├── .cache
    └── .temp
    │   └── 1740397786743_resample.wav
├── C2SER-llm
    ├── config.yaml
    ├── infer_runtime.py
    ├── prompt_config.yaml
    ├── requirements.txt
    ├── setup.py
    └── wenet
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-310.pyc
    │       ├── __init__.cpython-311.pyc
    │       └── __init__.cpython-39.pyc
    │   ├── bin
    │       ├── alignment.py
    │       ├── average_model.py
    │       ├── export_ipex.py
    │       ├── export_jit.py
    │       ├── export_onnx_bpu.py
    │       ├── export_onnx_cpu.py
    │       ├── export_onnx_gpu.py
    │       ├── recognize.py
    │       ├── recognize4llmasr.py
    │       ├── recognize_onnx_gpu.py
    │       └── train.py
    │   ├── cli
    │       ├── __init__.py
    │       ├── __pycache__
    │       │   ├── __init__.cpython-310.pyc
    │       │   ├── __init__.cpython-311.pyc
    │       │   ├── __init__.cpython-39.pyc
    │       │   ├── hub.cpython-310.pyc
    │       │   ├── hub.cpython-311.pyc
    │       │   ├── hub.cpython-39.pyc
    │       │   ├── model.cpython-310.pyc
    │       │   ├── model.cpython-311.pyc
    │       │   └── model.cpython-39.pyc
    │       ├── hub.py
    │       └── model.py
    │   ├── dataset
    │       ├── __init__.py
    │       ├── __pycache__
    │       │   ├── __init__.cpython-310.pyc
    │       │   ├── __init__.cpython-311.pyc
    │       │   ├── dataset.cpython-310.pyc
    │       │   └── dataset.cpython-311.pyc
    │       ├── datapipes.py
    │       ├── dataset.py
    │       ├── kaldi_io.py
    │       ├── process
    │       │   ├── __pycache__
    │       │   │   ├── processor.cpython-310.pyc
    │       │   │   └── processor.cpython-311.pyc
    │       │   ├── processor.py
    │       │   ├── processor_base-version.py
    │       │   ├── processor_base-version_emotion-only_with-ssl-vec.py
    │       │   └── processor_instrcut-version.py
    │       └── wav_distortion.py
    │   ├── efficient_conformer
    │       ├── __init__.py
    │       ├── __pycache__
    │       │   ├── __init__.cpython-39.pyc
    │       │   ├── attention.cpython-39.pyc
    │       │   └── subsampling.cpython-39.pyc
    │       ├── attention.py
    │       └── subsampling.py
    │   ├── llm_asr
    │       ├── __init__.py
    │       ├── __pycache__
    │       │   ├── __init__.cpython-39.pyc
    │       │   ├── downsampler.cpython-39.pyc
    │       │   ├── init_llmasr.cpython-39.pyc
    │       │   ├── llmasr_model.cpython-39.pyc
    │       │   └── utils4llmasr.cpython-39.pyc
    │       ├── downsampler.py
    │       ├── init_llmasr.py
    │       ├── llmasr_model.py
    │       └── utils4llmasr.py
    │   ├── paraformer
    │       ├── __init__.py
    │       ├── __pycache__
    │       │   ├── __init__.cpython-39.pyc
    │       │   ├── embedding.cpython-39.pyc
    │       │   └── search.cpython-39.pyc
    │       ├── embedding.py
    │       └── search.py
    │   ├── squeezeformer
    │       ├── __init__.py
    │       ├── __pycache__
    │       │   ├── __init__.cpython-39.pyc
    │       │   ├── conv2d.cpython-39.pyc
    │       │   └── subsampling.cpython-39.pyc
    │       ├── conv2d.py
    │       └── subsampling.py
    │   ├── text
    │       ├── __init__.py
    │       ├── __pycache__
    │       │   ├── __init__.cpython-310.pyc
    │       │   ├── __init__.cpython-311.pyc
    │       │   ├── __init__.cpython-39.pyc
    │       │   ├── base_tokenizer.cpython-310.pyc
    │       │   ├── base_tokenizer.cpython-311.pyc
    │       │   ├── base_tokenizer.cpython-39.pyc
    │       │   ├── bpe_tokenizer.cpython-310.pyc
    │       │   ├── bpe_tokenizer.cpython-311.pyc
    │       │   ├── bpe_tokenizer.cpython-39.pyc
    │       │   ├── char_tokenizer.cpython-310.pyc
    │       │   ├── char_tokenizer.cpython-311.pyc
    │       │   ├── char_tokenizer.cpython-39.pyc
    │       │   ├── hugging_face_tokenizer.cpython-310.pyc
    │       │   ├── hugging_face_tokenizer.cpython-311.pyc
    │       │   ├── hugging_face_tokenizer.cpython-39.pyc
    │       │   ├── paraformer_tokenizer.cpython-310.pyc
    │       │   ├── paraformer_tokenizer.cpython-311.pyc
    │       │   ├── paraformer_tokenizer.cpython-39.pyc
    │       │   ├── tokenize_utils.cpython-310.pyc
    │       │   ├── tokenize_utils.cpython-311.pyc
    │       │   ├── tokenize_utils.cpython-39.pyc
    │       │   ├── whisper_tokenizer.cpython-310.pyc
    │       │   ├── whisper_tokenizer.cpython-311.pyc
    │       │   └── whisper_tokenizer.cpython-39.pyc
    │       ├── base_tokenizer.py
    │       ├── bpe_tokenizer.py
    │       ├── char_tokenizer.py
    │       ├── hugging_face_tokenizer.py
    │       ├── paraformer_tokenizer.py
    │       ├── tokenize_utils.py
    │       └── whisper_tokenizer.py
    │   ├── transformer
    │       ├── __init__.py
    │       ├── __pycache__
    │       │   ├── __init__.cpython-310.pyc
    │       │   ├── __init__.cpython-311.pyc
    │       │   ├── __init__.cpython-39.pyc
    │       │   ├── asr_model.cpython-310.pyc
    │       │   ├── asr_model.cpython-311.pyc
    │       │   ├── asr_model.cpython-39.pyc
    │       │   ├── attention.cpython-310.pyc
    │       │   ├── attention.cpython-311.pyc
    │       │   ├── attention.cpython-39.pyc
    │       │   ├── cmvn.cpython-310.pyc
    │       │   ├── cmvn.cpython-311.pyc
    │       │   ├── cmvn.cpython-39.pyc
    │       │   ├── convolution.cpython-310.pyc
    │       │   ├── convolution.cpython-311.pyc
    │       │   ├── convolution.cpython-39.pyc
    │       │   ├── ctc.cpython-310.pyc
    │       │   ├── ctc.cpython-311.pyc
    │       │   ├── ctc.cpython-39.pyc
    │       │   ├── decoder.cpython-310.pyc
    │       │   ├── decoder.cpython-311.pyc
    │       │   ├── decoder.cpython-39.pyc
    │       │   ├── decoder_layer.cpython-310.pyc
    │       │   ├── decoder_layer.cpython-311.pyc
    │       │   ├── decoder_layer.cpython-39.pyc
    │       │   ├── embedding.cpython-310.pyc
    │       │   ├── embedding.cpython-311.pyc
    │       │   ├── embedding.cpython-39.pyc
    │       │   ├── encoder.cpython-310.pyc
    │       │   ├── encoder.cpython-311.pyc
    │       │   ├── encoder.cpython-39.pyc
    │       │   ├── encoder_layer.cpython-310.pyc
    │       │   ├── encoder_layer.cpython-311.pyc
    │       │   ├── encoder_layer.cpython-39.pyc
    │       │   ├── label_smoothing_loss.cpython-310.pyc
    │       │   ├── label_smoothing_loss.cpython-311.pyc
    │       │   ├── label_smoothing_loss.cpython-39.pyc
    │       │   ├── norm.cpython-310.pyc
    │       │   ├── norm.cpython-311.pyc
    │       │   ├── norm.cpython-39.pyc
    │       │   ├── positionwise_feed_forward.cpython-310.pyc
    │       │   ├── positionwise_feed_forward.cpython-311.pyc
    │       │   ├── positionwise_feed_forward.cpython-39.pyc
    │       │   ├── search.cpython-310.pyc
    │       │   ├── search.cpython-311.pyc
    │       │   ├── search.cpython-39.pyc
    │       │   ├── subsampling.cpython-310.pyc
    │       │   ├── subsampling.cpython-311.pyc
    │       │   ├── subsampling.cpython-39.pyc
    │       │   ├── swish.cpython-310.pyc
    │       │   ├── swish.cpython-311.pyc
    │       │   └── swish.cpython-39.pyc
    │       ├── asr_model.py
    │       ├── attention.py
    │       ├── cmvn.py
    │       ├── convolution.py
    │       ├── ctc.py
    │       ├── decoder.py
    │       ├── decoder_layer.py
    │       ├── embedding.py
    │       ├── encoder.py
    │       ├── encoder_layer.py
    │       ├── label_smoothing_loss.py
    │       ├── norm.py
    │       ├── positionwise_feed_forward.py
    │       ├── search.py
    │       ├── subsampling.py
    │       └── swish.py
    │   ├── utils
    │       ├── __init__.py
    │       ├── __pycache__
    │       │   ├── __init__.cpython-310.pyc
    │       │   ├── __init__.cpython-311.pyc
    │       │   ├── __init__.cpython-39.pyc
    │       │   ├── checkpoint.cpython-310.pyc
    │       │   ├── checkpoint.cpython-311.pyc
    │       │   ├── checkpoint.cpython-39.pyc
    │       │   ├── class_utils.cpython-310.pyc
    │       │   ├── class_utils.cpython-311.pyc
    │       │   ├── class_utils.cpython-39.pyc
    │       │   ├── cmvn.cpython-310.pyc
    │       │   ├── cmvn.cpython-311.pyc
    │       │   ├── cmvn.cpython-39.pyc
    │       │   ├── common.cpython-310.pyc
    │       │   ├── common.cpython-311.pyc
    │       │   ├── common.cpython-39.pyc
    │       │   ├── config.cpython-310.pyc
    │       │   ├── config.cpython-311.pyc
    │       │   ├── context_graph.cpython-310.pyc
    │       │   ├── context_graph.cpython-311.pyc
    │       │   ├── context_graph.cpython-39.pyc
    │       │   ├── ctc_utils.cpython-310.pyc
    │       │   ├── ctc_utils.cpython-311.pyc
    │       │   ├── ctc_utils.cpython-39.pyc
    │       │   ├── executor.cpython-310.pyc
    │       │   ├── executor.cpython-311.pyc
    │       │   ├── file_utils.cpython-310.pyc
    │       │   ├── file_utils.cpython-311.pyc
    │       │   ├── file_utils.cpython-39.pyc
    │       │   ├── fsdp_utils.cpython-310.pyc
    │       │   ├── fsdp_utils.cpython-311.pyc
    │       │   ├── init_dataset.cpython-310.pyc
    │       │   ├── init_dataset.cpython-311.pyc
    │       │   ├── init_model.cpython-310.pyc
    │       │   ├── init_model.cpython-311.pyc
    │       │   ├── init_model.cpython-39.pyc
    │       │   ├── init_tokenizer.cpython-310.pyc
    │       │   ├── init_tokenizer.cpython-311.pyc
    │       │   ├── init_tokenizer.cpython-39.pyc
    │       │   ├── mask.cpython-310.pyc
    │       │   ├── mask.cpython-311.pyc
    │       │   ├── mask.cpython-39.pyc
    │       │   ├── rope_utils.cpython-310.pyc
    │       │   ├── rope_utils.cpython-311.pyc
    │       │   ├── rope_utils.cpython-39.pyc
    │       │   ├── scheduler.cpython-310.pyc
    │       │   ├── scheduler.cpython-311.pyc
    │       │   ├── train_utils.cpython-310.pyc
    │       │   └── train_utils.cpython-311.pyc
    │       ├── checkpoint.py
    │       ├── class_utils.py
    │       ├── cmvn.py
    │       ├── common.py
    │       ├── config.py
    │       ├── context_graph.py
    │       ├── ctc_utils.py
    │       ├── executor.py
    │       ├── file_utils.py
    │       ├── fsdp_utils.py
    │       ├── init_dataset.py
    │       ├── init_model.py
    │       ├── init_tokenizer.py
    │       ├── mask.py
    │       ├── rope_utils.py
    │       ├── scheduler.py
    │       └── train_utils.py
    │   └── whisper
    │       ├── __init__.py
    │       ├── __pycache__
    │           ├── __init__.cpython-310.pyc
    │           ├── __init__.cpython-311.pyc
    │           ├── __init__.cpython-39.pyc
    │           ├── whisper.cpython-310.pyc
    │           ├── whisper.cpython-311.pyc
    │           └── whisper.cpython-39.pyc
    │       ├── convert_whisper_to_wenet_config_and_ckpt.py
    │       ├── whisper.py
    │       └── whisper_with_clap.py
├── Emo-Emilia
    └── Emo-Emilia-ALL.jsonl
├── Emotion2Vec-S
    ├── downstream_EmoBox
    │   └── k_fold_CV.sh
    ├── examples
    │   ├── .gitignore
    │   └── data2vec
    │   │   └── models
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │           ├── __init__.cpython-38.pyc
    │   │           └── data2vec2.cpython-38.pyc
    │   │       ├── data2vec2.py
    │   │       └── modalities
    │   │           ├── __init__.py
    │   │           ├── __pycache__
    │   │               ├── __init__.cpython-38.pyc
    │   │               ├── audio.cpython-38.pyc
    │   │               ├── base.cpython-38.pyc
    │   │               └── modules.cpython-38.pyc
    │   │           ├── audio.py
    │   │           ├── base.py
    │   │           └── modules.py
    ├── extract_feature.sh
    ├── features
    │   ├── features_frm
    │   │   ├── 4YJy1uDx0jM_769.npy
    │   │   └── vo_EQAST002_1_paimon_07.npy
    │   └── features_utt
    │   │   ├── 4YJy1uDx0jM_769.npy
    │   │   └── vo_EQAST002_1_paimon_07.npy
    ├── speech_feature_extraction.py
    ├── test_wav
    │   ├── 4YJy1uDx0jM_769.wav
    │   └── vo_EQAST002_1_paimon_07.wav
    └── wav.scp
├── README.md
└── figs
    └── c2ser.png


/.cache/.temp/1740397786743_resample.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/.cache/.temp/1740397786743_resample.wav


--------------------------------------------------------------------------------
/C2SER-llm/config.yaml:
--------------------------------------------------------------------------------
  1 | model: llmasr
  2 | 
  3 | # tokenizer 
  4 | tokenizer: huggingface
  5 | tokenizer_conf:
  6 |   llm_path: Qwen/Qwen2-7B
  7 | use_lora: true
  8 | lora_alpha: 32
  9 | lora_rank: 8
 10 | lora_dropout: 0.1
 11 | speech_token_num: 4097
 12 | 
 13 | fire_module: link_and_encoder_and_lora  # link  encoder llm  link_and_encoder link_and_encoder_and_lora,
 14 | downsample_rate: 4 # 1 2 4 8
 15 | adapter_type: gxl
 16 | llm_path: Qwen/Qwen2-7B
 17 | optim: adamw
 18 | optim_conf:
 19 |   betas:
 20 |   - 0.9
 21 |   - 0.99
 22 |   eps: 1.0e-06
 23 |   lr: 5.0e-05
 24 |   weight_decay: 0.01
 25 | scheduler: warmuplr
 26 | scheduler_conf:
 27 |   warmup_steps: 8000
 28 | 
 29 | cmvn: null
 30 | cmvn_conf:
 31 |   cmvn_file: null
 32 |   is_json_cmvn: null
 33 | ctc_conf:
 34 |   ctc_blank_id: 50362
 35 | 
 36 | dataset: asr
 37 | dataset_conf:
 38 |   batch_conf:
 39 |     batch_size: 26
 40 |     batch_type: dynamic
 41 |     max_frames_in_batch: 3900 # 3900
 42 |     max_seq_in_batch: 1900 # 1900
 43 |   feats_type: log_mel_spectrogram
 44 |   filter_conf:
 45 |     max_length: 2900
 46 |     min_length: 0
 47 |     token_max_length: 200
 48 |     token_min_length: 1
 49 |     filter_no_extra_info: true # 如果没有task lang 等信息,直接过滤掉, 适用于通用多任务训练, 推理时应该关掉
 50 |     max_seq_len: 1000 #1000
 51 |   language_conf:
 52 |     limited_langs:
 53 |     - zh
 54 |   log_mel_spectrogram_conf:
 55 |     hop_length: 160
 56 |     n_fft: 400
 57 |     num_mel_bins: 80
 58 |     padding: 0
 59 |   resample_conf:
 60 |     resample_rate: 16000
 61 |   shuffle: true
 62 |   shuffle_conf:
 63 |     shuffle_size: 1500
 64 |   sort: true
 65 |   sort_conf:
 66 |     sort_size: 500
 67 |   spec_aug: true
 68 |   spec_aug_conf:
 69 |     max_f: 10
 70 |     max_t: 50
 71 |     num_f_mask: 2
 72 |     num_t_mask: 2
 73 |   spec_sub: true
 74 |   spec_sub_conf:
 75 |     max_t: 30
 76 |     num_t_sub: 3
 77 |   spec_trim: false
 78 |   speed_perturb: false
 79 |   eod_id: 151643 # for whisper
 80 |   split_num: 1 # 25000tar -> /split_Num 1000
 81 |   multi_num: 1 # 2
 82 |   prompt_conf_path: ./prompt_config.yaml
 83 |   continue_data: true
 84 | 
 85 | decoder: transformer
 86 | decoder_conf:
 87 |   activation_type: gelu
 88 |   attention_heads: 16
 89 |   dropout_rate: 0.1
 90 |   gradient_checkpointing: true
 91 |   input_layer: embed_learnable_pe
 92 |   key_bias: false
 93 |   linear_units: 4096
 94 |   normalize_before: true
 95 |   num_blocks: 24
 96 |   positional_dropout_rate: 0.0
 97 |   self_attention_dropout_rate: 0.0
 98 |   src_attention: true
 99 |   src_attention_dropout_rate: 0.0
100 |   tie_word_embedding: true
101 |   use_output_layer: true
102 | encoder: transformer
103 | encoder_conf:
104 |   activation_type: gelu
105 |   attention_dropout_rate: 0.0
106 |   attention_heads: 16
107 |   dropout_rate: 0.1
108 |   gradient_checkpointing: true
109 |   input_layer: conv1d2
110 |   key_bias: false
111 |   linear_units: 4096
112 |   normalize_before: true
113 |   num_blocks: 24
114 |   output_size: 1024
115 |   pos_enc_layer_type: abs_pos_whisper
116 |   positional_dropout_rate: 0.1
117 |   static_chunk_size: -1
118 |   use_dynamic_chunk: false
119 |   use_dynamic_left_chunk: false
120 | grad_clip: 5
121 | accum_grad: 4
122 | input_dim: 80
123 | log_interval: 10
124 | save_interval: 1250
125 | max_epoch: 100
126 | 
127 | model_conf:
128 |   ctc_weight: 0
129 |   length_normalized_loss: false
130 |   lsm_weight: 0.1
131 | 
132 | init_step: true
133 | 


--------------------------------------------------------------------------------
/C2SER-llm/infer_runtime.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import torch.nn.functional as F
 3 | from gxl_ai_utils.utils import utils_file
 4 | from wenet.utils.init_tokenizer import init_tokenizer
 5 | from gxl_ai_utils.config.gxl_config import GxlNode
 6 | from wenet.utils.init_model import init_model
 7 | import logging
 8 | import librosa
 9 | import torch
10 | import torchaudio
11 | import numpy as np
12 | 
13 | logging.basicConfig(level=logging.DEBUG,
14 |                     format='%(asctime)s %(levelname)s %(message)s')
15 | config_path = "./C2SER-llm/config.yaml"
16 | checkpoint_path = "/home/work_nfs16/xlgeng/code/wenet_undersdand_and_speech_xlgeng_emotion_only/examples/wenetspeech/whisper/exp/two_stage_train/stage_2_plus_meld/step_9999.pt"
17 | args = GxlNode({
18 |     "checkpoint": checkpoint_path,
19 | })
20 | configs = utils_file.load_dict_from_yaml(config_path)
21 | model, configs = init_model(args, configs)
22 | gpu_id = 0
23 | model = model.cuda(gpu_id)
24 | tokenizer = init_tokenizer(configs)
25 | print(model)
26 | resample_rate = 16000
27 | 
28 | def do_resample(input_wav_path, output_wav_path):
29 |     """"""
30 |     print(f'input_wav_path: {input_wav_path}, output_wav_path: {output_wav_path}')
31 |     waveform, sample_rate = torchaudio.load(input_wav_path)
32 |     # 检查音频的维度
33 |     num_channels = waveform.shape[0]
34 |     # 如果音频是多通道的，则进行通道平均
35 |     if num_channels > 1:
36 |         waveform = torch.mean(waveform, dim=0, keepdim=True)
37 |     waveform = torchaudio.transforms.Resample(
38 |         orig_freq=sample_rate, new_freq=16000)(waveform)
39 |     utils_file.makedir_for_file(output_wav_path)
40 |     torchaudio.save(output_wav_path, waveform, 16000)
41 | 
42 | 
43 | def do_decode(input_wav_path, input_prompt, ssl_vector_path):
44 |     # input_prompt = TASK_PROMPT_MAPPING.get(input_prompt, "未知任务类型")
45 |     print(f"wav_path: {input_wav_path}, prompt:{input_prompt}")
46 |     timestamp_ms = int(time.time() * 1000)
47 |     now_file_tmp_path_resample = f'./.cache/.temp/{timestamp_ms}_resample.wav'
48 |     do_resample(input_wav_path, now_file_tmp_path_resample)
49 |     input_wav_path = now_file_tmp_path_resample
50 |     waveform, sample_rate = torchaudio.load(input_wav_path)
51 |     waveform = waveform.squeeze(0)  # (channel=1, sample) -> (sample,)
52 |     print(f'wavform shape: {waveform.shape}, sample_rate: {sample_rate}')
53 |     window = torch.hann_window(400)
54 |     stft = torch.stft(waveform,
55 |                       400,
56 |                       160,
57 |                       window=window,
58 |                       return_complex=True)
59 |     magnitudes = stft[..., :-1].abs() ** 2
60 | 
61 |     filters = torch.from_numpy(
62 |         librosa.filters.mel(sr=sample_rate,
63 |                             n_fft=400,
64 |                             n_mels=80))
65 |     mel_spec = filters @ magnitudes
66 | 
67 |     # NOTE(): https://github.com/openai/whisper/discussions/269
68 |     log_spec = torch.clamp(mel_spec, min=1e-10).log10()
69 |     log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
70 |     log_spec = (log_spec + 4.0) / 4.0
71 |     feat = log_spec.transpose(0, 1)
72 |     feat_lens = torch.tensor([feat.shape[0]], dtype=torch.int64).to(gpu_id)
73 |     feat = feat.unsqueeze(0).to(gpu_id)
74 |     # feat = feat.half()
75 |     # feat_lens = feat_lens.half()
76 |     numpy_array = np.load(ssl_vector_path)
77 | 
78 |     tensor = torch.from_numpy(numpy_array)
79 |     pad_amount = 1024 - tensor.size(1)
80 |     padded_tensor_ssl = F.pad(tensor, (0, pad_amount), mode='constant', value=0)
81 |     res_text = model.generate(wavs=feat, wavs_len=feat_lens, prompt=input_prompt, padded_tensor_ssl=padded_tensor_ssl)[0]
82 |     print("result:", res_text)
83 |     return res_text
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     input_wav_path = "./Emotion2Vec-S/test_wav/vo_EQAST002_1_paimon_07.wav"
88 |     input_prompt = "Please consider the speaking style, content, and directly provide the speaker's emotion in this speech." # for stage1, more prompt refer to ./prompt_config.yaml
89 |     ssl_vector_path = "./Emotion2Vec-S/features/features_utt/vo_EQAST002_1_paimon_07.npy" # for ssl, the path of ssl vector
90 |     res_text_list = do_decode(input_wav_path, input_prompt, ssl_vector_path)
91 |     # print(res_text_list)
92 | 
93 | 


--------------------------------------------------------------------------------
/C2SER-llm/prompt_config.yaml:
--------------------------------------------------------------------------------
 1 | <EMOTION> <EXP_DESCRIPTION>:
 2 |   - Please describe the speaking style, content, and the speaker's emotional state of this speech.
 3 |   - Please describe the speaking style, content, and the speaker's emotional state of this speech.
 4 |   - Please describe the speaking style, content, and the speaker's emotional state of this speech.
 5 |   - Please describe the speaking style, content, and the speaker's emotional state of this speech.
 6 |   - Please describe the speaking style, content, and the speaker's emotional state of this speech.
 7 | <EMOTION> <IMP_DESCRIPTION>:
 8 |   - Please consider the speaking style, content, and directly provide the speaker's emotion in this speech.
 9 |   - Please consider the speaking style, content, and directly provide the speaker's emotion in this speech.
10 |   - Please consider the speaking style, content, and directly provide the speaker's emotion in this speech.
11 |   - Please consider the speaking style, content, and directly provide the speaker's emotion in this speech.
12 |   - Please consider the speaking style, content, and directly provide the speaker's emotion in this speech.
13 | 


--------------------------------------------------------------------------------
/C2SER-llm/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.24
 2 | jsonlines==4.0.0
 3 | torch==2.1.0
 4 | transformers==4.44.0
 5 | torchaudio==2.1.0
 6 | librosa
 7 | tensorboardX>=2.5
 8 | tqdm
 9 | absl-py
10 | psutil
11 | cloudpickle
12 | ml-dtypes
13 | tornado
14 | openai-whisper
15 | colorama
16 | peft
17 | sox
18 | deepspeed
19 | librosa
20 | gxl_ai_utils
21 | jsonlines
22 | 


--------------------------------------------------------------------------------
/C2SER-llm/setup.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | from setuptools import setup, find_packages
 3 | 
 4 | requirements = [
 5 |     "numpy",
 6 |     "requests",
 7 |     "tqdm",
 8 |     "torch>=1.13.0",
 9 |     "torchaudio>=0.13.0",
10 |     "openai-whisper",
11 |     "librosa",
12 | ]
13 | 
14 | extra_require = {
15 |     "torch-npu": [
16 |         "torch==2.2.0", "torch-npu==2.2.0", "torchaudio==2.2.0", "decorator",
17 |         "numpy<2.0.0", "attrs", "psutil"
18 |     ],
19 | }
20 | 
21 | if platform.system() == 'Windows':
22 |     requirements += ['PySoundFile']
23 | 
24 | setup(
25 |     name="wenet",
26 |     install_requires=requirements,
27 |     packages=find_packages(),
28 |     entry_points={"console_scripts": [
29 |         "wenet = wenet.cli.transcribe:main",
30 |     ]},
31 |     extras_require=extra_require,
32 | )
33 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/__init__.py:
--------------------------------------------------------------------------------
1 | from wenet.cli.model import load_model  # noqa
2 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/bin/average_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Mobvoi Inc (Di Wu)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import argparse
 17 | import glob
 18 | import sys
 19 | 
 20 | import yaml
 21 | import torch
 22 | 
 23 | 
 24 | def get_args():
 25 |     parser = argparse.ArgumentParser(description='average model')
 26 |     parser.add_argument('--dst_model', required=True, help='averaged model')
 27 |     parser.add_argument('--src_path',
 28 |                         required=True,
 29 |                         help='src model path for average')
 30 |     parser.add_argument('--val_best',
 31 |                         action="store_true",
 32 |                         help='averaged model')
 33 |     parser.add_argument('--num',
 34 |                         default=5,
 35 |                         type=int,
 36 |                         help='nums for averaged model')
 37 |     parser.add_argument('--min_epoch',
 38 |                         default=0,
 39 |                         type=int,
 40 |                         help='min epoch used for averaging model')
 41 |     parser.add_argument('--max_epoch',
 42 |                         default=sys.maxsize,
 43 |                         type=int,
 44 |                         help='max epoch used for averaging model')
 45 |     parser.add_argument('--min_step',
 46 |                         default=0,
 47 |                         type=int,
 48 |                         help='min step used for averaging model')
 49 |     parser.add_argument('--max_step',
 50 |                         default=sys.maxsize,
 51 |                         type=int,
 52 |                         help='max step used for averaging model')
 53 |     parser.add_argument('--mode',
 54 |                         default="hybrid",
 55 |                         choices=["hybrid", "epoch", "step"],
 56 |                         type=str,
 57 |                         help='average mode')
 58 | 
 59 |     args = parser.parse_args()
 60 |     print(args)
 61 |     return args
 62 | 
 63 | 
 64 | def main():
 65 |     args = get_args()
 66 |     checkpoints = []
 67 |     val_scores = []
 68 |     if args.val_best:
 69 |         if args.mode == "hybrid":
 70 |             yamls = glob.glob('{}/*.yaml'.format(args.src_path))
 71 |             yamls = [
 72 |                 f for f in yamls
 73 |                 if not (os.path.basename(f).startswith('train')
 74 |                         or os.path.basename(f).startswith('init'))
 75 |             ]
 76 |         elif args.mode == "step":
 77 |             yamls = glob.glob('{}/step_*.yaml'.format(args.src_path))
 78 |         else:
 79 |             yamls = glob.glob('{}/epoch_*.yaml'.format(args.src_path))
 80 |         for y in yamls:
 81 |             with open(y, 'r') as f:
 82 |                 dic_yaml = yaml.load(f, Loader=yaml.FullLoader)
 83 |                 loss = dic_yaml['loss_dict']['loss']
 84 |                 epoch = dic_yaml['epoch']
 85 |                 step = dic_yaml['step']
 86 |                 tag = dic_yaml['tag']
 87 |                 if epoch >= args.min_epoch and epoch <= args.max_epoch \
 88 |                         and step >= args.min_step and step <= args.max_step:
 89 |                     val_scores += [[epoch, step, loss, tag]]
 90 |         sorted_val_scores = sorted(val_scores,
 91 |                                    key=lambda x: x[2],
 92 |                                    reverse=False)
 93 |         print("best val (epoch, step, loss, tag) = " +
 94 |               str(sorted_val_scores[:args.num]))
 95 |         path_list = [
 96 |             args.src_path + '/{}.pt'.format(score[-1])
 97 |             for score in sorted_val_scores[:args.num]
 98 |         ]
 99 |     else:
100 |         path_list = glob.glob('{}/[!init]*.pt'.format(args.src_path))
101 |         path_list = sorted(path_list, key=os.path.getmtime)
102 |         path_list = path_list[-args.num:]
103 |     print(path_list)
104 |     avg = {}
105 |     num = args.num
106 |     assert num == len(path_list)
107 |     for path in path_list:
108 |         print('Processing {}'.format(path))
109 |         states = torch.load(path, map_location=torch.device('cpu'))
110 |         for k in states.keys():
111 |             if k not in avg.keys():
112 |                 avg[k] = states[k].clone()
113 |             else:
114 |                 avg[k] += states[k]
115 |     # average
116 |     for k in avg.keys():
117 |         if avg[k] is not None:
118 |             # pytorch 1.6 use true_divide instead of /=
119 |             avg[k] = torch.true_divide(avg[k], num)
120 |     print('Saving to {}'.format(args.dst_model))
121 |     torch.save(avg, args.dst_model)
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     main()
126 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/bin/export_ipex.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2021-2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from __future__ import print_function
 5 | 
 6 | import argparse
 7 | import logging
 8 | import os
 9 | 
10 | import torch
11 | import yaml
12 | 
13 | from wenet.utils.init_model import init_model
14 | import intel_extension_for_pytorch as ipex
15 | from intel_extension_for_pytorch.quantization import prepare, convert
16 | 
17 | 
18 | def get_args():
19 |     parser = argparse.ArgumentParser(description='export your script model')
20 |     parser.add_argument('--config', required=True, help='config file')
21 |     parser.add_argument('--checkpoint', required=True, help='checkpoint model')
22 |     parser.add_argument('--output_file', default=None, help='output file')
23 |     parser.add_argument('--dtype',
24 |                         default="fp32",
25 |                         help='choose the dtype to run:[fp32,bf16]')
26 |     parser.add_argument('--output_quant_file',
27 |                         default=None,
28 |                         help='output quantized model file')
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 
33 | def scripting(model):
34 |     with torch.inference_mode():
35 |         script_model = torch.jit.script(model)
36 |         script_model = torch.jit.freeze(
37 |             script_model,
38 |             preserved_attrs=[
39 |                 "forward_encoder_chunk", "ctc_activation",
40 |                 "forward_attention_decoder", "subsampling_rate",
41 |                 "right_context", "sos_symbol", "eos_symbol",
42 |                 "is_bidirectional_decoder"
43 |             ])
44 |     return script_model
45 | 
46 | 
47 | def main():
48 |     args = get_args()
49 |     logging.basicConfig(level=logging.DEBUG,
50 |                         format='%(asctime)s %(levelname)s %(message)s')
51 |     # No need gpu for model export
52 |     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
53 | 
54 |     with open(args.config, 'r') as fin:
55 |         configs = yaml.load(fin, Loader=yaml.FullLoader)
56 |     model, configs = init_model(args, configs)
57 |     print(model)
58 | 
59 |     # Apply IPEX optimization
60 |     model.eval()
61 |     torch._C._jit_set_texpr_fuser_enabled(False)
62 |     model.to(memory_format=torch.channels_last)
63 |     if args.dtype == "fp32":
64 |         ipex_model = ipex.optimize(model)
65 |     elif args.dtype == "bf16":  # For Intel 4th generation Xeon (SPR)
66 |         ipex_model = ipex.optimize(model,
67 |                                    dtype=torch.bfloat16,
68 |                                    weights_prepack=False)
69 | 
70 |     # Export jit torch script model
71 |     if args.output_file:
72 |         if args.dtype == "fp32":
73 |             script_model = scripting(ipex_model)
74 |         elif args.dtype == "bf16":
75 |             torch._C._jit_set_autocast_mode(True)
76 |             with torch.cpu.amp.autocast():
77 |                 script_model = scripting(ipex_model)
78 |         script_model.save(args.output_file)
79 |         print('Export model successfully, see {}'.format(args.output_file))
80 | 
81 |     # Export quantized jit torch script model
82 |     if args.output_quant_file:
83 |         dynamic_qconfig = ipex.quantization.default_dynamic_qconfig
84 |         dummy_data = (torch.zeros(1, 67, 80), 16, -16,
85 |                       torch.zeros(12, 4, 32, 128), torch.zeros(12, 1, 256, 7))
86 |         model = prepare(model, dynamic_qconfig, dummy_data)
87 |         model = convert(model)
88 |         script_quant_model = scripting(model)
89 |         script_quant_model.save(args.output_quant_file)
90 |         print('Export quantized model successfully, '
91 |               'see {}'.format(args.output_quant_file))
92 | 
93 | 
94 | if __name__ == '__main__':
95 |     main()
96 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/bin/export_jit.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function
16 | 
17 | import argparse
18 | import logging
19 | import os
20 | 
21 | import torch
22 | import yaml
23 | 
24 | from wenet.utils.init_model import init_model
25 | 
26 | 
27 | def get_args():
28 |     parser = argparse.ArgumentParser(description='export your script model')
29 |     parser.add_argument('--config', required=True, help='config file')
30 |     parser.add_argument('--checkpoint', required=True, help='checkpoint model')
31 |     parser.add_argument('--output_file', default=None, help='output file')
32 |     parser.add_argument('--output_quant_file',
33 |                         default=None,
34 |                         help='output quantized model file')
35 |     args = parser.parse_args()
36 |     return args
37 | 
38 | 
39 | def main():
40 |     args = get_args()
41 |     args.jit = True
42 |     logging.basicConfig(level=logging.DEBUG,
43 |                         format='%(asctime)s %(levelname)s %(message)s')
44 |     # No need gpu for model export
45 |     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
46 | 
47 |     with open(args.config, 'r') as fin:
48 |         configs = yaml.load(fin, Loader=yaml.FullLoader)
49 |     model, configs = init_model(args, configs)
50 |     model.eval()
51 |     print(model)
52 |     # Export jit torch script model
53 | 
54 |     if args.output_file:
55 |         script_model = torch.jit.script(model)
56 |         script_model.save(args.output_file)
57 |         print('Export model successfully, see {}'.format(args.output_file))
58 | 
59 |     # Export quantized jit torch script model
60 |     if args.output_quant_file:
61 |         quantized_model = torch.quantization.quantize_dynamic(
62 |             model, {torch.nn.Linear}, dtype=torch.qint8)
63 |         print(quantized_model)
64 |         script_quant_model = torch.jit.script(quantized_model)
65 |         script_quant_model.save(args.output_quant_file)
66 |         print('Export quantized model successfully, '
67 |               'see {}'.format(args.output_quant_file))
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     main()
72 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__init__.py


--------------------------------------------------------------------------------
/C2SER-llm/wenet/cli/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/cli/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/cli/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/cli/__pycache__/hub.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/hub.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/cli/__pycache__/hub.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/hub.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/cli/__pycache__/hub.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/hub.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/cli/__pycache__/model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/model.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/cli/__pycache__/model.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/model.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/cli/__pycache__/model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/model.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/cli/hub.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022  Mddct(hamddct@gmail.com)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import requests
 17 | import sys
 18 | import tarfile
 19 | from pathlib import Path
 20 | from urllib.request import urlretrieve
 21 | 
 22 | import tqdm
 23 | 
 24 | 
 25 | def download(url: str, dest: str, only_child=True):
 26 |     """ download from url to dest
 27 |     """
 28 |     assert os.path.exists(dest)
 29 |     print('Downloading {} to {}'.format(url, dest))
 30 | 
 31 |     def progress_hook(t):
 32 |         last_b = [0]
 33 | 
 34 |         def update_to(b=1, bsize=1, tsize=None):
 35 |             if tsize not in (None, -1):
 36 |                 t.total = tsize
 37 |             displayed = t.update((b - last_b[0]) * bsize)
 38 |             last_b[0] = b
 39 |             return displayed
 40 | 
 41 |         return update_to
 42 | 
 43 |     # *.tar.gz
 44 |     name = url.split('?')[0].split('/')[-1]
 45 |     tar_path = os.path.join(dest, name)
 46 |     with tqdm.tqdm(unit='B',
 47 |                    unit_scale=True,
 48 |                    unit_divisor=1024,
 49 |                    miniters=1,
 50 |                    desc=(name)) as t:
 51 |         urlretrieve(url,
 52 |                     filename=tar_path,
 53 |                     reporthook=progress_hook(t),
 54 |                     data=None)
 55 |         t.total = t.n
 56 | 
 57 |     with tarfile.open(tar_path) as f:
 58 |         if not only_child:
 59 |             f.extractall(dest)
 60 |         else:
 61 |             for tarinfo in f:
 62 |                 if "/" not in tarinfo.name:
 63 |                     continue
 64 |                 name = os.path.basename(tarinfo.name)
 65 |                 fileobj = f.extractfile(tarinfo)
 66 |                 with open(os.path.join(dest, name), "wb") as writer:
 67 |                     writer.write(fileobj.read())
 68 | 
 69 | 
 70 | class Hub(object):
 71 |     """Hub for wenet pretrain runtime model
 72 |     """
 73 |     # TODO(Mddct): make assets class to support other language
 74 |     Assets = {
 75 |         # wenetspeech
 76 |         "chinese": "wenetspeech_u2pp_conformer_libtorch.tar.gz",
 77 |         # gigaspeech
 78 |         "english": "gigaspeech_u2pp_conformer_libtorch.tar.gz",
 79 |         # paraformer
 80 |         "paraformer": "paraformer.tar.gz"
 81 |     }
 82 | 
 83 |     def __init__(self) -> None:
 84 |         pass
 85 | 
 86 |     @staticmethod
 87 |     def get_model_by_lang(lang: str) -> str:
 88 |         if lang not in Hub.Assets.keys():
 89 |             print('ERROR: Unsupported language {} !!!'.format(lang))
 90 |             sys.exit(1)
 91 | 
 92 |         # NOTE(Mddct): model_dir structure
 93 |         # Path.Home()/.wenet
 94 |         # - chs
 95 |         #    - units.txt
 96 |         #    - final.zip
 97 |         # - en
 98 |         #    - units.txt
 99 |         #    - final.zip
100 |         model = Hub.Assets[lang]
101 |         model_dir = os.path.join(Path.home(), ".wenet", lang)
102 |         if not os.path.exists(model_dir):
103 |             os.makedirs(model_dir)
104 |         # TODO(Mddct): model metadata
105 |         if set(["final.zip",
106 |                 "units.txt"]).issubset(set(os.listdir(model_dir))):
107 |             return model_dir
108 |         # If not exist, download
109 |         response = requests.get(
110 |             "https://modelscope.cn/api/v1/datasets/wenet/wenet_pretrained_models/oss/tree"  # noqa
111 |         )
112 |         model_info = next(data for data in response.json()["Data"]
113 |                           if data["Key"] == model)
114 |         model_url = model_info['Url']
115 |         download(model_url, model_dir, only_child=True)
116 |         return model_dir
117 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/__init__.py


--------------------------------------------------------------------------------
/C2SER-llm/wenet/dataset/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/dataset/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/dataset/__pycache__/dataset.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/__pycache__/dataset.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/dataset/__pycache__/dataset.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/__pycache__/dataset.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/dataset/process/__pycache__/processor.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/process/__pycache__/processor.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/dataset/process/__pycache__/processor.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/process/__pycache__/processor.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/efficient_conformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/efficient_conformer/__init__.py


--------------------------------------------------------------------------------
/C2SER-llm/wenet/efficient_conformer/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/efficient_conformer/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/efficient_conformer/__pycache__/attention.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/efficient_conformer/__pycache__/attention.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/efficient_conformer/__pycache__/subsampling.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/efficient_conformer/__pycache__/subsampling.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/efficient_conformer/subsampling.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
 2 | #               2022 58.com(Wuba) Inc AI Lab.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # Modified from ESPnet(https://github.com/espnet/espnet)
16 | """Subsampling layer definition."""
17 | 
18 | from typing import Tuple, Union
19 | 
20 | import torch
21 | from wenet.transformer.subsampling import BaseSubsampling
22 | 
23 | 
24 | class Conv2dSubsampling2(BaseSubsampling):
25 |     """Convolutional 2D subsampling (to 1/4 length).
26 | 
27 |     Args:
28 |         idim (int): Input dimension.
29 |         odim (int): Output dimension.
30 |         dropout_rate (float): Dropout rate.
31 | 
32 |     """
33 | 
34 |     def __init__(self, idim: int, odim: int, dropout_rate: float,
35 |                  pos_enc_class: torch.nn.Module):
36 |         """Construct an Conv2dSubsampling4 object."""
37 |         super().__init__()
38 |         self.conv = torch.nn.Sequential(torch.nn.Conv2d(1, odim, 3, 2),
39 |                                         torch.nn.ReLU())
40 |         self.out = torch.nn.Sequential(
41 |             torch.nn.Linear(odim * ((idim - 1) // 2), odim))
42 |         self.pos_enc = pos_enc_class
43 |         # The right context for every conv layer is computed by:
44 |         # (kernel_size - 1) * frame_rate_of_this_layer
45 |         self.subsampling_rate = 2
46 |         # 2 = (3 - 1) * 1
47 |         self.right_context = 2
48 | 
49 |     def forward(
50 |         self,
51 |         x: torch.Tensor,
52 |         x_mask: torch.Tensor,
53 |         offset: Union[int, torch.Tensor] = 0
54 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
55 |         """Subsample x.
56 | 
57 |         Args:
58 |             x (torch.Tensor): Input tensor (#batch, time, idim).
59 |             x_mask (torch.Tensor): Input mask (#batch, 1, time).
60 | 
61 |         Returns:
62 |             torch.Tensor: Subsampled tensor (#batch, time', odim),
63 |                 where time' = time // 2.
64 |             torch.Tensor: Subsampled mask (#batch, 1, time'),
65 |                 where time' = time // 2.
66 |             torch.Tensor: positional encoding
67 | 
68 |         """
69 |         x = x.unsqueeze(1)  # (b, c=1, t, f)
70 |         x = self.conv(x)
71 |         b, c, t, f = x.size()
72 |         x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
73 |         x, pos_emb = self.pos_enc(x, offset)
74 |         return x, pos_emb, x_mask[:, :, :-2:2]
75 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/llm_asr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/llm_asr/__init__.py


--------------------------------------------------------------------------------
/C2SER-llm/wenet/llm_asr/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/llm_asr/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/llm_asr/__pycache__/downsampler.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/llm_asr/__pycache__/downsampler.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/llm_asr/__pycache__/init_llmasr.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/llm_asr/__pycache__/init_llmasr.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/llm_asr/__pycache__/llmasr_model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/llm_asr/__pycache__/llmasr_model.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/llm_asr/__pycache__/utils4llmasr.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/llm_asr/__pycache__/utils4llmasr.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/llm_asr/downsampler.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | 
  5 | class GxlConv1dSubsampling2(nn.Module):
  6 |     """Conv1d subsampling module.
  7 | 
  8 |     Args:
  9 |         idim (int): Input dimension.
 10 |         odim (int): Output dimension.
 11 |         dropout_rate (float): Dropout rate.
 12 | 
 13 |     """
 14 | 
 15 |     def __init__(self, idim: int, odim: int):
 16 |         """Construct an Conv1dSubsampling object."""
 17 |         super().__init__()
 18 |         self.conv = torch.nn.Sequential(
 19 |             torch.nn.Conv1d(idim, odim, 3, 1),
 20 |             torch.nn.GELU(),
 21 |             torch.nn.Conv1d(odim, odim, 3, 2),
 22 |             torch.nn.GELU(),
 23 |         )
 24 | 
 25 |     def forward(self, x):
 26 |         """
 27 | 
 28 |         Args:
 29 |             x: (B, T, idim)
 30 | 
 31 |         Returns:
 32 |         """
 33 |         x = x.transpose(1, 2)
 34 |         x = self.conv(x)
 35 |         x = x.transpose(1, 2)
 36 |         return x
 37 | 
 38 | 
 39 | class GxlConv1dSubsampling4(nn.Module):
 40 |     """Conv1d subsampling module.
 41 | 
 42 |     Args:
 43 |         idim (int): Input dimension.
 44 |         odim (int): Output dimension.
 45 |         dropout_rate (float): Dropout rate.
 46 | 
 47 |     """
 48 | 
 49 |     def __init__(self, idim: int, odim: int):
 50 |         """Construct an Conv1dSubsampling object."""
 51 |         super().__init__()
 52 |         self.conv = torch.nn.Sequential(
 53 |             torch.nn.ConstantPad1d((2, 0), 0.0),
 54 |             torch.nn.Conv1d(idim, odim, 3, 1),
 55 |             torch.nn.GELU(),
 56 |             torch.nn.ConstantPad1d((2, 0), 0.0),
 57 |             torch.nn.Conv1d(odim, odim, 3, 2),
 58 |             torch.nn.GELU(),
 59 |             torch.nn.ConstantPad1d((2, 0), 0.0),
 60 |             torch.nn.Conv1d(odim, odim, 3, 2),
 61 |             torch.nn.GELU(),
 62 |         )
 63 | 
 64 |     def forward(self, x, mask_pad):
 65 |         """
 66 | 
 67 |         Args:
 68 |             x: (B, T, idim)
 69 | 
 70 |         Returns:
 71 |         """
 72 |         x = x.transpose(1, 2)
 73 |         x = self.conv(x)
 74 |         x = x.transpose(1, 2)
 75 |         mask_pad = mask_pad[:, :, 0::2]
 76 |         mask_pad = mask_pad[:, :, 0::2]
 77 |         return x, mask_pad
 78 | 
 79 | 
 80 | class GxlConv1dSubsampling6(nn.Module):
 81 |     """Conv1d subsampling module.
 82 | 
 83 |     Args:
 84 |         idim (int): Input dimension.
 85 |         odim (int): Output dimension.
 86 |         dropout_rate (float): Dropout rate.
 87 | 
 88 |     """
 89 | 
 90 |     def __init__(self, idim: int, odim: int):
 91 |         """Construct an Conv1dSubsampling object."""
 92 |         super().__init__()
 93 |         self.conv = torch.nn.Sequential(
 94 |             torch.nn.Conv1d(idim, odim, 3, 1),
 95 |             torch.nn.GELU(),
 96 |             torch.nn.Conv1d(odim, odim, 3, 2),
 97 |             torch.nn.GELU(),
 98 |             torch.nn.Conv1d(odim, odim, 3, 3),
 99 |             torch.nn.GELU(),
100 |         )
101 | 
102 |     def forward(self, x):
103 |         """
104 | 
105 |         Args:
106 |             x: (B, T, idim)
107 | 
108 |         Returns:
109 |         """
110 |         x = x.transpose(1, 2)
111 |         x = self.conv(x)
112 |         x = x.transpose(1, 2)
113 |         return x
114 | 
115 | 
116 | class GxlConv1dSubsampling8(nn.Module):
117 |     """Conv1d subsampling module.
118 | 
119 |     Args:
120 |         idim (int): Input dimension.
121 |         odim (int): Output dimension.
122 |         dropout_rate (float): Dropout rate.
123 | 
124 |     """
125 | 
126 |     def __init__(self, idim: int, odim: int):
127 |         """Construct an Conv1dSubsampling object."""
128 |         super().__init__()
129 |         self.conv = torch.nn.Sequential(
130 |             torch.nn.Conv1d(idim, odim, 3, 1),
131 |             torch.nn.GELU(),
132 |             torch.nn.Conv1d(odim, odim, 3, 2),
133 |             torch.nn.GELU(),
134 |             torch.nn.Conv1d(odim, odim, 3, 2),
135 |             torch.nn.GELU(),
136 |             torch.nn.Conv1d(odim, odim, 3, 2),
137 |             torch.nn.GELU(),
138 |         )
139 | 
140 |     def forward(self, x):
141 |         """
142 | 
143 |         Args:
144 |             x: (B, T, idim)
145 | 
146 |         Returns:
147 |         """
148 |         x = x.transpose(1, 2)
149 |         x = self.conv(x)
150 |         x = x.transpose(1, 2)
151 |         return x
152 |     
153 | class LyzConv1dSubsampling(torch.nn.Module):
154 |     def __init__(
155 |         self,
156 |         enc_out_dim: int = 512,
157 |         llm_embed_dim: int = 4096,
158 |         kernel_size: int = 5,
159 |         activation_func: str = 'relu',
160 |         norm: str = 'batch',
161 |     ):
162 |         super().__init__()
163 |         
164 |         if enc_out_dim * 4 < llm_embed_dim:
165 |             self.left_padding1 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
166 |             self.conv1d1 = nn.Conv1d(enc_out_dim, 2 * enc_out_dim, kernel_size, 1, 0)
167 |             self.bn1 = nn.BatchNorm1d(2 * enc_out_dim, eps=1e-3, momentum=0.99)
168 |             self.relu1 = nn.ReLU()
169 | 
170 |             self.left_padding2 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
171 |             self.conv1d2 = nn.Conv1d(2 * enc_out_dim, 4 * enc_out_dim, kernel_size, 2, 0)
172 |             self.bn2 = nn.BatchNorm1d(4 * enc_out_dim, eps=1e-3, momentum=0.99)
173 |             self.relu2 = nn.ReLU()
174 |             
175 |             self.project = nn.Linear(4 * enc_out_dim, llm_embed_dim)
176 |             self.cnn_num = 2
177 |         else:
178 |             self.left_padding2 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
179 |             self.conv1d2 = nn.Conv1d(enc_out_dim, 2 * enc_out_dim, kernel_size, 2, 0)
180 |             if norm == 'batch':
181 |                 self.bn2 = nn.BatchNorm1d(2 * enc_out_dim, eps=1e-3, momentum=0.99)
182 |             elif norm == 'layer':
183 |                 self.bn2 = nn.LayerNorm(2 * enc_out_dim, eps=1e-3)
184 |             if activation_func == 'gelu':
185 |                 self.relu2 = nn.GELU()
186 |             else:
187 |                 self.relu2 = nn.ReLU()
188 |             self.project = nn.Linear(2 * enc_out_dim, llm_embed_dim)
189 |             self.cnn_num = 1
190 |     
191 |     def forward(self, x, mask_pad):
192 |         """
193 |             x: B, T, enc_out_dim
194 |             mask: (B, T) or (B, 1, T)
195 |         """
196 |         x = x.transpose(1, 2)  # B, channels, T
197 | 
198 |         # mask batch padding
199 |         if mask_pad.size(2) > 0:  # time > 0
200 |             x.masked_fill_(~mask_pad, 0.0)
201 | 
202 |         if self.cnn_num == 2:
203 |             x = self.left_padding1(x)
204 |             x = self.conv1d1(x)
205 |             x = self.bn1(x)
206 |             x = self.relu1(x)
207 | 
208 |         x = self.left_padding2(x)
209 |         x = self.conv1d2(x)
210 |         if isinstance(self.bn2, nn.LayerNorm):
211 |             x = x.transpose(1, 2)
212 |         x = self.bn2(x)
213 |         if isinstance(self.bn2, nn.LayerNorm):
214 |             x = x.transpose(1, 2)
215 |         x = self.relu2(x)
216 | 
217 |         x = x.transpose(1, 2)
218 |         x = self.project(x)
219 | 
220 |         return x, mask_pad[:, :, 0::2]
221 |     
222 | def get_downsampler(downsample_rate, ndim=1280):
223 |     down_sample_2 = nn.Identity()
224 |     if downsample_rate == 2:
225 |         down_sample_2 = GxlConv1dSubsampling2(ndim, ndim)
226 |     elif downsample_rate == 4:
227 |         down_sample_2 = GxlConv1dSubsampling4(ndim, ndim)
228 |     elif downsample_rate == 8:
229 |         down_sample_2 = GxlConv1dSubsampling8(ndim, ndim)
230 |     elif downsample_rate == 6:
231 |         down_sample_2 = GxlConv1dSubsampling6(ndim, ndim)
232 |     return down_sample_2


--------------------------------------------------------------------------------
/C2SER-llm/wenet/llm_asr/init_llmasr.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import torch
  4 | 
  5 | from wenet.llm_asr.llmasr_model import LLMASR_Model
  6 | from wenet.transformer.cmvn import GlobalCMVN
  7 | from wenet.utils.checkpoint import load_checkpoint, load_trained_modules
  8 | from wenet.utils.cmvn import load_cmvn
  9 | 
 10 | from gxl_ai_utils.utils import utils_file
 11 | 
 12 | def init_llmasr(args, configs, is_inference=False):
 13 |     llm_path = configs["llm_path"]
 14 |     lora = configs["use_lora"]
 15 |     lora_alpha = configs["lora_alpha"]
 16 |     lora_rank = configs["lora_rank"]
 17 |     lora_dropout = configs["lora_dropout"]
 18 |     # prompt_pattern = configs['prompt_pattern']
 19 | 
 20 |     encoder_output_dim = -1
 21 |     if configs['encoder'] == 'transformer':
 22 |         if configs.get('cmvn', None) == 'global_cmvn':
 23 |             mean, istd = load_cmvn(configs['cmvn_conf']['cmvn_file'],
 24 |                                    configs['cmvn_conf']['is_json_cmvn'])
 25 |             global_cmvn = GlobalCMVN(
 26 |                 torch.from_numpy(mean).float(),
 27 |                 torch.from_numpy(istd).float())
 28 |         else:
 29 |             global_cmvn = None
 30 |         encoder_type = configs.get('encoder', 'conformer')
 31 |         input_dim = configs['input_dim']
 32 |         from wenet.utils.init_model import WENET_ENCODER_CLASSES
 33 |         encoder = WENET_ENCODER_CLASSES[encoder_type](
 34 |             input_dim,
 35 |             global_cmvn=global_cmvn,
 36 |             **configs['encoder_conf'],
 37 |             **configs['encoder_conf']['efficient_conf']
 38 |             if 'efficient_conf' in configs['encoder_conf'] else {})
 39 |         encoder_output_dim = configs['encoder_conf']['output_size']
 40 |     elif configs['encoder'] == 'whisper':
 41 |         raise NotImplementedError('whisper 还没实现')
 42 |     elif configs['encoder'] == 'hubert':
 43 |         raise NotImplementedError('hubert 还没实现')
 44 |     else:
 45 |         encoder = None
 46 |     logging.info(f'encoder output dim:{encoder_output_dim}')
 47 | 
 48 | 
 49 |     # encoder = encoder.to(torch.float16)
 50 |     speech_token_num = configs.get('speech_token_num', 0)
 51 |     train_speech_out = speech_token_num != 0
 52 | 
 53 |     model = LLMASR_Model(
 54 |         encoder=encoder,
 55 |         encoder_output_dim=encoder_output_dim,
 56 |         llm_path=llm_path,
 57 |         lora=lora,
 58 |         lora_alpha=lora_alpha,
 59 |         lora_rank=lora_rank,
 60 |         lora_dropout=lora_dropout,
 61 |         is_inference=is_inference,
 62 |         downsample_rate=configs.get('downsample_rate',1),
 63 |         adapter_type=configs.get('adapter_type', 'lyz'),
 64 |         speech_token_num=speech_token_num,
 65 |         train_speech_out=train_speech_out,
 66 |     )
 67 | 
 68 |     utils_file.print_model_size(model.encoder)
 69 |     utils_file.print_model_size(model.llama_model)
 70 |     # utils_file.print_model_size(model.speech_transformer)
 71 |     # utils_file.print_model_size(model.speech_llama_proj)
 72 | 
 73 |     logging.info(f'开始加载初始化模型')
 74 |     if hasattr(args, 'checkpoint') and args.checkpoint is not None:
 75 |         logging.info(f'设置了初始化模型位置，开始加载，参数文件位置：{args.checkpoint}')
 76 |         infos = load_checkpoint(model, args.checkpoint)
 77 |     elif hasattr(args, 'checkpoint') and args.enc_init is not None:
 78 |         infos = load_trained_modules(model, args)
 79 |     else:
 80 |         infos = {}
 81 | 
 82 |     if configs.get('init_step', False):
 83 |         infos = {}
 84 |     configs["init_infos"] = infos
 85 |     print(configs)
 86 |     logging.info('加载初始化模型完毕')
 87 | 
 88 |     if not is_inference:
 89 |         logging.info('不更换LLM的参数')
 90 |     else:
 91 |         logging.info(' 不更换LLM的参数')
 92 | 
 93 |     logging.info('开始选择性冻结模块')
 94 |     fire_module = configs.get("fire_module", None)
 95 |     if fire_module is None:
 96 |         logging.info('没有选择解冻的模块,也就是没有训练参数，直接报错返回')
 97 |         raise ValueError('没有选择解冻的模块,也就是没有训练参数，直接报错返回')
 98 |     for k, p in model.named_parameters():
 99 |         if fire_module == 'link':
100 |             if k.startswith("llama_model") or k.startswith("encoder"):
101 |                 p.requires_grad = False
102 |         elif fire_module == 'encoder':
103 |             if not k.startswith("encoder"):
104 |                 p.requires_grad = False
105 |         elif fire_module == 'llm':
106 |             if not k.startswith("llama_model"):
107 |                 p.requires_grad = False
108 |         elif fire_module == 'link_and_encoder':
109 |             # 这里和speech token相关的层不会被冻结
110 |             if k.startswith("llama_model"):
111 |                 p.requires_grad = False
112 |         elif fire_module == "link_and_encoder_and_lora":
113 |             break
114 |         logging.info(f"{k} {p.requires_grad}")
115 |     logging.info('冻结完毕')
116 | 
117 |     return model, configs
118 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/llm_asr/utils4llmasr.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | 
 6 | from wenet.utils.common import pad_list
 7 | from gxl_ai_utils.utils import utils_file
 8 | 
 9 | 
10 | def add_sos_eos4speech_llm(ys_pad: torch.Tensor, sos: int, eos: int,
11 |                            ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]:
12 |     """Add <sos> and <eos> labels.
13 |     为out后接一个eos. in基本保持不变
14 | 
15 |     Args:
16 |         ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax)
17 |         sos (int): index of <sos>
18 |         eos (int): index of <eeos>
19 |         ignore_id (int): index of padding
20 | 
21 |     Returns:
22 |         ys_in (torch.Tensor) : (B, Lmax)
23 |         ys_out (torch.Tensor) : (B, Lmax + 1)
24 | 
25 |     Examples:
26 |         >>> sos_id = 10
27 |         >>> eos_id = 11
28 |         >>> ignore_id = -1
29 |         >>> ys_pad
30 |         tensor([[ 1,  2,  3,  4,  5],
31 |                 [ 4,  5,  6, -1, -1],
32 |                 [ 7,  8,  9, -1, -1]], dtype=torch.int32)
33 |         >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id)
34 |         >>> ys_in
35 |         tensor([[ 1,  2,  3,  4,  5],
36 |                 [ 4,  5,  6, 11, 11],
37 |                 [ 7,  8,  9, 11, 11]])
38 |         >>> ys_out
39 |         tensor([[ 1,  2,  3,  4,  5, 11],
40 |                 [ 4,  5,  6, 11, -1, -1],
41 |                 [ 7,  8,  9, 11, -1, -1]])
42 |     """
43 |     _sos = torch.tensor([sos],
44 |                         dtype=torch.long,
45 |                         requires_grad=False,
46 |                         device=ys_pad.device)
47 |     _eos = torch.tensor([eos],
48 |                         dtype=torch.long,
49 |                         requires_grad=False,
50 |                         device=ys_pad.device)
51 |     ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
52 |     # ys_in = [torch.cat([_sos, y], dim=0) for y in ys]
53 |     ys_in = [y for y in ys]
54 |     ys_out = [torch.cat([y, _eos], dim=0) for y in ys]
55 |     return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)
56 | 
57 | global_prompt_dict = None
58 | def get_prompt_by_task(task_name):
59 |     """
60 |     根据task给定指定的prompt, 并实现prompt的多样随意性
61 |     Args:
62 |         task_name:
63 | 
64 |     Returns:
65 | 
66 |     """
67 |     global global_prompt_dict
68 |     if global_prompt_dict is None:
69 |         global_prompt_dict = utils_file.load_dict_from_yaml('conf/prompt.yaml')
70 |     random_index = random.randint(0, len(global_prompt_dict[task_name])-1)
71 |     return global_prompt_dict[task_name][random_index]
72 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/paraformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/paraformer/__init__.py


--------------------------------------------------------------------------------
/C2SER-llm/wenet/paraformer/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/paraformer/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/paraformer/__pycache__/embedding.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/paraformer/__pycache__/embedding.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/paraformer/__pycache__/search.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/paraformer/__pycache__/search.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/paraformer/embedding.py:
--------------------------------------------------------------------------------
 1 | from wenet.transformer.embedding import WhisperPositionalEncoding
 2 | 
 3 | 
 4 | class ParaformerPositinoalEncoding(WhisperPositionalEncoding):
 5 |     """ Sinusoids position encoding used in paraformer.encoder
 6 |     """
 7 | 
 8 |     def __init__(self,
 9 |                  depth: int,
10 |                  d_model: int,
11 |                  dropout_rate: float = 0.1,
12 |                  max_len: int = 1500):
13 |         super().__init__(depth, dropout_rate, max_len)
14 |         self.xscale = d_model**0.5
15 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/squeezeformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/squeezeformer/__init__.py


--------------------------------------------------------------------------------
/C2SER-llm/wenet/squeezeformer/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/squeezeformer/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/squeezeformer/__pycache__/conv2d.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/squeezeformer/__pycache__/conv2d.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/squeezeformer/__pycache__/subsampling.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/squeezeformer/__pycache__/subsampling.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/squeezeformer/conv2d.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Conv2d Module with Valid Padding"""
15 | 
16 | import torch.nn.functional as F
17 | from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional
18 | 
19 | 
20 | class Conv2dValid(_ConvNd):
21 |     """
22 |     Conv2d operator for VALID mode padding.
23 |     """
24 | 
25 |     def __init__(
26 |             self,
27 |             in_channels: int,
28 |             out_channels: int,
29 |             kernel_size: _size_2_t,
30 |             stride: _size_2_t = 1,
31 |             padding: Union[str, _size_2_t] = 0,
32 |             dilation: _size_2_t = 1,
33 |             groups: int = 1,
34 |             bias: bool = True,
35 |             padding_mode: str = 'zeros',  # TODO: refine this type
36 |             device=None,
37 |             dtype=None,
38 |             valid_trigx: bool = False,
39 |             valid_trigy: bool = False) -> None:
40 |         factory_kwargs = {'device': device, 'dtype': dtype}
41 |         kernel_size_ = _pair(kernel_size)
42 |         stride_ = _pair(stride)
43 |         padding_ = padding if isinstance(padding, str) else _pair(padding)
44 |         dilation_ = _pair(dilation)
45 |         super(Conv2dValid,
46 |               self).__init__(in_channels, out_channels,
47 |                              kernel_size_, stride_, padding_, dilation_, False,
48 |                              _pair(0), groups, bias, padding_mode,
49 |                              **factory_kwargs)
50 |         self.valid_trigx = valid_trigx
51 |         self.valid_trigy = valid_trigy
52 | 
53 |     def _conv_forward(self, input: Tensor, weight: Tensor,
54 |                       bias: Optional[Tensor]):
55 |         validx, validy = 0, 0
56 |         if self.valid_trigx:
57 |             validx = (input.size(-2) *
58 |                       (self.stride[-2] - 1) - 1 + self.kernel_size[-2]) // 2
59 |         if self.valid_trigy:
60 |             validy = (input.size(-1) *
61 |                       (self.stride[-1] - 1) - 1 + self.kernel_size[-1]) // 2
62 |         return F.conv2d(input, weight, bias, self.stride, (validx, validy),
63 |                         self.dilation, self.groups)
64 | 
65 |     def forward(self, input: Tensor) -> Tensor:
66 |         return self._conv_forward(input, self.weight, self.bias)
67 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__init__.py


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/base_tokenizer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/base_tokenizer.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/base_tokenizer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/base_tokenizer.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/base_tokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/base_tokenizer.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/bpe_tokenizer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/bpe_tokenizer.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/bpe_tokenizer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/bpe_tokenizer.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/bpe_tokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/bpe_tokenizer.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/char_tokenizer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/char_tokenizer.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/char_tokenizer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/char_tokenizer.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/char_tokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/char_tokenizer.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/hugging_face_tokenizer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/hugging_face_tokenizer.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/hugging_face_tokenizer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/hugging_face_tokenizer.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/hugging_face_tokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/hugging_face_tokenizer.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/paraformer_tokenizer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/paraformer_tokenizer.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/paraformer_tokenizer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/paraformer_tokenizer.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/paraformer_tokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/paraformer_tokenizer.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/tokenize_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/tokenize_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/tokenize_utils.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/tokenize_utils.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/tokenize_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/tokenize_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/whisper_tokenizer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/whisper_tokenizer.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/whisper_tokenizer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/whisper_tokenizer.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/__pycache__/whisper_tokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/whisper_tokenizer.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/base_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod, abstractproperty
 2 | from typing import Dict, List, Tuple, Union
 3 | 
 4 | T = Union[str, bytes]
 5 | 
 6 | 
 7 | class BaseTokenizer(ABC):
 8 | 
 9 |     def tokenize(self, line: str) -> Tuple[List[T], List[int]]:
10 |         tokens = self.text2tokens(line)
11 |         ids = self.tokens2ids(tokens)
12 |         return tokens, ids
13 | 
14 |     def detokenize(self, ids: List[int]) -> Tuple[str, List[T]]:
15 |         tokens = self.ids2tokens(ids)
16 |         text = self.tokens2text(tokens)
17 |         return text, tokens
18 | 
19 |     @abstractmethod
20 |     def text2tokens(self, line: str) -> List[T]:
21 |         raise NotImplementedError("abstract method")
22 | 
23 |     @abstractmethod
24 |     def tokens2text(self, tokens: List[T]) -> str:
25 |         raise NotImplementedError("abstract method")
26 | 
27 |     @abstractmethod
28 |     def tokens2ids(self, tokens: List[T]) -> List[int]:
29 |         raise NotImplementedError("abstract method")
30 | 
31 |     @abstractmethod
32 |     def ids2tokens(self, ids: List[int]) -> List[T]:
33 |         raise NotImplementedError("abstract method")
34 | 
35 |     @abstractmethod
36 |     def vocab_size(self) -> int:
37 |         raise NotImplementedError("abstract method")
38 | 
39 |     @abstractproperty
40 |     def symbol_table(self) -> Dict[T, int]:
41 |         raise NotImplementedError("abstract method")
42 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/bpe_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from os import PathLike
 2 | from typing import Dict, List, Optional, Union
 3 | from wenet.text.char_tokenizer import CharTokenizer
 4 | from wenet.text.tokenize_utils import tokenize_by_bpe_model
 5 | 
 6 | 
 7 | class BpeTokenizer(CharTokenizer):
 8 | 
 9 |     def __init__(
10 |         self,
11 |         bpe_model: Union[PathLike, str],
12 |         symbol_table: Union[str, PathLike, Dict],
13 |         non_lang_syms: Optional[Union[str, PathLike, List]] = None,
14 |         split_with_space: bool = False,
15 |         connect_symbol: str = '',
16 |         unk='<unk>',
17 |     ) -> None:
18 |         super().__init__(symbol_table, non_lang_syms, split_with_space,
19 |                          connect_symbol, unk)
20 |         self._model = bpe_model
21 |         # NOTE(Mddct): multiprocessing.Process() issues
22 |         #              don't build sp here
23 |         self.bpe_model = None
24 | 
25 |     def _build_sp(self):
26 |         if self.bpe_model is None:
27 |             import sentencepiece as spm
28 |             self.bpe_model = spm.SentencePieceProcessor()
29 |             self.bpe_model.load(self._model)
30 | 
31 |     def text2tokens(self, line: str) -> List[str]:
32 |         self._build_sp()
33 |         line = line.strip()
34 |         if self.non_lang_syms_pattern is not None:
35 |             parts = self.non_lang_syms_pattern.split(line.upper())
36 |             parts = [w for w in parts if len(w.strip()) > 0]
37 |         else:
38 |             parts = [line]
39 | 
40 |         tokens = []
41 |         for part in parts:
42 |             if part in self.non_lang_syms:
43 |                 tokens.append(part)
44 |             else:
45 |                 tokens.extend(tokenize_by_bpe_model(self.bpe_model, part))
46 |         return tokens
47 | 
48 |     def tokens2text(self, tokens: List[str]) -> str:
49 |         self._build_sp()
50 |         text = super().tokens2text(tokens)
51 |         return text.replace("▁", ' ').strip()
52 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/char_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from os import PathLike
 4 | from typing import Dict, List, Optional, Union
 5 | from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols
 6 | from wenet.text.base_tokenizer import BaseTokenizer
 7 | 
 8 | 
 9 | class CharTokenizer(BaseTokenizer):
10 | 
11 |     def __init__(
12 |         self,
13 |         symbol_table: Union[str, PathLike, Dict],
14 |         non_lang_syms: Optional[Union[str, PathLike, List]] = None,
15 |         split_with_space: bool = False,
16 |         connect_symbol: str = '',
17 |         unk='<unk>',
18 |     ) -> None:
19 |         self.non_lang_syms_pattern = None
20 |         if non_lang_syms is not None:
21 |             self.non_lang_syms_pattern = re.compile(
22 |                 r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})")
23 |         if not isinstance(symbol_table, Dict):
24 |             self._symbol_table = read_symbol_table(symbol_table)
25 |         else:
26 |             # symbol_table = {"我": 1, "是": 2, "{NOISE}": 3}
27 |             self._symbol_table = symbol_table
28 |         if not isinstance(non_lang_syms, List):
29 |             self.non_lang_syms = read_non_lang_symbols(non_lang_syms)
30 |         else:
31 |             # non_lang_syms=["{NOISE}"]
32 |             self.non_lang_syms = non_lang_syms
33 |         self.char_dict = {v: k for k, v in self._symbol_table.items()}
34 |         self.split_with_space = split_with_space
35 |         self.connect_symbol = connect_symbol
36 |         self.unk = unk
37 | 
38 |     def text2tokens(self, line: str) -> List[str]:
39 |         line = line.strip()
40 |         if self.non_lang_syms_pattern is not None:
41 |             parts = self.non_lang_syms_pattern.split(line.upper())
42 |             parts = [w for w in parts if len(w.strip()) > 0]
43 |         else:
44 |             parts = [line]
45 | 
46 |         tokens = []
47 |         for part in parts:
48 |             if part in self.non_lang_syms:
49 |                 tokens.append(part)
50 |             else:
51 |                 if self.split_with_space:
52 |                     part = part.split(" ")
53 |                 for ch in part:
54 |                     if ch == ' ':
55 |                         ch = "▁"
56 |                     tokens.append(ch)
57 |         return tokens
58 | 
59 |     def tokens2text(self, tokens: List[str]) -> str:
60 |         return self.connect_symbol.join(tokens)
61 | 
62 |     def tokens2ids(self, tokens: List[str]) -> List[int]:
63 |         ids = []
64 |         for ch in tokens:
65 |             if ch in self._symbol_table:
66 |                 ids.append(self._symbol_table[ch])
67 |             elif self.unk in self._symbol_table:
68 |                 ids.append(self._symbol_table[self.unk])
69 |         return ids
70 | 
71 |     def ids2tokens(self, ids: List[int]) -> List[str]:
72 |         content = [self.char_dict[w] for w in ids]
73 |         return content
74 | 
75 |     def vocab_size(self) -> int:
76 |         return len(self.char_dict)
77 | 
78 |     @property
79 |     def symbol_table(self) -> Dict[str, int]:
80 |         return self._symbol_table
81 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/hugging_face_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from os import PathLike
 2 | from typing import Dict, List, Union
 3 | from wenet.text.base_tokenizer import BaseTokenizer, T as Type
 4 | 
 5 | 
 6 | class HuggingFaceTokenizer(BaseTokenizer):
 7 | 
 8 |     def __init__(self, model: Union[str, PathLike], *args, **kwargs) -> None:
 9 |         # NOTE(Mddct): don't build here, pickle issues
10 |         self.model = model
11 |         self.tokenizer = None
12 | 
13 |         self.args = args
14 |         self.kwargs = kwargs
15 | 
16 |     def __getstate__(self):
17 |         state = self.__dict__.copy()
18 |         del state['tokenizer']
19 |         return state
20 | 
21 |     def __setstate__(self, state):
22 |         self.__dict__.update(state)
23 |         recovery = {'tokenizer': None}
24 |         self.__dict__.update(recovery)
25 | 
26 |     def _build_hugging_face(self):
27 |         from transformers import AutoTokenizer
28 |         if self.tokenizer is None:
29 |             self.tokenizer = AutoTokenizer.from_pretrained(
30 |                 self.model, **self.kwargs)
31 |             self.t2i = self.tokenizer.get_vocab()
32 | 
33 |     def text2tokens(self, line: str) -> List[Type]:
34 |         self._build_hugging_face()
35 |         return self.tokenizer.tokenize(line)
36 | 
37 |     def tokens2text(self, tokens: List[Type]) -> str:
38 |         self._build_hugging_face()
39 |         ids = self.tokens2ids(tokens)
40 |         return self.tokenizer.decode(ids)
41 | 
42 |     def tokens2ids(self, tokens: List[Type]) -> List[int]:
43 |         self._build_hugging_face()
44 |         return self.tokenizer.convert_tokens_to_ids(tokens)
45 | 
46 |     def ids2tokens(self, ids: List[int]) -> List[Type]:
47 |         self._build_hugging_face()
48 |         return self.tokenizer.convert_ids_to_tokens(ids)
49 | 
50 |     def vocab_size(self) -> int:
51 |         self._build_hugging_face()
52 |         # TODO: we need special tokenize size in future
53 |         return len(self.tokenizer)
54 | 
55 |     @property
56 |     def symbol_table(self) -> Dict[Type, int]:
57 |         self._build_hugging_face()
58 |         return self.t2i
59 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/paraformer_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from os import PathLike
 2 | from typing import Dict, List, Optional, Union
 3 | from wenet.paraformer.search import paraformer_beautify_result
 4 | from wenet.text.char_tokenizer import CharTokenizer
 5 | from wenet.text.tokenize_utils import tokenize_by_seg_dict
 6 | 
 7 | 
 8 | def read_seg_dict(path):
 9 |     seg_table = {}
10 |     with open(path, 'r', encoding='utf8') as fin:
11 |         for line in fin:
12 |             arr = line.strip().split('\t')
13 |             assert len(arr) == 2
14 |             seg_table[arr[0]] = arr[1]
15 |     return seg_table
16 | 
17 | 
18 | class ParaformerTokenizer(CharTokenizer):
19 | 
20 |     def __init__(self,
21 |                  symbol_table: Union[str, PathLike, Dict],
22 |                  seg_dict: Optional[Union[str, PathLike, Dict]] = None,
23 |                  split_with_space: bool = False,
24 |                  connect_symbol: str = '',
25 |                  unk='<unk>') -> None:
26 |         super().__init__(symbol_table, None, split_with_space, connect_symbol,
27 |                          unk)
28 |         self.seg_dict = seg_dict
29 |         if seg_dict is not None and not isinstance(seg_dict, Dict):
30 |             self.seg_dict = read_seg_dict(seg_dict)
31 | 
32 |     def text2tokens(self, line: str) -> List[str]:
33 |         assert self.seg_dict is not None
34 | 
35 |         # TODO(Mddct): duplicated here, refine later
36 |         line = line.strip()
37 |         if self.non_lang_syms_pattern is not None:
38 |             parts = self.non_lang_syms_pattern.split(line)
39 |             parts = [w for w in parts if len(w.strip()) > 0]
40 |         else:
41 |             parts = [line]
42 | 
43 |         tokens = []
44 |         for part in parts:
45 |             if part in self.non_lang_syms:
46 |                 tokens.append(part)
47 |             else:
48 |                 tokens.extend(tokenize_by_seg_dict(self.seg_dict, part))
49 |         return tokens
50 | 
51 |     def tokens2text(self, tokens: List[str]) -> str:
52 |         return paraformer_beautify_result(tokens)
53 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/tokenize_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
 2 | #               2023 Tsinghua Univ. (authors: Xingchen Song)
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import re
16 | 
17 | 
18 | def tokenize_by_bpe_model(sp, txt):
19 |     return _tokenize_by_seg_dic_or_bpe_model(txt, sp=sp, upper=True)
20 | 
21 | 
22 | def tokenize_by_seg_dict(seg_dict, txt):
23 |     return _tokenize_by_seg_dic_or_bpe_model(txt,
24 |                                              seg_dict=seg_dict,
25 |                                              upper=False)
26 | 
27 | 
28 | def _tokenize_by_seg_dic_or_bpe_model(
29 |     txt,
30 |     sp=None,
31 |     seg_dict=None,
32 |     upper=True,
33 | ):
34 |     if sp is None:
35 |         assert seg_dict is not None
36 |     if seg_dict is None:
37 |         assert sp is not None
38 |     tokens = []
39 |     # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref:
40 |     # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
41 |     pattern = re.compile(r'([\u4e00-\u9fff])')
42 |     # Example:
43 |     #   txt   = "你好 ITS'S OKAY 的"
44 |     #   chars = ["你", "好", " ITS'S OKAY ", "的"]
45 |     chars = pattern.split(txt.upper() if upper else txt)
46 |     mix_chars = [w for w in chars if len(w.strip()) > 0]
47 |     for ch_or_w in mix_chars:
48 |         # ch_or_w is a single CJK charater(i.e., "你"), do nothing.
49 |         if pattern.fullmatch(ch_or_w) is not None:
50 |             tokens.append(ch_or_w)
51 |         # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "),
52 |         # encode ch_or_w using bpe_model.
53 |         else:
54 |             if sp is not None:
55 |                 for p in sp.encode_as_pieces(ch_or_w):
56 |                     tokens.append(p)
57 |             else:
58 |                 for en_token in ch_or_w.split():
59 |                     en_token = en_token.strip()
60 |                     if en_token in seg_dict:
61 |                         tokens.extend(seg_dict[en_token].split(' '))
62 |                     else:
63 |                         tokens.append(en_token)
64 | 
65 |     return tokens
66 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/text/whisper_tokenizer.py:
--------------------------------------------------------------------------------
  1 | from os import PathLike
  2 | from typing import Dict, List, Optional, Tuple, Union
  3 | from wenet.text.base_tokenizer import BaseTokenizer
  4 | 
  5 | from wenet.utils.file_utils import read_non_lang_symbols
  6 | 
  7 | 
  8 | class WhisperTokenizer(BaseTokenizer):
  9 | 
 10 |     def __init__(
 11 |         self,
 12 |         multilingual: bool,
 13 |         num_languages: int = 99,
 14 |         language: Optional[str] = None,
 15 |         task: Optional[str] = None,
 16 |         non_lang_syms: Optional[Union[str, PathLike, List]] = None,
 17 |         *args,
 18 |         **kwargs,
 19 |     ) -> None:
 20 |         # NOTE(Mddct): don't build here, pickle issues
 21 |         self.tokenizer = None
 22 |         # TODO: we don't need this in future
 23 |         self.multilingual = multilingual
 24 |         self.num_languages = num_languages
 25 |         self.language = language
 26 |         self.task = task
 27 | 
 28 |         if not isinstance(non_lang_syms, List):
 29 |             self.non_lang_syms = read_non_lang_symbols(non_lang_syms)
 30 |         else:
 31 |             # non_lang_syms=["{NOISE}"]
 32 |             self.non_lang_syms = non_lang_syms
 33 |         # TODO(Mddct): add special tokens, like non_lang_syms
 34 |         del self.non_lang_syms
 35 | 
 36 |     def __getstate__(self):
 37 |         state = self.__dict__.copy()
 38 |         del state['tokenizer']
 39 |         return state
 40 | 
 41 |     def __setstate__(self, state):
 42 |         self.__dict__.update(state)
 43 |         recovery = {'tokenizer': None}
 44 |         self.__dict__.update(recovery)
 45 | 
 46 |     def _build_tiktoken(self):
 47 |         if self.tokenizer is None:
 48 |             from whisper.tokenizer import get_tokenizer
 49 |             self.tokenizer = get_tokenizer(multilingual=self.multilingual,
 50 |                                            num_languages=self.num_languages,
 51 |                                            language=self.language,
 52 |                                            task=self.task)
 53 |             self.t2i = {}
 54 |             self.i2t = {}
 55 |             for i in range(self.tokenizer.encoding.n_vocab):
 56 |                 unit = str(
 57 |                     self.tokenizer.encoding.decode_single_token_bytes(i))
 58 |                 if len(unit) == 0:
 59 |                     unit = str(i)
 60 |                 unit = unit.replace(" ", "<space>")
 61 |                 # unit = bytes(unit, 'utf-8')
 62 |                 self.t2i[unit] = i
 63 |                 self.i2t[i] = unit
 64 |             assert len(self.t2i) == len(self.i2t)
 65 | 
 66 |     def tokenize(self, line: str) -> Tuple[List[str], List[int]]:
 67 |         self._build_tiktoken()
 68 |         ids = self.tokenizer.encoding.encode(line)
 69 |         text = [self.i2t[d] for d in ids]
 70 |         return text, ids
 71 | 
 72 |     def detokenize(self, ids: List[int]) -> Tuple[str, List[str]]:
 73 |         self._build_tiktoken()
 74 |         tokens = [self.i2t[d] for d in ids]
 75 |         text = self.tokenizer.encoding.decode(ids)
 76 |         return text, tokens
 77 | 
 78 |     def text2tokens(self, line: str) -> List[str]:
 79 |         self._build_tiktoken()
 80 |         return self.tokenize(line)[0]
 81 | 
 82 |     def tokens2text(self, tokens: List[str]) -> str:
 83 |         self._build_tiktoken()
 84 |         ids = [self.t2i[t] for t in tokens]
 85 |         return self.detokenize(ids)[0]
 86 | 
 87 |     def tokens2ids(self, tokens: List[str]) -> List[int]:
 88 |         self._build_tiktoken()
 89 |         ids = [self.t2i[t] for t in tokens]
 90 |         return ids
 91 | 
 92 |     def ids2tokens(self, ids: List[int]) -> List[str]:
 93 |         self._build_tiktoken()
 94 |         return [self.tokenizer.encoding.decode([id]) for id in ids]
 95 | 
 96 |     def vocab_size(self) -> int:
 97 |         self._build_tiktoken()
 98 |         return len(self.t2i)
 99 | 
100 |     @property
101 |     def symbol_table(self) -> Dict[str, int]:
102 |         self._build_tiktoken()
103 |         return self.t2i
104 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__init__.py


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/asr_model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/asr_model.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/asr_model.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/asr_model.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/asr_model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/asr_model.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/attention.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/attention.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/attention.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/attention.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/attention.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/attention.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/cmvn.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/cmvn.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/cmvn.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/cmvn.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/cmvn.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/cmvn.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/convolution.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/convolution.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/convolution.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/convolution.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/convolution.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/convolution.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/ctc.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/ctc.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/ctc.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/ctc.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/ctc.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/ctc.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/decoder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/decoder.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/decoder.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/decoder.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/decoder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/decoder.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/decoder_layer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/decoder_layer.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/decoder_layer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/decoder_layer.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/decoder_layer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/decoder_layer.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/embedding.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/embedding.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/embedding.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/embedding.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/embedding.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/embedding.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/encoder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/encoder.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/encoder.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/encoder.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/encoder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/encoder.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/encoder_layer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/encoder_layer.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/encoder_layer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/encoder_layer.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/encoder_layer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/encoder_layer.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/label_smoothing_loss.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/label_smoothing_loss.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/label_smoothing_loss.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/label_smoothing_loss.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/label_smoothing_loss.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/label_smoothing_loss.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/norm.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/norm.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/norm.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/norm.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/norm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/norm.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/search.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/search.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/search.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/search.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/search.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/search.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/subsampling.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/subsampling.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/subsampling.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/subsampling.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/subsampling.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/subsampling.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/swish.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/swish.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/swish.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/swish.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/__pycache__/swish.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/swish.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/cmvn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | 
17 | 
18 | class GlobalCMVN(torch.nn.Module):
19 | 
20 |     def __init__(self,
21 |                  mean: torch.Tensor,
22 |                  istd: torch.Tensor,
23 |                  norm_var: bool = True):
24 |         """
25 |         Args:
26 |             mean (torch.Tensor): mean stats
27 |             istd (torch.Tensor): inverse std, std which is 1.0 / std
28 |         """
29 |         super().__init__()
30 |         assert mean.shape == istd.shape
31 |         self.norm_var = norm_var
32 |         # The buffer can be accessed from this module using self.mean
33 |         self.register_buffer("mean", mean)
34 |         self.register_buffer("istd", istd)
35 | 
36 |     def forward(self, x: torch.Tensor):
37 |         """
38 |         Args:
39 |             x (torch.Tensor): (batch, max_len, feat_dim)
40 | 
41 |         Returns:
42 |             (torch.Tensor): normalized feature
43 |         """
44 |         x = x - self.mean
45 |         if self.norm_var:
46 |             x = x * self.istd
47 |         return x
48 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/convolution.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # Modified from ESPnet(https://github.com/espnet/espnet)
 15 | """ConvolutionModule definition."""
 16 | 
 17 | from typing import Tuple
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | 
 22 | from wenet.utils.class_utils import WENET_NORM_CLASSES
 23 | 
 24 | 
 25 | class ConvolutionModule(nn.Module):
 26 |     """ConvolutionModule in Conformer model."""
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         channels: int,
 31 |         kernel_size: int = 15,
 32 |         activation: nn.Module = nn.ReLU(),
 33 |         norm: str = "batch_norm",
 34 |         causal: bool = False,
 35 |         bias: bool = True,
 36 |         norm_eps: float = 1e-5,
 37 |     ):
 38 |         """Construct an ConvolutionModule object.
 39 |         Args:
 40 |             channels (int): The number of channels of conv layers.
 41 |             kernel_size (int): Kernel size of conv layers.
 42 |             causal (int): Whether use causal convolution or not
 43 |         """
 44 |         super().__init__()
 45 | 
 46 |         self.pointwise_conv1 = nn.Conv1d(
 47 |             channels,
 48 |             2 * channels,
 49 |             kernel_size=1,
 50 |             stride=1,
 51 |             padding=0,
 52 |             bias=bias,
 53 |         )
 54 |         # self.lorder is used to distinguish if it's a causal convolution,
 55 |         # if self.lorder > 0: it's a causal convolution, the input will be
 56 |         #    padded with self.lorder frames on the left in forward.
 57 |         # else: it's a symmetrical convolution
 58 |         if causal:
 59 |             padding = 0
 60 |             self.lorder = kernel_size - 1
 61 |         else:
 62 |             # kernel_size should be an odd number for none causal convolution
 63 |             assert (kernel_size - 1) % 2 == 0
 64 |             padding = (kernel_size - 1) // 2
 65 |             self.lorder = 0
 66 |         self.depthwise_conv = nn.Conv1d(
 67 |             channels,
 68 |             channels,
 69 |             kernel_size,
 70 |             stride=1,
 71 |             padding=padding,
 72 |             groups=channels,
 73 |             bias=bias,
 74 |         )
 75 | 
 76 |         assert norm in ['batch_norm', 'layer_norm', 'rms_norm']
 77 |         if norm == "batch_norm":
 78 |             self.use_layer_norm = False
 79 |             self.norm = WENET_NORM_CLASSES['batch_norm'](channels,
 80 |                                                          eps=norm_eps)
 81 |         else:
 82 |             self.use_layer_norm = True
 83 |             self.norm = WENET_NORM_CLASSES[norm](channels, eps=norm_eps)
 84 | 
 85 |         self.pointwise_conv2 = nn.Conv1d(
 86 |             channels,
 87 |             channels,
 88 |             kernel_size=1,
 89 |             stride=1,
 90 |             padding=0,
 91 |             bias=bias,
 92 |         )
 93 |         self.activation = activation
 94 | 
 95 |     def forward(
 96 |         self,
 97 |         x: torch.Tensor,
 98 |         mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
 99 |         cache: torch.Tensor = torch.zeros((0, 0, 0)),
100 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
101 |         """Compute convolution module.
102 |         Args:
103 |             x (torch.Tensor): Input tensor (#batch, time, channels).
104 |             mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
105 |                 (0, 0, 0) means fake mask.
106 |             cache (torch.Tensor): left context cache, it is only
107 |                 used in causal convolution (#batch, channels, cache_t),
108 |                 (0, 0, 0) meas fake cache.
109 |         Returns:
110 |             torch.Tensor: Output tensor (#batch, time, channels).
111 |         """
112 |         # exchange the temporal dimension and the feature dimension
113 |         x = x.transpose(1, 2)  # (#batch, channels, time)
114 | 
115 |         # mask batch padding
116 |         if mask_pad.size(2) > 0:  # time > 0
117 |             x.masked_fill_(~mask_pad, 0.0)
118 | 
119 |         if self.lorder > 0:
120 |             if cache.size(2) == 0:  # cache_t == 0
121 |                 x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
122 |             else:
123 |                 assert cache.size(0) == x.size(0)  # equal batch
124 |                 assert cache.size(1) == x.size(1)  # equal channel
125 |                 x = torch.cat((cache, x), dim=2)
126 |             assert (x.size(2) > self.lorder)
127 |             new_cache = x[:, :, -self.lorder:]
128 |         else:
129 |             # It's better we just return None if no cache is required,
130 |             # However, for JIT export, here we just fake one tensor instead of
131 |             # None.
132 |             new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
133 | 
134 |         # GLU mechanism
135 |         x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
136 |         x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
137 | 
138 |         # 1D Depthwise Conv
139 |         x = self.depthwise_conv(x)
140 |         if self.use_layer_norm:
141 |             x = x.transpose(1, 2)
142 |         x = self.activation(self.norm(x))
143 |         if self.use_layer_norm:
144 |             x = x.transpose(1, 2)
145 |         x = self.pointwise_conv2(x)
146 |         # mask batch padding
147 |         if mask_pad.size(2) > 0:  # time > 0
148 |             x.masked_fill_(~mask_pad, 0.0)
149 | 
150 |         return x.transpose(1, 2), new_cache
151 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/ctc.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # Modified from ESPnet(https://github.com/espnet/espnet)
15 | 
16 | from typing import Tuple
17 | 
18 | import torch
19 | import torch.nn.functional as F
20 | 
21 | 
22 | class CTC(torch.nn.Module):
23 |     """CTC module"""
24 | 
25 |     def __init__(
26 |         self,
27 |         odim: int,
28 |         encoder_output_size: int,
29 |         dropout_rate: float = 0.0,
30 |         reduce: bool = True,
31 |         blank_id: int = 0,
32 |     ):
33 |         """ Construct CTC module
34 |         Args:
35 |             odim: dimension of outputs
36 |             encoder_output_size: number of encoder projection units
37 |             dropout_rate: dropout rate (0.0 ~ 1.0)
38 |             reduce: reduce the CTC loss into a scalar
39 |             blank_id: blank label.
40 |         """
41 |         super().__init__()
42 |         eprojs = encoder_output_size
43 |         self.dropout_rate = dropout_rate
44 |         self.ctc_lo = torch.nn.Linear(eprojs, odim)
45 | 
46 |         reduction_type = "sum" if reduce else "none"
47 |         self.ctc_loss = torch.nn.CTCLoss(blank=blank_id,
48 |                                          reduction=reduction_type,
49 |                                          zero_infinity=True)
50 | 
51 |     def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor,
52 |                 ys_pad: torch.Tensor,
53 |                 ys_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
54 |         """Calculate CTC loss.
55 | 
56 |         Args:
57 |             hs_pad: batch of padded hidden state sequences (B, Tmax, D)
58 |             hlens: batch of lengths of hidden state sequences (B)
59 |             ys_pad: batch of padded character id sequence tensor (B, Lmax)
60 |             ys_lens: batch of lengths of character sequence (B)
61 |         """
62 |         # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab)
63 |         ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate))
64 |         # ys_hat: (B, L, D) -> (L, B, D)
65 |         ys_hat = ys_hat.transpose(0, 1)
66 |         ys_hat = ys_hat.log_softmax(2)
67 |         loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens)
68 |         # Batch-size average
69 |         loss = loss / ys_hat.size(1)
70 |         ys_hat = ys_hat.transpose(0, 1)
71 |         return loss, ys_hat
72 | 
73 |     def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor:
74 |         """log_softmax of frame activations
75 | 
76 |         Args:
77 |             Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
78 |         Returns:
79 |             torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim)
80 |         """
81 |         return F.log_softmax(self.ctc_lo(hs_pad), dim=2)
82 | 
83 |     def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor:
84 |         """argmax of frame activations
85 | 
86 |         Args:
87 |             torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
88 |         Returns:
89 |             torch.Tensor: argmax applied 2d tensor (B, Tmax)
90 |         """
91 |         return torch.argmax(self.ctc_lo(hs_pad), dim=2)
92 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/decoder_layer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Shigeki Karita
  2 | #               2020 Mobvoi Inc (Binbin Zhang)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #   http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Decoder self-attention layer definition."""
 16 | from typing import Dict, Optional, Tuple
 17 | 
 18 | import torch
 19 | from torch import nn
 20 | from wenet.transformer.attention import T_CACHE
 21 | 
 22 | from wenet.utils.class_utils import WENET_NORM_CLASSES
 23 | 
 24 | 
 25 | class DecoderLayer(nn.Module):
 26 |     """Single decoder layer module.
 27 | 
 28 |     Args:
 29 |         size (int): Input dimension.
 30 |         self_attn (torch.nn.Module): Self-attention module instance.
 31 |             `MultiHeadedAttention` instance can be used as the argument.
 32 |         src_attn (torch.nn.Module): Inter-attention module instance.
 33 |             `MultiHeadedAttention` instance can be used as the argument.
 34 |             If `None` is passed, Inter-attention is not used, such as
 35 |             CIF, GPT, and other decoder only model.
 36 |         feed_forward (torch.nn.Module): Feed-forward module instance.
 37 |             `PositionwiseFeedForward` instance can be used as the argument.
 38 |         dropout_rate (float): Dropout rate.
 39 |         normalize_before (bool):
 40 |             True: use layer_norm before each sub-block.
 41 |             False: to use layer_norm after each sub-block.
 42 |     """
 43 | 
 44 |     def __init__(
 45 |         self,
 46 |         size: int,
 47 |         self_attn: nn.Module,
 48 |         src_attn: Optional[nn.Module],
 49 |         feed_forward: nn.Module,
 50 |         dropout_rate: float,
 51 |         normalize_before: bool = True,
 52 |         layer_norm_type: str = 'layer_norm',
 53 |         norm_eps: float = 1e-5,
 54 |     ):
 55 |         """Construct an DecoderLayer object."""
 56 |         super().__init__()
 57 |         self.size = size
 58 |         self.self_attn = self_attn
 59 |         self.src_attn = src_attn
 60 |         self.feed_forward = feed_forward
 61 |         assert layer_norm_type in ['layer_norm', 'rms_norm']
 62 |         self.norm1 = WENET_NORM_CLASSES[layer_norm_type](size, eps=norm_eps)
 63 |         self.norm2 = WENET_NORM_CLASSES[layer_norm_type](size, eps=norm_eps)
 64 |         self.norm3 = WENET_NORM_CLASSES[layer_norm_type](size, eps=norm_eps)
 65 |         self.dropout = nn.Dropout(dropout_rate)
 66 |         self.normalize_before = normalize_before
 67 | 
 68 |     def forward(
 69 |         self,
 70 |         tgt: torch.Tensor,
 71 |         tgt_mask: torch.Tensor,
 72 |         memory: torch.Tensor,
 73 |         memory_mask: torch.Tensor,
 74 |         cache: Optional[Dict[str, Optional[T_CACHE]]] = None
 75 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
 76 |         """Compute decoded features.
 77 | 
 78 |         Args:
 79 |             tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
 80 |             tgt_mask (torch.Tensor): Mask for input tensor
 81 |                 (#batch, maxlen_out).
 82 |             memory (torch.Tensor): Encoded memory
 83 |                 (#batch, maxlen_in, size).
 84 |             memory_mask (torch.Tensor): Encoded memory mask
 85 |                 (#batch, maxlen_in).
 86 |             cache (torch.Tensor): cached tensors.
 87 |                 (#batch, maxlen_out - 1, size).
 88 | 
 89 |         Returns:
 90 |             torch.Tensor: Output tensor (#batch, maxlen_out, size).
 91 |             torch.Tensor: Mask for output tensor (#batch, maxlen_out).
 92 |             torch.Tensor: Encoded memory (#batch, maxlen_in, size).
 93 |             torch.Tensor: Encoded memory mask (#batch, maxlen_in).
 94 | 
 95 |         """
 96 |         if cache is not None:
 97 |             att_cache = cache['self_att_cache']
 98 |             cross_att_cache = cache['cross_att_cache']
 99 |         else:
100 |             att_cache, cross_att_cache = None, None
101 | 
102 |         residual = tgt
103 |         if self.normalize_before:
104 |             tgt = self.norm1(tgt)
105 | 
106 |         if att_cache is None:
107 |             tgt_q = tgt
108 |             tgt_q_mask = tgt_mask
109 |             att_cache = (torch.empty(0, 0, 0, 0), torch.empty(0, 0, 0, 0))
110 |         else:
111 |             tgt_q = tgt[:, -1:, :]
112 |             residual = residual[:, -1:, :]
113 |             tgt_q_mask = tgt_mask[:, -1:, :]
114 | 
115 |         x, new_att_cache = self.self_attn(
116 |             tgt_q,
117 |             tgt_q,
118 |             tgt_q,
119 |             tgt_q_mask,
120 |             cache=att_cache,
121 |         )
122 |         if cache is not None:
123 |             cache['self_att_cache'] = new_att_cache
124 |         x = residual + self.dropout(x)
125 |         if not self.normalize_before:
126 |             x = self.norm1(x)
127 | 
128 |         if self.src_attn is not None:
129 |             residual = x
130 |             if self.normalize_before:
131 |                 x = self.norm2(x)
132 |             if cross_att_cache is None:
133 |                 cross_att_cache = (torch.empty(0, 0, 0,
134 |                                                0), torch.empty(0, 0, 0, 0))
135 |             x, new_cross_cache = self.src_attn(x,
136 |                                                memory,
137 |                                                memory,
138 |                                                memory_mask,
139 |                                                cache=cross_att_cache)
140 |             if cache is not None:
141 |                 cache['cross_att_cache'] = new_cross_cache
142 |             x = residual + self.dropout(x)
143 |             if not self.normalize_before:
144 |                 x = self.norm2(x)
145 | 
146 |         residual = x
147 |         if self.normalize_before:
148 |             x = self.norm3(x)
149 |         x = residual + self.dropout(self.feed_forward(x))
150 |         if not self.normalize_before:
151 |             x = self.norm3(x)
152 | 
153 |         return x, tgt_mask, memory, memory_mask
154 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/label_smoothing_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Shigeki Karita
 2 | #               2020 Mobvoi Inc (Binbin Zhang)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Label smoothing module."""
16 | 
17 | import torch
18 | from torch import nn
19 | 
20 | 
21 | class LabelSmoothingLoss(nn.Module):
22 |     """Label-smoothing loss.
23 | 
24 |     In a standard CE loss, the label's data distribution is:
25 |     [0,1,2] ->
26 |     [
27 |         [1.0, 0.0, 0.0],
28 |         [0.0, 1.0, 0.0],
29 |         [0.0, 0.0, 1.0],
30 |     ]
31 | 
32 |     In the smoothing version CE Loss,some probabilities
33 |     are taken from the true label prob (1.0) and are divided
34 |     among other labels.
35 | 
36 |     e.g.
37 |     smoothing=0.1
38 |     [0,1,2] ->
39 |     [
40 |         [0.9, 0.05, 0.05],
41 |         [0.05, 0.9, 0.05],
42 |         [0.05, 0.05, 0.9],
43 |     ]
44 | 
45 |     Args:
46 |         size (int): the number of class
47 |         padding_idx (int): padding class id which will be ignored for loss
48 |         smoothing (float): smoothing rate (0.0 means the conventional CE)
49 |         normalize_length (bool):
50 |             normalize loss by sequence length if True
51 |             normalize loss by batch size if False
52 |     """
53 | 
54 |     def __init__(self,
55 |                  size: int,
56 |                  padding_idx: int,
57 |                  smoothing: float,
58 |                  normalize_length: bool = False):
59 |         """Construct an LabelSmoothingLoss object."""
60 |         super(LabelSmoothingLoss, self).__init__()
61 |         self.criterion = nn.KLDivLoss(reduction="none")
62 |         self.padding_idx = padding_idx
63 |         self.confidence = 1.0 - smoothing
64 |         self.smoothing = smoothing
65 |         self.size = size
66 |         self.normalize_length = normalize_length
67 | 
68 |     def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
69 |         """Compute loss between x and target.
70 | 
71 |         The model outputs and data labels tensors are flatten to
72 |         (batch*seqlen, class) shape and a mask is applied to the
73 |         padding part which should not be calculated for loss.
74 | 
75 |         Args:
76 |             x (torch.Tensor): prediction (batch, seqlen, class)
77 |             target (torch.Tensor):
78 |                 target signal masked with self.padding_id (batch, seqlen)
79 |         Returns:
80 |             loss (torch.Tensor) : The KL loss, scalar float value
81 |         """
82 |         assert x.size(2) == self.size
83 |         batch_size = x.size(0)
84 |         x = x.view(-1, self.size)
85 |         target = target.view(-1)
86 |         # use zeros_like instead of torch.no_grad() for true_dist,
87 |         # since no_grad() can not be exported by JIT
88 |         true_dist = torch.zeros_like(x)
89 |         true_dist.fill_(self.smoothing / (self.size - 1))
90 |         ignore = target == self.padding_idx  # (B,)
91 |         total = len(target) - ignore.sum().item()
92 |         target = target.masked_fill(ignore, 0)  # avoid -1 index
93 |         true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
94 |         kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
95 |         denom = total if self.normalize_length else batch_size
96 |         return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
97 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class RMSNorm(torch.nn.Module):
 5 |     """ https://arxiv.org/pdf/1910.07467.pdf
 6 |     """
 7 | 
 8 |     def __init__(
 9 |         self,
10 |         dim: int,
11 |         eps: float = 1e-6,
12 |         add_unit_offset: bool = True,
13 |     ):
14 |         super().__init__()
15 |         self.eps = eps
16 |         self.weight = torch.nn.Parameter(torch.ones(dim))
17 |         self.add_unit_offset = add_unit_offset
18 | 
19 |     def _norm(self, x):
20 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
21 | 
22 |     def forward(self, x):
23 |         x = self._norm(x.float()).type_as(x)
24 |         if self.add_unit_offset:
25 |             return x * (1 + self.weight)
26 |         else:
27 |             return x * self.weight
28 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/positionwise_feed_forward.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Shigeki Karita
  2 | #               2020 Mobvoi Inc (Binbin Zhang)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #   http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Positionwise feed forward layer definition."""
 16 | 
 17 | import torch
 18 | 
 19 | 
 20 | class PositionwiseFeedForward(torch.nn.Module):
 21 |     """Positionwise feed forward layer.
 22 | 
 23 |     FeedForward are appied on each position of the sequence.
 24 |     The output dim is same with the input dim.
 25 | 
 26 |     Args:
 27 |         idim (int): Input dimenstion.
 28 |         hidden_units (int): The number of hidden units.
 29 |         dropout_rate (float): Dropout rate.
 30 |         activation (torch.nn.Module): Activation function
 31 |     """
 32 | 
 33 |     def __init__(
 34 |         self,
 35 |         idim: int,
 36 |         hidden_units: int,
 37 |         dropout_rate: float,
 38 |         activation: torch.nn.Module = torch.nn.ReLU(),
 39 |         bias: bool = True,
 40 |         *dummy_args,
 41 |         **dummy_kwargs,
 42 |     ):
 43 |         """Construct a PositionwiseFeedForward object."""
 44 |         super(PositionwiseFeedForward, self).__init__()
 45 |         self.w_1 = torch.nn.Linear(idim, hidden_units, bias=bias)
 46 |         self.activation = activation
 47 |         self.dropout = torch.nn.Dropout(dropout_rate)
 48 |         self.w_2 = torch.nn.Linear(hidden_units, idim, bias=bias)
 49 | 
 50 |     def forward(self, xs: torch.Tensor) -> torch.Tensor:
 51 |         """Forward function.
 52 | 
 53 |         Args:
 54 |             xs: input tensor (B, L, D)
 55 |         Returns:
 56 |             output tensor, (B, L, D)
 57 |         """
 58 |         return self.w_2(self.dropout(self.activation(self.w_1(xs))))
 59 | 
 60 | 
 61 | class MoEFFNLayer(torch.nn.Module):
 62 |     """
 63 |     Mixture of expert with Positionwise feed forward layer
 64 |     See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf
 65 |     The output dim is same with the input dim.
 66 | 
 67 |     Modified from https://github.com/Lightning-AI/lit-gpt/pull/823
 68 |                   https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219
 69 |     Args:
 70 |         n_expert: number of expert.
 71 |         n_expert_activated: The actual number of experts used for each frame
 72 |         idim (int): Input dimenstion.
 73 |         hidden_units (int): The number of hidden units.
 74 |         dropout_rate (float): Dropout rate.
 75 |         activation (torch.nn.Module): Activation function
 76 |     """
 77 | 
 78 |     def __init__(
 79 |         self,
 80 |         idim: int,
 81 |         hidden_units: int,
 82 |         dropout_rate: float,
 83 |         activation: torch.nn.Module = torch.nn.ReLU(),
 84 |         bias: bool = False,
 85 |         n_expert: int = 8,
 86 |         n_expert_activated: int = 2,
 87 |     ):
 88 |         super(MoEFFNLayer, self).__init__()
 89 |         self.gate = torch.nn.Linear(idim, n_expert, bias=False)
 90 |         self.experts = torch.nn.ModuleList(
 91 |             PositionwiseFeedForward(
 92 |                 idim, hidden_units, dropout_rate, activation, bias=bias)
 93 |             for _ in range(n_expert))
 94 |         self.n_expert = n_expert
 95 |         self.n_expert_activated = n_expert_activated
 96 | 
 97 |     def forward(self, xs: torch.Tensor) -> torch.Tensor:
 98 |         """Foward function.
 99 |         Args:
100 |             xs: input tensor (B, L, D)
101 |         Returns:
102 |             output tensor, (B, L, D)
103 | 
104 |         """
105 |         B, L, D = xs.size(
106 |         )  # batch size, sequence length, embedding dimension (idim)
107 |         xs = xs.view(-1, D)  # (B*L, D)
108 |         router = self.gate(xs)  # (B*L, n_expert)
109 |         logits, selected_experts = torch.topk(
110 |             router, self.n_expert_activated
111 |         )  # probs:(B*L, n_expert_activated), selected_exp: (B*L, n_expert_activated)
112 |         weights = torch.nn.functional.softmax(
113 |             logits, dim=1,
114 |             dtype=torch.float).to(dtype=xs.dtype)  # (B*L, n_expert_activated)
115 |         output = torch.zeros_like(xs)  # (B*L, D)
116 |         for i, expert in enumerate(self.experts):
117 |             mask = selected_experts == i
118 |             token_ids, ith_expert = torch.where(mask)
119 |             output[token_ids] += weights[token_ids, ith_expert, None] * expert(
120 |                 xs[token_ids])
121 |         return output.view(B, L, D)
122 | 
123 | 
124 | class GatedVariantsMLP(torch.nn.Module):
125 |     """ https://arxiv.org/pdf/2002.05202.pdf
126 |     """
127 | 
128 |     def __init__(
129 |         self,
130 |         idim: int,
131 |         hidden_units: int,
132 |         dropout_rate: float,
133 |         activation: torch.nn.Module = torch.nn.GELU(),
134 |         bias: bool = True,
135 |         *dummy_args,
136 |         **dummy_kwargs,
137 |     ):
138 |         """Construct a PositionwiseFeedForward object."""
139 |         super(GatedVariantsMLP, self).__init__()
140 |         self.gate = torch.nn.Linear(idim, hidden_units, bias=False)
141 |         self.activation = activation
142 |         # w_1 as up proj
143 |         self.w_1 = torch.nn.Linear(idim, hidden_units, bias=bias)
144 |         self.dropout = torch.nn.Dropout(dropout_rate)
145 |         # w_2 as down proj
146 |         self.w_2 = torch.nn.Linear(hidden_units, idim, bias=bias)
147 | 
148 |     def forward(self, x) -> torch.Tensor:
149 |         """Foward function.
150 |         Args:
151 |             xs: input tensor (B, L, D)
152 |         Returns:
153 |             output tensor, (B, L, D)
154 | 
155 |         """
156 |         gate = self.activation(self.gate(x))
157 |         up = self.w_1(x)
158 |         fuse = gate * up
159 |         return self.w_2(self.dropout(fuse))
160 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/transformer/swish.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
 2 | #               2020 Northwestern Polytechnical University (Pengcheng Guo)
 3 | #               2020 Mobvoi Inc (Binbin Zhang)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #   http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """Swish() activation function for Conformer."""
17 | import math
18 | 
19 | import torch
20 | 
21 | 
22 | class Swish(torch.nn.Module):
23 |     """Construct an Swish object."""
24 | 
25 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
26 |         """Return Swish activation function."""
27 |         return x * torch.sigmoid(x)
28 | 
29 | class New_gelu4npu(torch.nn.Module):
30 |     """Construct an Swish object."""
31 | 
32 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
33 |         """Return Swish activation function."""
34 |         return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
35 | 
36 | def new_gelu_func(x: torch.Tensor):
37 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
38 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__init__.py


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/checkpoint.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/checkpoint.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/checkpoint.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/checkpoint.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/checkpoint.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/checkpoint.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/class_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/class_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/class_utils.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/class_utils.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/class_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/class_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/cmvn.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/cmvn.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/cmvn.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/cmvn.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/cmvn.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/cmvn.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/common.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/common.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/common.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/common.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/common.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/common.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/config.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/config.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/config.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/context_graph.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/context_graph.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/context_graph.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/context_graph.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/context_graph.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/context_graph.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/ctc_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/ctc_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/ctc_utils.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/ctc_utils.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/ctc_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/ctc_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/executor.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/executor.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/executor.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/executor.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/file_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/file_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/file_utils.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/file_utils.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/file_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/file_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/fsdp_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/fsdp_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/fsdp_utils.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/fsdp_utils.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/init_dataset.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_dataset.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/init_dataset.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_dataset.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/init_model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_model.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/init_model.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_model.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/init_model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_model.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/init_tokenizer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_tokenizer.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/init_tokenizer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_tokenizer.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/init_tokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_tokenizer.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/mask.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/mask.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/mask.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/mask.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/mask.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/mask.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/rope_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/rope_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/rope_utils.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/rope_utils.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/rope_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/rope_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/scheduler.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/scheduler.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/scheduler.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/scheduler.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/train_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/train_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/__pycache__/train_utils.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/train_utils.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/checkpoint.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import logging
 16 | import os
 17 | import re
 18 | 
 19 | import yaml
 20 | import torch
 21 | from collections import OrderedDict
 22 | 
 23 | import datetime
 24 | 
 25 | 
 26 | def load_checkpoint(model: torch.nn.Module, path: str) -> dict:
 27 |     rank = int(os.environ.get('RANK', 0))
 28 |     logging.info('[Rank {}] Checkpoint: loading from checkpoint {}'.format(
 29 |         rank, path))
 30 |     checkpoint = torch.load(path, map_location='cpu')
 31 |     missing_keys, unexpected_keys = model.load_state_dict(checkpoint,
 32 |                                                           strict=False)
 33 |     if rank == 0:
 34 |         for key in missing_keys:
 35 |             logging.info("missing tensor: {}".format(key))
 36 |         for key in unexpected_keys:
 37 |             logging.info("unexpected tensor: {}".format(key))
 38 |     info_path = re.sub('.pt$', '.yaml', path)
 39 |     configs = {}
 40 |     if os.path.exists(info_path):
 41 |         with open(info_path, 'r') as fin:
 42 |             configs = yaml.load(fin, Loader=yaml.FullLoader)
 43 |         if configs is None:
 44 |             configs = {}
 45 |     return configs
 46 | 
 47 | 
 48 | def save_state_dict_and_infos(state_dict, path: str, infos=None):
 49 |     rank = int(os.environ.get('RANK', 0))
 50 |     logging.info('[Rank {}] Checkpoint: save to checkpoint {}'.format(
 51 |         rank, path))
 52 |     torch.save(state_dict, path)
 53 |     info_path = re.sub('.pt$', '.yaml', path)
 54 |     if infos is None:
 55 |         infos = {}
 56 |     infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S')
 57 |     with open(info_path, 'w') as fout:
 58 |         data = yaml.dump(infos)
 59 |         fout.write(data)
 60 | 
 61 | 
 62 | def save_checkpoint(model: torch.nn.Module, path: str, infos=None):
 63 |     '''
 64 |     Args:
 65 |         infos (dict or None): any info you want to save.
 66 |     '''
 67 |     if isinstance(model, torch.nn.DataParallel):
 68 |         state_dict = model.module.state_dict()
 69 |     elif isinstance(model, torch.nn.parallel.DistributedDataParallel):
 70 |         state_dict = model.module.state_dict()
 71 |     else:
 72 |         state_dict = model.state_dict()
 73 |     save_state_dict_and_infos(state_dict, path, infos)
 74 | 
 75 | 
 76 | def filter_modules(model_state_dict, modules):
 77 |     rank = int(os.environ.get('RANK', 0))
 78 |     new_mods = []
 79 |     incorrect_mods = []
 80 |     mods_model = model_state_dict.keys()
 81 |     for mod in modules:
 82 |         if any(key.startswith(mod) for key in mods_model):
 83 |             new_mods += [mod]
 84 |         else:
 85 |             incorrect_mods += [mod]
 86 |     if incorrect_mods and rank == 0:
 87 |         logging.warning(
 88 |             "module(s) %s don't match or (partially match) "
 89 |             "available modules in model.",
 90 |             incorrect_mods,
 91 |         )
 92 |         logging.warning("for information, the existing modules in model are:")
 93 |         logging.warning("%s", mods_model)
 94 | 
 95 |     return new_mods
 96 | 
 97 | 
 98 | def load_trained_modules(model: torch.nn.Module, args: None):
 99 |     # Load encoder modules with pre-trained model(s).
100 |     enc_model_path = args.enc_init
101 |     enc_modules = args.enc_init_mods
102 |     main_state_dict = model.state_dict()
103 |     logging.warning("model(s) found for pre-initialization")
104 |     if os.path.isfile(enc_model_path):
105 |         logging.info('Checkpoint: loading from checkpoint %s for CPU' %
106 |                      enc_model_path)
107 |         model_state_dict = torch.load(enc_model_path, map_location='cpu')
108 |         modules = filter_modules(model_state_dict, enc_modules)
109 |         partial_state_dict = OrderedDict()
110 |         for key, value in model_state_dict.items():
111 |             if any(key.startswith(m) for m in modules):
112 |                 partial_state_dict[key] = value
113 |         main_state_dict.update(partial_state_dict)
114 |     else:
115 |         logging.warning("model was not found : %s", enc_model_path)
116 | 
117 |     model.load_state_dict(main_state_dict)
118 |     configs = {}
119 |     return configs
120 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/class_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>
 4 | import torch
 5 | from torch.nn import BatchNorm1d, LayerNorm
 6 | from wenet.paraformer.embedding import ParaformerPositinoalEncoding
 7 | from wenet.transformer.norm import RMSNorm
 8 | from wenet.transformer.positionwise_feed_forward import (
 9 |     GatedVariantsMLP, MoEFFNLayer, PositionwiseFeedForward)
10 | 
11 | from wenet.transformer.swish import Swish, New_gelu4npu
12 | from wenet.transformer.subsampling import (
13 |     LinearNoSubsampling,
14 |     EmbedinigNoSubsampling,
15 |     Conv1dSubsampling2,
16 |     Conv2dSubsampling4,
17 |     Conv2dSubsampling6,
18 |     Conv2dSubsampling8,
19 |     StackNFramesSubsampling,
20 | )
21 | from wenet.efficient_conformer.subsampling import Conv2dSubsampling2
22 | from wenet.squeezeformer.subsampling import DepthwiseConv2dSubsampling4
23 | from wenet.transformer.embedding import (PositionalEncoding,
24 |                                          RelPositionalEncoding,
25 |                                          RopePositionalEncoding,
26 |                                          WhisperPositionalEncoding,
27 |                                          LearnablePositionalEncoding,
28 |                                          NoPositionalEncoding)
29 | from wenet.transformer.attention import (MultiHeadedAttention,
30 |                                          MultiHeadedCrossAttention,
31 |                                          RelPositionMultiHeadedAttention,
32 |                                          RopeMultiHeadedAttention,
33 |                                          ShawRelPositionMultiHeadedAttention)
34 | from wenet.efficient_conformer.attention import (
35 |     GroupedRelPositionMultiHeadedAttention)
36 | 
37 | WENET_ACTIVATION_CLASSES = {
38 |     "hardtanh": torch.nn.Hardtanh,
39 |     "tanh": torch.nn.Tanh,
40 |     "relu": torch.nn.ReLU,
41 |     "selu": torch.nn.SELU,
42 |     "swish": getattr(torch.nn, "SiLU", Swish),
43 |     "gelu": New_gelu4npu,
44 | }
45 | 
46 | WENET_RNN_CLASSES = {
47 |     "rnn": torch.nn.RNN,
48 |     "lstm": torch.nn.LSTM,
49 |     "gru": torch.nn.GRU,
50 | }
51 | 
52 | WENET_SUBSAMPLE_CLASSES = {
53 |     "linear": LinearNoSubsampling,
54 |     "embed": EmbedinigNoSubsampling,
55 |     "conv1d2": Conv1dSubsampling2,
56 |     "conv2d2": Conv2dSubsampling2,
57 |     "conv2d": Conv2dSubsampling4,
58 |     "dwconv2d4": DepthwiseConv2dSubsampling4,
59 |     "conv2d6": Conv2dSubsampling6,
60 |     "conv2d8": Conv2dSubsampling8,
61 |     'paraformer_dummy': torch.nn.Identity,
62 |     'stack_n_frames': StackNFramesSubsampling,
63 | }
64 | 
65 | WENET_EMB_CLASSES = {
66 |     "embed": PositionalEncoding,
67 |     "abs_pos": PositionalEncoding,
68 |     "rel_pos": RelPositionalEncoding,
69 |     "no_pos": NoPositionalEncoding,
70 |     "abs_pos_whisper": WhisperPositionalEncoding,
71 |     "embed_learnable_pe": LearnablePositionalEncoding,
72 |     "abs_pos_paraformer": ParaformerPositinoalEncoding,
73 |     'rope_pos': RopePositionalEncoding,
74 | }
75 | 
76 | WENET_ATTENTION_CLASSES = {
77 |     "selfattn": MultiHeadedAttention,
78 |     "rel_selfattn": RelPositionMultiHeadedAttention,
79 |     "grouped_rel_selfattn": GroupedRelPositionMultiHeadedAttention,
80 |     "crossattn": MultiHeadedCrossAttention,
81 |     'shaw_rel_selfattn': ShawRelPositionMultiHeadedAttention,
82 |     'rope_abs_selfattn': RopeMultiHeadedAttention,
83 | }
84 | 
85 | WENET_MLP_CLASSES = {
86 |     'position_wise_feed_forward': PositionwiseFeedForward,
87 |     'moe': MoEFFNLayer,
88 |     'gated': GatedVariantsMLP
89 | }
90 | 
91 | WENET_NORM_CLASSES = {
92 |     'layer_norm': LayerNorm,
93 |     'batch_norm': BatchNorm1d,
94 |     'rms_norm': RMSNorm
95 | }
96 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/cmvn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import json
16 | import math
17 | 
18 | import numpy as np
19 | 
20 | 
21 | def _load_json_cmvn(json_cmvn_file):
22 |     """ Load the json format cmvn stats file and calculate cmvn
23 | 
24 |     Args:
25 |         json_cmvn_file: cmvn stats file in json format
26 | 
27 |     Returns:
28 |         a numpy array of [means, vars]
29 |     """
30 |     with open(json_cmvn_file) as f:
31 |         cmvn_stats = json.load(f)
32 | 
33 |     means = cmvn_stats['mean_stat']
34 |     variance = cmvn_stats['var_stat']
35 |     count = cmvn_stats['frame_num']
36 |     for i in range(len(means)):
37 |         means[i] /= count
38 |         variance[i] = variance[i] / count - means[i] * means[i]
39 |         if variance[i] < 1.0e-20:
40 |             variance[i] = 1.0e-20
41 |         variance[i] = 1.0 / math.sqrt(variance[i])
42 |     cmvn = np.array([means, variance])
43 |     return cmvn
44 | 
45 | 
46 | def _load_kaldi_cmvn(kaldi_cmvn_file):
47 |     """ Load the kaldi format cmvn stats file and calculate cmvn
48 | 
49 |     Args:
50 |         kaldi_cmvn_file:  kaldi text style global cmvn file, which
51 |            is generated by:
52 |            compute-cmvn-stats --binary=false scp:feats.scp global_cmvn
53 | 
54 |     Returns:
55 |         a numpy array of [means, vars]
56 |     """
57 |     means = []
58 |     variance = []
59 |     with open(kaldi_cmvn_file, 'r') as fid:
60 |         # kaldi binary file start with '\0B'
61 |         if fid.read(2) == '\0B':
62 |             logging.error('kaldi cmvn binary file is not supported, please '
63 |                           'recompute it by: compute-cmvn-stats --binary=false '
64 |                           ' scp:feats.scp global_cmvn')
65 |             sys.exit(1)
66 |         fid.seek(0)
67 |         arr = fid.read().split()
68 |         assert (arr[0] == '[')
69 |         assert (arr[-2] == '0')
70 |         assert (arr[-1] == ']')
71 |         feat_dim = int((len(arr) - 2 - 2) / 2)
72 |         for i in range(1, feat_dim + 1):
73 |             means.append(float(arr[i]))
74 |         count = float(arr[feat_dim + 1])
75 |         for i in range(feat_dim + 2, 2 * feat_dim + 2):
76 |             variance.append(float(arr[i]))
77 | 
78 |     for i in range(len(means)):
79 |         means[i] /= count
80 |         variance[i] = variance[i] / count - means[i] * means[i]
81 |         if variance[i] < 1.0e-20:
82 |             variance[i] = 1.0e-20
83 |         variance[i] = 1.0 / math.sqrt(variance[i])
84 |     cmvn = np.array([means, variance])
85 |     return cmvn
86 | 
87 | 
88 | def load_cmvn(cmvn_file, is_json):
89 |     if is_json:
90 |         cmvn = _load_json_cmvn(cmvn_file)
91 |     else:
92 |         cmvn = _load_kaldi_cmvn(cmvn_file)
93 |     return cmvn[0], cmvn[1]
94 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Shaoshang Qi
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import copy
16 | 
17 | 
18 | def override_config(configs, override_list):
19 |     new_configs = copy.deepcopy(configs)
20 |     for item in override_list:
21 |         arr = item.split()
22 |         if len(arr) != 2:
23 |             print(f"the overrive {item} format not correct, skip it")
24 |             continue
25 |         keys = arr[0].split('.')
26 |         s_configs = new_configs
27 |         for i, key in enumerate(keys):
28 |             if key not in s_configs:
29 |                 print(f"the overrive {item} format not correct, skip it")
30 |             if i == len(keys) - 1:
31 |                 param_type = type(s_configs[key])
32 |                 if param_type != bool:
33 |                     s_configs[key] = param_type(arr[1])
34 |                 else:
35 |                     s_configs[key] = arr[1] in ['true', 'True']
36 |                 print(f"override {arr[0]} with {arr[1]}")
37 |             else:
38 |                 s_configs = s_configs[key]
39 |     return new_configs
40 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/ctc_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from typing import List, Tuple
 16 | 
 17 | import numpy as np
 18 | 
 19 | import torch
 20 | import torchaudio.functional as F
 21 | 
 22 | 
 23 | def remove_duplicates_and_blank(hyp: List[int],
 24 |                                 blank_id: int = 0) -> List[int]:
 25 |     new_hyp: List[int] = []
 26 |     cur = 0
 27 |     while cur < len(hyp):
 28 |         if hyp[cur] != blank_id:
 29 |             new_hyp.append(hyp[cur])
 30 |         prev = cur
 31 |         while cur < len(hyp) and hyp[cur] == hyp[prev]:
 32 |             cur += 1
 33 |     return new_hyp
 34 | 
 35 | 
 36 | def replace_duplicates_with_blank(hyp: List[int],
 37 |                                   blank_id: int = 0) -> List[int]:
 38 |     new_hyp: List[int] = []
 39 |     cur = 0
 40 |     while cur < len(hyp):
 41 |         new_hyp.append(hyp[cur])
 42 |         prev = cur
 43 |         cur += 1
 44 |         while cur < len(
 45 |                 hyp) and hyp[cur] == hyp[prev] and hyp[cur] != blank_id:
 46 |             new_hyp.append(blank_id)
 47 |             cur += 1
 48 |     return new_hyp
 49 | 
 50 | 
 51 | def gen_ctc_peak_time(hyp: List[int], blank_id: int = 0) -> List[int]:
 52 |     times = []
 53 |     cur = 0
 54 |     while cur < len(hyp):
 55 |         if hyp[cur] != blank_id:
 56 |             times.append(cur)
 57 |         prev = cur
 58 |         while cur < len(hyp) and hyp[cur] == hyp[prev]:
 59 |             cur += 1
 60 |     return times
 61 | 
 62 | 
 63 | def gen_timestamps_from_peak(
 64 |     peaks: List[int],
 65 |     max_duration: float,
 66 |     frame_rate: float = 0.04,
 67 |     max_token_duration: float = 1.0,
 68 | ) -> List[Tuple[float, float]]:
 69 |     """
 70 |     Args:
 71 |         peaks: ctc peaks time stamp
 72 |         max_duration: max_duration of the sentence
 73 |         frame_rate: frame rate of every time stamp, in seconds
 74 |         max_token_duration: max duration of the token, in seconds
 75 |     Returns:
 76 |         list(start, end) of each token
 77 |     """
 78 |     times = []
 79 |     half_max = max_token_duration / 2
 80 |     for i in range(len(peaks)):
 81 |         if i == 0:
 82 |             start = max(0, peaks[0] * frame_rate - half_max)
 83 |         else:
 84 |             start = max((peaks[i - 1] + peaks[i]) / 2 * frame_rate,
 85 |                         peaks[i] * frame_rate - half_max)
 86 | 
 87 |         if i == len(peaks) - 1:
 88 |             end = min(max_duration, peaks[-1] * frame_rate + half_max)
 89 |         else:
 90 |             end = min((peaks[i] + peaks[i + 1]) / 2 * frame_rate,
 91 |                       peaks[i] * frame_rate + half_max)
 92 |         times.append((start, end))
 93 |     return times
 94 | 
 95 | 
 96 | def insert_blank(label, blank_id=0):
 97 |     """Insert blank token between every two label token."""
 98 |     label = np.expand_dims(label, 1)
 99 |     blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id
100 |     label = np.concatenate([blanks, label], axis=1)
101 |     label = label.reshape(-1)
102 |     label = np.append(label, label[0])
103 |     return label
104 | 
105 | 
106 | def force_align(ctc_probs: torch.Tensor, y: torch.Tensor, blank_id=0) -> list:
107 |     """ctc forced alignment.
108 | 
109 |     Args:
110 |         torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D)
111 |         torch.Tensor y: id sequence tensor 1d tensor (L)
112 |         int blank_id: blank symbol index
113 |     Returns:
114 |         torch.Tensor: alignment result
115 |     """
116 |     ctc_probs = ctc_probs[None].cpu()
117 |     y = y[None].cpu()
118 |     alignments, _ = F.forced_align(ctc_probs, y, blank=blank_id)
119 |     return alignments[0]
120 | 
121 | 
122 | def get_blank_id(configs, symbol_table):
123 |     if 'ctc_conf' not in configs:
124 |         configs['ctc_conf'] = {}
125 | 
126 |     if '<blank>' in symbol_table:
127 |         if 'ctc_blank_id' in configs['ctc_conf']:
128 |             assert configs['ctc_conf']['ctc_blank_id'] == symbol_table[
129 |                 '<blank>']
130 |         else:
131 |             configs['ctc_conf']['ctc_blank_id'] = symbol_table['<blank>']
132 |     else:
133 |         assert 'ctc_blank_id' in configs[
134 |             'ctc_conf'], "PLZ set ctc_blank_id in yaml"
135 | 
136 |     return configs, configs['ctc_conf']['ctc_blank_id']
137 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/file_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import re
16 | 
17 | 
18 | def read_lists(list_file):
19 |     lists = []
20 |     with open(list_file, 'r', encoding='utf8') as fin:
21 |         for line in fin:
22 |             lists.append(line.strip())
23 |     return lists
24 | 
25 | 
26 | def read_non_lang_symbols(non_lang_sym_path):
27 |     """read non-linguistic symbol from file.
28 | 
29 |     The file format is like below:
30 | 
31 |     {NOISE}\n
32 |     {BRK}\n
33 |     ...
34 | 
35 | 
36 |     Args:
37 |         non_lang_sym_path: non-linguistic symbol file path, None means no any
38 |         syms.
39 | 
40 |     """
41 |     if non_lang_sym_path is None:
42 |         return []
43 |     else:
44 |         syms = read_lists(non_lang_sym_path)
45 |         non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})")
46 |         for sym in syms:
47 |             if non_lang_syms_pattern.fullmatch(sym) is None:
48 | 
49 |                 class BadSymbolFormat(Exception):
50 |                     pass
51 | 
52 |                 raise BadSymbolFormat(
53 |                     "Non-linguistic symbols should be "
54 |                     "formatted in {xxx}/<xxx>/[xxx], consider"
55 |                     " modify '%s' to meet the requirment. "
56 |                     "More details can be found in discussions here : "
57 |                     "https://github.com/wenet-e2e/wenet/pull/819" % (sym))
58 |         return syms
59 | 
60 | 
61 | def read_symbol_table(symbol_table_file):
62 |     symbol_table = {}
63 |     with open(symbol_table_file, 'r', encoding='utf8') as fin:
64 |         for line in fin:
65 |             arr = line.strip().split()
66 |             assert len(arr) == 2
67 |             symbol_table[arr[0]] = int(arr[1])
68 |     return symbol_table
69 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/fsdp_utils.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | import os
  3 | from torch.distributed.fsdp import (FullyShardedDataParallel as FSDP,
  4 |                                     FullStateDictConfig, StateDictType)
  5 | 
  6 | from torch.distributed.fsdp.wrap import (lambda_auto_wrap_policy,
  7 |                                          transformer_auto_wrap_policy)
  8 | from wenet.LLM.decoder import DecoderOnly
  9 | from wenet.branchformer.encoder_layer import BranchformerEncoderLayer
 10 | from wenet.e_branchformer.encoder_layer import EBranchformerEncoderLayer
 11 | from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer
 12 | from wenet.paraformer.layers import AliParaformerEncoderLayer, SanmDecoderLayer
 13 | from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer
 14 | from wenet.transformer.encoder_layer import (ConformerEncoderLayer,
 15 |                                              TransformerEncoderLayer)
 16 | from wenet.transformer.decoder_layer import DecoderLayer
 17 | from wenet.utils.checkpoint import save_state_dict_and_infos
 18 | from wenet.utils.init_model import WENET_DECODER_CLASSES, WENET_ENCODER_CLASSES
 19 | 
 20 | WENET_ENCODER_LAYERS_CLASSES = {
 21 |     'transformer_encoder_layer': TransformerEncoderLayer,
 22 |     'conformer_encoder_layer': ConformerEncoderLayer,
 23 |     'paraformer_encoder_layer': AliParaformerEncoderLayer,
 24 |     'squeezeformer_encoder_layer': SqueezeformerEncoderLayer,
 25 |     'ebranchformer_encoder_layer': EBranchformerEncoderLayer,
 26 |     'efficient_conformer_encoder_layer': StrideConformerEncoderLayer,
 27 |     'branchformer_encoder_layer': BranchformerEncoderLayer,
 28 | }
 29 | 
 30 | WENET_DECODER_LAYERS_CLASSES = {
 31 |     'transformer_decoder_layer': DecoderLayer,
 32 |     'paraformer_decoder_layer': SanmDecoderLayer,
 33 |     # TODO(Mddct):
 34 |     #     1 wrap transducer's predictor and joint
 35 |     #     2 wrap paraformer's cif and ignore lstm
 36 | }
 37 | 
 38 | 
 39 | def wenet_fsdp_wrap_policy(mode):
 40 |     # different wrap methods
 41 |     # please refer： https://openmmlab.medium.com/its-2023-is-pytorch-s-fsdp-the-best-choice-for-training-large-models-fe8d2848832f # noqa
 42 |     assert mode in ['no_shard', 'model', 'zero2', 'zero3']
 43 |     if mode == 'no_shard':
 44 |         return None
 45 |     else:
 46 |         # TODO(Mddct):  Support user customization
 47 |         # see more wrap methods:
 48 |         # https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/utils/fsdp_utils.py#L13 # noqa
 49 |         if mode == 'model':
 50 |             enc_dec_wrap_policy = partial(
 51 |                 lambda_auto_wrap_policy,
 52 |                 lambda_fn=lambda module: isinstance(
 53 |                     module,
 54 |                     tuple(WENET_ENCODER_CLASSES.values()) + tuple(
 55 |                         WENET_DECODER_CLASSES.values())))
 56 |             return enc_dec_wrap_policy
 57 |         else:
 58 |             to_wrap_class = set()
 59 |             to_wrap_class.update(set(WENET_ENCODER_LAYERS_CLASSES.values()))
 60 |             to_wrap_class.update(set(WENET_DECODER_LAYERS_CLASSES.values()))
 61 |             layers_wrap_policy = partial(transformer_auto_wrap_policy,
 62 |                                          transformer_layer_cls=to_wrap_class)
 63 |             return layers_wrap_policy
 64 | 
 65 | 
 66 | fullstate_save_policy = FullStateDictConfig(offload_to_cpu=True,
 67 |                                             rank0_only=True)
 68 | 
 69 | 
 70 | def fsdp_save_model(model, save_model_path, info_dict):
 71 |     # TODO(Mddct); When the model is large, saving a model will take a long time.
 72 |     # We only need to keep the sharding in an asynchronous manner, but it is
 73 |     # good now. This feature will be supported when llm is supported in the future.
 74 | 
 75 |     rank = int(os.environ.get('RANK', 0))
 76 |     with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT,
 77 |                               fullstate_save_policy):
 78 |         state_dict = model.state_dict()
 79 |         if rank == 0:
 80 |             save_state_dict_and_infos(state_dict, save_model_path, info_dict)
 81 | 
 82 | 
 83 | def check_gradient_checkpoint(model):
 84 |     ckpt_laye_types = []
 85 |     if hasattr(model, 'encoder') and hasattr(model.encoder,
 86 |                                              'gradient_checkpointing'):
 87 |         if model.encoder.gradient_checkpointing:
 88 |             model.encoder.gradient_checkpointing = False
 89 |             ckpt_laye_types += list(WENET_ENCODER_LAYERS_CLASSES.values())
 90 |     if hasattr(model, 'decoder') and hasattr(model.decoder,
 91 |                                              'gradient_checkpointing'):
 92 |         if model.decoder.gradient_checkpointing:
 93 |             model.decoder.gradient_checkpointing = False
 94 |             ckpt_laye_types += list(WENET_DECODER_LAYERS_CLASSES.values())
 95 |             if isinstance(model.decoder, DecoderOnly):
 96 |                 ckpt_laye_types += [DecoderOnly]
 97 |     return tuple(ckpt_laye_types)
 98 | 
 99 | 
100 | def apply_fsdp_checkpointing(model, ckpt_layer_types: tuple):
101 |     # NOTE(Mddct):  torch.utils.checkpoint is currently incompatible with
102 |     # wenet's model mode. Using this writing method, Please refer to
103 |     # https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/policies/activation_checkpointing_functions.py#L21 # noqa
104 |     if len(ckpt_layer_types) == 0:
105 |         return
106 |     from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
107 |         checkpoint_wrapper,
108 |         CheckpointImpl,
109 |         apply_activation_checkpointing,
110 |     )
111 |     non_reentrant_wrapper = partial(
112 |         checkpoint_wrapper,
113 |         checkpoint_impl=CheckpointImpl.NO_REENTRANT,
114 |     )
115 |     apply_activation_checkpointing(
116 |         model,
117 |         checkpoint_wrapper_fn=non_reentrant_wrapper,
118 |         check_fn=lambda submodule: isinstance(submodule, ckpt_layer_types))
119 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/init_dataset.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | from typing import Optional
 3 | from wenet.dataset.dataset import Dataset
 4 | 
 5 | from wenet.text.base_tokenizer import BaseTokenizer
 6 | 
 7 | 
 8 | def init_asr_dataset(data_type,
 9 |                      data_list_file,
10 |                      tokenizer: Optional[BaseTokenizer] = None,
11 |                      conf=None,
12 |                      partition=True):
13 |     return Dataset(data_type, data_list_file, tokenizer, conf, partition)
14 | 
15 | 
16 | def init_dataset(dataset_type,
17 |                  data_type,
18 |                  data_list_file,
19 |                  tokenizer: Optional[BaseTokenizer] = None,
20 |                  conf=None,
21 |                  partition=True,
22 |                  split='train'):
23 |     assert dataset_type in ['asr', 'ssl']
24 | 
25 |     if split != 'train':
26 |         cv_conf = copy.deepcopy(conf)
27 |         cv_conf['cycle'] = 1
28 |         cv_conf['speed_perturb'] = False
29 |         cv_conf['spec_aug'] = False
30 |         cv_conf['spec_sub'] = False
31 |         cv_conf['spec_trim'] = False
32 |         cv_conf['shuffle'] = False
33 |         cv_conf['list_shuffle'] = False
34 |         conf = cv_conf
35 | 
36 |     if dataset_type == 'asr':
37 |         return init_asr_dataset(data_type, data_list_file, tokenizer, conf,
38 |                                 partition)
39 |     else:
40 |         from wenet.ssl.init_dataset import init_dataset as init_ssl_dataset
41 |         return init_ssl_dataset(data_type, data_list_file, conf, partition)
42 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/init_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022 Binbin Zhang (binbzha@qq.com)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import torch
 17 | 
 18 | from wenet.llm_asr.init_llmasr import init_llmasr
 19 | from wenet.transformer.asr_model import ASRModel
 20 | from wenet.transformer.cmvn import GlobalCMVN
 21 | from wenet.transformer.ctc import CTC
 22 | from wenet.transformer.encoder import TransformerEncoder, ConformerEncoder
 23 | from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder
 24 | from wenet.whisper.whisper import Whisper
 25 | from wenet.utils.cmvn import load_cmvn
 26 | from wenet.utils.checkpoint import load_checkpoint, load_trained_modules
 27 | 
 28 | 
 29 | WENET_ENCODER_CLASSES = {
 30 |     "transformer": TransformerEncoder,
 31 |     "conformer": ConformerEncoder,
 32 | }
 33 | 
 34 | WENET_DECODER_CLASSES = {
 35 |     "transformer": TransformerDecoder,
 36 |     "bitransformer": BiTransformerDecoder,
 37 | }
 38 | 
 39 | WENET_CTC_CLASSES = {
 40 |     "ctc": CTC,
 41 | }
 42 | 
 43 | WENET_MODEL_CLASSES = {
 44 |     "asr_model": ASRModel,
 45 |     "whisper": Whisper,
 46 | }
 47 | 
 48 | 
 49 | def init_speech_model(args, configs):
 50 |     # TODO(xcsong): Forcefully read the 'cmvn' attribute.
 51 |     if configs.get('cmvn', None) == 'global_cmvn':
 52 |         mean, istd = load_cmvn(configs['cmvn_conf']['cmvn_file'],
 53 |                                configs['cmvn_conf']['is_json_cmvn'])
 54 |         global_cmvn = GlobalCMVN(
 55 |             torch.from_numpy(mean).float(),
 56 |             torch.from_numpy(istd).float())
 57 |     else:
 58 |         global_cmvn = None
 59 | 
 60 |     input_dim = configs['input_dim']
 61 |     vocab_size = configs['output_dim']
 62 | 
 63 |     encoder_type = configs.get('encoder', 'conformer')
 64 |     decoder_type = configs.get('decoder', 'bitransformer')
 65 |     ctc_type = configs.get('ctc', 'ctc')
 66 | 
 67 |     encoder = WENET_ENCODER_CLASSES[encoder_type](
 68 |         input_dim,
 69 |         global_cmvn=global_cmvn,
 70 |         **configs['encoder_conf'],
 71 |         **configs['encoder_conf']['efficient_conf']
 72 |         if 'efficient_conf' in configs['encoder_conf'] else {})
 73 | 
 74 |     decoder = WENET_DECODER_CLASSES[decoder_type](vocab_size,
 75 |                                                   encoder.output_size(),
 76 |                                                   **configs['decoder_conf'])
 77 | 
 78 |     ctc = WENET_CTC_CLASSES[ctc_type](
 79 |         vocab_size,
 80 |         encoder.output_size(),
 81 |         blank_id=configs['ctc_conf']['ctc_blank_id']
 82 |         if 'ctc_conf' in configs else 0)
 83 | 
 84 |     model_type = configs.get('model', 'asr_model')
 85 | 
 86 |     model = WENET_MODEL_CLASSES[model_type](
 87 |         vocab_size=vocab_size,
 88 |         encoder=encoder,
 89 |         decoder=decoder,
 90 |         ctc=ctc,
 91 |         special_tokens=configs.get('tokenizer_conf',
 92 |                                    {}).get('special_tokens', None),
 93 |         **configs['model_conf'])
 94 |     return model, configs
 95 | 
 96 | 
 97 | 
 98 | def init_model(args, configs):
 99 | 
100 |     model_type = configs.get('model', 'asr_model')
101 |     configs['model'] = model_type
102 |     if model_type == "llmasr":
103 |         model = init_llmasr(args, configs)
104 |         return model
105 |     else:
106 |         model, configs = init_speech_model(args, configs)
107 | 
108 | 
109 |     # If specify checkpoint, load some info from checkpoint
110 |     if hasattr(args, 'checkpoint') and args.checkpoint is not None:
111 |         infos = load_checkpoint(model, args.checkpoint)
112 |     elif hasattr(args, 'enc_init') and args.enc_init is not None:
113 |         infos = load_trained_modules(model, args)
114 |     else:
115 |         infos = {}
116 |     if configs.get('init_step', False):
117 |         infos = {}
118 |     configs["init_infos"] = infos
119 | 
120 |     if hasattr(args, 'use_lora') and args.use_lora:
121 |         if hasattr(args, 'lora_ckpt_path') and args.lora_ckpt_path:
122 |             load_checkpoint(model, args.lora_ckpt_path)
123 | 
124 |     print(configs)
125 |     # Trye to tie some weights
126 |     if hasattr(model, 'tie_or_clone_weights'):
127 |         if not hasattr(args, 'jit'):
128 |             args.jit = True  # i.e. export onnx/jit/ipex
129 |         model.tie_or_clone_weights(args.jit)
130 | 
131 |     if int(os.environ.get('RANK', 0)) == 0:
132 |         print(configs)
133 | 
134 |     return model, configs
135 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/init_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Wenet Community. (authors: Dinghao Zhou)
 2 | #                                     (authors: Xingchen Song)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import logging
17 | 
18 | from wenet.text.base_tokenizer import BaseTokenizer
19 | from wenet.text.bpe_tokenizer import BpeTokenizer
20 | from wenet.text.char_tokenizer import CharTokenizer
21 | from wenet.text.hugging_face_tokenizer import HuggingFaceTokenizer
22 | from wenet.text.paraformer_tokenizer import ParaformerTokenizer
23 | from wenet.text.whisper_tokenizer import WhisperTokenizer
24 | 
25 | 
26 | def init_tokenizer(configs) -> BaseTokenizer:
27 |     # TODO(xcsong): Forcefully read the 'tokenizer' attribute.
28 |     tokenizer_type = configs.get("tokenizer", "char")
29 |     if tokenizer_type == "whisper":
30 |         tokenizer = WhisperTokenizer(
31 |             multilingual=configs['tokenizer_conf']['is_multilingual'],
32 |             num_languages=configs['tokenizer_conf']['num_languages'])
33 |     elif tokenizer_type == "char":
34 |         tokenizer = CharTokenizer(
35 |             configs['tokenizer_conf']['symbol_table_path'],
36 |             configs['tokenizer_conf']['non_lang_syms_path'],
37 |             split_with_space=configs['tokenizer_conf'].get(
38 |                 'split_with_space', False),
39 |             connect_symbol=configs['tokenizer_conf'].get('connect_symbol', ''))
40 |     elif tokenizer_type == "bpe":
41 |         tokenizer = BpeTokenizer(
42 |             configs['tokenizer_conf']['bpe_path'],
43 |             configs['tokenizer_conf']['symbol_table_path'],
44 |             configs['tokenizer_conf']['non_lang_syms_path'],
45 |             split_with_space=configs['tokenizer_conf'].get(
46 |                 'split_with_space', False))
47 |     elif tokenizer_type == 'paraformer':
48 |         tokenizer = ParaformerTokenizer(
49 |             symbol_table=configs['tokenizer_conf']['symbol_table_path'],
50 |             seg_dict=configs['tokenizer_conf']['seg_dict_path'])
51 |     elif tokenizer_type == 'huggingface':
52 |         tokenizer = HuggingFaceTokenizer(
53 |             model=configs['tokenizer_conf']['llm_path'])
54 |     else:
55 |         raise NotImplementedError
56 |     logging.info("use {} tokenizer".format(configs["tokenizer"]))
57 | 
58 |     return tokenizer
59 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/utils/rope_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | # copy from:https://github.com/google/gemma_pytorch/blob/main/gemma/model.py#L84
 5 | def precompute_freqs_cis(dim: int,
 6 |                          end: int,
 7 |                          theta: float = 10000.0) -> torch.Tensor:
 8 |     """Precomputes the frequency cis."""
 9 |     freqs = 1.0 / (theta**(torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
10 |     t = torch.arange(end, device=freqs.device)
11 |     freqs = torch.outer(t, freqs).float()
12 |     freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
13 |     return freqs_cis
14 | 
15 | 
16 | # modified from:
17 | #     https://github.com/google/gemma_pytorch/blob/main/gemma/model.py#L95
18 | def google_apply_rotary_emb(x: torch.Tensor,
19 |                             freqs_cis: torch.Tensor) -> torch.Tensor:
20 |     """Applies the rotary embedding to the query and key tensors."""
21 |     x_ = torch.view_as_complex(
22 |         torch.stack(torch.chunk(x.float(), 2, dim=-1), dim=-1))
23 |     x_out = torch.view_as_real(x_ * freqs_cis).type_as(x)
24 |     x_out = torch.cat(torch.chunk(x_out, 2, dim=-1), dim=-2)
25 |     x_out = x_out.reshape(x_out.shape[0], x_out.shape[1], x_out.shape[2], -1)
26 |     return x_out
27 | 
28 | 
29 | def llama_apply_rotary_emb(x: torch.Tensor,
30 |                            freqs_cis: torch.Tensor) -> torch.Tensor:
31 |     x_ = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
32 |     x_out = torch.view_as_real(x_ * freqs_cis).flatten(3)
33 |     return x_out.type_as(x)
34 | 
35 | 
36 | WENET_APPLY_ROTARY_EMB = {
37 |     'google': google_apply_rotary_emb,
38 |     'llama': llama_apply_rotary_emb,
39 | }
40 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/whisper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__init__.py


--------------------------------------------------------------------------------
/C2SER-llm/wenet/whisper/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/whisper/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/whisper/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/whisper/__pycache__/whisper.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__pycache__/whisper.cpython-310.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/whisper/__pycache__/whisper.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__pycache__/whisper.cpython-311.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/whisper/__pycache__/whisper.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__pycache__/whisper.cpython-39.pyc


--------------------------------------------------------------------------------
/C2SER-llm/wenet/whisper/whisper.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Wenet Community. (authors: Xingchen Song)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Modified from [Whisper](https://github.com/openai/whisper)
16 | 
17 | import torch
18 | 
19 | from typing import Tuple, Dict, List
20 | 
21 | from wenet.transformer.asr_model import ASRModel
22 | from wenet.transformer.ctc import CTC
23 | from wenet.transformer.encoder import TransformerEncoder
24 | from wenet.transformer.decoder import TransformerDecoder
25 | from wenet.utils.common import IGNORE_ID, add_whisper_tokens, th_accuracy
26 | 
27 | 
28 | class Whisper(ASRModel):
29 | 
30 |     def __init__(
31 |         self,
32 |         vocab_size: int,
33 |         encoder: TransformerEncoder,
34 |         decoder: TransformerDecoder,
35 |         ctc: CTC = None,
36 |         ctc_weight: float = 0.5,
37 |         ignore_id: int = IGNORE_ID,
38 |         reverse_weight: float = 0.0,
39 |         lsm_weight: float = 0.0,
40 |         length_normalized_loss: bool = False,
41 |         special_tokens: dict = None,
42 |     ):
43 |         super().__init__(vocab_size, encoder, decoder, ctc, ctc_weight,
44 |                          ignore_id, reverse_weight, lsm_weight,
45 |                          length_normalized_loss, special_tokens)
46 |         assert reverse_weight == 0.0
47 |         self.sos = special_tokens["sot"]
48 |         self.eos = special_tokens["eot"]
49 |         self.decode_maxlen = self.decoder.embed[1].max_len
50 | 
51 |     # TODO(xcsong): time align
52 |     def set_alignment_heads(self, dump: bytes):
53 |         raise NotImplementedError
54 | 
55 |     @property
56 |     def is_multilingual(self):
57 |         return self.vocab_size >= 51865
58 | 
59 |     @property
60 |     def num_languages(self):
61 |         return self.vocab_size - 51765 - int(self.is_multilingual)
62 | 
63 |     def _calc_att_loss(
64 |         self,
65 |         encoder_out: torch.Tensor,
66 |         encoder_mask: torch.Tensor,
67 |         ys_pad: torch.Tensor,
68 |         ys_pad_lens: torch.Tensor,
69 |         infos: Dict[str, List[str]],
70 |     ) -> Tuple[torch.Tensor, float]:
71 |         prev_len = ys_pad.size(1)
72 |         ys_in_pad, ys_out_pad = add_whisper_tokens(self.special_tokens,
73 |                                                    ys_pad,
74 |                                                    self.ignore_id,
75 |                                                    tasks=infos['tasks'],
76 |                                                    no_timestamp=True,
77 |                                                    langs=infos['langs'],
78 |                                                    use_prev=False)
79 |         cur_len = ys_in_pad.size(1)
80 |         ys_in_lens = ys_pad_lens + cur_len - prev_len
81 | 
82 |         # 1. Forward decoder
83 |         decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask,
84 |                                                      ys_in_pad, ys_in_lens)
85 | 
86 |         # 2. Compute attention loss
87 |         loss_att = self.criterion_att(decoder_out, ys_out_pad)
88 |         acc_att = th_accuracy(
89 |             decoder_out.view(-1, self.vocab_size),
90 |             ys_out_pad,
91 |             ignore_label=self.ignore_id,
92 |         )
93 |         return loss_att, acc_att
94 | 


--------------------------------------------------------------------------------
/C2SER-llm/wenet/whisper/whisper_with_clap.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 Wenet Community. (authors: Xingchen Song)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | # Modified from [Whisper](https://github.com/openai/whisper)
 16 | 
 17 | import torch
 18 | 
 19 | from typing import Tuple, Dict, List
 20 | 
 21 | from torch import nn
 22 | 
 23 | from wenet.transformer.asr_model import ASRModel
 24 | from wenet.transformer.ctc import CTC
 25 | from wenet.transformer.encoder import TransformerEncoder
 26 | from wenet.transformer.decoder import TransformerDecoder
 27 | from wenet.utils.common import IGNORE_ID, add_whisper_tokens, th_accuracy
 28 | 
 29 | 
 30 | class Whisper(ASRModel):
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         vocab_size: int,
 35 |         encoder: TransformerEncoder,
 36 |         decoder: TransformerDecoder,
 37 |         ctc: CTC = None,
 38 |         ctc_weight: float = 0.5,
 39 |         ignore_id: int = IGNORE_ID,
 40 |         reverse_weight: float = 0.0,
 41 |         lsm_weight: float = 0.0,
 42 |         length_normalized_loss: bool = False,
 43 |         special_tokens: dict = None,
 44 |     ):
 45 |         super().__init__(vocab_size, encoder, decoder, ctc, ctc_weight,
 46 |                          ignore_id, reverse_weight, lsm_weight,
 47 |                          length_normalized_loss, special_tokens)
 48 |         assert reverse_weight == 0.0
 49 |         self.sos = special_tokens["sot"]
 50 |         self.eos = special_tokens["eot"]
 51 |         self.decode_maxlen = self.decoder.embed[1].max_len
 52 | 
 53 |         # 添加clap
 54 |         self.clip_length = 40
 55 |         self.prefix_length = 40
 56 |         num_layers = 12
 57 |         dim_embedding = 1024
 58 |         dim_clip = 512
 59 |         # 修改一下使用nn.transformer
 60 |         nhead = 8
 61 |         self.ttt = nn.TransformerEncoder(
 62 |             encoder_layer=nn.TransformerEncoderLayer(d_model=dim_embedding, nhead=nhead),
 63 |             num_layers=num_layers
 64 |         )
 65 |         self.linear = nn.Linear(dim_clip, self.clip_length * dim_embedding)
 66 |         self.prefix_const = nn.Parameter(torch.randn(self.prefix_length, dim_embedding), requires_grad=True)
 67 | 
 68 |         from transformers import ClapModel, AutoFeatureExtractor
 69 |         # 加载模型和处理器
 70 |         self.model = ClapModel.from_pretrained(
 71 |             "/home/work_nfs11/wjtian/work_space/wenet_whisper_finetune/examples/wenetspeech/whisper/pretrain_ckpt/clap-htsat-unfused")
 72 |         self.processor = AutoFeatureExtractor.from_pretrained(
 73 |             "/home/work_nfs11/wjtian/work_space/wenet_whisper_finetune/examples/wenetspeech/whisper/pretrain_ckpt/clap-htsat-unfused")
 74 |         for param in self.model.parameters():
 75 |             param.requires_grad = False
 76 | 
 77 |     # TODO(xcsong): time align
 78 |     def set_alignment_heads(self, dump: bytes):
 79 |         raise NotImplementedError
 80 | 
 81 |     @property
 82 |     def is_multilingual(self):
 83 |         return self.vocab_size >= 51865
 84 | 
 85 |     @property
 86 |     def num_languages(self):
 87 |         return self.vocab_size - 51765 - int(self.is_multilingual)
 88 | 
 89 |     def _calc_att_loss(
 90 |         self,
 91 |         encoder_out: torch.Tensor,
 92 |         encoder_mask: torch.Tensor,
 93 |         ys_pad: torch.Tensor,
 94 |         ys_pad_lens: torch.Tensor,
 95 |         infos: Dict[str, List[str]],
 96 |     ) -> Tuple[torch.Tensor, float]:
 97 |         prev_len = ys_pad.size(1)
 98 |         ys_in_pad, ys_out_pad = add_whisper_tokens(self.special_tokens,
 99 |                                                    ys_pad,
100 |                                                    self.ignore_id,
101 |                                                    tasks=infos['tasks'],
102 |                                                    no_timestamp=True,
103 |                                                    langs=infos['langs'],
104 |                                                    use_prev=False)
105 |         cur_len = ys_in_pad.size(1)
106 |         ys_in_lens = ys_pad_lens + cur_len - prev_len
107 | 
108 |         # 1. Forward decoder
109 |         decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask,
110 |                                                      ys_in_pad, ys_in_lens)
111 | 
112 |         # 2. Compute attention loss
113 |         loss_att = self.criterion_att(decoder_out, ys_out_pad)
114 |         acc_att = th_accuracy(
115 |             decoder_out.view(-1, self.vocab_size),
116 |             ys_out_pad,
117 |             ignore_label=self.ignore_id,
118 |         )
119 |         return loss_att, acc_att
120 | 


--------------------------------------------------------------------------------
/Emotion2Vec-S/downstream_EmoBox/k_fold_CV.sh:
--------------------------------------------------------------------------------
 1 | cd examples/sb
 2 | data=/path/to/your/data_files
 3 | lrs=(1e-3 1e-4)               # Learning rate list
 4 | hidden_sizes=(128 256)        # Hidden size list
 5 | gpus=(0 1 2 3)                # GPU list
 6 | task_id=0
 7 | declare -A dataset_folds=(
 8 |     ["mesd"]=1
 9 | )
10 | declare -A dataset_classes=(
11 |     ["mesd"]=6
12 | )
13 | datasets=("mesd")
14 | 
15 | for dataset in "${datasets[@]}"; do
16 |     folds=${dataset_folds[$dataset]}
17 |     n_classes=${dataset_classes[$dataset]}
18 | 
19 |     for lr in "${lrs[@]}"; do
20 |         for hidden_size in "${hidden_sizes[@]}"; do
21 |             gpu=${gpus[$task_id % ${#gpus[@]}]}
22 |             export CUDA_VISIBLE_DEVICES=$gpu
23 |             task_number=$((task_id + 1))
24 |             for fold in $(seq 1 $folds); do
25 |                 echo "Training fold $fold with lr=$lr, hidden_size=$hidden_size on GPU $gpu, task_number=$task_number, dataset=$dataset..."
26 |                 python3 train.py \
27 |                     hparams/data2vec2-large_freeze.yaml \
28 |                     --output_folder /path/to/your/${dataset}-S/fold${fold}_lr${lr}_hidden${hidden_size} \
29 |                     --seed 1234 \
30 |                     --batch_size 32 \
31 |                     --lr $lr \
32 |                     --train_annotation ${data}/${dataset}/fold_${fold}/${dataset}_train_fold_${fold}.json \
33 |                     --test_annotation ${data}/${dataset}/fold_${fold}/${dataset}_test_fold_${fold}.json \
34 |                     --number_of_epochs 100 \
35 |                     --feat_dir /path/to/your/dump_${dataset}-S \
36 |                     --label_map ${data}/${dataset}/label_map.json \
37 |                     --device cuda \
38 |                     --out_n_neurons ${n_classes} \
39 |                     --hidden_size $hidden_size &
40 |             done
41 |             task_id=$((task_id + 1))
42 |         done
43 |     done
44 | done
45 | 
46 | wait
47 | echo "All training tasks completed."


--------------------------------------------------------------------------------
/Emotion2Vec-S/examples/.gitignore:
--------------------------------------------------------------------------------
1 | !*/*.sh
2 | !*/*.md
3 | 


--------------------------------------------------------------------------------
/Emotion2Vec-S/examples/data2vec/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/__init__.py


--------------------------------------------------------------------------------
/Emotion2Vec-S/examples/data2vec/models/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/Emotion2Vec-S/examples/data2vec/models/__pycache__/data2vec2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/__pycache__/data2vec2.cpython-38.pyc


--------------------------------------------------------------------------------
/Emotion2Vec-S/examples/data2vec/models/modalities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/modalities/__init__.py


--------------------------------------------------------------------------------
/Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/audio.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/audio.cpython-38.pyc


--------------------------------------------------------------------------------
/Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/base.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/base.cpython-38.pyc


--------------------------------------------------------------------------------
/Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/modules.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/modules.cpython-38.pyc


--------------------------------------------------------------------------------
/Emotion2Vec-S/examples/data2vec/models/modalities/audio.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | from functools import partial
  7 | import torch
  8 | import torch.nn as nn
  9 | import numpy as np
 10 | from dataclasses import dataclass, field
 11 | from typing import Callable, Dict, Optional
 12 | from fairseq.models.wav2vec import ConvFeatureExtractionModel
 13 | from fairseq.modules import (
 14 |     LayerNorm,
 15 |     SamePad,
 16 |     TransposeLast,
 17 | )
 18 | from fairseq.tasks import FairseqTask
 19 | from .base import D2vModalityConfig, ModalitySpecificEncoder, get_alibi_bias
 20 | from .modules import BlockEncoder, Decoder1d
 21 | from enum import Enum, auto
 22 | 
 23 | class Modality(Enum):
 24 |     AUDIO = auto()
 25 |     IMAGE = auto()
 26 |     TEXT = auto()
 27 | 
 28 | @dataclass
 29 | class D2vAudioConfig(D2vModalityConfig):
 30 |     type: Modality = Modality.AUDIO
 31 |     extractor_mode: str = "layer_norm"
 32 |     feature_encoder_spec: str = field(
 33 |         default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
 34 |         metadata={
 35 |             "help": "string describing convolutional feature extraction layers in form of a python list that contains "
 36 |             "[(dim, kernel_size, stride), ...]"
 37 |         },
 38 |     )
 39 |     conv_pos_width: int = field(
 40 |         default=95,
 41 |         metadata={"help": "number of filters for convolutional positional embeddings"},
 42 |     )
 43 |     conv_pos_groups: int = field(
 44 |         default=16,
 45 |         metadata={"help": "number of groups for convolutional positional embedding"},
 46 |     )
 47 |     conv_pos_depth: int = field(
 48 |         default=5,
 49 |         metadata={"help": "depth of positional encoder network"},
 50 |     )
 51 |     conv_pos_pre_ln: bool = False
 52 | 
 53 | 
 54 | class AudioEncoder(ModalitySpecificEncoder):
 55 | 
 56 |     modality_cfg: D2vAudioConfig
 57 | 
 58 |     def __init__(
 59 |         self,
 60 |         modality_cfg: D2vAudioConfig,
 61 |         embed_dim: int,
 62 |         make_block: Callable[[float], nn.ModuleList],
 63 |         norm_layer: Callable[[int], nn.LayerNorm],
 64 |         layer_norm_first: bool,
 65 |         alibi_biases: Dict,
 66 |         task: Optional[FairseqTask],
 67 |     ):
 68 | 
 69 |         self.feature_enc_layers = eval(modality_cfg.feature_encoder_spec)
 70 |         feature_embed_dim = self.feature_enc_layers[-1][0]
 71 | 
 72 |         local_encoder = ConvFeatureExtractionModel(
 73 |             conv_layers=self.feature_enc_layers,
 74 |             dropout=0.0,
 75 |             mode=modality_cfg.extractor_mode,
 76 |             conv_bias=False,
 77 |         )
 78 | 
 79 |         project_features = nn.Sequential(
 80 |             TransposeLast(),
 81 |             nn.LayerNorm(feature_embed_dim),
 82 |             nn.Linear(feature_embed_dim, embed_dim),
 83 |         )
 84 | 
 85 |         num_pos_layers = modality_cfg.conv_pos_depth
 86 |         k = max(3, modality_cfg.conv_pos_width // num_pos_layers)
 87 | 
 88 |         positional_encoder = nn.Sequential(
 89 |             TransposeLast(),
 90 |             *[
 91 |                 nn.Sequential(
 92 |                     nn.Conv1d(
 93 |                         embed_dim,
 94 |                         embed_dim,
 95 |                         kernel_size=k,
 96 |                         padding=k // 2,
 97 |                         groups=modality_cfg.conv_pos_groups,
 98 |                     ),
 99 |                     SamePad(k),
100 |                     TransposeLast(),
101 |                     LayerNorm(embed_dim, elementwise_affine=False),
102 |                     TransposeLast(),
103 |                     nn.GELU(),
104 |                 )
105 |                 for _ in range(num_pos_layers)
106 |             ],
107 |             TransposeLast(),
108 |         )
109 | 
110 |         if modality_cfg.conv_pos_pre_ln:
111 |             positional_encoder = nn.Sequential(LayerNorm(embed_dim), positional_encoder)
112 | 
113 |         dpr = np.linspace(
114 |             modality_cfg.start_drop_path_rate,
115 |             modality_cfg.end_drop_path_rate,
116 |             modality_cfg.prenet_depth,
117 |         )
118 |         context_encoder = BlockEncoder(
119 |             nn.ModuleList(make_block(dpr[i]) for i in range(modality_cfg.prenet_depth)),
120 |             norm_layer(embed_dim) if not layer_norm_first else None,
121 |             layer_norm_first,
122 |             modality_cfg.prenet_layerdrop,
123 |             modality_cfg.prenet_dropout,
124 |         )
125 | 
126 |         decoder = (
127 |             Decoder1d(modality_cfg.decoder, embed_dim)
128 |             if modality_cfg.decoder is not None
129 |             else None
130 |         )
131 | 
132 |         alibi_bias_fn = partial(get_alibi_bias, alibi_biases=alibi_biases)
133 | 
134 |         super().__init__(
135 |             modality_cfg=modality_cfg,
136 |             embed_dim=embed_dim,
137 |             local_encoder=local_encoder,
138 |             project_features=project_features,
139 |             fixed_positional_encoder=None,
140 |             relative_positional_encoder=positional_encoder,
141 |             context_encoder=context_encoder,
142 |             decoder=decoder,
143 |             get_alibi_bias=alibi_bias_fn,
144 |         )
145 | 
146 |     def convert_padding_mask(self, x, padding_mask):
147 |         def get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
148 |             """
149 |             Computes the output length of the convolutional layers
150 |             """
151 | 
152 |             def _conv_out_length(input_length, kernel_size, stride):
153 |                 return torch.floor((input_length - kernel_size) / stride + 1)
154 | 
155 |             for i in range(len(self.feature_enc_layers)):
156 |                 input_lengths = _conv_out_length(
157 |                     input_lengths,
158 |                     self.feature_enc_layers[i][1],
159 |                     self.feature_enc_layers[i][2],
160 |                 )
161 | 
162 |             return input_lengths.to(torch.long)
163 | 
164 |         if padding_mask is not None:
165 |             input_lengths = (1 - padding_mask.long()).sum(-1)
166 |             # apply conv formula to get real output_lengths
167 |             output_lengths = get_feat_extract_output_lengths(input_lengths)
168 | 
169 |             if padding_mask.any():
170 |                 padding_mask = torch.zeros(x.shape[:2], dtype=x.dtype, device=x.device)
171 | 
172 |                 # these two operations makes sure that all values
173 |                 # before the output lengths indices are attended to
174 |                 padding_mask[
175 |                     (
176 |                         torch.arange(padding_mask.shape[0], device=padding_mask.device),
177 |                         output_lengths - 1,
178 |                     )
179 |                 ] = 1
180 |                 padding_mask = (
181 |                     1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])
182 |                 ).bool()
183 |             else:
184 |                 padding_mask = torch.zeros(
185 |                     x.shape[:2], dtype=torch.bool, device=x.device
186 |                 )
187 | 
188 |         return padding_mask
189 | 
190 |     def reset_parameters(self):
191 |         super().reset_parameters()
192 |         for mod in self.project_features.children():
193 |             if isinstance(mod, nn.Linear):
194 |                 mod.reset_parameters()
195 |         if self.decoder is not None:
196 |             self.decoder.reset_parameters()
197 | 


--------------------------------------------------------------------------------
/Emotion2Vec-S/extract_feature.sh:
--------------------------------------------------------------------------------
 1 | datasets=("m3ed" "iemocap")  # Add dataset names to this array e.g., iempcap
 2 | 
 3 | for dataset in "${datasets[@]}"; do
 4 |     echo "Processing dataset: $dataset"
 5 |     python3 speech_feature_extraction.py \
 6 |         --model_path C2SER/Emotion2Vec-S/ckpt/checkpoint.pt \
 7 |         --model_dir C2SER/Emotion2Vec-S/examples/data2vec/ \
 8 |         --dump_dir C2SER/Emotion2Vec-S/fea_${dataset} \
 9 |         --device cuda \
10 |         --data C2SER/Emotion2Vec-S/${dataset}.scp \
11 |         --level frame
12 | done


--------------------------------------------------------------------------------
/Emotion2Vec-S/features/features_frm/4YJy1uDx0jM_769.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/features/features_frm/4YJy1uDx0jM_769.npy


--------------------------------------------------------------------------------
/Emotion2Vec-S/features/features_frm/vo_EQAST002_1_paimon_07.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/features/features_frm/vo_EQAST002_1_paimon_07.npy


--------------------------------------------------------------------------------
/Emotion2Vec-S/features/features_utt/4YJy1uDx0jM_769.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/features/features_utt/4YJy1uDx0jM_769.npy


--------------------------------------------------------------------------------
/Emotion2Vec-S/features/features_utt/vo_EQAST002_1_paimon_07.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/features/features_utt/vo_EQAST002_1_paimon_07.npy


--------------------------------------------------------------------------------
/Emotion2Vec-S/speech_feature_extraction.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import sys
 4 | import json
 5 | import numpy as np
 6 | import argparse
 7 | from tqdm import tqdm
 8 | import torchaudio
 9 | import torch.nn.functional as F
10 | import fairseq
11 | from dataclasses import dataclass
12 | 
13 | SAMPLING_RATE=16000
14 | 
15 | @dataclass
16 | class UserDirModule:
17 |     user_dir: str
18 | 
19 | def extract_fairseq_feature(wav_path, model, device):
20 |     try:
21 |         wav, sr = torchaudio.load(wav_path)
22 |         # 合并多声道为单声道（取平均）
23 |         if wav.size(0) > 1:
24 |             wav = torch.mean(wav, dim=0, keepdim=True)
25 |         if sr != SAMPLING_RATE:
26 |             wav = torchaudio.functional.resample(wav, sr, SAMPLING_RATE)
27 |         wav = wav[0, :].view(1, -1)
28 |         wav = wav.to(device)
29 |         out = model.extract_features(wav)
30 |         return out
31 |     except Exception as e:
32 |         print(f"Error processing audio file {wav_path}: {e}")
33 |         return None
34 | 
35 | if __name__ == '__main__':
36 | 
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument('--model_path', type=str, default="/home/work_nfs15/sywang/work_space/fairseq/1_public/checkpoint.pt", help="Path to the model checkpoint file")
39 |     parser.add_argument('--model_dir', type=str, default="./Emotion2Vec-S/examples/data2vec/", help="Path to the model directory")
40 |     parser.add_argument('--dump_dir', type=str, default="./features_frm", help="Directory to save extracted features")
41 |     parser.add_argument('--device', type=str, default='cuda', help="Device to use for computation (e.g., 'cuda' or 'cpu')")
42 |     parser.add_argument('--data', type=str, default="./Emotion2Vec-S/wav.scp", help="Path to the wav.scp file containing audio paths")
43 |     parser.add_argument('--level', type=str, default="frame", help="frame or utterance")
44 |     args = parser.parse_args()
45 | 
46 |     data = {}
47 |     with open(args.data, 'r') as f:
48 |         for line in f:
49 |             seg_id, wav_path = line.strip().split(maxsplit=1)
50 |             data[seg_id] = wav_path
51 | 
52 |     os.makedirs(args.dump_dir, exist_ok=True)
53 | 
54 |     seg_ids = data.keys()
55 |     print(f'Loaded {len(seg_ids)} audio entries')
56 |     # load models
57 |     my_model_path = UserDirModule(args.model_dir)
58 |     fairseq.utils.import_user_module(my_model_path)
59 |     model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([args.model_path])
60 |     model = model[0].to(args.device)
61 |     
62 |     for seg_id in tqdm(seg_ids):
63 | 
64 |         wav_path = data[seg_id]
65 |         if not os.path.exists(wav_path):
66 |             print(f"WARNING: {wav_path} does not exist")
67 |             continue 
68 |         try:
69 |             torchaudio.load(wav_path)
70 |         except:
71 |             print(f'ERROR: Failed to load {wav_path}')
72 |             continue         
73 | 
74 |         # 提取特征
75 |         feat = extract_fairseq_feature(wav_path, model, args.device)
76 | 
77 |         if feat is not None:
78 |             # 处理特征输出
79 |             if args.level == 'frame':
80 |                 feat = feat['x'].cpu().detach().numpy()[0]
81 |             elif args.level == 'utterance':
82 |                 feat = feat['utt_x'].cpu().detach().numpy()[0] 
83 |             else:
84 |                 raise ValueError("Unknown level: {}".format(args.level))            
85 | 
86 |             save_path = os.path.join(args.dump_dir, f"{seg_id}.npy")
87 |             os.makedirs(os.path.dirname(save_path), exist_ok=True)
88 |             np.save(save_path, feat)
89 |             print(f"Processed: {seg_id} | Shape: {feat.shape} | Saved to: {save_path}")
90 |         else:
91 |             print(f"Skipped problematic file: {seg_id}")
92 | 


--------------------------------------------------------------------------------
/Emotion2Vec-S/test_wav/4YJy1uDx0jM_769.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/test_wav/4YJy1uDx0jM_769.wav


--------------------------------------------------------------------------------
/Emotion2Vec-S/test_wav/vo_EQAST002_1_paimon_07.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/test_wav/vo_EQAST002_1_paimon_07.wav


--------------------------------------------------------------------------------
/Emotion2Vec-S/wav.scp:
--------------------------------------------------------------------------------
1 | 4YJy1uDx0jM_769 ./Emotion2Vec-S/test_wav/4YJy1uDx0jM_769.wav
2 | vo_EQAST002_1_paimon_07 ./Emotion2Vec-S/test_wav/vo_EQAST002_1_paimon_07.wav


--------------------------------------------------------------------------------
/figs/c2ser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/figs/c2ser.png


--------------------------------------------------------------------------------