├── src
    ├── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── src_glm4
    │   │   ├── cosyvoice
    │   │   │   ├── __init__.py
    │   │   │   ├── cli
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── cosyvoice.py
    │   │   │   ├── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── block_mask_util.py
    │   │   │   │   ├── file_utils.py
    │   │   │   │   ├── class_utils.py
    │   │   │   │   ├── common.py
    │   │   │   │   └── frontend_utils.py
    │   │   │   ├── dataset
    │   │   │   │   └── __init__.py
    │   │   │   ├── transformer
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── activation.py
    │   │   │   │   └── label_smoothing_loss.py
    │   │   │   ├── flow
    │   │   │   │   ├── length_regulator.py
    │   │   │   │   └── stable
    │   │   │   │   │   └── stable_diffusion_test.py
    │   │   │   └── hifigan
    │   │   │   │   └── f0_predictor.py
    │   │   ├── speech_tokenizer
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration_whisper.py
    │   │   │   └── utils.py
    │   │   └── audio_process.py
    │   ├── src_kimi
    │   │   └── kimia_infer
    │   │   │   ├── __init__.py
    │   │   │   ├── api
    │   │   │       └── __init__.py
    │   │   │   ├── models
    │   │   │       ├── __init__.py
    │   │   │       ├── tokenizer
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── whisper_Lv3
    │   │   │       │   │   └── mel_filters.npz
    │   │   │       │   └── glm4_tokenizer.py
    │   │   │       └── detokenizer
    │   │   │       │   ├── vocoder
    │   │   │       │       ├── alias_free_activation
    │   │   │       │       │   ├── __init__.py
    │   │   │       │       │   ├── cuda
    │   │   │       │       │   │   ├── __init__.py
    │   │   │       │       │   │   ├── compat.h
    │   │   │       │       │   │   ├── anti_alias_activation.cpp
    │   │   │       │       │   │   ├── activation1d.py
    │   │   │       │       │   │   └── load.py
    │   │   │       │       │   └── torch
    │   │   │       │       │   │   ├── __init__.py
    │   │   │       │       │   │   ├── act.py
    │   │   │       │       │   │   ├── resample.py
    │   │   │       │       │   │   └── filter.py
    │   │   │       │       └── utils.py
    │   │   │       │   ├── flow_matching
    │   │   │       │       └── scheduler.py
    │   │   │       │   └── bigvgan_wrapper.py
    │   │   │   └── utils
    │   │   │       ├── __init__.py
    │   │   │       ├── special_tokens.py
    │   │   │       └── data.py
    │   ├── src_llama_omni
    │   │   ├── datasets
    │   │   │   └── __init__.py
    │   │   ├── model
    │   │   │   ├── __init__.py
    │   │   │   ├── speech_generator
    │   │   │   │   └── builder.py
    │   │   │   ├── speech_projector
    │   │   │   │   ├── builder.py
    │   │   │   │   └── speech_projector.py
    │   │   │   └── speech_encoder
    │   │   │   │   ├── builder.py
    │   │   │   │   └── speech_encoder.py
    │   │   ├── constants.py
    │   │   └── arguments.py
    │   ├── src_speechgpt2
    │   │   └── Codec
    │   │   │   ├── models
    │   │   │       ├── __init__.py
    │   │   │       ├── modules
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── activation_function.py
    │   │   │       │   ├── projector.py
    │   │   │       │   ├── quantizer.py
    │   │   │       │   └── residual_block.py
    │   │   │       └── moshi_modules
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── rope.py
    │   │   │       │   ├── gating.py
    │   │   │       │   └── resample.py
    │   │   │   └── utils.py
    │   ├── src_minicpm
    │   │   └── ref_audios
    │   │   │   ├── assistant_male_voice.wav
    │   │   │   ├── assistant_female_voice.wav
    │   │   │   └── assistant_default_female_voice.wav
    │   ├── src_baichuan
    │   │   ├── constants.py
    │   │   └── cosy24k_vocoder
    │   │   │   ├── README.md
    │   │   │   ├── hifigan
    │   │   │       ├── __init__.py
    │   │   │       └── f0_predictor.py
    │   │   │   └── cosy24k_vocoder.py
    │   ├── src_freezeomni
    │   │   ├── decoder
    │   │   │   └── ticodec
    │   │   │   │   ├── vqvae_tester.py
    │   │   │   │   └── vqvae.py
    │   │   ├── utils.py
    │   │   └── encoder
    │   │   │   ├── cmvn.py
    │   │   │   └── subsampling.py
    │   ├── telechat2.py
    │   ├── kimi_audio.py
    │   ├── model_utils.py
    │   ├── qwen.py
    │   └── api.py
    ├── prompt
    │   ├── __init__.py
    │   └── template.py
    ├── evaluator
    │   ├── __init__.py
    │   ├── emo2vec.py
    │   ├── text_utils.py
    │   ├── asr.py
    │   ├── base.py
    │   ├── dialect.py
    │   └── dnsmos.py
    ├── summarizer
    │   ├── __init__.py
    │   └── summarizer.py
    └── config.py
├── assets
    └── contact.jpg
├── .gitmodules
├── registry
    ├── dataset
    │   ├── scene.yaml
    │   ├── multiturn.yaml
    │   ├── choice.yaml
    │   ├── human.yaml
    │   ├── paralinguistic.yaml
    │   ├── aqa.yaml
    │   └── dialect.yaml
    ├── model
    │   ├── api.yaml
    │   ├── text.yaml
    │   └── offline.yaml
    ├── evaluator
    │   ├── match.yaml
    │   ├── speech.yaml
    │   └── llm.yaml
    ├── infer_task
    │   ├── multiturn.yaml
    │   ├── scene.yaml
    │   ├── choice.yaml
    │   ├── human.yaml
    │   ├── paralinguistic.yaml
    │   ├── aqa.yaml
    │   └── dialect.yaml
    ├── eval_task
    │   ├── objective.yaml
    │   ├── speech.yaml
    │   └── llm.yaml
    ├── summarizer
    │   └── base.yaml
    └── template
    │   ├── aqa.yaml
    │   ├── text_llm.yaml
    │   └── multiturn.yaml
├── requirements
    ├── freeze_omni_requirements.txt
    ├── qwen2_5_omni_requirements.txt
    ├── minicpm_omni_requirements.txt
    ├── llama_omni_requirements.txt
    ├── kimi_audio_requirements.txt
    ├── glm4voice_requirements.txt
    └── speechgpt2_requirements.txt
├── requirements_eval.txt
├── main.py
├── requirements_all.txt
├── tools
    ├── test_eval_speech.py
    ├── save_csv.py
    ├── test_api.py
    └── parquet2jsonl.py
├── run_text.sh
└── run.sh


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/prompt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/evaluator/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/summarizer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/speech_tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_llama_omni/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_speechgpt2/Codec/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_speechgpt2/Codec/models/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/contact.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tele-AI/TELEVAL/HEAD/assets/contact.jpg


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/src_minicpm/ref_audios/assistant_male_voice.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tele-AI/TELEVAL/HEAD/src/models/src_minicpm/ref_audios/assistant_male_voice.wav


--------------------------------------------------------------------------------
/src/models/src_minicpm/ref_audios/assistant_female_voice.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tele-AI/TELEVAL/HEAD/src/models/src_minicpm/ref_audios/assistant_female_voice.wav


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src/models/src_glm4/third_party/Matcha-TTS"]
2 | 	path = src/models/src_glm4/third_party/Matcha-TTS
3 | 	url = git@github.com:shivammehta25/Matcha-TTS.git
4 | 


--------------------------------------------------------------------------------
/registry/dataset/scene.yaml:
--------------------------------------------------------------------------------
1 | aed_combine-zh:
2 |   class: src.dataset.BatchLoader
3 |   args:
4 |     file: Tele-AI/TELEVAL/aed_combine-zh
5 |     ref_col: answer
6 |     query_col: query


--------------------------------------------------------------------------------
/registry/model/api.yaml:
--------------------------------------------------------------------------------
1 | gpt4o-audio:
2 |   class: src.models.api.GPT4oAudio
3 |   args:
4 |     llm_name: gpt-4o-audio-preview
5 |     api_keys:
6 |       key1: "xxx"
7 |     max_workers: 1


--------------------------------------------------------------------------------
/src/models/src_minicpm/ref_audios/assistant_default_female_voice.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tele-AI/TELEVAL/HEAD/src/models/src_minicpm/ref_audios/assistant_default_female_voice.wav


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/tokenizer/whisper_Lv3/mel_filters.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tele-AI/TELEVAL/HEAD/src/models/src_kimi/kimia_infer/models/tokenizer/whisper_Lv3/mel_filters.npz


--------------------------------------------------------------------------------
/src/models/src_llama_omni/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.omni_speech_llama import OmniSpeechLlamaForCausalLM, OmniSpeechConfig
2 | from .language_model.omni_speech2s_llama import OmniSpeech2SLlamaForCausalLM


--------------------------------------------------------------------------------
/registry/dataset/multiturn.yaml:
--------------------------------------------------------------------------------
1 | 
2 | multiturn_memory-zh:
3 |   class: src.dataset.BatchLoader
4 |   args:
5 |     file: Tele-AI/TELEVAL/multiturn_memory-zh
6 |     batch_size: 1  # suggest bsz=1 since multiturn may OOM
7 |     tuple_decode: False


--------------------------------------------------------------------------------
/src/models/src_llama_omni/constants.py:
--------------------------------------------------------------------------------
1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
2 | WORKER_HEART_BEAT_INTERVAL = 15
3 | 
4 | LOGDIR = "."
5 | 
6 | # Model Constants
7 | IGNORE_INDEX = -100
8 | SPEECH_TOKEN_INDEX = -200
9 | DEFAULT_SPEECH_TOKEN = "<speech>"


--------------------------------------------------------------------------------
/requirements/freeze_omni_requirements.txt:
--------------------------------------------------------------------------------
 1 | cryptography
 2 | flask==3.0.3
 3 | flask_socketio==5.3.4
 4 | librosa==0.10.2.post1
 5 | numpy==1.24.4
 6 | silero-vad==5.1.2
 7 | soundfile==0.12.1
 8 | torch==2.2.0
 9 | torchaudio==2.2.0
10 | transformers==4.45.2
11 | PyYAML==6.0.2


--------------------------------------------------------------------------------
/registry/evaluator/match.yaml:
--------------------------------------------------------------------------------
 1 | exist_match:
 2 |   class: src.evaluator.base.ExistMatch
 3 |   args:
 4 |     keep_punc: False
 5 |     max_workers: 1
 6 | 
 7 | single_option_match:
 8 |   class: src.evaluator.base.SingleOptionMatch
 9 |   args:
10 |     max_workers: 1
11 |     cushion: True
12 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__init__.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | #   LICENSE is in incl_licenses directory.
3 | 
4 | from .filter import *
5 | from .resample import *
6 | from .act import *
7 | 


--------------------------------------------------------------------------------
/registry/infer_task/multiturn.yaml:
--------------------------------------------------------------------------------
 1 | multiturn-memory-zh:
 2 |   class: src.config.InferTaskCfg
 3 |   args:
 4 |     dataset: multiturn_memory-zh
 5 |     template: multiturn-audio
 6 |     model: qwen2_5_omni
 7 |     save_pred_audio: False
 8 |     eval_task: basic
 9 |     reverse_spkr: False
10 |     use_model_history: True
11 |     save_latest_only: True
12 | 


--------------------------------------------------------------------------------
/registry/eval_task/objective.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # -------------------------------qa-------------------------------------
 3 | 
 4 | basic: 
 5 |   class: src.config.EvalTaskCfg
 6 |   args:
 7 |     evaluator: exist_match
 8 |     summarizer: AvgInfo
 9 | 
10 | choice:
11 |   class: src.config.EvalTaskCfg
12 |   args:
13 |     evaluator: single_option_match
14 |     summarizer: AvgInfo
15 | 


--------------------------------------------------------------------------------
/registry/dataset/choice.yaml:
--------------------------------------------------------------------------------
 1 | ceval-zh:
 2 |   class: src.dataset.BatchLoader
 3 |   args:
 4 |     file: Tele-AI/TELEVAL/ceval-zh
 5 |     ref_col: answer
 6 |     query_col: query
 7 |     batch_size: 1
 8 | 
 9 | agieval-zh:
10 |   class: src.dataset.BatchLoader
11 |   args:
12 |     file: Tele-AI/TELEVAL/agieval-zh
13 |     ref_col: answer
14 |     query_col: query
15 |     batch_size: 1


--------------------------------------------------------------------------------
/src/models/src_llama_omni/model/speech_generator/builder.py:
--------------------------------------------------------------------------------
 1 | from .speech_generator import SpeechGeneratorCTC
 2 | 
 3 | 
 4 | def build_speech_generator(config):
 5 |     generator_type = getattr(config, 'speech_generator_type', 'ctc')
 6 |     if generator_type == 'ctc':
 7 |         return SpeechGeneratorCTC(config)
 8 | 
 9 |     raise ValueError(f'Unknown generator type: {generator_type}')
10 | 


--------------------------------------------------------------------------------
/src/models/src_llama_omni/model/speech_projector/builder.py:
--------------------------------------------------------------------------------
 1 | from .speech_projector import EncoderProjectorConcat
 2 | 
 3 | 
 4 | def build_speech_projector(config):
 5 |     projector_type = getattr(config, 'speech_projector_type', 'linear')
 6 |     if projector_type == 'linear':
 7 |         return EncoderProjectorConcat(config)
 8 | 
 9 |     raise ValueError(f'Unknown projector type: {projector_type}')
10 | 


--------------------------------------------------------------------------------
/src/models/src_baichuan/constants.py:
--------------------------------------------------------------------------------
 1 | MODEL_PATH = "../baichuan-omni/model"
 2 | COSY_VOCODER = "../third_party/cosy24k_vocoder"
 3 | g_cache_dir = "../cache"
 4 | sampling_rate = 24000
 5 | wave_concat_overlap = int(sampling_rate * 0.01)
 6 | role_prefix = {
 7 |     'system': '<B_SYS>',
 8 |     'user': '<C_Q>',
 9 |     'assistant': '<C_A>',
10 |     'audiogen': '<audiotext_start_baichuan>'
11 | }
12 | max_frames = 8


--------------------------------------------------------------------------------
/registry/dataset/human.yaml:
--------------------------------------------------------------------------------
 1 | human_acceptance-zh:
 2 |   class: src.dataset.BatchLoader
 3 |   args:
 4 |     file: Tele-AI/TELEVAL/human_accept-zh
 5 |     ref_col: answer
 6 |     query_col: query
 7 |     batch_size: 1
 8 | 
 9 | human_chitchat-zh:
10 |   class: src.dataset.BatchLoader
11 |   args:
12 |     file: Tele-AI/TELEVAL/human_chitchat-zh
13 |     ref_col: answer
14 |     query_col: query
15 |     batch_size: 1


--------------------------------------------------------------------------------
/requirements/qwen2_5_omni_requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependencies
 2 | gradio==5.23.1
 3 | gradio_client==1.8.0
 4 | qwen-omni-utils==0.0.4
 5 | librosa==0.11.0
 6 | ffmpeg==1.4
 7 | ffmpeg-python==0.2.0
 8 | soundfile==0.13.1
 9 | modelscope_studio==1.2.2
10 | transformers==4.52.3
11 | accelerate
12 | av
13 | 
14 | # Optional dependency
15 | # Uncomment the following line if you need flash-attn
16 | flash-attn==2.7.4.post1


--------------------------------------------------------------------------------
/src/models/src_llama_omni/model/speech_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | from .speech_encoder import WhisperWrappedEncoder
 2 | 
 3 | 
 4 | def build_speech_encoder(config):
 5 |     speech_encoder_type = getattr(config, 'speech_encoder_type', None)
 6 |     if "whisper" in speech_encoder_type.lower():
 7 |         return WhisperWrappedEncoder.load(config)
 8 | 
 9 |     raise ValueError(f'Unknown speech encoder: {speech_encoder_type}')
10 | 


--------------------------------------------------------------------------------
/registry/summarizer/base.yaml:
--------------------------------------------------------------------------------
 1 | AvgInfo: 
 2 |   class: src.summarizer.summarizer.AvgInfo
 3 |   args: {}
 4 | 
 5 | AvgThreshold:
 6 |   class: src.summarizer.summarizer.AvgThreshold
 7 |   args:
 8 |     rescale: power # linear  power
 9 |     power: 2
10 |     threshold: 60
11 | 
12 | AvgWER:
13 |   class: src.summarizer.summarizer.AvgWER
14 |   args: {}
15 | 
16 | AvgMOS:
17 |   class: src.summarizer.summarizer.AvgMOS
18 |   args: {}


--------------------------------------------------------------------------------
/registry/infer_task/scene.yaml:
--------------------------------------------------------------------------------
 1 | aed-audio-instruct:
 2 |   class: src.config.InferTaskCfg
 3 |   args:
 4 |     dataset: aed_combine-zh
 5 |     template: zeroshot-aqa
 6 |     model: qwen2_5_omni
 7 |     eval_task: aed_instruct
 8 | 
 9 | aed-text-instruct:  # not recommand
10 |   class: src.config.InferTaskCfg
11 |   args:
12 |     dataset: aed_combine-zh
13 |     template: text-instruct-caption
14 |     model: qwen2_5_omni
15 |     eval_task: aed_instruct


--------------------------------------------------------------------------------
/src/models/src_baichuan/cosy24k_vocoder/README.md:
--------------------------------------------------------------------------------
1 | # CosyVoice 2.0 HiFi-GAN Vocoder Module
2 | This module contains the HiFi-GAN vocoder component extracted from [CosyVoice 2.0](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B), providing high-quality speech waveform generation capabilities and optimized for ease of integration.
3 | 
4 | The weights (hift.pt) are derived from [CosyVoice 2.0](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B) (Apache 2.0 licensed)


--------------------------------------------------------------------------------
/registry/infer_task/choice.yaml:
--------------------------------------------------------------------------------
 1 | choice-ceval-zh: 
 2 |   class: src.config.InferTaskCfg
 3 |   args:
 4 |     dataset: ceval-zh
 5 |     template: zeroshot-aqa
 6 |     model: qwen2_5_omni
 7 |     save_pred_audio: False
 8 |     eval_task: choice
 9 | 
10 | choice-agieval-zh: 
11 |   class: src.config.InferTaskCfg
12 |   args:
13 |     dataset: agieval-zh
14 |     template: zeroshot-aqa
15 |     model: qwen2_5_omni
16 |     save_pred_audio: False
17 |     eval_task: choice


--------------------------------------------------------------------------------
/requirements_eval.txt:
--------------------------------------------------------------------------------
 1 | jinja2
 2 | tqdm
 3 | requests
 4 | pandas
 5 | regex
 6 | datasets
 7 | soundfile
 8 | librosa
 9 | transformers
10 | WeTextProcessing==1.0.3
11 | #vllm  # (choice) for llm_offline judgement
12 | jiwer
13 | funasr
14 | zhon
15 | zhconv
16 | onnxruntime==1.18.1  # (choice) for dialect classify and dnsmos
17 | torch
18 | torchaudio
19 | scipy
20 | --find-links https://csukuangfj.github.io/kaldifeat/cuda.html
21 | kaldifeat==1.25.5  # (choice) for dialect classify


--------------------------------------------------------------------------------
/registry/infer_task/human.yaml:
--------------------------------------------------------------------------------
 1 | acceptance-human-zh:
 2 |   class: src.config.InferTaskCfg
 3 |   args:
 4 |     dataset: human_acceptance-zh
 5 |     template: zeroshot-aqa
 6 |     model: qwen2_5_omni
 7 |     save_pred_audio: False
 8 |     eval_task: human_acceptance
 9 | 
10 | chitchat-human-zh:
11 |   class: src.config.InferTaskCfg
12 |   args:
13 |     dataset: human_chitchat-zh
14 |     template: zeroshot-aqa
15 |     model: qwen2_5_omni
16 |     save_pred_audio: False
17 |     eval_task: human_likeness


--------------------------------------------------------------------------------
/registry/eval_task/speech.yaml:
--------------------------------------------------------------------------------
 1 | wer: 
 2 |   class: src.config.EvalTaskCfg
 3 |   args:
 4 |     evaluator: paraformer-zh
 5 |     summarizer: AvgWER
 6 | 
 7 | dnsmos:
 8 |   class: src.config.EvalTaskCfg
 9 |   args:
10 |     evaluator: MS-DNSMOS
11 |     summarizer: AvgMOS
12 | 
13 | emotion_response:
14 |   class: src.config.EvalTaskCfg
15 |   args:
16 |     evaluator: emo2vec-large
17 |     summarizer: AvgInfo
18 | 
19 | dialect_classify:
20 |   class: src.config.EvalTaskCfg
21 |   args:
22 |     evaluator: TeleSpeech-Dialect
23 |     summarizer: AvgInfo


--------------------------------------------------------------------------------
/requirements/minicpm_omni_requirements.txt:
--------------------------------------------------------------------------------
 1 | Pillow==10.1.0
 2 | torch==2.3.1
 3 | torchaudio==2.3.1
 4 | torchvision==0.18.1
 5 | transformers==4.44.2
 6 | sentencepiece==0.2.0
 7 | vector-quantize-pytorch==1.18.5
 8 | vocos==0.1.0
 9 | accelerate==1.2.1
10 | timm==0.9.10
11 | soundfile==0.12.1
12 | librosa==0.9.0
13 | decord
14 | moviepy
15 | 
16 | # for web demo
17 | aiofiles==23.2.1
18 | onnxruntime==1.20.1
19 | fastapi
20 | uvicorn
21 | gradio==4.44.1
22 | http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/modelscope_studio-0.4.0.9-py3-none-any.whl


--------------------------------------------------------------------------------
/requirements/llama_omni_requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==2.1.2
 2 | torchvision==0.16.2
 3 | torchaudio==2.1.2
 4 | transformers==4.43.4
 5 | tokenizers==0.19.1
 6 | sentencepiece==0.1.99
 7 | shortuuid
 8 | accelerate==0.33.0
 9 | peft==0.11.1
10 | bitsandbytes==0.43.1
11 | pydantic
12 | markdown2
13 | numpy
14 | scikit-learn==1.2.2
15 | gradio==4.43.0
16 | gradio_client==1.3.0
17 | requests
18 | httpx==0.27.2
19 | uvicorn
20 | fastapi
21 | soundfile
22 | einops==0.6.1
23 | einops-exts==0.0.4
24 | timm==0.6.13
25 | openai-whisper
26 | setuptools==59.5.0
27 | omegaconf==2.0.6
28 | fairseq


--------------------------------------------------------------------------------
/src/models/src_speechgpt2/Codec/models/modules/activation_function.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """Activation functions."""
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | 
10 | 
11 | def get_activation(nonlinear_activation, nonlinear_activation_params={}):
12 |     if hasattr(nn, nonlinear_activation):
13 |         return getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
14 |     else:
15 |         raise NotImplementedError(f"Activation {nonlinear_activation} is not supported!")


--------------------------------------------------------------------------------
/requirements/kimi_audio_requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==2.4.1
 2 | torchaudio==2.4.1
 3 | packaging
 4 | jinja2
 5 | openai-whisper
 6 | jsonlines
 7 | pandas
 8 | validators
 9 | sty
10 | transformers
11 | librosa
12 | accelerate
13 | aiohttp
14 | colorama
15 | omegaconf==2.3.0
16 | sox
17 | six==1.16.0
18 | hyperpyyaml
19 | conformer==0.3.2
20 | diffusers
21 | pillow
22 | sentencepiece
23 | easydict
24 | fire
25 | ujson
26 | cairosvg
27 | immutabledict
28 | rich
29 | wget
30 | gdown
31 | datasets
32 | torchdyn==1.0.6
33 | huggingface_hub
34 | loguru
35 | decord
36 | blobfile
37 | timm
38 | sacrebleu==1.5.1
39 | soundfile
40 | tqdm


--------------------------------------------------------------------------------
/registry/template/aqa.yaml:
--------------------------------------------------------------------------------
 1 | zeroshot-aqa:
 2 |   class: src.prompt.template.DataTemplate
 3 |   args:
 4 |     template:
 5 |       - role: user
 6 |         content:
 7 |           audio: "{{audio}}"
 8 | 
 9 | zeroshot-qa:
10 |   class: src.prompt.template.DataTemplate
11 |   args:
12 |     template:
13 |       - role: user
14 |         content:
15 |           text: "{{query}}"
16 | 
17 | text-instruct-caption:  # not recommand
18 |   class: src.prompt.template.DataTemplate
19 |   args:
20 |     template:
21 |       - role: instruct
22 |         content:
23 |           text: "{{query}}"
24 |       - role: user
25 |         content:
26 |           audio: "{{audio_only}}"
27 | 


--------------------------------------------------------------------------------
/registry/dataset/paralinguistic.yaml:
--------------------------------------------------------------------------------
 1 | esd:
 2 |   class: src.dataset.BatchLoader
 3 |   args:
 4 |     file: Tele-AI/TELEVAL/esd-zh
 5 |     ref_col: query_emo
 6 |     query_col: query
 7 |     extra_col: ["query_emo_zh", "answer", "answer_emo", "answer_emo_zh"]
 8 |     batch_size: 1
 9 | 
10 | para_mix300-zh:
11 |   class: src.dataset.BatchLoader
12 |   args:
13 |     file: Tele-AI/TELEVAL/para_mix300-zh
14 |     query_col: query
15 |     ref_col: answer
16 |     extra_col: ["para_name"]
17 | 
18 | age-zh:
19 |   class: src.dataset.BatchLoader
20 |   args:
21 |     file: Tele-AI/TELEVAL/age-zh
22 |     query_col: query
23 |     ref_col: age
24 |     extra_col: ["answer_age", "answer_common"]


--------------------------------------------------------------------------------
/registry/model/text.yaml:
--------------------------------------------------------------------------------
 1 | qwen2-7b-instruct:
 2 |   class: src.models.qwen.Qwen2Instruct
 3 |   args:
 4 |     path: path/to/Qwen2-7B-Instruct
 5 |     sample_params:
 6 |       gen_type: greedy
 7 | 
 8 | qwen2.5-7b-instruct:
 9 |   class: src.models.qwen.Qwen2Instruct
10 |   args:
11 |     path: path/to/Qwen2.5-7B-Instruct
12 |     sample_params:
13 |       gen_type: greedy
14 | 
15 | qwen3-8b-instruct:
16 |   class: src.models.qwen.Qwen3Instruct
17 |   args:
18 |     path: path/to/Qwen3-8B
19 |     sample_params:
20 |       gen_type: greedy
21 | 
22 | qwen2.5-72b-instruct:
23 |   args:
24 |     path: path/to/qwen2.5-72b-instruct
25 |     sample_params:
26 |       gen_type: default  # as judgement


--------------------------------------------------------------------------------
/src/models/src_baichuan/cosy24k_vocoder/hifigan/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """HIFI-GAN"""


--------------------------------------------------------------------------------
/registry/evaluator/speech.yaml:
--------------------------------------------------------------------------------
 1 | paraformer-zh:
 2 |   class: src.evaluator.asr.ASR
 3 |   args:
 4 |     model: paraformer-zh # path/to/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch
 5 |     max_workers: 1
 6 | 
 7 | MS-DNSMOS:
 8 |   class: src.evaluator.dnsmos.DNSMOS
 9 |   args:
10 |     model: path/to/sig_bak_ovr.onnx
11 |     max_workers: 1
12 | 
13 | emo2vec-large:
14 |   class: src.evaluator.emo2vec.Emo2vec
15 |   args:
16 |     model: iic/emotion2vec_plus_large  # path/to/emotion2vec_large
17 |     strict: True
18 | 
19 | TeleSpeech-Dialect:
20 |   class: src.evaluator.dialect.DialectClassify
21 |   args:
22 |     model: path/to/ecapa_tdnn-mfcc40-ch512-cls14.onnx
23 |     max_workers: 1


--------------------------------------------------------------------------------
/src/models/src_speechgpt2/Codec/models/moshi_modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Kyutai, all rights reserved.
 2 | # This source code is licensed under the license found in the
 3 | # LICENSE file in the root directory of this source tree.
 4 | 
 5 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 6 | # All rights reserved.
 7 | #
 8 | # This source code is licensed under the license found in the
 9 | # LICENSE file in the root directory of this source tree.
10 | """Modules used for building the models."""
11 | 
12 | # flake8: noqa
13 | from .conv import (
14 |     NormConv1d,
15 |     NormConvTranspose1d,
16 |     StreamingConv1d,
17 |     StreamingConvTranspose1d,
18 |     pad_for_conv1d,
19 |     pad1d,
20 |     unpad1d,
21 | )
22 | from .transformer import StreamingTransformer
23 | 


--------------------------------------------------------------------------------
/registry/infer_task/paralinguistic.yaml:
--------------------------------------------------------------------------------
 1 | emotion-esd:
 2 |   class: src.config.InferTaskCfg
 3 |   args:
 4 |     dataset: esd-zh
 5 |     template: zeroshot-aqa
 6 |     model: qwen2_5_omni
 7 |     eval_task: emotion_understand  # ["emotion_understand", "wer", "dnsmos", "emotion_response"]
 8 |     save_pred_audio: True
 9 | 
10 | aqa-para_mix300-zh:
11 |   class: src.config.InferTaskCfg
12 |   args:
13 |     dataset: para_mix300-zh
14 |     template: zeroshot-aqa
15 |     model: qwen2_5_omni
16 |     eval_task: para_care  # ["para_care", "basic"]
17 |     save_pred_audio: False
18 | 
19 | care-age-zh:
20 |   class: src.config.InferTaskCfg
21 |   args:
22 |     dataset: age-zh
23 |     template: zeroshot-aqa
24 |     model: qwen2_5_omni
25 |     eval_task: age_care
26 |     save_pred_audio: False


--------------------------------------------------------------------------------
/registry/template/text_llm.yaml:
--------------------------------------------------------------------------------
 1 | # for text LLM
 2 | text-choice:
 3 |   class: src.prompt.template.DataTemplate
 4 |   args:
 5 |     template:
 6 |       - role: user
 7 |         content:
 8 |           text: "以下是单选题，请以'正确答案是'为引导，直接给出答案选项。例如'正确答案是C'。{{query}}"
 9 | 
10 | text-emo:
11 |   class: src.prompt.template.DataTemplate
12 |   args:
13 |     template:
14 |       - role: user
15 |         content:
16 |           text: "用户以 '{{query_emo_zh}}' 的情绪输入 '{{query}}' 。结合用户的情绪作出适当的回答。"
17 | 
18 | # as judge
19 | judge-qwen2.5_72b_instruct:
20 |   class: src.prompt.template.DataTemplate
21 |   args:
22 |     template:
23 |       - role: system
24 |         content:
25 |           text: "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
26 |       - role: user
27 |         content:
28 |           text: "{{text}}"
29 | 


--------------------------------------------------------------------------------
/requirements/glm4voice_requirements.txt:
--------------------------------------------------------------------------------
 1 | conformer==0.3.2
 2 | deepspeed==0.14.2; sys_platform == 'linux'
 3 | diffusers==0.27.2
 4 | fastapi==0.115.3
 5 | fastapi-cli==0.0.4
 6 | gdown==5.1.0
 7 | gradio==5.3.0
 8 | grpcio==1.57.0
 9 | grpcio-tools==1.57.0
10 | huggingface_hub==0.25.2
11 | hydra-core==1.3.2
12 | HyperPyYAML==1.2.2
13 | inflect==7.3.1
14 | librosa==0.10.2
15 | lightning==2.2.4
16 | matplotlib==3.7.5
17 | modelscope==1.15.0
18 | networkx==3.1
19 | numpy==1.24.4
20 | omegaconf==2.3.0
21 | onnxruntime-gpu==1.16.0; sys_platform == 'linux'
22 | onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows'
23 | openai-whisper==20231117
24 | protobuf==4.25
25 | pydantic==2.7.0
26 | rich==13.7.1
27 | Requests==2.32.3
28 | safetensors==0.4.5
29 | soundfile==0.12.1
30 | tensorboard==2.14.0
31 | transformers==4.44.1
32 | uvicorn==0.32.0
33 | wget==3.2
34 | WeTextProcessing==1.0.3
35 | torch==2.3.0
36 | torchaudio==2.3.0


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Dict, List, Union, Tuple, Any
 3 | from src.dataset import BatchLoader, BatchSaver
 4 | 
 5 | TemplateStruct = Union[str, Dict[str, Any], List[Dict[str, Union[str, List[Dict[str, str]]]]]]
 6 | RefType = Union[str, List["RefType"], Tuple["RefType", ...]]
 7 | RefsType = List[RefType]
 8 | 
 9 | @dataclass
10 | class EvalTaskCfg:
11 |     evaluator: str
12 |     summarizer: str
13 | 
14 | @dataclass
15 | class InferTaskCfg:
16 |     dataset: Union[str, List[str]]
17 |     template: str
18 |     model: str
19 |     eval_task: str
20 |     save_pred_audio: bool = False
21 |     reverse_spkr : bool = False  # for multiturn
22 |     use_model_history: bool = True  # for multiturn
23 |     save_latest_only: bool = False  # for multiturn_memory
24 | 
25 | @dataclass
26 | class DatasetRuntimeCtx:
27 |     name: str
28 |     loader: BatchLoader
29 |     saver: BatchSaver
30 |     summary_file: str = None


--------------------------------------------------------------------------------
/registry/eval_task/llm.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | emotion_understand:
 3 |   class: src.config.EvalTaskCfg
 4 |   args:
 5 |     evaluator: emo_llm
 6 |     summarizer: AvgThreshold
 7 | 
 8 | aed_instruct:
 9 |   class: src.config.EvalTaskCfg
10 |   args:
11 |     evaluator: aed_llm
12 |     summarizer: AvgThreshold
13 | 
14 | dialect_follow:
15 |   class: src.config.EvalTaskCfg
16 |   args:
17 |     evaluator: dialect_llm
18 |     summarizer: AvgThreshold
19 | 
20 | human_acceptance:
21 |   class: src.config.EvalTaskCfg
22 |   args:
23 |     evaluator: acceptance_llm
24 |     summarizer: AvgThreshold
25 | 
26 | human_likeness:
27 |   class: src.config.EvalTaskCfg
28 |   args:
29 |     evaluator: human_likeness_llm
30 |     summarizer: AvgThreshold
31 | 
32 | para_care:
33 |   class: src.config.EvalTaskCfg
34 |   args:
35 |     evaluator: para_care_llm
36 |     summarizer: AvgThreshold
37 | 
38 | age_care:
39 |   class: src.config.EvalTaskCfg
40 |   args:
41 |     evaluator: age_care_llm
42 |     summarizer: AvgThreshold


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/act.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from .resample import UpSample1d, DownSample1d
 6 | 
 7 | 
 8 | class Activation1d(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         activation,
12 |         up_ratio: int = 2,
13 |         down_ratio: int = 2,
14 |         up_kernel_size: int = 12,
15 |         down_kernel_size: int = 12,
16 |     ):
17 |         super().__init__()
18 |         self.up_ratio = up_ratio
19 |         self.down_ratio = down_ratio
20 |         self.act = activation
21 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
22 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
23 | 
24 |     # x: [B,C,T]
25 |     def forward(self, x):
26 |         x = self.upsample(x)
27 |         x = self.act(x)
28 |         x = self.downsample(x)
29 | 
30 |         return x
31 | 


--------------------------------------------------------------------------------
/registry/template/multiturn.yaml:
--------------------------------------------------------------------------------
 1 | multiturn-audio:
 2 |   class: src.prompt.template.DataTemplate
 3 |   args:
 4 |     template: |
 5 |       {
 6 |         "nrounds": {{ nrounds }},
 7 |         "dialogue": [
 8 |           {% for i in range(1, nrounds + 1) %}
 9 |           {
10 |             "role": "A",
11 |             "round": "{{ i }}",
12 |             "content": {
13 |               "audio": {{ getvar("user_audio" ~ i) | tojson }},
14 |               "text": {{ getvar("user_text" ~ i) | tojson }}
15 |             }
16 |           },
17 |           {
18 |             "role": "B",
19 |             "round": "{{ i }}",
20 |             "content": {
21 |               "audio": {{ getvar("bot_audio" ~ i) | tojson }},
22 |               "text": {% if loop.last and answer is not none %}
23 |                         {{ answer | tojson }}
24 |                       {% else %}
25 |                         {{ getvar("bot_text" ~ i) | tojson }}
26 |                       {% endif %}
27 |             }
28 |           }{% if not loop.last %},{% endif %}
29 |           {% endfor %}
30 |         ]
31 |       }
32 | 
33 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/compat.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*This code is copied fron NVIDIA apex:
18 |  *     https://github.com/NVIDIA/apex
19 |  *     with minor changes. */
20 | 
21 | #ifndef TORCH_CHECK
22 | #define TORCH_CHECK AT_CHECK
23 | #endif
24 | 
25 | #ifdef VERSION_GE_1_3
26 | #define DATA_PTR data_ptr
27 | #else
28 | #define DATA_PTR data
29 | #endif
30 | 


--------------------------------------------------------------------------------
/src/models/src_llama_omni/model/speech_projector/speech_projector.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/ddlBoJack/SLAM-LLM/blob/main/src/slam_llm/models/projector.py
 2 | 
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | 
 8 | class EncoderProjectorConcat(nn.Module):
 9 |     def __init__(self, config):
10 |         super().__init__()
11 |         self.k = config.speech_encoder_ds_rate
12 |         self.encoder_dim = config.speech_encoder_hidden_size
13 |         self.llm_dim = config.hidden_size
14 |         self.linear1 = nn.Linear(self.encoder_dim * self.k, 2048)
15 |         self.relu = nn.ReLU()
16 |         self.linear2 = nn.Linear(2048, config.hidden_size)
17 | 
18 |     def forward(self, x):
19 |         batch_size, seq_len, dim = x.size()
20 |         num_frames_to_discard = seq_len % self.k
21 |         if num_frames_to_discard > 0:
22 |             x = x[:, :-num_frames_to_discard, :]
23 |         seq_len = x.size(1)
24 |         
25 |         x = x.contiguous()
26 |         x = x.view(batch_size, seq_len // self.k, dim * self.k)
27 |         x = self.linear1(x)
28 |         x = self.relu(x)
29 |         x = self.linear2(x)
30 |         return x


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | 
 4 | from src.task import Pipeline
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | def get_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--mode", default="eval", choices=["infer", "eval"])
11 |     parser.add_argument("--task", default="aqa")
12 |     parser.add_argument("--model", default=None)
13 |     
14 |     parser.add_argument("--bsz", default=None)
15 |     parser.add_argument("--save_dir", default="")
16 |     parser.add_argument("--eval_task", default=None)
17 |     parser.add_argument("--save_pred_audio", default=None)
18 | 
19 |     args = parser.parse_args()
20 |     return args
21 | 
22 | 
23 | def main():
24 |     args = get_args()
25 |     logging.basicConfig(
26 |         level=logging.INFO,
27 |         format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
28 |         handlers=[logging.StreamHandler()],
29 |         encoding="utf-8"
30 |     )
31 |     user_args = vars(args)
32 |     logger.info(f"Processing task: \nglobal args: {user_args}")
33 |     t = Pipeline.create(**user_args)
34 |     t.run()
35 | 
36 | if __name__ == "__main__":
37 |     main()
38 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/anti_alias_activation.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 |  #include <torch/extension.h>
18 | 
19 | extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
20 | 
21 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
22 |     m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
23 | }


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/utils/block_mask_util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def create_grid_mask(seq_length, trunck_length, fill_triangle):
 5 |     assert seq_length > 0
 6 | 
 7 |     # 先不考虑seen_length创建一个grid mask：
 8 |     if fill_triangle:
 9 |         mask = 1 - torch.triu(torch.ones(seq_length, seq_length), diagonal=1)
10 |         # 下三角与主对角线都为1
11 |     else:
12 |         mask = torch.zeros(seq_length, seq_length)
13 | 
14 |     for i in range(seq_length):
15 |         trunck_idx = i // trunck_length
16 |         trunck_start = trunck_idx * trunck_length
17 |         trunck_end = trunck_length + trunck_start
18 |         mask[i][trunck_start:trunck_end] = 1
19 | 
20 |     return mask
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     mask = create_grid_mask(seq_length=8, trunck_length=3, fill_triangle=True).int()
25 |     print(mask)
26 | # tensor([[1, 1, 1, 0, 0, 0, 0, 0],
27 | #         [1, 1, 1, 0, 0, 0, 0, 0],
28 | #         [1, 1, 1, 0, 0, 0, 0, 0],
29 | #         [1, 1, 1, 1, 1, 1, 0, 0],
30 | #         [1, 1, 1, 1, 1, 1, 0, 0],
31 | #         [1, 1, 1, 1, 1, 1, 0, 0],
32 | #         [1, 1, 1, 1, 1, 1, 1, 1],
33 | #         [1, 1, 1, 1, 1, 1, 1, 1]]
34 | 
35 | 


--------------------------------------------------------------------------------
/src/models/src_llama_omni/model/speech_encoder/speech_encoder.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/ddlBoJack/SLAM-LLM/blob/main/src/slam_llm/models/encoder.py
 2 | 
 3 | import types
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | 
 8 | 
 9 | class WhisperWrappedEncoder:
10 |     
11 |     @classmethod
12 |     def load(cls, model_config):
13 | 
14 |         def replace_layer_norm(module):
15 |             from whisper.model import LayerNorm
16 |             for name, child in module.named_children():
17 |                 if isinstance(child, LayerNorm):
18 |                     old_params = child.state_dict()
19 |                     new_layer_norm = nn.LayerNorm(child.normalized_shape, eps=child.eps, elementwise_affine=child.elementwise_affine)
20 |                     new_layer_norm.load_state_dict(old_params)
21 |                     setattr(module, name, new_layer_norm)
22 |                 else:
23 |                     replace_layer_norm(child)
24 | 
25 |         import whisper
26 |         encoder = whisper.load_model(name=model_config.speech_encoder, device='cpu').encoder
27 |         replace_layer_norm(encoder)
28 |         return encoder


--------------------------------------------------------------------------------
/requirements_all.txt:
--------------------------------------------------------------------------------
 1 | jinja2
 2 | tqdm
 3 | requests
 4 | pandas
 5 | regex
 6 | datasets
 7 | soundfile
 8 | librosa
 9 | torch==2.6.0  # average
10 | torchaudio==2.6.0  # average
11 | transformers==4.45.0  # average
12 | flash-attn==2.7.2.post1
13 | accelerate>=0.26.0
14 | 
15 | # minicpm
16 | pillow
17 | vector-quantize-pytorch==1.18.5
18 | vocos==0.1.0
19 | 
20 | # speech-gpt2
21 | einops
22 | 
23 | # baichuan
24 | av==12.3.0
25 | fire==0.4.0
26 | ujson==5.10.0
27 | easydict==1.13
28 | diffusers==0.24.0
29 | deepspeed
30 | decord==0.6.0
31 | opencv-python==4.10.0.84
32 | imagesize==1.4.1
33 | cairosvg==2.7.1
34 | 
35 | # glm4voice
36 | hyperpyyaml
37 | conformer==0.3.2
38 | diffusers==0.27.2
39 | huggingface_hub==0.25.2
40 | lightning==2.2.4
41 | rich==13.7.1
42 | gdown==5.1.0
43 | wget==3.2
44 | matplotlib
45 | 
46 | # lamma-omni
47 | openai-whisper==20240930
48 | fairseq==0.12.2
49 | 
50 | # qwen2.5-omni
51 | qwen-omni-utils==0.0.4
52 | # transformers==4.52.3
53 | 
54 | # kimi
55 | loguru
56 | blobfile
57 | timm
58 | torchdyn==1.0.6
59 | # transformers>=4.48.3
60 | 
61 | # ============ evaluate ==============
62 | WeTextProcessing==1.0.3
63 | #vllm  # choice
64 | jiwer
65 | funasr
66 | zhon
67 | zhconv
68 | onnxruntime==1.18.1
69 | scipy
70 | --find-links https://csukuangfj.github.io/kaldifeat/cuda.html
71 | kaldifeat==1.25.5  # choice, for dialect classify


--------------------------------------------------------------------------------
/src/models/src_freezeomni/decoder/ticodec/vqvae_tester.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import librosa
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from .vqvae import VQVAE
 8 | 
 9 | class VqvaeTester(nn.Module):
10 |     def __init__(self, config_path, model_path, sample_rate=24000):
11 |         super().__init__()
12 |         self.vqvae = VQVAE(config_path, model_path, with_encoder=True)
13 |         self.sample_rate = sample_rate
14 | 
15 |     @torch.no_grad()
16 |     def forward(self, wav_path):
17 |         # 单声道
18 |         # wav.shape (T, ), 按照模型的 sr 读取
19 |         wav, sr = librosa.load(wav_path, sr=self.sample_rate)
20 |         fid = os.path.basename(wav_path)[:-4]
21 |         wav = torch.tensor(wav).unsqueeze(0)
22 |         wav = wav.cuda()
23 |         # vq_codes is acoustic token
24 |         vq_codes, global_token = self.vqvae.encode(wav)
25 |         import pdb; pdb.set_trace()
26 |         syn = self.vqvae(vq_codes, global_token)
27 |         return fid, syn
28 | 
29 |     @torch.no_grad()
30 |     def vq(self, wav_path):
31 |         wav, sr = librosa.load(wav_path, sr=self.sample_rate)
32 |         fid = os.path.basename(wav_path)[:-4]
33 |         wav = torch.tensor(wav).unsqueeze(0)
34 |         wav = wav.cuda()
35 |         # vq_codes is acoustic token
36 |         vq_codes, global_token = self.vqvae.encode(wav)
37 |         return fid, vq_codes, global_token
38 | 


--------------------------------------------------------------------------------
/src/models/src_speechgpt2/Codec/models/modules/projector.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Projector modules."""
 5 | 
 6 | import torch
 7 | 
 8 | from .conv_layers import NonCausalConv1d, CausalConv1d
 9 | 
10 | 
11 | class Projector(torch.nn.Module):
12 |     def __init__(self,
13 |         mode,
14 |         input_channels,
15 |         output_channels, 
16 |         stride=1,
17 |         bias=False,
18 |         model='conv1d',
19 |     ):
20 |         super().__init__()
21 |         self.mode = mode
22 |         if self.mode == 'noncausal':
23 |             Conv1d = NonCausalConv1d
24 |         elif self.mode == 'causal':
25 |             Conv1d = CausalConv1d
26 |         else:
27 |             raise NotImplementedError(f"Mode ({mode}) is not supported!")
28 | 
29 |         if model == 'conv1d':
30 |             self.project = Conv1d(input_channels, output_channels, kernel_size=1, stride=stride, bias=bias)
31 |         elif model == 'conv1d_bn':
32 |             self.project = torch.nn.Sequential(
33 |                 Conv1d(input_channels, output_channels, kernel_size=1, stride=stride, bias=bias),
34 |                 torch.nn.BatchNorm1d(output_channels)
35 |             )
36 |         else:
37 |             raise NotImplementedError(f"Model ({model}) is not supported!")
38 |         
39 |     def forward(self, x): 
40 |         return self.project(x)
41 |     
42 |     def encode(self, x):
43 |         return self.project.inference(x)
44 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/tokenizer/glm4_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import librosa
 3 | import os
 4 | 
 5 | from transformers import WhisperFeatureExtractor
 6 | from src.models.src_glm4.speech_tokenizer.modeling_whisper import WhisperVQEncoder
 7 | from src.models.src_glm4.speech_tokenizer.utils import extract_speech_token
 8 | from torch import nn
 9 | 
10 | 
11 | class Glm4Tokenizer(nn.Module):
12 |     def __init__(self, tokenizer_path):
13 |         super().__init__()
14 |         self.whisper_model = WhisperVQEncoder.from_pretrained(tokenizer_path).eval()
15 |         self.feature_extractor = WhisperFeatureExtractor.from_pretrained(tokenizer_path)
16 | 
17 |     def tokenize(self, speech=None, audio_path=None, sr=16000):
18 |         if audio_path:
19 |             audio, sr = librosa.load(audio_path, sr=16000)
20 |             audio = torch.tensor(audio).unsqueeze(0)
21 |             audio_info = (audio, sr)
22 |         else:
23 |             assert speech is not None
24 |             assert sr
25 |             if isinstance(speech, list):
26 |                 speech = torch.tensor(speech).unsqueeze(0)
27 |             if len(speech.shape) == 1:
28 |                 speech = speech.unsqueeze(0)
29 |             audio_info = (speech, sr)
30 | 
31 |         audio_tokens = extract_speech_token(
32 |             self.whisper_model, self.feature_extractor, [audio_info]
33 |         )[0]
34 |         audio_tokens = torch.tensor(audio_tokens).unsqueeze(0)
35 |         return audio_tokens
36 | 


--------------------------------------------------------------------------------
/src/models/src_speechgpt2/Codec/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import sys
 4 | import debugpy
 5 | import torchaudio
 6 | import torch
 7 | import numpy as np
 8 | 
 9 | def set_logging():
10 |     rank = os.environ.get("RANK", 0)
11 |     logging.basicConfig(
12 |         level=logging.INFO,
13 |         stream=sys.stdout,
14 |         format=f"%(asctime)s [RANK {rank}] (%(module)s:%(lineno)d) %(levelname)s : %(message)s",
15 |     )
16 |     
17 | def waiting_for_debug(ip, port):
18 |     rank = os.environ.get("RANK", "0")
19 |     debugpy.listen((ip, port))
20 |     logging.info(f"[rank = {rank}] Waiting for debugger attach...")
21 |     debugpy.wait_for_client()
22 |     logging.info(f"[rank = {rank}] Debugger attached")
23 |     
24 | def load_audio(audio_path, target_sample_rate):
25 |     wav, raw_sample_rate = torchaudio.load(audio_path) # (1, T)   tensor 
26 |     if raw_sample_rate != target_sample_rate:   
27 |         wav = torchaudio.functional.resample(wav, raw_sample_rate, target_sample_rate) # tensor 
28 |     wav = np.expand_dims(wav.squeeze(0).numpy(), axis=1)
29 |     wav = torch.tensor(wav).reshape(1, 1, -1)
30 |     return wav
31 | 
32 | def save_audio(audio_outpath, audio_out, sample_rate):
33 |     print(audio_outpath, audio_out, sample_rate)
34 |     torchaudio.save(
35 |         audio_outpath, 
36 |         audio_out, 
37 |         sample_rate=sample_rate, 
38 |         encoding='PCM_S', 
39 |         bits_per_sample=16
40 |     )
41 |     logging.info(f"success save audio at {audio_outpath}")


--------------------------------------------------------------------------------
/src/prompt/template.py:
--------------------------------------------------------------------------------
 1 | """
 2 | from https://github.com/OpenBMB/UltraEval-Audio/blob/main/src/prompt/base.py
 3 | """
 4 | import json
 5 | from functools import singledispatch
 6 | from typing import Any, Dict, List
 7 | from jinja2 import StrictUndefined, Template
 8 | from jinja2.exceptions import UndefinedError
 9 | from src.config import TemplateStruct
10 | 
11 | @singledispatch
12 | def _load(t: Any, **kwargs: Any) -> Any:
13 |     return t
14 | 
15 | 
16 | @_load.register
17 | def _(t: str, **kwargs: Any) -> str:
18 |     def getvar(name: str, default=None):  # for multiturn
19 |         return kwargs.get(name, default)
20 |     
21 |     template = Template(t, undefined=StrictUndefined)
22 |     try:
23 |         rendered = template.render(**kwargs, getvar=getvar)
24 |         # add for multiturn template
25 |         try:
26 |             return json.loads(rendered)
27 |         except json.JSONDecodeError:
28 |             return rendered
29 |     except UndefinedError as e:
30 |         raise ValueError("{}: template is {}\ndoc is {}".format(e, t, kwargs))
31 | 
32 | 
33 | @_load.register
34 | def _(t: list, **kwargs: Any) -> List[Any]:
35 |     return [_load(item, **kwargs) for item in t]
36 | 
37 | 
38 | @_load.register
39 | def _(t: dict, **kwargs: Any) -> Dict[Any, Any]:
40 |     return {k: _load(v, **kwargs) for k, v in t.items()}
41 | 
42 | 
43 | class DataTemplate:
44 |     def __init__(self, template: TemplateStruct):
45 |         self.template = template
46 | 
47 |     def load(self, **kwargs):
48 |         return _load(self.template, **kwargs)
49 | 


--------------------------------------------------------------------------------
/tools/test_eval_speech.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 4 | 
 5 | from src.dataset import BatchLoader, BatchSaver
 6 | from src.registry import registry
 7 | 
 8 | dataset_name = "esd"
 9 | eval_task = "dialect_classify"  # wer  dnsmos  emotion_response  dialect_classify
10 | eval_task_cfg = registry.get_eval_task(eval_task)
11 | evaluator = registry.get_evaluator(eval_task_cfg.evaluator)
12 | summarizer = registry.get_summarizer(eval_task_cfg.summarizer)
13 | 
14 | pred_file = f"{dataset_name}.jsonl"
15 | save_file = f"{dataset_name}_{eval_task}.jsonl"
16 | 
17 | scores = []
18 | all_results = []
19 | data_loader = BatchLoader(pred_file, batch_size=1)
20 | saver = BatchSaver(save_file)
21 | 
22 | for batch_data in data_loader:
23 |     keys, preds, refs, pred_info_list = [
24 |         list(x) for x in zip(*[
25 |             (
26 |                 d["key"],
27 |                 d["pred"],
28 |                 d["ref"] if isinstance(d["ref"], list) else [d["ref"]],
29 |                 {k: d[k] for k in d if k not in ("pred", "ref")}
30 |             )
31 |             for d in batch_data
32 |         ])
33 |     ]
34 |     eval_results = evaluator.evaluate(preds, refs, pred_info_list)
35 |     if len(eval_results) != len(pred_info_list):
36 |         raise ValueError("Lost some results...")
37 |     
38 |     for result, pred_info in zip(eval_results, pred_info_list):
39 |         result.update(pred_info)
40 |         scores.append(result["score"])
41 |         all_results.append(result)
42 | 
43 | saver.save_all(all_results)
44 | stat = summarizer.statistic(scores)
45 | print(f"stage: {eval_task}, total_score: {stat}")


--------------------------------------------------------------------------------
/src/models/src_freezeomni/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import re
 3 | import os
 4 | import yaml
 5 | from .audioLLM import AudioLLM
 6 | 
 7 | from .encoder.cmvn import GlobalCMVN, load_cmvn
 8 | from .encoder.encoder import speechEncoder
 9 | 
10 | def load_checkpoint(model: torch.nn.Module, path: str) -> dict:
11 |     if torch.cuda.is_available():
12 |         print('Checkpoint: loading from checkpoint %s for GPU' % path)
13 |         checkpoint = torch.load(path)
14 |     else:
15 |         print('Checkpoint: loading from checkpoint %s for CPU' % path)
16 |         checkpoint = torch.load(path, map_location='cpu')
17 |     
18 |     # load parm from checkpoint
19 |     model.load_state_dict(checkpoint, strict=False)
20 | 
21 |     info_path = re.sub('.pt$', '.yaml', path)
22 |     configs = {}
23 |     # get configs
24 |     if os.path.exists(info_path):
25 |         with open(info_path, 'r') as fin:
26 |             configs = yaml.safe_load(fin)
27 |     return configs
28 | 
29 | def init_encoder_llm(configs):
30 |     if configs['cmvn_file'] is not None:
31 |         # read cmvn
32 |         mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn'])
33 |         # init cmvn layer
34 |         global_cmvn = GlobalCMVN(
35 |             torch.from_numpy(mean).float(),
36 |             torch.from_numpy(istd).float())
37 |     else:
38 |         global_cmvn = None
39 | 
40 |     input_dim = configs['input_dim']
41 |     vocab_size = configs['output_dim']
42 | 
43 |     # init speech encoder
44 |     encoder = speechEncoder(input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
45 |     # init audioLLM
46 |     model = AudioLLM(encoder=encoder, **configs['model_conf'])
47 | 
48 |     return model
49 | 


--------------------------------------------------------------------------------
/registry/dataset/aqa.yaml:
--------------------------------------------------------------------------------
 1 | llamaqa-en:
 2 |   class: src.dataset.BatchLoader
 3 |   args:
 4 |     file: Tele-AI/TELEVAL/llamaqa-en
 5 |     ref_col: answer
 6 |     query_col: query
 7 |     batch_size: 1
 8 | 
 9 | llamaqa-zh:
10 |   class: src.dataset.BatchLoader
11 |   args:
12 |     file: Tele-AI/TELEVAL/llamaqa-zh
13 |     ref_col: answer
14 |     query_col: query
15 |     batch_size: 1
16 | 
17 | triviaqa-en:
18 |   class: src.dataset.BatchLoader
19 |   args:
20 |     file: Tele-AI/TELEVAL/triviaqa-en
21 |     ref_col: answer
22 |     query_col: query
23 |     batch_size: 1
24 | 
25 | triviaqa-zh:
26 |   class: src.dataset.BatchLoader
27 |   args:
28 |     file: Tele-AI/TELEVAL/triviaqa-zh
29 |     ref_col: answer
30 |     query_col: query
31 |     batch_size: 1
32 | 
33 | webq-en:
34 |   class: src.dataset.BatchLoader
35 |   args:
36 |     file: Tele-AI/TELEVAL/webq-en
37 |     ref_col: answer
38 |     query_col: query
39 |     batch_size: 1
40 | 
41 | webq-zh:
42 |   class: src.dataset.BatchLoader
43 |   args:
44 |     file: Tele-AI/TELEVAL/webq-zh
45 |     ref_col: answer
46 |     query_col: query
47 |     batch_size: 1
48 | 
49 | chinesesimpleqa-zh:
50 |   class: src.dataset.BatchLoader
51 |   args:
52 |     file: Tele-AI/TELEVAL/chinesesimpleqa-zh
53 |     ref_col: answer
54 |     query_col: query
55 |     batch_size: 1
56 | 
57 | chinese_quiz-zh:
58 |   class: src.dataset.BatchLoader
59 |   args:
60 |     file: Tele-AI/TELEVAL/chinese_quiz-zh
61 |     ref_col: answer
62 |     query_col: query
63 |     batch_size: 1
64 | 
65 | livelihood_policy-zh:
66 |   class: src.dataset.BatchLoader
67 |   args:
68 |     file: Tele-AI/TELEVAL/livelihood_policy-zh
69 |     ref_col: answer
70 |     query_col: query
71 |     batch_size: 1


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/utils/special_tokens.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class ExtraTokens:
 6 |     msg_end: int
 7 |     user_msg_start: int
 8 |     assistant_msg_start: int
 9 | 
10 |     media_begin: int
11 |     media_end: int
12 | 
13 |     kimia_text_blank: int
14 |     kimia_text_eos: int
15 | 
16 |     kimia_user_msg_start: int
17 |     kimia_assistant_msg_start: int
18 | 
19 |     kimia_speech_ct_id: int
20 |     kimia_speech_ctd_id: int
21 | 
22 |     pad: int
23 | 
24 | 
25 | def instantiate_extra_tokens(tokenizer):
26 |     if hasattr(tokenizer, "special_tokens"):
27 |         map_fn = lambda x: tokenizer.special_tokens[x]
28 |     elif hasattr(tokenizer, "convert_tokens_to_ids"):
29 |         map_fn = lambda x: tokenizer.convert_tokens_to_ids(x)
30 |     else:
31 |         raise ValueError(f"Invalid tokenizer type: {type(tokenizer)}")
32 |     return ExtraTokens(
33 |         msg_end=map_fn("<|im_msg_end|>"),  # 0
34 |         user_msg_start=map_fn("<|im_user_msg_start|>"),  # 1
35 |         assistant_msg_start=map_fn("<|im_assistant_msg_start|>"),  # 2
36 |         media_begin=map_fn("<|im_media_begin|>"),  # 13
37 |         media_end=map_fn("<|im_media_end|>"),  # 15
38 |         kimia_text_blank=map_fn("<|im_kimia_text_blank|>"),  # 18
39 |         kimia_text_eos=map_fn("<|im_kimia_text_eos|>"),  # 19
40 |         kimia_user_msg_start=map_fn("<|im_kimia_user_msg_start|>"),  # 22
41 |         kimia_assistant_msg_start=map_fn("<|im_kimia_assistant_msg_start|>"),  # 23
42 |         kimia_speech_ct_id=map_fn("<|im_kimia_speech_ct_id|>"),  # 27
43 |         kimia_speech_ctd_id=map_fn("<|im_kimia_speech_ctd_id|>"),  # 28
44 |         pad=tokenizer.pad_id,
45 |     )
46 | 


--------------------------------------------------------------------------------
/src/evaluator/emo2vec.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict
 2 | from src.evaluator.base import Evaluator
 3 | 
 4 | class Emo2vec(Evaluator):
 5 |     def __init__(self, model: str, strict: bool = True):
 6 |         from funasr import AutoModel
 7 |         self.model = AutoModel(model=model, hub="ms", disable_update=True)
 8 |         self.strict = strict
 9 | 
10 |     def evaluate(self, preds, refs, pred_info_list: List[Dict], **kwargs):
11 |         # emo2vec model support batch generate
12 |         pred_audios = [info["pred_audio"] for info in pred_info_list]
13 |         model_outputs = self.model.generate(
14 |             pred_audios, output_dir=None, granularity="utterance", extract_embedding=False
15 |         )
16 | 
17 |         results = []
18 |         for output, info in zip(model_outputs, pred_info_list):
19 |             label_scores = {
20 |                 label.split("/")[-1].lower(): score
21 |                 for label, score in zip(output["labels"], output["scores"])
22 |             }
23 |             ref_emotions = [emo.lower() for emo in info["answer_emo"]]
24 | 
25 |             if self.strict:
26 |                 neutral_count = sum(1 for emo in ref_emotions if emo == "neutral")
27 |                 if neutral_count <= len(ref_emotions) // 2:
28 |                     # remove "neutral"
29 |                     filtered_ref_emotions = [emo for emo in ref_emotions if emo != "neutral"]
30 |                 else:
31 |                     filtered_ref_emotions = ref_emotions
32 |             else:
33 |                 filtered_ref_emotions = ref_emotions
34 | 
35 |             score = max((label_scores.get(emo, 0) for emo in filtered_ref_emotions), default=0)
36 |             results.append({"key": info["key"], "score": score})
37 |         return results
38 | 


--------------------------------------------------------------------------------
/src/models/src_baichuan/cosy24k_vocoder/cosy24k_vocoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from .hifigan.generator import HiFTGenerator
 4 | from .hifigan.f0_predictor import ConvRNNF0Predictor
 5 | 
 6 | 
 7 | class Cosy24kVocoder(nn.Module):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.hifigan_generator = HiFTGenerator(
11 |             in_channels=80,
12 |             base_channels=512,
13 |             nb_harmonics=8,
14 |             sampling_rate=24000,
15 |             nsf_alpha=0.1,
16 |             nsf_sigma=0.003,
17 |             nsf_voiced_threshold=10,
18 |             upsample_rates=[8, 5, 3],
19 |             upsample_kernel_sizes=[16, 11, 7],
20 |             resblock_kernel_sizes=[3, 7, 11],
21 |             resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
22 |             source_resblock_kernel_sizes=[7, 7, 11],
23 |             source_resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
24 |             lrelu_slope=0.1,
25 |             audio_limit=0.99,
26 |             f0_predictor=ConvRNNF0Predictor(
27 |                 num_class=1,
28 |                 in_channels=80,
29 |                 cond_channels=512,
30 |             ),
31 |         )
32 |     
33 |     def decode(self, mel, device="cuda"):
34 |         """
35 |         Args: mel: (batch_size, n_frames, n_mel)
36 |         """
37 |         generated_speech, f0 = self.hifigan_generator.forward(
38 |                 {"speech_feat": mel.transpose(1, 2)}, device=device
39 |             )
40 |         return generated_speech
41 | 
42 |     @classmethod
43 |     def from_pretrained(cls, model_path: str):
44 |         """Load a pretrained model from a checkpoint."""
45 |         model = cls()
46 |         model.hifigan_generator.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
47 |         model.eval()
48 |         return model
49 | 


--------------------------------------------------------------------------------
/src/evaluator/text_utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | import zhconv
 4 | from zhon.hanzi import punctuation as zh_punct
 5 | 
 6 | from tn.chinese.normalizer import Normalizer as ZhNormalizer
 7 | from tn.english.normalizer import Normalizer as EnNormalizer
 8 | 
 9 | 
10 | class TextProcessor:
11 |     RE_PUNCTUATION = re.compile(rf'[{re.escape(zh_punct + string.punctuation)}]')
12 |     RE_SPACES = re.compile(r'[\s\u3000]+')
13 | 
14 |     def __init__(self, language: str = "zh"):
15 |         self.language = language
16 |         if self.language == "zh":
17 |             self.normalizer = ZhNormalizer()
18 |         elif self.language == "en":
19 |             self.normalizer = EnNormalizer()
20 |         else:
21 |             raise ValueError(f"Unsupported language: {self.language}")
22 | 
23 |     @staticmethod
24 |     def clean_text(text: str, remove_punct: bool = True, remove_space: bool = True) -> str:
25 |         if remove_punct:
26 |             text = TextProcessor.RE_PUNCTUATION.sub('', text)
27 |         if remove_space:
28 |             text = TextProcessor.RE_SPACES.sub('', text)
29 |         return text
30 |     
31 |     @staticmethod
32 |     def convert_cn(text: str) -> str:
33 |         return zhconv.convert(text, 'zh-cn')
34 |     
35 |     def normalize_text(self, text: str) -> str:
36 |         return self.normalizer.normalize(text)
37 | 
38 |     def normalize_and_clean(self, text: str,
39 |                             do_normalize: bool = True,
40 |                             simplified_zh: bool = True,
41 |                             remove_punct: bool = True,
42 |                             remove_space: bool = True) -> str:
43 |         if simplified_zh:
44 |             text = self.convert_cn(text)
45 |         if do_normalize:
46 |             text = self.normalize_text(text)
47 |         return self.clean_text(text, remove_punct=remove_punct, remove_space=remove_space)


--------------------------------------------------------------------------------
/registry/model/offline.yaml:
--------------------------------------------------------------------------------
 1 | kimi-audio-7b-instruct:
 2 |   class: src.models.kimi_audio.Kimi
 3 |   args:
 4 |     path: path/to/Kimi-Audio-7B-Instruct
 5 |     whisper_path: path/to/whisper-large-v3
 6 |     glm4_tokenizer: path/to/glm-4-voice-tokenizer
 7 |     sample_params:
 8 |       gen_type: greedy
 9 | 
10 | qwen2_5_omni:
11 |   class: src.models.qwen2_omni.Qwen2Omni
12 |   args:
13 |     path: path/to/Qwen2.5-Omni-7B
14 |     sample_params:
15 |       gen_type: greedy
16 | 
17 | glm-4-voice-9b:
18 |   class: src.models.glm4voice.GLM4voice
19 |   args:
20 |     path: path/to/glm-4-voice-9b
21 |     speech_tokenizer_path: path/to/glm-4-voice-tokenizer
22 |     flow_path: path/to/glm-4-voice-decoder
23 |     sample_params:
24 |       gen_type: greedy
25 | 
26 | MiniCPMo2_6-audio:
27 |   class: src.models.mini_cpm.MiniCPMoAudio
28 |   args:
29 |     path: path/to/MiniCPM-o-2_6
30 |     sample_params:
31 |       gen_type: greedy
32 | 
33 | baichuan_omni_1d5:
34 |   class: src.models.baichuan.BaichuanOmni
35 |   args:
36 |     path: path/to/Baichuan-Omni-1d5
37 |     cosy_vocoder_path: path/to/Baichuan-Omni-1d5/hift.pt  # third_party/cosy24k_vocoder/hift.pt
38 |     sample_params:
39 |       gen_type: greedy
40 | 
41 | llama_omni:
42 |   class: src.models.llama_omni.LlamaOmni
43 |   args:
44 |     path: path/to/Llama-3.1-8B-Omni
45 |     vocoder_path: path/to/Llama-3.1-8B-Omni/vocoder
46 |     sample_params:
47 |       gen_type: greedy
48 | 
49 | speechgpt2:
50 |   class: src.models.speechgpt2.SpeechGPT2
51 |   args:
52 |     path: path/to/SpeechGPT-2-preview-7B
53 |     codec_ckpt_path: path/to/SpeechGPT-2.0-preview-Codec/sg2_codec_ckpt.pkl
54 |     sample_params:
55 |       gen_type: greedy
56 | 
57 | freeze_omni:
58 |   class: src.models.freeze_omni.FreezeOmni
59 |   args:
60 |     path: path/to/Freeze-Omni/checkpoints
61 |     llm_path: path/to/Qwen2-7B-Instruct
62 |     sample_params:
63 |       gen_type: greedy


--------------------------------------------------------------------------------
/src/models/src_glm4/speech_tokenizer/configuration_whisper.py:
--------------------------------------------------------------------------------
 1 | from transformers import WhisperConfig
 2 | 
 3 | 
 4 | class WhisperVQConfig(WhisperConfig):
 5 |     def __init__(self,
 6 |                  pooling_kernel_size=None,
 7 |                  pooling_type="max",
 8 |                  pooling_position=0,
 9 |                  quantize_vocab_size=None,
10 |                  quantize_position=16,
11 |                  quantize_commit_coefficient=0.25,
12 |                  quantize_loss_scale=1.0,
13 |                  quantize_ema_decay=None,
14 |                  quantize_restart_interval=None,
15 |                  quantize_encoder_only=False,
16 |                  quantize_causal_encoder=False,
17 |                  quantize_causal_block_size=None,
18 |                  skip_language_detection=False,
19 |                  encoder_causal_attention=False,
20 |                  encoder_causal_convolution=False,
21 |                  **kwargs):
22 |         self.pooling_kernel_size = pooling_kernel_size
23 |         self.pooling_type = pooling_type
24 |         self.pooling_position = pooling_position
25 |         self.quantize_vocab_size = quantize_vocab_size
26 |         self.quantize_position = quantize_position
27 |         self.quantize_commit_coefficient = quantize_commit_coefficient
28 |         self.quantize_loss_scale = quantize_loss_scale
29 |         self.quantize_ema_decay = quantize_ema_decay
30 |         self.quantize_restart_interval = quantize_restart_interval
31 |         self.quantize_encoder_only = quantize_encoder_only
32 |         self.quantize_causal_encoder = quantize_causal_encoder
33 |         self.quantize_causal_block_size = quantize_causal_block_size
34 |         self.skip_language_detection = skip_language_detection
35 |         self.encoder_causal_attention = encoder_causal_attention
36 |         self.encoder_causal_convolution = encoder_causal_convolution
37 |         super().__init__(**kwargs)
38 | 


--------------------------------------------------------------------------------
/src/evaluator/asr.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | from jiwer import compute_measures
 3 | from src.evaluator.base import Evaluator
 4 | from src.utils import parallel_batch
 5 | from src.evaluator.text_utils import TextProcessor
 6 | 
 7 | class ASR(Evaluator):
 8 |     """
 9 |     Part from https://github.com/BytedanceSpeech/seed-tts-eval/tree/main
10 |     """
11 |     def __init__(self, model: str, max_workers=None):
12 |         if max_workers is not None:
13 |             self.max_workers = max_workers
14 |         from funasr import AutoModel
15 |         self.model = AutoModel(model=model, disable_update=True)
16 |         self.text_processor = TextProcessor(language="zh")
17 |     
18 |     @parallel_batch(default_workers=4)
19 |     def evaluate(self, pred: str, ref: str, pred_info: Dict, **kwargs):
20 |         pred_audio = pred_info["pred_audio"]
21 |         res = self.model.generate(input=pred_audio, batch_size_s=300)
22 |         transcription = res[0]["text"]
23 | 
24 |         clean_truth, clean_hypo, wer, subs, dele, inse, ref_len = self.compute_wer(hypo=transcription, truth=pred)
25 |         score = {
26 |             "ref_len": ref_len,
27 |             "subs": subs,
28 |             "dele": dele,
29 |             "inse": inse,
30 |             "wer": wer
31 |         }
32 |         return {"key": pred_info["key"], "clean_trans": clean_hypo, "clean_text": clean_truth, "score": score}
33 | 
34 |     def compute_wer(self, hypo, truth):
35 |         truth = self.text_processor.normalize_and_clean(truth)
36 |         hypo = self.text_processor.normalize_and_clean(hypo)
37 | 
38 |         truth_chars = " ".join(truth)
39 |         hypo_chars = " ".join(hypo)
40 |         measures = compute_measures(truth_chars, hypo_chars)
41 |         ref_len = len(truth)
42 | 
43 |         wer = measures["wer"]
44 |         subs = measures["substitutions"]
45 |         dele = measures["deletions"]
46 |         inse = measures["insertions"]
47 | 
48 |         return truth_chars, hypo_chars, wer, subs, dele, inse, ref_len


--------------------------------------------------------------------------------
/src/evaluator/base.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any, Union, List
 2 | from src.evaluator.process import SimpleTokenizer, OptionExtractor
 3 | from src.config import RefType, RefsType
 4 | from src.utils import parallel_batch
 5 | 
 6 | class Evaluator:
 7 |     def evaluate(self, pred, label, **kwargs) -> Dict[str, Any]:
 8 |         raise NotImplementedError
 9 | 
10 | class ExistMatch(Evaluator):
11 |     """
12 |     referred to https://github.com/DevSinghSachan/emdr2/blob/main/tasks/openqa/dense_retriever/evaluation/qa_validation.py
13 |     """
14 | 
15 |     def __init__(self, keep_punc=False, max_workers=None):
16 |         self.keep_punc = keep_punc
17 |         if max_workers is not None:
18 |             self.max_workers = max_workers
19 |     
20 |     @parallel_batch(default_workers=4)
21 |     def evaluate(self, pred: str, ref: RefsType, pred_info: Dict, **kwargs):
22 |         # NOTE (TTTdas): If strict sequential matching is required, set keep_punc=False and simply put the ref into a string
23 |         if not isinstance(ref, List):
24 |             raise ValueError(f"Need List type ref for ExistMatch, but got {type(ref)} instead")
25 |         match = SimpleTokenizer.has_answer(ref, str(pred), uncased=True, keep_punc=self.keep_punc)
26 |         return {"key": pred_info["key"], "pred": pred, "ref": ref, "score": 1 if match else 0}
27 |     
28 | 
29 | class SingleOptionMatch(Evaluator):
30 |     def __init__(self, max_workers=None, cushion=False):
31 |         self.cushion = cushion
32 |         if max_workers is not None:
33 |             self.max_workers = max_workers
34 |                 
35 |     @parallel_batch(default_workers=4)
36 |     def evaluate(self, pred: str, ref: Union[str, List], pred_info: Dict, **kwargs):
37 |         if isinstance(ref, list):
38 |             assert len(ref) == 1
39 |             ref = ref[0]
40 |         match_dict = OptionExtractor.has_answer(ref, str(pred), pred_info.get("query", None), cushion=self.cushion)
41 |         return {"key": pred_info["key"], "pred": pred, "ref": ref, "score": match_dict}


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/flow/length_regulator.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import Tuple
15 | import torch.nn as nn
16 | from torch.nn import functional as F
17 | from cosyvoice.utils.mask import make_pad_mask
18 | 
19 | 
20 | class InterpolateRegulator(nn.Module):
21 |     def __init__(
22 |             self,
23 |             channels: int,
24 |             sampling_ratios: Tuple,
25 |             out_channels: int = None,
26 |             groups: int = 1,
27 |     ):
28 |         super().__init__()
29 |         self.sampling_ratios = sampling_ratios
30 |         out_channels = out_channels or channels
31 |         model = nn.ModuleList([])
32 |         if len(sampling_ratios) > 0:
33 |             for _ in sampling_ratios:
34 |                 module = nn.Conv1d(channels, channels, 3, 1, 1)
35 |                 norm = nn.GroupNorm(groups, channels)
36 |                 act = nn.Mish()
37 |                 model.extend([module, norm, act])
38 |         model.append(
39 |             nn.Conv1d(channels, out_channels, 1, 1)
40 |         )
41 |         self.model = nn.Sequential(*model)
42 | 
43 |     def forward(self, x, ylens=None):
44 |         # x in (B, T, D)
45 |         mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
46 |         x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest')
47 |         out = self.model(x).transpose(1, 2).contiguous()
48 |         olens = ylens
49 |         return out * mask, olens
50 | 


--------------------------------------------------------------------------------
/registry/evaluator/llm.yaml:
--------------------------------------------------------------------------------
 1 | acceptance_llm:
 2 |   class: src.evaluator.llm.LLMScorer
 3 |   args: 
 4 |     llm_name: gpt4o
 5 |     judge_task: value_align
 6 |     api_keys:
 7 |       key1: "xxx"
 8 |       key2: "xxx"
 9 |       key3: "xxx"
10 |     max_workers: 3
11 | 
12 | human_likeness_llm:
13 |   class: src.evaluator.llm.LLMScorer
14 |   args: 
15 |     llm_name: gpt4o
16 |     judge_task: humanlike
17 |     api_keys:
18 |       key1: "xxx"
19 |       key2: "xxx"
20 |       key3: "xxx"
21 |     max_workers: 3
22 | 
23 | emo_llm:
24 |   class: src.evaluator.llm.LLMScorer
25 |   args: 
26 |     llm_name: gpt4o
27 |     judge_task: emotion_understand
28 |     api_keys:
29 |       key1: "xxx"
30 |       key2: "xxx"
31 |       key3: "xxx"
32 |     max_workers: 3
33 | 
34 | aed_llm:
35 |   class: src.evaluator.llm.LLMScorer
36 |   args: 
37 |     llm_name: gpt4o
38 |     judge_task: aed
39 |     api_keys:
40 |       key1: "xxx"
41 |       key2: "xxx"
42 |       key3: "xxx"
43 |     max_workers: 3
44 | 
45 | dialect_llm:
46 |   class: src.evaluator.llm.LLMScorer
47 |   args: 
48 |     llm_name: gpt4o
49 |     judge_task: dialect_follow
50 |     api_keys:
51 |       key1: "xxx"
52 |       key2: "xxx"
53 |       key3: "xxx"
54 |     max_workers: 3
55 | 
56 | para_care_llm:
57 |   class: src.evaluator.llm.LLMScorer
58 |   args: 
59 |     llm_name: gpt4o
60 |     judge_task: para_care
61 |     api_keys:
62 |       key1: "xxx"
63 |       key2: "xxx"
64 |       key3: "xxx"
65 |     max_workers: 3
66 | 
67 | age_care_llm:
68 |   class: src.evaluator.llm.LLMScorer
69 |   args: 
70 |     llm_name: gpt4o
71 |     judge_task: age_care
72 |     api_keys:
73 |       key1: "xxx"
74 |       key2: "xxx"
75 |       key3: "xxx"
76 |     max_workers: 3
77 | 
78 | llm_offline:
79 |   class: src.evaluator.llm.LLMOfflineScorer
80 |   args: 
81 |     llm_name: qwen2.3-72b-instruct
82 |     template: judge-qwen2.3_72b_instruct
83 |     judge_task: emotion_understand
84 |     generate_params:
85 |       ngpus: 8
86 |       max_tokens: 1024
87 |       temperature: 0.7
88 |       top_p: 0.8
89 |       repetition_penalty: 1.03
90 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/utils/file_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
 2 | #               2024 Alibaba Inc (authors: Xiang Lyu)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import json
17 | import torchaudio
18 | 
19 | 
20 | def read_lists(list_file):
21 |     lists = []
22 |     with open(list_file, 'r', encoding='utf8') as fin:
23 |         for line in fin:
24 |             lists.append(line.strip())
25 |     return lists
26 | 
27 | def read_json_lists(list_file):
28 |     lists = read_lists(list_file)
29 |     results = {}
30 |     for fn in lists:
31 |         with open(fn, 'r', encoding='utf8') as fin:
32 |             results.update(json.load(fin))
33 |     return results
34 | 
35 | def load_wav(wav, target_sr):
36 |     speech, sample_rate = torchaudio.load(wav)
37 |     speech = speech.mean(dim=0, keepdim=True)
38 |     if sample_rate != target_sr:
39 |         assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
40 |         speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
41 |     return speech
42 | 
43 | def speed_change(waveform, sample_rate, speed_factor: str):
44 |     effects = [
45 |         ["tempo", speed_factor],  # speed_factor
46 |         ["rate", f"{sample_rate}"]
47 |     ]
48 |     augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor(
49 |         waveform,
50 |         sample_rate,
51 |         effects
52 |     )
53 |     return augmented_waveform, new_sample_rate
54 | 


--------------------------------------------------------------------------------
/registry/infer_task/aqa.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # -------------------------------audio qa-------------------------------------
 3 | aqa-llamaqa-en:
 4 |   class: src.config.InferTaskCfg
 5 |   args:
 6 |     dataset: llamaqa-en
 7 |     template: zeroshot-aqa
 8 |     model: qwen2_5_omni
 9 |     save_pred_audio: False
10 |     eval_task: basic
11 | 
12 | aqa-llamaqa-zh:
13 |   class: src.config.InferTaskCfg
14 |   args:
15 |     dataset: llamaqa-zh
16 |     template: zeroshot-aqa
17 |     model: qwen2_5_omni
18 |     save_pred_audio: False
19 |     eval_task: basic
20 | 
21 | aqa-triviaqa-en:
22 |   class: src.config.InferTaskCfg
23 |   args:
24 |     dataset: triviaqa-en
25 |     template: zeroshot-aqa
26 |     model: qwen2_5_omni
27 |     save_pred_audio: False
28 |     eval_task: basic
29 | 
30 | aqa-triviaqa-zh:
31 |   class: src.config.InferTaskCfg
32 |   args:
33 |     dataset: triviaqa-zh
34 |     template: zeroshot-aqa
35 |     model: qwen2_5_omni
36 |     save_pred_audio: False
37 |     eval_task: basic
38 | 
39 | aqa-webq-en:
40 |   class: src.config.InferTaskCfg
41 |   args:
42 |     dataset: webq-en
43 |     template: zeroshot-aqa
44 |     model: qwen2_5_omni
45 |     save_pred_audio: False
46 |     eval_task: basic
47 | 
48 | aqa-webq-zh:
49 |   class: src.config.InferTaskCfg
50 |   args:
51 |     dataset: webq-zh
52 |     template: zeroshot-aqa
53 |     model: qwen2_5_omni
54 |     save_pred_audio: False
55 |     eval_task: basic
56 | 
57 | aqa-chinesesimpleqa-zh:
58 |   class: src.config.InferTaskCfg
59 |   args:
60 |     dataset: chinesesimpleqa-zh
61 |     template: zeroshot-aqa
62 |     model: qwen2_5_omni
63 |     save_pred_audio: False
64 |     eval_task: basic
65 | 
66 | aqa-chinese_quiz-zh:
67 |   class: src.config.InferTaskCfg
68 |   args:
69 |     dataset: chinese_quiz-zh
70 |     template: zeroshot-aqa
71 |     model: qwen2_5_omni
72 |     save_pred_audio: False
73 |     eval_task: basic
74 | 
75 | aqa-livelihood_policy-zh:
76 |   class: src.config.InferTaskCfg
77 |   args:
78 |     dataset: livelihood_policy-zh
79 |     template: zeroshot-aqa
80 |     model: qwen2_5_omni
81 |     save_pred_audio: False
82 |     eval_task: basic
83 | 
84 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/resample.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from torch.nn import functional as F
 6 | from .filter import LowPassFilter1d
 7 | from .filter import kaiser_sinc_filter1d
 8 | 
 9 | 
10 | class UpSample1d(nn.Module):
11 |     def __init__(self, ratio=2, kernel_size=None):
12 |         super().__init__()
13 |         self.ratio = ratio
14 |         self.kernel_size = (
15 |             int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
16 |         )
17 |         self.stride = ratio
18 |         self.pad = self.kernel_size // ratio - 1
19 |         self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
20 |         self.pad_right = (
21 |             self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
22 |         )
23 |         filter = kaiser_sinc_filter1d(
24 |             cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size
25 |         )
26 |         self.register_buffer("filter", filter)
27 | 
28 |     # x: [B, C, T]
29 |     def forward(self, x):
30 |         _, C, _ = x.shape
31 | 
32 |         x = F.pad(x, (self.pad, self.pad), mode="replicate")
33 |         x = self.ratio * F.conv_transpose1d(
34 |             x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
35 |         )
36 |         x = x[..., self.pad_left : -self.pad_right]
37 | 
38 |         return x
39 | 
40 | 
41 | class DownSample1d(nn.Module):
42 |     def __init__(self, ratio=2, kernel_size=None):
43 |         super().__init__()
44 |         self.ratio = ratio
45 |         self.kernel_size = (
46 |             int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
47 |         )
48 |         self.lowpass = LowPassFilter1d(
49 |             cutoff=0.5 / ratio,
50 |             half_width=0.6 / ratio,
51 |             stride=ratio,
52 |             kernel_size=self.kernel_size,
53 |         )
54 | 
55 |     def forward(self, x):
56 |         xx = self.lowpass(x)
57 | 
58 |         return xx
59 | 


--------------------------------------------------------------------------------
/tools/save_csv.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import shutil
 5 | import pandas as pd
 6 | 
 7 | def get_args():
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("--root_dir", default="res")
10 |     parser.add_argument("--transpose", default=True)
11 |     return parser.parse_args()
12 | 
13 | def main():
14 |     results = dict()
15 |     args = get_args()
16 |     summary_dir = os.path.join(args.root_dir, "summary")
17 |     column_order = []
18 |     for model_name in os.listdir(summary_dir):
19 |         model_path = os.path.join(summary_dir, model_name)
20 |         if not os.path.isdir(model_path):
21 |             continue
22 | 
23 |         results[model_name] = dict()
24 | 
25 |         jsonl_files = [f for f in os.listdir(model_path) if f.endswith(".jsonl")]
26 |         jsonl_files.sort()
27 | 
28 | 
29 |         for jsonl_file in jsonl_files:
30 |             dataset_name = os.path.splitext(jsonl_file)[0]
31 |             if dataset_name not in column_order:
32 |                 column_order.append(dataset_name)
33 | 
34 |             file_path = os.path.join(model_path, jsonl_file)
35 | 
36 |             try:
37 |                 with open(file_path, "r", encoding="utf-8") as f:
38 |                     score_str = ""
39 |                     line = f.readline().strip()
40 |                     data = json.loads(line)
41 |                     for key, value in data.items():
42 |                         score_str += str(value) + " "
43 |                     results[model_name][dataset_name] = score_str
44 |             except Exception as e:
45 |                 print(f"fail to read {file_path}: {e}")
46 |                 raise e
47 | 
48 |     df = pd.DataFrame.from_dict(results, orient="index")
49 |     df = df.reindex(columns=column_order)
50 | 
51 |     if args.transpose:
52 |         df = df.T
53 |     df.to_csv(f"{args.root_dir}/results.csv", encoding="utf-8")
54 | 
55 |     print("========================== results ==========================", flush=True)
56 |     terminal_width = shutil.get_terminal_size().columns
57 |     pd.set_option("display.max_columns", None)
58 |     pd.set_option("display.width", terminal_width)
59 |     print(df)
60 | 
61 | if __name__ == "__main__":
62 |     main()


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/hifigan/f0_predictor.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | import torch.nn as nn
16 | from torch.nn.utils import weight_norm
17 | 
18 | 
19 | class ConvRNNF0Predictor(nn.Module):
20 |     def __init__(self,
21 |                  num_class: int = 1,
22 |                  in_channels: int = 80,
23 |                  cond_channels: int = 512
24 |                  ):
25 |         super().__init__()
26 | 
27 |         self.num_class = num_class
28 |         self.condnet = nn.Sequential(
29 |             weight_norm(
30 |                 nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
31 |             ),
32 |             nn.ELU(),
33 |             weight_norm(
34 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
35 |             ),
36 |             nn.ELU(),
37 |             weight_norm(
38 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
39 |             ),
40 |             nn.ELU(),
41 |             weight_norm(
42 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
43 |             ),
44 |             nn.ELU(),
45 |             weight_norm(
46 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
47 |             ),
48 |             nn.ELU(),
49 |         )
50 |         self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
51 | 
52 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
53 |         x = self.condnet(x)
54 |         x = x.transpose(1, 2)
55 |         return torch.abs(self.classifier(x).squeeze(-1))
56 | 


--------------------------------------------------------------------------------
/src/models/src_baichuan/cosy24k_vocoder/hifigan/f0_predictor.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | import torch.nn as nn
16 | from torch.nn.utils import weight_norm
17 | 
18 | 
19 | class ConvRNNF0Predictor(nn.Module):
20 |     def __init__(self,
21 |                  num_class: int = 1,
22 |                  in_channels: int = 80,
23 |                  cond_channels: int = 512
24 |                  ):
25 |         super().__init__()
26 | 
27 |         self.num_class = num_class
28 |         self.condnet = nn.Sequential(
29 |             weight_norm(
30 |                 nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
31 |             ),
32 |             nn.ELU(),
33 |             weight_norm(
34 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
35 |             ),
36 |             nn.ELU(),
37 |             weight_norm(
38 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
39 |             ),
40 |             nn.ELU(),
41 |             weight_norm(
42 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
43 |             ),
44 |             nn.ELU(),
45 |             weight_norm(
46 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
47 |             ),
48 |             nn.ELU(),
49 |         )
50 |         self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
51 | 
52 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
53 |         x = self.condnet(x)
54 |         x = x.transpose(1, 2)
55 |         return torch.abs(self.classifier(x).squeeze(-1))
56 | 


--------------------------------------------------------------------------------
/src/models/telechat2.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch
 3 | from typing import Dict, Any
 4 | from transformers import AutoModelForCausalLM, AutoTokenizer
 5 | from src.models.base import Model
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | class TeleChat2(Model):
10 |     def __init__(self, path: str, sample_params: Dict[str, Any] = None):
11 |         super().__init__(sample_params)
12 |         logger.info("start load model from {}".format(path))
13 |         self.model = AutoModelForCausalLM.from_pretrained(
14 |             path,
15 |             device_map="auto",
16 |             trust_remote_code=True, 
17 |             torch_dtype=torch.float16
18 |         ).eval()
19 |         logger.info("successfully load model from {}".format(path))
20 | 
21 |         self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
22 |         config = {
23 |             "greedy": {
24 |                 "do_sample": False,
25 |                 "max_new_tokens": 1024,
26 |                 "top_k": None,
27 |                 "num_beams": 1,
28 |                 "temperature": None,
29 |                 "top_p": None
30 |             }
31 |         }
32 |         self.generation_config = config.get(self.sample_params.get("gen_type", "greedy"), None)
33 |         logger.info("generation_config: {}".format(self.generation_config)) 
34 | 
35 |     
36 |     def generate_once(self, audio, **kwargs):
37 |         content = kwargs["query"]
38 |         
39 |         messages = [
40 |             {"role": "user", "content": content}
41 |         ]
42 |         text = self.tokenizer.apply_chat_template(
43 |             messages,
44 |             tokenize=False,
45 |             add_generation_prompt=True
46 |         )
47 |         model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
48 |         
49 |         generated_ids = self.model.generate(
50 |             **model_inputs,
51 |             **self.generation_config
52 |         )
53 | 
54 |         generated_ids = [
55 |             output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
56 |         ]
57 |         response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
58 | 
59 |         return {"pred": response}
60 |         
61 | 


--------------------------------------------------------------------------------
/src/models/src_freezeomni/decoder/ticodec/vqvae.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from .models import Encoder
 7 | from .models import Generator
 8 | from .models import Quantizer
 9 | 
10 | class AttrDict(dict):
11 |     def __init__(self, *args, **kwargs):
12 |         super(AttrDict, self).__init__(*args, **kwargs)
13 |         self.__dict__ = self
14 | 
15 | class VQVAE(nn.Module):
16 |     def __init__(self,
17 |                  config_path,
18 |                  ckpt_path,
19 |                  with_encoder=False):
20 |         super(VQVAE, self).__init__()
21 |         ckpt = torch.load(ckpt_path)
22 |         with open(config_path) as f:
23 |             data = f.read()
24 |         json_config = json.loads(data)
25 |         self.h = AttrDict(json_config)
26 |         # self.gst = GST()
27 |         # self.gst = Proposed(n_specs=128, token_num=10, E=128, n_layers=4)
28 |         self.quantizer = Quantizer(self.h)
29 |         self.generator = Generator(self.h)
30 |         self.generator.load_state_dict(ckpt['generator'])
31 |         self.quantizer.load_state_dict(ckpt['quantizer'])
32 |         # self.gst.load_state_dict(ckpt['gst'])
33 |         if with_encoder:
34 |             self.encoder = Encoder(self.h)
35 |             self.encoder.load_state_dict(ckpt['encoder'])
36 | 
37 |     def forward(self, x, global_style_token):
38 |         # x is the codebook
39 |         # x.shape (B, T, Nq)
40 |         quant_emb = self.quantizer.embed(x)
41 |         global_style_quantized_emb = self.quantizer.embed_gst(global_style_token).squeeze(-1)
42 |         return self.generator(quant_emb, global_style_quantized_emb)
43 | 
44 |     def encode(self, x):
45 |         batch_size = x.size(0)
46 |         if len(x.shape) == 3 and x.shape[-1] == 1:
47 |             x = x.squeeze(-1)
48 |         # print(x.shape)
49 | 
50 |         c, global_features = self.encoder(x.unsqueeze(1))
51 |         # mid = mid.transpose(1, 2).unsqueeze(1)
52 |         # global_style = self.gst(mid)
53 |         q, loss_q, local_token, g, global_style_token = self.quantizer(c, global_features)
54 |         local_token = [code.reshape(batch_size, -1) for code in local_token]
55 |         global_style_token = torch.stack(global_style_token, -1).unsqueeze(1)
56 |         # shape: [N, T, 4]
57 |         return torch.stack(local_token, -1), global_style_token
58 | 


--------------------------------------------------------------------------------
/src/models/src_speechgpt2/Codec/models/modules/quantizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import torch
 4 | import logging
 5 | 
 6 | from .vq_module import ResidualVQ
 7 | 
 8 | class Quantizer(torch.nn.Module):
 9 |     def __init__(
10 |             self,
11 |             train_codebook,
12 |             code_dim,
13 |             codebook_num,
14 |             codebook_size,
15 |             kmeans_init,
16 |             kmeans_iters,
17 |             decay,
18 |             threshold_ema_dead_code,
19 |             model,
20 |         ):
21 |         self.quantizer_type = model
22 |         super().__init__()
23 |         # speech
24 |         if model == 'residual_vq':
25 |             self.codebook = ResidualVQ(
26 |                 train_codebook=train_codebook, 
27 |                 dim=code_dim, 
28 |                 num_quantizers=codebook_num, 
29 |                 codebook_size=codebook_size,
30 |                 kmeans_init=kmeans_init,
31 |                 kmeans_iters=kmeans_iters,
32 |                 decay=decay,
33 |                 threshold_ema_dead_code=threshold_ema_dead_code
34 |             )
35 |         else:
36 |             raise NotImplementedError(f"Model ({model}) is not supported!")
37 | 
38 |     def patch_accelerator(self, accelerator):
39 |         logging.info(f"[in models/melvqgan/modules/quantizer.py/ Quantizer] patch accelerator !")
40 |         self.codebook.patch_accelerator(accelerator)
41 | 
42 |     def initial(self):
43 |         self.codebook.initial()    
44 |     
45 |     def forward(self, z):
46 |         zq, embed_nums, vqloss, perplexity, all_layers_output = self.codebook(z.transpose(2, 1))
47 |         all_layers_output = [output.transpose(2, 1) for output in all_layers_output]
48 |         zq = zq.transpose(2, 1)        
49 |         return zq, embed_nums, vqloss, perplexity, all_layers_output
50 |     
51 |     def inference(self, z):  
52 |         zq, indices = self.codebook.forward_index(z.transpose(2, 1))
53 |         zq = zq.transpose(2, 1)
54 |         return zq, indices
55 |     
56 |     def encode(self, z): # 给 model 
57 |         indices = self.codebook.encode(z.transpose(2, 1))
58 |         return indices # (num_layers, bs, len)
59 |     
60 |     def decode(self, indices):  # 给 model     (num_layers, bs, len)
61 |         zq = self.codebook.decode(indices)
62 |         zq = zq.transpose(1, 2)
63 |         return zq # (bs, length, dim)
64 | 


--------------------------------------------------------------------------------
/tools/test_api.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | import os
 4 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 5 | 
 6 | from src.dataset import BatchLoader, BatchSaver
 7 | from src.registry import registry
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 |     static_only = False
12 |     model_name = "freeze_omni"
13 |     evaluator_name = "emo_llm"
14 |     judge_task = "emotion"
15 |     jsonl_files =["esd"]
16 | 
17 |     summarizer = registry.get_summarizer("AvgThreshold")
18 |     
19 |     for file in jsonl_files:
20 |         input_file = f"res/prediction/{model_name}/{file}.jsonl"
21 |         save_file = f"res/result/{model_name}/{file}_{judge_task}.jsonl"
22 | 
23 |         print("processing file: ", input_file)
24 |         if static_only:
25 |             scores = []
26 |             with open(save_file, "r", encoding="utf-8") as f:
27 |                 for line in f:
28 |                     data = json.loads(line)
29 |                     scores.append(int(data["score"]))
30 |             stat = summarizer.statistic(scores)
31 |             print(f"file: {file}, total_score: {stat}")
32 |             raise RuntimeError
33 | 
34 |         scores = []
35 |         dataloader = BatchLoader(input_file, batch_size=4)
36 |         saver = BatchSaver(save_file)
37 |         evaluator = registry.get_evaluator(evaluator_name)
38 |         
39 |         for idx, batch_data in enumerate(dataloader):
40 |             keys, preds, refs, pred_info_list = [
41 |                 list(x) for x in zip(*[
42 |                     (
43 |                         d["key"],
44 |                         d["pred"],
45 |                         d["ref"] if isinstance(d["ref"], list) else [d["ref"]],
46 |                         {k: d[k] for k in d if k not in ("pred", "ref")}
47 |                     )
48 |                     for d in batch_data
49 |                 ])
50 |             ]
51 | 
52 |             eval_results = evaluator.evaluate(preds, refs, pred_info_list)
53 |             if len(eval_results) != len(pred_info_list):
54 |                 raise ValueError("Lost some results...")
55 |             
56 |             for result, pred_info in zip(eval_results, pred_info_list):
57 |                 scores.append(result["score"])
58 |                 result.update(pred_info)
59 |                 saver.save_one(result)
60 | 
61 |         stat = summarizer.statistic(scores)
62 |         print(f"file: {file}, total_score: {stat}")


--------------------------------------------------------------------------------
/tools/parquet2jsonl.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import sys
 3 | import os
 4 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 5 | 
 6 | import json
 7 | import glob
 8 | import yaml
 9 | from src.utils import load_and_process_parquet_dataset
10 | 
11 | def collect_yaml_file_info(yaml_dir):
12 |     result = {}
13 |     yaml_files = glob.glob(os.path.join(yaml_dir, "*.yaml"))
14 | 
15 |     for yaml_file in yaml_files:
16 |         with open(yaml_file, "r", encoding="utf-8") as f:
17 |             data = yaml.safe_load(f)
18 | 
19 |         for name, config in data.items():
20 |             file_path = config.get("args", {}).get("file", None)
21 |             result[name] = file_path
22 | 
23 |     return result
24 | 
25 | def export_parquet_to_jsonl(repo_or_path="Tele-AI/TELEVAL", data_dir_pattern="llamaqa-zh", save_root_dir="./", is_local=False):
26 |     print(f"processing {repo_or_path}, {data_dir_pattern} data from huggingface, saving to {save_root_dir}")
27 |     if "*.parquet" in data_dir_pattern:
28 |         base_subdir = os.path.normpath(os.path.dirname(data_dir_pattern))
29 |     else:
30 |         base_subdir = os.path.normpath(data_dir_pattern)
31 | 
32 |     jsonl_filename = os.path.basename(base_subdir) + ".jsonl"
33 |     jsonl_path = os.path.join(save_root_dir, base_subdir, jsonl_filename)
34 |     audio_output_dir = os.path.join(save_root_dir, "audios", base_subdir)
35 | 
36 |     os.makedirs(os.path.dirname(jsonl_path), exist_ok=True)
37 | 
38 |     if os.path.exists(jsonl_path):
39 |         print(f"JSONL already exists and will be overwritten: {jsonl_path}")
40 | 
41 |     records = load_and_process_parquet_dataset(
42 |         repo_or_path, data_dir_pattern, audio_output_dir, key_col="key", is_local=is_local, tuple_decode=False
43 |     )
44 | 
45 |     with open(jsonl_path, "w", encoding="utf-8") as fout:
46 |         for record in records:
47 |             print(json.dumps(record, ensure_ascii=False), file=fout)
48 | 
49 |     print(f"JSONL saved to: {jsonl_path}")
50 |     print(f"Audio files saved under: {audio_output_dir}")
51 | 
52 | if __name__ == "__main__":
53 |     save_root_dir = "audiobench_data"
54 |     all_dataset = collect_yaml_file_info("registry/dataset")
55 |     for dataset, repo_data_dir in all_dataset.items():
56 |         parts = repo_data_dir.split("/", 2)
57 |         repo_or_path, data_dir_pattern = "/".join(parts[:2]), parts[-1]
58 |         export_parquet_to_jsonl(repo_or_path, data_dir_pattern, save_root_dir)


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/utils/data.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class KimiAContent:
 5 |     def __init__(
 6 |         self, audio_token_ids=None, text_token_ids=None, is_continuous_mask=None
 7 |     ):
 8 |         self.audio_token_ids: list[int] = audio_token_ids or []
 9 |         self.text_token_ids: list[int] = text_token_ids or []
10 |         self.is_continuous_mask: list[int] = is_continuous_mask or []
11 | 
12 |         self.continuous_feature = []
13 | 
14 |     def audio_append(self, index: int, is_continuous: bool = False):
15 |         self.audio_token_ids.append(index)
16 |         self.is_continuous_mask.append(is_continuous)
17 | 
18 |     def text_append(self, index: int):
19 |         self.text_token_ids.append(index)
20 | 
21 |     def audio_extend(self, ids: list[int], is_continuous: bool = False):
22 |         self.audio_token_ids.extend(ids)
23 |         self.is_continuous_mask.extend([is_continuous] * len(ids))
24 | 
25 |     def text_extend(self, ids: list[int]):
26 |         self.text_token_ids.extend(ids)
27 | 
28 |     def audio_prepend(self, index: int, is_continuous: bool = False):
29 |         self.audio_token_ids = [index] + self.audio_token_ids
30 |         self.is_continuous_mask = [is_continuous] + self.is_continuous_mask
31 | 
32 |     def text_prepend(self, index: int):
33 |         self.text_token_ids = [index] + self.text_token_ids
34 | 
35 |     def audio_pretend(self, ids: list[int], is_continuous: bool = False):
36 |         self.audio_token_ids = ids + self.audio_token_ids
37 |         self.is_continuous_mask = [is_continuous] * len(ids) + self.is_continuous_mask
38 | 
39 |     def text_pretend(self, ids: list[int]):
40 |         self.text_token_ids = ids + self.text_token_ids
41 | 
42 |     def merge(self, other: "KimiAContent"):
43 |         self.audio_token_ids.extend(other.audio_token_ids)
44 |         self.text_token_ids.extend(other.text_token_ids)
45 |         self.is_continuous_mask.extend(other.is_continuous_mask)
46 |         self.continuous_feature.extend(other.continuous_feature)
47 | 
48 |     def to_tensor(self) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
49 |         return (
50 |             torch.tensor([self.audio_token_ids], dtype=torch.long),
51 |             torch.tensor([self.text_token_ids], dtype=torch.long),
52 |             torch.tensor([self.is_continuous_mask], dtype=torch.bool),
53 |         )
54 | 
55 |     def is_valid(self):
56 |         return (
57 |             len(self.audio_token_ids)
58 |             == len(self.text_token_ids)
59 |             == len(self.is_continuous_mask)
60 |         )
61 | 


--------------------------------------------------------------------------------
/src/models/kimi_audio.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import soundfile as sf
 4 | from typing import Dict, Any
 5 | 
 6 | from src.models.base import Model
 7 | from src.models.src_kimi.kimia_infer.api.kimia import KimiAudio
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | class Kimi(Model):
12 |     def __init__(self, path: str, whisper_path: str, glm4_tokenizer: str, sample_params: Dict[str, Any] = None):
13 |         super().__init__(sample_params)
14 |         self.model = KimiAudio(
15 |             model_path=path,
16 |             whisper_path=whisper_path,
17 |             glm4_tokenizer=glm4_tokenizer,
18 |             load_detokenizer=True,
19 |             split_device=False,  # split need 4.48.3
20 |         )
21 | 
22 |         config = {
23 |             "default": {
24 |                 "audio_temperature": 0.8,
25 |                 "audio_top_k": 10,
26 |                 "text_temperature": 0.0,
27 |                 "text_top_k": 5,
28 |                 "audio_repetition_penalty": 1.0,
29 |                 "audio_repetition_window_size": 64,
30 |                 "text_repetition_penalty": 1.0,
31 |                 "text_repetition_window_size": 16
32 |             },
33 |             "greedy": {
34 |                 "audio_temperature": 1e-7,
35 |                 "text_temperature": 1e-7,
36 |                 "audio_repetition_penalty": 1.0,
37 |                 "text_repetition_penalty": 1.0
38 |             }  # NOTE (TTTdas): temerature > 1e-6 will do sampling
39 |         }
40 |         self.generation_config = config.get(self.sample_params.get("gen_type", "greedy"), None)
41 |         logger.info("generation_config: {}".format(self.generation_config))
42 | 
43 |     def generate_once(self, audio, **kwargs):
44 |         messages = []
45 |         instruction = kwargs.get("instruct", "")
46 |         if len(instruction) > 0:
47 |             messages.append({"role": "user", "message_type": "text", "content": instruction})
48 | 
49 |         messages.append({"role": "user", "message_type": "audio", "content": audio})
50 |         wav, text = self.model.generate(messages, **self.generation_config, output_type="both")
51 |         if kwargs.get("pred_audio"):
52 |             sf.write(
53 |                 kwargs["pred_audio"],
54 |                 wav.detach().cpu().view(-1).numpy(),
55 |                 24000,
56 |             )
57 |    
58 |         return {"pred": text, "pred_audio": kwargs.get("pred_audio")}
59 |     
60 |     def generate_multiturn(self, audio, user_history, assistant_history, **kwargs):
61 |         raise NotImplementedError("Waiting for Kimi-audio debug...")


--------------------------------------------------------------------------------
/requirements/speechgpt2_requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==2.1.0
  2 | accelerate==1.3.0
  3 | aiofiles==23.2.1
  4 | annotated-types==0.7.0
  5 | anyio==4.8.0
  6 | audioread==3.0.1
  7 | certifi==2024.12.14
  8 | cffi==1.17.1
  9 | charset-normalizer==3.4.1
 10 | click==8.1.8
 11 | cramjam==2.9.1
 12 | debugpy==1.8.12
 13 | decorator==5.1.1
 14 | einops==0.8.0
 15 | fastapi==0.115.7
 16 | fastparquet==2024.11.0
 17 | ffmpy==0.5.0
 18 | filelock==3.17.0
 19 | fsspec==2024.12.0
 20 | gradio==5.13.1
 21 | gradio_client==1.6.0
 22 | grpcio==1.70.0
 23 | h11==0.14.0
 24 | httpcore==1.0.7
 25 | httpx==0.28.1
 26 | huggingface-hub==0.27.1
 27 | idna==3.10
 28 | Jinja2==3.1.5
 29 | jiwer==3.0.5
 30 | joblib==1.4.2
 31 | lazy_loader==0.4
 32 | librosa==0.10.2.post1
 33 | llvmlite==0.44.0
 34 | Markdown==3.7
 35 | markdown-it-py==3.0.0
 36 | MarkupSafe==2.1.5
 37 | mdurl==0.1.2
 38 | mpmath==1.3.0
 39 | msgpack==1.1.0
 40 | networkx==3.4.2
 41 | numba==0.61.0
 42 | numpy==2.1.3
 43 | nvidia-cublas-cu12==12.4.5.8
 44 | nvidia-cuda-cupti-cu12==12.4.127
 45 | nvidia-cuda-nvrtc-cu12==12.4.127
 46 | nvidia-cuda-runtime-cu12==12.4.127
 47 | nvidia-cudnn-cu12==9.1.0.70
 48 | nvidia-cufft-cu12==11.2.1.3
 49 | nvidia-curand-cu12==10.3.5.147
 50 | nvidia-cusolver-cu12==11.6.1.9
 51 | nvidia-cusparse-cu12==12.3.1.170
 52 | nvidia-nccl-cu12==2.21.5
 53 | nvidia-nvjitlink-cu12==12.4.127
 54 | nvidia-nvtx-cu12==12.4.127
 55 | orjson==3.10.15
 56 | packaging==24.2
 57 | pandas==2.2.3
 58 | pillow==11.1.0
 59 | platformdirs==4.3.6
 60 | pooch==1.8.2
 61 | protobuf==5.29.3
 62 | psutil==6.1.1
 63 | pycparser==2.22
 64 | pydantic==2.10.6
 65 | pydantic_core==2.27.2
 66 | pydub==0.25.1
 67 | Pygments==2.19.1
 68 | python-dateutil==2.9.0.post0
 69 | python-multipart==0.0.20
 70 | pytz==2024.2
 71 | PyYAML==6.0.2
 72 | RapidFuzz==3.11.0
 73 | regex==2024.11.6
 74 | requests==2.32.3
 75 | rich==13.9.4
 76 | ruff==0.9.3
 77 | safehttpx==0.1.6
 78 | safetensors==0.5.2
 79 | scikit-learn==1.6.1
 80 | scipy==1.15.1
 81 | semantic-version==2.10.0
 82 | setuptools==75.1.0
 83 | shellingham==1.5.4
 84 | six==1.17.0
 85 | sniffio==1.3.1
 86 | sounddevice==0.5.1
 87 | soundfile==0.13.1
 88 | soxr==0.5.0.post1
 89 | starlette==0.45.3
 90 | sympy==1.13.1
 91 | tensorboard==2.18.0
 92 | tensorboard-data-server==0.7.2
 93 | tensorboardX==2.6.2.2
 94 | threadpoolctl==3.5.0
 95 | tokenizers==0.20.3
 96 | tomlkit==0.13.2
 97 | torch==2.5.1
 98 | torchaudio==2.5.1
 99 | torchvision==0.20.1
100 | tqdm==4.67.1
101 | transformers==4.46.1
102 | triton==3.1.0
103 | typer==0.15.1
104 | typing_extensions==4.12.2
105 | tzdata==2025.1
106 | urllib3==2.3.0
107 | uroman==1.3.1.1
108 | uvicorn==0.34.0
109 | websockets==14.2
110 | Werkzeug==3.1.3
111 | wheel==0.44.0
112 | 


--------------------------------------------------------------------------------
/src/models/src_llama_omni/arguments.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | 
 3 | from dataclasses import dataclass, field
 4 | from typing import Optional
 5 | 
 6 | 
 7 | @dataclass
 8 | class ModelArguments:
 9 |     model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
10 |     version: Optional[str] = field(default="v0")
11 |     freeze_backbone: bool = field(default=False)
12 |     tune_speech_projector: bool = field(default=False)
13 |     tune_speech_encoder: bool = field(default=False)
14 |     tune_speech_generator_only: bool = field(default=False)
15 |     speech_encoder_type: Optional[str] = field(default=None)
16 |     speech_encoder: Optional[str] = field(default=None)
17 |     pretrain_speech_projector: Optional[str] = field(default=None)
18 |     speech_projector_type: Optional[str] = field(default='linear')
19 |     speech_generator_type: Optional[str] = field(default='ctc')
20 |     ctc_decoder_config: str = "(2,4096,32,11008)"
21 |     ctc_upsample_factor: int = 1
22 |     ctc_loss_weight: float = 1.0
23 |     unit_vocab_size: int = 1000
24 |     speech_encoder_ds_rate: int = 5
25 |     speech_encoder_hidden_size: int = 1280
26 | 
27 | 
28 | @dataclass
29 | class DataArguments:
30 |     data_path: str = field(default=None,
31 |                            metadata={"help": "Path to the training data."})
32 |     is_multimodal: bool = False
33 |     input_type: str = field(default="mel")
34 |     speech_normalize: bool = False
35 |     mel_size: int = 128
36 |     has_tgt_units: bool = False
37 | 
38 | 
39 | @dataclass
40 | class TrainingArguments(transformers.TrainingArguments):
41 |     cache_dir: Optional[str] = field(default=None)
42 |     optim: str = field(default="adamw_torch")
43 |     freeze_speech_projector: bool = field(default=False)
44 |     model_max_length: int = field(
45 |         default=512,
46 |         metadata={
47 |             "help":
48 |             "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
49 |         },
50 |     )
51 |     double_quant: bool = field(
52 |         default=True,
53 |         metadata={"help": "Compress the quantization statistics through double quantization."}
54 |     )
55 |     quant_type: str = field(
56 |         default="nf4",
57 |         metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
58 |     )
59 |     bits: int = field(
60 |         default=16,
61 |         metadata={"help": "How many bits to use."}
62 |     )
63 |     lora_enable: bool = False
64 |     lora_r: int = 64
65 |     lora_alpha: int = 16
66 |     lora_dropout: float = 0.05
67 |     lora_weight_path: str = ""
68 |     lora_bias: str = "none"
69 |     speech_projector_lr: Optional[float] = None
70 |     group_by_modality_length: bool = field(default=False)


--------------------------------------------------------------------------------
/src/models/src_speechgpt2/Codec/models/moshi_modules/rope.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | import math
 3 | import torch
 4 | 
 5 | 
 6 | def apply_rope(
 7 |     q: torch.Tensor,
 8 |     k: torch.Tensor,
 9 |     offset: torch.Tensor,
10 |     max_period: float = 10_000,
11 |     time_before_heads: bool = False,
12 | ):
13 |     """
14 |     Args:
15 |         q (torch.Tensor): queries, shape `[B, T, H, D]`.
16 |         k (torch.Tensor): keys, shape `[B, T, H, D]`.
17 |         offset (int): current offset, e.g. when streaming.
18 |         max_period (float): maximum period for the cos and sin.
19 |         time_before_heads (bool):  if True, expected [B, T, H, D], else [B, H, T ,D]
20 |     """
21 | 
22 |     if time_before_heads:
23 |         B, T, H, D = q.shape
24 |     else:
25 |         B, H, T, D = q.shape
26 |     assert k.shape == q.shape
27 |     assert D > 0
28 |     assert D % 2 == 0
29 |     assert max_period > 0
30 | 
31 |     ds = torch.arange(D // 2, device=q.device, dtype=torch.float32)
32 |     freqs = torch.exp(ds * (-math.log(max_period) * 2 / D))
33 |     ts = offset.float() + torch.arange(T, device=q.device, dtype=torch.float32)
34 |     if time_before_heads:
35 |         ts = ts.view(-1, 1, 1)
36 |     else:
37 |         ts = ts.view(1, -1, 1)
38 | 
39 |     dims = q.shape[:-1]
40 |     q = q.view(*dims, D // 2, 2)
41 |     k = k.view(*dims, D // 2, 2)
42 | 
43 |     # convention is `r` suffix is real part, `i` is imaginary.
44 |     qr = q[..., 0].float()
45 |     qi = q[..., 1].float()
46 | 
47 |     kr = k[..., 0].float()
48 |     ki = k[..., 1].float()
49 | 
50 |     rotr = torch.cos(freqs * ts)
51 |     roti = torch.sin(freqs * ts)
52 |     qor = qr * rotr - qi * roti
53 |     qoi = qr * roti + qi * rotr
54 | 
55 |     kor = kr * rotr - ki * roti
56 |     koi = kr * roti + ki * rotr
57 | 
58 |     dtype = q.dtype
59 |     qo = torch.stack([qor.to(dtype), qoi.to(dtype)], dim=-1)
60 |     ko = torch.stack([kor.to(dtype), koi.to(dtype)], dim=-1)
61 | 
62 |     return qo.view(*dims, D), ko.view(*dims, D)
63 | 
64 | 
65 | class RotaryEmbedding(nn.Module):
66 |     """Rotary positional embedding (RoPE) from [Su et al 2022](https://arxiv.org/abs/2104.09864).
67 | 
68 |     Args:
69 |         max_period (float): Maximum period of the rotation frequencies.
70 |     """
71 | 
72 |     def __init__(self, max_period: float = 10000.0):
73 |         super().__init__()
74 |         self.max_period = max_period
75 | 
76 |     def forward(
77 |         self,
78 |         q: torch.Tensor,
79 |         k: torch.Tensor,
80 |         offset: torch.Tensor,
81 |         time_before_heads: bool = False,
82 |     ):
83 |         """Apply rope rotation to query or key tensor."""
84 |         return apply_rope(q, k, offset, self.max_period, time_before_heads)
85 | 


--------------------------------------------------------------------------------
/src/models/src_speechgpt2/Codec/models/moshi_modules/gating.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | 
 5 | def gating_forward_kernel(
 6 |     weight_in: torch.Tensor, weight_out: torch.Tensor, activation, x: torch.Tensor
 7 | ):
 8 |     x = F.linear(x, weight_in)
 9 |     B, T, _ = x.shape
10 |     x = x.view(B, T, 2, -1)
11 |     x = activation(x[..., 0, :]) * x[..., 1, :]
12 |     x = F.linear(x, weight_out)
13 |     return x
14 | 
15 | 
16 | class ActivationGating(nn.Module):
17 |     """
18 |     Gating FFN layer, using the given activation.
19 |     Args:
20 |         dim (int): dimension of the input and output of the transformer.
21 |         activation (any callable Tensor to Tensor): activation function to use.
22 |         **factory_kwargs: other kwargs passed to the linear layer, in particular device and dtype.
23 |     """
24 | 
25 |     _fsdp_final = True
26 | 
27 |     def __init__(self, dim: int, dim_feedforward: int, activation, **factory_kwargs):
28 |         super().__init__()
29 |         # We should have 8 d^2 param, instead we will have
30 |         # 2 * h * d + h * d = 3 h * d = 8 d^2
31 |         # so h = 8 d / 3 but following Hervé's advice we use 21 / 8 as an approx.
32 |         if dim_feedforward == 4 * dim:
33 |             hidden = (21 * dim) // 8
34 |         else:
35 |             hidden = (2 * dim_feedforward) // 3
36 |         self.linear_in = nn.Linear(dim, 2 * hidden, bias=False, **factory_kwargs)
37 |         self.linear_out = nn.Linear(hidden, dim, bias=False, **factory_kwargs)
38 |         self.activation = activation
39 | 
40 |     def forward(self, x: torch.Tensor):
41 |         return gating_forward_kernel(
42 |             self.linear_in.weight, self.linear_out.weight, self.activation, x
43 |         )
44 | 
45 | 
46 | def _get_activation(name: str):
47 |     if name in ["sigmoid", "tanh", "relu"]:
48 |         return getattr(torch, name)
49 |     elif name in ["leaky_relu", "elu", "gelu", "silu", "mish", "softsign"]:
50 |         return getattr(torch.nn.functional, name)
51 |     elif name == "identity":
52 |         return torch.nn.Identity()
53 |     else:
54 |         raise ValueError(f"Unknown activation {name}")
55 | 
56 | 
57 | def _make_gating(
58 |     name: str, dim: int, dim_feedforward: int, **factory_kwargs
59 | ) -> nn.Module:
60 |     return ActivationGating(
61 |         dim, dim_feedforward, _get_activation(name), **factory_kwargs
62 |     )
63 | 
64 | 
65 | def make_gating(
66 |     name: str, dim: int, dim_feedforward: int, **factory_kwargs
67 | ) -> nn.Module:
68 |     gating = _make_gating(name, dim, dim_feedforward, **factory_kwargs)
69 |     max_params = 2 * dim * dim_feedforward
70 |     params = sum(p.numel() for p in gating.parameters())
71 |     assert (
72 |         params <= max_params
73 |     ), f"{name} gating has {params} params, max is {max_params}"
74 |     return gating
75 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/activation1d.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 NVIDIA CORPORATION.
 2 | #   Licensed under the MIT license.
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | from ..torch.resample import UpSample1d, DownSample1d
 7 | 
 8 | # load fused CUDA kernel: this enables importing anti_alias_activation_cuda
 9 | from . import load
10 | 
11 | anti_alias_activation_cuda = load.load()
12 | 
13 | 
14 | class FusedAntiAliasActivation(torch.autograd.Function):
15 |     """
16 |     Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs.
17 |     The hyperparameters are hard-coded in the kernel to maximize speed.
18 |     NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters.
19 |     """
20 | 
21 |     @staticmethod
22 |     def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
23 |         activation_results = anti_alias_activation_cuda.forward(
24 |             inputs, up_ftr, down_ftr, alpha, beta
25 |         )
26 | 
27 |         return activation_results
28 | 
29 |     @staticmethod
30 |     def backward(ctx, output_grads):
31 |         raise NotImplementedError
32 |         return output_grads, None, None
33 | 
34 | 
35 | class Activation1d(nn.Module):
36 |     def __init__(
37 |         self,
38 |         activation,
39 |         up_ratio: int = 2,
40 |         down_ratio: int = 2,
41 |         up_kernel_size: int = 12,
42 |         down_kernel_size: int = 12,
43 |         fused: bool = True,
44 |     ):
45 |         super().__init__()
46 |         self.up_ratio = up_ratio
47 |         self.down_ratio = down_ratio
48 |         self.act = activation
49 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
50 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
51 | 
52 |         self.fused = fused  # Whether to use fused CUDA kernel or not
53 | 
54 |     def forward(self, x):
55 |         if not self.fused:
56 |             x = self.upsample(x)
57 |             x = self.act(x)
58 |             x = self.downsample(x)
59 |             return x
60 |         else:
61 |             if self.act.__class__.__name__ == "Snake":
62 |                 beta = self.act.alpha.data  # Snake uses same params for alpha and beta
63 |             else:
64 |                 beta = (
65 |                     self.act.beta.data
66 |                 )  # Snakebeta uses different params for alpha and beta
67 |             alpha = self.act.alpha.data
68 |             if (
69 |                 not self.act.alpha_logscale
70 |             ):  # Exp baked into cuda kernel, cancel it out with a log
71 |                 alpha = torch.log(alpha)
72 |                 beta = torch.log(beta)
73 | 
74 |             x = FusedAntiAliasActivation.apply(
75 |                 x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta
76 |             )
77 |             return x
78 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/load.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 NVIDIA CORPORATION.
 2 | #   Licensed under the MIT license.
 3 | 
 4 | import os
 5 | import pathlib
 6 | import subprocess
 7 | 
 8 | from torch.utils import cpp_extension
 9 | 
10 | """
11 | Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels. 
12 | Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below
13 | """
14 | os.environ["TORCH_CUDA_ARCH_LIST"] = ""
15 | 
16 | 
17 | def load():
18 |     # Check if cuda 11 is installed for compute capability 8.0
19 |     cc_flag = []
20 |     _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
21 |     if int(bare_metal_major) >= 11:
22 |         cc_flag.append("-gencode")
23 |         cc_flag.append("arch=compute_80,code=sm_80")
24 | 
25 |     # Build path
26 |     srcpath = pathlib.Path(__file__).parent.absolute()
27 |     buildpath = srcpath / "build"
28 |     _create_build_dir(buildpath)
29 | 
30 |     # Helper function to build the kernels.
31 |     def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
32 |         return cpp_extension.load(
33 |             name=name,
34 |             sources=sources,
35 |             build_directory=buildpath,
36 |             extra_cflags=[
37 |                 "-O3",
38 |             ],
39 |             extra_cuda_cflags=[
40 |                 "-O3",
41 |                 "-gencode",
42 |                 "arch=compute_70,code=sm_70",
43 |                 "--use_fast_math",
44 |             ]
45 |             + extra_cuda_flags
46 |             + cc_flag,
47 |             verbose=True,
48 |         )
49 | 
50 |     extra_cuda_flags = [
51 |         "-U__CUDA_NO_HALF_OPERATORS__",
52 |         "-U__CUDA_NO_HALF_CONVERSIONS__",
53 |         "--expt-relaxed-constexpr",
54 |         "--expt-extended-lambda",
55 |     ]
56 | 
57 |     sources = [
58 |         srcpath / "anti_alias_activation.cpp",
59 |         srcpath / "anti_alias_activation_cuda.cu",
60 |     ]
61 |     anti_alias_activation_cuda = _cpp_extention_load_helper(
62 |         "anti_alias_activation_cuda", sources, extra_cuda_flags
63 |     )
64 | 
65 |     return anti_alias_activation_cuda
66 | 
67 | 
68 | def _get_cuda_bare_metal_version(cuda_dir):
69 |     raw_output = subprocess.check_output(
70 |         [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
71 |     )
72 |     output = raw_output.split()
73 |     release_idx = output.index("release") + 1
74 |     release = output[release_idx].split(".")
75 |     bare_metal_major = release[0]
76 |     bare_metal_minor = release[1][0]
77 | 
78 |     return raw_output, bare_metal_major, bare_metal_minor
79 | 
80 | 
81 | def _create_build_dir(buildpath):
82 |     try:
83 |         os.mkdir(buildpath)
84 |     except OSError:
85 |         if not os.path.isdir(buildpath):
86 |             print(f"Creation of the build directory {buildpath} failed")
87 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/utils/class_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>
 2 | #            2024 Alibaba Inc (authors: Xiang Lyu)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | 
17 | from cosyvoice.transformer.activation import Swish
18 | from cosyvoice.transformer.subsampling import (
19 |     LinearNoSubsampling,
20 |     EmbedinigNoSubsampling,
21 |     Conv1dSubsampling2,
22 |     Conv2dSubsampling4,
23 |     Conv2dSubsampling6,
24 |     Conv2dSubsampling8,
25 | )
26 | from cosyvoice.transformer.embedding import (PositionalEncoding,
27 |                                              RelPositionalEncoding,
28 |                                              WhisperPositionalEncoding,
29 |                                              LearnablePositionalEncoding,
30 |                                              NoPositionalEncoding)
31 | from cosyvoice.transformer.attention import (MultiHeadedAttention,
32 |                                              RelPositionMultiHeadedAttention,
33 |                                              BlockRelPositionMultiHeadedAttention)
34 | from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding
35 | from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling
36 | 
37 | 
38 | COSYVOICE_ACTIVATION_CLASSES = {
39 |     "hardtanh": torch.nn.Hardtanh,
40 |     "tanh": torch.nn.Tanh,
41 |     "relu": torch.nn.ReLU,
42 |     "selu": torch.nn.SELU,
43 |     "swish": getattr(torch.nn, "SiLU", Swish),
44 |     "gelu": torch.nn.GELU,
45 | }
46 | 
47 | COSYVOICE_SUBSAMPLE_CLASSES = {
48 |     "linear": LinearNoSubsampling,
49 |     "linear_legacy": LegacyLinearNoSubsampling,
50 |     "embed": EmbedinigNoSubsampling,
51 |     "conv1d2": Conv1dSubsampling2,
52 |     "conv2d": Conv2dSubsampling4,
53 |     "conv2d6": Conv2dSubsampling6,
54 |     "conv2d8": Conv2dSubsampling8,
55 |     'paraformer_dummy': torch.nn.Identity
56 | }
57 | 
58 | COSYVOICE_EMB_CLASSES = {
59 |     "embed": PositionalEncoding,
60 |     "abs_pos": PositionalEncoding,
61 |     "rel_pos": RelPositionalEncoding,
62 |     "rel_pos_espnet": EspnetRelPositionalEncoding,
63 |     "no_pos": NoPositionalEncoding,
64 |     "abs_pos_whisper": WhisperPositionalEncoding,
65 |     "embed_learnable_pe": LearnablePositionalEncoding,
66 | }
67 | 
68 | COSYVOICE_ATTENTION_CLASSES = {
69 |     "selfattn": MultiHeadedAttention,
70 |     "rel_selfattn": RelPositionMultiHeadedAttention,
71 |     "block_rel_selfattn": BlockRelPositionMultiHeadedAttention,
72 | }
73 | 


--------------------------------------------------------------------------------
/src/models/model_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import functools
 3 | import logging
 4 | from transformers import AutoModelForCausalLM, AutoTokenizer
 5 | from accelerate import infer_auto_device_map
 6 | from accelerate.utils import get_balanced_memory
 7 | from collections import Counter
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | def get_no_split_module_candidates(model):
12 |     class_counter = Counter()
13 | 
14 |     def count_module_classes(module):
15 |         for child in module.children():
16 |             class_name = child.__class__.__name__
17 |             class_counter[class_name] += 1
18 |             count_module_classes(child)
19 | 
20 |     count_module_classes(model)
21 | 
22 |     candidates = {name for name, count in class_counter.items() if count > 1}
23 |     return candidates
24 | 
25 | def load_model_with_auto_device_map(
26 |     model_name: str,
27 |     max_memory: dict = None,
28 |     no_split_module_classes: list = [],
29 |     dtype=torch.float16,
30 |     return_tokenizer=False
31 | ):
32 |     """
33 |     Automatically infer device_map and load a multi-GPU model.
34 | 
35 |     Args:
36 |         model_name (str): Model name or path.
37 |         max_memory (dict, optional): Max memory per GPU, e.g., {0: "20GiB", 1: "20GiB"}.
38 |             If None, get balance memory.
39 |         no_split_module_classes (list, optional): List of module class names that must not be split.
40 |         dtype (torch.dtype, optional): Model precision (default: float16).
41 |         return_tokenizer (bool, optional): Whether to return tokenizer along with model.
42 | 
43 |     Returns:
44 |         model or (model, tokenizer)
45 |     """
46 |     # load to CPU first
47 |     model = AutoModelForCausalLM.from_pretrained(
48 |         model_name,
49 |         torch_dtype=dtype,
50 |         trust_remote_code=True, 
51 |         device_map=None
52 |     )
53 |     candidates = get_no_split_module_candidates(model)
54 |     logger.info(f"Folloing modules can be split: {candidates}")
55 | 
56 |     illegal = [cls for cls in no_split_module_classes if cls not in candidates]
57 |     if illegal:
58 |         raise ValueError(f"{illegal} not in allowed no_split_module_classes: {candidates}")
59 |     
60 |     if max_memory is None:
61 |         max_memory = get_balanced_memory(model, dtype=dtype)
62 |         # n_gpus = torch.cuda.device_count()
63 |         # if n_gpus == 0:
64 |         #     raise ValueError("No CUDA GPUs detected for max_memory auto-inference.")
65 |         # max_memory = {i: "20GiB" for i in range(n_gpus)}
66 |     
67 |     device_map = infer_auto_device_map(
68 |         model,
69 |         max_memory=max_memory,
70 |         no_split_module_classes=no_split_module_classes
71 |     )
72 | 
73 |     model = AutoModelForCausalLM.from_pretrained(
74 |         model_name,
75 |         torch_dtype=dtype,
76 |         trust_remote_code=True, 
77 |         device_map=device_map
78 |     )
79 | 
80 |     if return_tokenizer:
81 |         tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
82 |         return model, tokenizer
83 | 
84 |     return model


--------------------------------------------------------------------------------
/run_text.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONPATH=$PWD:$PYTHONPATH
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | 
 5 | max_memory=400
 6 | save_dir="res/test"
 7 | 
 8 | stage=1
 9 | stop_stage=1
10 | eval_bsz=1
11 | 
12 | text_qa_tasks="text-llamaqa-en,text-llamaqa-zh,text-triviaqa-en,text-triviaqa-zh,text-webq-en,text-webq-zh,text-chinesesimpleqa-zh"
13 | text_choice_tasks="text-agieval-zh,text-ceval-zh"
14 | text_dialect_tasks="text-sichuanese,text-shanghainese,text-northeastern_mandarin,text-henan_dialect,text-cantonese"
15 | text_chitchat_dialect_tasks="text-chitchat-sichuanese,text-chitchat-shanghainese,text-chitchat-northeastern_mandarin,text-chitchat-henan_dialect,text-chitchat-cantonese"
16 | 
17 | text_down_tasks="text-chinese_quiz-zh,text-livelihood_policy-zh"
18 | text_down_dialect_tasks="text-livelihood_policy-sichuanese,text-livelihood_policy-shanghainese,text-livelihood_policy-northeastern_mandarin,text-livelihood_policy-henan_dialect,text-livelihood_policy-cantonese"
19 | 
20 | text_emo_tasks="text-emo"
21 | 
22 | 
23 | declare -A model_tasks
24 | model_tasks=(
25 |     ["qwen3-8b-instruct"]="$text_down_tasks,$text_dialect_tasks,$text_down_tasks"
26 | )
27 | 
28 | gpu_list=($(echo $CUDA_VISIBLE_DEVICES | tr ',' ' '))
29 | gpu_counts=${#gpu_list[@]}
30 | 
31 | get_free_gpu() {
32 |     while true; do
33 |         for gpu in "${gpu_list[@]}"; do
34 |             used_mem=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk "NR==$((gpu+1))")
35 |             if [[ "$used_mem" -lt "$max_memory" ]]; then
36 |                 echo "$gpu"
37 |                 return
38 |             fi
39 |         done
40 |         sleep 30
41 |     done
42 | }
43 | 
44 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
45 |     for model in "${!model_tasks[@]}"; do
46 |         IFS=',' read -r -a values <<< "${model_tasks[$model]}"
47 |         for task in "${values[@]}"; do
48 |             gpu=$(get_free_gpu)
49 |             echo "***********************************************"
50 |             echo "processing model: $model using task: $task on GPU: $gpu"
51 |             echo "***********************************************"
52 |             CUDA_VISIBLE_DEVICES=$gpu python main.py \
53 |                 --mode "infer" \
54 |                 --save_dir $save_dir \
55 |                 --model $model \
56 |                 --task $task &
57 |             sleep 40  # Increase sleep time appropriately according to the speed of loading the model
58 |         done
59 |     done
60 |     wait
61 | fi
62 | 
63 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
64 |     for model in "${!model_tasks[@]}"; do
65 |         IFS=',' read -r -a values <<< "${model_tasks[$model]}"
66 |         # read -a values <<< "${model_tasks[$model]}"
67 |         for task in "${values[@]}"; do
68 |             python main.py \
69 |                 --mode "eval" \
70 |                 --save_dir $save_dir \
71 |                 --model $model \
72 |                 --bsz $eval_bsz \
73 |                 --task $task
74 |         done
75 |     done
76 |     wait
77 |     python tools/save_csv.py --root_dir $save_dir
78 | fi


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/detokenizer/flow_matching/scheduler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from abc import abstractmethod, ABC
 3 | 
 4 | try:
 5 |     from torchdyn.core import NeuralODE
 6 | 
 7 |     NEURALODE_INSTALLED = True
 8 | except ImportError:
 9 |     NEURALODE_INSTALLED = False
10 | 
11 | 
12 | class SchedulerBase(ABC):
13 |     def __init__(self) -> None:
14 |         pass
15 | 
16 |     @abstractmethod
17 |     def set_timesteps(self):
18 |         pass
19 | 
20 |     @abstractmethod
21 |     def step(self):
22 |         pass
23 | 
24 |     @abstractmethod
25 |     def add_noise(self):
26 |         pass
27 | 
28 | 
29 | class StreamingFlowMatchingScheduler(SchedulerBase):
30 |     def __init__(
31 |         self,
32 |         timesteps=1000,
33 |         sigma_min=1e-4,
34 |     ) -> None:
35 |         super().__init__()
36 | 
37 |         self.sigma_min = sigma_min
38 |         self.timesteps = timesteps
39 |         self.t_min = 0
40 |         self.t_max = 1 - self.sigma_min
41 | 
42 |         self.neural_ode = None
43 | 
44 |     def set_timesteps(self, timesteps=15):
45 |         self.timesteps = timesteps
46 | 
47 |     def step(self, xt, predicted_v):
48 | 
49 |         h = (self.t_max - self.t_min) / self.timesteps
50 |         h = h * torch.ones(xt.shape[0], dtype=xt.dtype, device=xt.device)
51 | 
52 |         xt = xt + h * predicted_v
53 |         return xt
54 | 
55 |     def sample(self, ode_wrapper, time_steps, xt, verbose=False, x0=None):
56 |         h = (self.t_max - self.t_min) / self.timesteps
57 |         h = h * torch.ones(xt.shape[0], dtype=xt.dtype, device=xt.device)
58 | 
59 |         if verbose:
60 |             gt_v = x0 - xt
61 | 
62 |         for t in time_steps:
63 |             predicted_v = ode_wrapper(t, xt)
64 |             if verbose:
65 |                 dist = torch.mean(torch.nn.functional.l1_loss(gt_v, predicted_v))
66 |                 print("Time: {}, Distance: {}".format(t, dist))
67 |             xt = xt + h * predicted_v
68 |         return xt
69 | 
70 |     def sample_by_neuralode(self, ode_wrapper, time_steps, xt, verbose=False, x0=None):
71 |         if not NEURALODE_INSTALLED:
72 |             raise ImportError("NeuralODE is not installed, please install it first.")
73 | 
74 |         if self.neural_ode is None:
75 |             self.neural_ode = NeuralODE(
76 |                 ode_wrapper,
77 |                 solver="euler",
78 |                 sensitivity="adjoint",
79 |                 atol=self.sigma_min,
80 |                 rtol=self.sigma_min,
81 |             )
82 | 
83 |         eval_points, traj = self.neural_ode(xt, time_steps)
84 |         return traj[-1]
85 | 
86 |     def add_noise(
87 |         self,
88 |         original_samples: torch.FloatTensor,
89 |         noise: torch.FloatTensor,
90 |         timesteps: torch.IntTensor,
91 |     ):
92 |         ut = original_samples - (1 - self.sigma_min) * noise  # 和ut的梯度没关系
93 |         t_unsqueeze = timesteps.unsqueeze(1).unsqueeze(1).float() / self.timesteps
94 |         x_noisy = (
95 |             t_unsqueeze * original_samples
96 |             + (1.0 - (1 - self.sigma_min) * t_unsqueeze) * noise
97 |         )
98 |         return x_noisy, ut
99 | 


--------------------------------------------------------------------------------
/registry/dataset/dialect.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # -----------------chinese quiz dialect------------------
  3 | chinese_quiz-sichuanese:
  4 |   class: src.dataset.BatchLoader
  5 |   args:
  6 |     file: Tele-AI/TELEVAL/chinese_quiz-sichuanese
  7 |     ref_col: answer
  8 |     query_col: query
  9 | 
 10 | chinese_quiz-shanghainese:
 11 |   class: src.dataset.BatchLoader
 12 |   args:
 13 |     file: Tele-AI/TELEVAL/chinese_quiz-shanghainese
 14 |     ref_col: answer
 15 |     query_col: query
 16 | 
 17 | chinese_quiz-northeastern_mandarin:
 18 |   class: src.dataset.BatchLoader
 19 |   args:
 20 |     file: Tele-AI/TELEVAL/chinese_quiz-northeastern_mandarin
 21 |     ref_col: answer
 22 |     query_col: query
 23 | 
 24 | chinese_quiz-henan_dialect:
 25 |   class: src.dataset.BatchLoader
 26 |   args:
 27 |     file: Tele-AI/TELEVAL/chinese_quiz-henan_dialect
 28 |     ref_col: answer
 29 |     query_col: query
 30 | 
 31 | chinese_quiz-cantonese:
 32 |   class: src.dataset.BatchLoader
 33 |   args:
 34 |     file: Tele-AI/TELEVAL/chinese_quiz-cantonese
 35 |     ref_col: answer
 36 |     query_col: query
 37 | 
 38 | # -----------------livelihood policy dialect------------------
 39 | livelihood_policy-sichuanese:
 40 |   class: src.dataset.BatchLoader
 41 |   args:
 42 |     file: Tele-AI/TELEVAL/livelihood_policy-sichuanese
 43 |     ref_col: answer
 44 |     query_col: query
 45 | 
 46 | livelihood_policy-shanghainese:
 47 |   class: src.dataset.BatchLoader
 48 |   args:
 49 |     file: Tele-AI/TELEVAL/livelihood_policy-shanghainese
 50 |     ref_col: answer
 51 |     query_col: query
 52 | 
 53 | livelihood_policy-northeastern_mandarin:
 54 |   class: src.dataset.BatchLoader
 55 |   args:
 56 |     file: Tele-AI/TELEVAL/livelihood_policy-northeastern_mandarin
 57 |     ref_col: answer
 58 |     query_col: query
 59 | 
 60 | livelihood_policy-henan_dialect:
 61 |   class: src.dataset.BatchLoader
 62 |   args:
 63 |     file: Tele-AI/TELEVAL/livelihood_policy-henan_dialect
 64 |     ref_col: answer
 65 |     query_col: query
 66 | 
 67 | livelihood_policy-cantonese:
 68 |   class: src.dataset.BatchLoader
 69 |   args:
 70 |     file: Tele-AI/TELEVAL/livelihood_policy-cantonese
 71 |     ref_col: answer
 72 |     query_col: query
 73 | 
 74 | # -----------------chitchat dialect------------------
 75 | chitchat-sichuanese:
 76 |   class: src.dataset.BatchLoader
 77 |   args:
 78 |     file: Tele-AI/TELEVAL/chitchat-sichuanese
 79 |     ref_col: dialect
 80 |     query_col: query
 81 | 
 82 | chitchat-shanghainese:
 83 |   class: src.dataset.BatchLoader
 84 |   args:
 85 |     file: Tele-AI/TELEVAL/chitchat-shanghainese
 86 |     ref_col: dialect
 87 |     query_col: query
 88 | 
 89 | chitchat-northeastern_mandarin:
 90 |   class: src.dataset.BatchLoader
 91 |   args:
 92 |     file: Tele-AI/TELEVAL/chitchat-northeastern_mandarin
 93 |     ref_col: dialect
 94 |     query_col: query
 95 | 
 96 | chitchat-henan_dialect:
 97 |   class: src.dataset.BatchLoader
 98 |   args:
 99 |     file: Tele-AI/TELEVAL/chitchat-henan_dialect
100 |     ref_col: dialect
101 |     query_col: query
102 | 
103 | chitchat-cantonese:
104 |   class: src.dataset.BatchLoader
105 |   args:
106 |     file: Tele-AI/TELEVAL/chitchat-cantonese
107 |     ref_col: dialect
108 |     query_col: query
109 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/transformer/activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
 2 | #               2020 Northwestern Polytechnical University (Pengcheng Guo)
 3 | #               2020 Mobvoi Inc (Binbin Zhang)
 4 | #               2024 Alibaba Inc (Xiang Lyu)
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | """Swish() activation function for Conformer."""
18 | 
19 | import torch
20 | from torch import nn, sin, pow
21 | from torch.nn import Parameter
22 | 
23 | 
24 | class Swish(torch.nn.Module):
25 |     """Construct an Swish object."""
26 | 
27 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
28 |         """Return Swish activation function."""
29 |         return x * torch.sigmoid(x)
30 | 
31 | 
32 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
33 | #   LICENSE is in incl_licenses directory.
34 | class Snake(nn.Module):
35 |     '''
36 |     Implementation of a sine-based periodic activation function
37 |     Shape:
38 |         - Input: (B, C, T)
39 |         - Output: (B, C, T), same shape as the input
40 |     Parameters:
41 |         - alpha - trainable parameter
42 |     References:
43 |         - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
44 |         https://arxiv.org/abs/2006.08195
45 |     Examples:
46 |         >>> a1 = snake(256)
47 |         >>> x = torch.randn(256)
48 |         >>> x = a1(x)
49 |     '''
50 |     def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
51 |         '''
52 |         Initialization.
53 |         INPUT:
54 |             - in_features: shape of the input
55 |             - alpha: trainable parameter
56 |             alpha is initialized to 1 by default, higher values = higher-frequency.
57 |             alpha will be trained along with the rest of your model.
58 |         '''
59 |         super(Snake, self).__init__()
60 |         self.in_features = in_features
61 | 
62 |         # initialize alpha
63 |         self.alpha_logscale = alpha_logscale
64 |         if self.alpha_logscale:  # log scale alphas initialized to zeros
65 |             self.alpha = Parameter(torch.zeros(in_features) * alpha)
66 |         else:  # linear scale alphas initialized to ones
67 |             self.alpha = Parameter(torch.ones(in_features) * alpha)
68 | 
69 |         self.alpha.requires_grad = alpha_trainable
70 | 
71 |         self.no_div_by_zero = 0.000000001
72 | 
73 |     def forward(self, x):
74 |         '''
75 |         Forward pass of the function.
76 |         Applies the function to the input elementwise.
77 |         Snake ∶= x + 1/a * sin^2 (xa)
78 |         '''
79 |         alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
80 |         if self.alpha_logscale:
81 |             alpha = torch.exp(alpha)
82 |         x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
83 | 
84 |         return x
85 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/audio_process.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import librosa
 3 | import soundfile as sf
 4 | import numpy as np
 5 | from pathlib import Path
 6 | import io
 7 | 
 8 | # Split audio stream at silence points to prevent playback stuttering issues
 9 | # caused by AAC encoder frame padding when streaming audio through Gradio audio components.
10 | class AudioStreamProcessor:
11 |     def __init__(self, sr=22050, min_silence_duration=0.1, threshold_db=-40):
12 |         self.sr = sr
13 |         self.min_silence_duration = min_silence_duration
14 |         self.threshold_db = threshold_db
15 |         self.buffer = np.array([])
16 |   
17 |     
18 |     def process(self, audio_data, last=False):
19 |         """
20 |         Add audio data and process it
21 |         params:
22 |             audio_data: audio data in numpy array
23 |             last: whether this is the last chunk of data
24 |         returns:
25 |             Processed audio data, returns None if no split point is found
26 |         """
27 | 
28 |         # Add new data to buffer
29 |         self.buffer = np.concatenate([self.buffer, audio_data]) if len(self.buffer) > 0 else audio_data
30 |         
31 |         if last:
32 |             result = self.buffer
33 |             self.buffer = np.array([])
34 |             return self._to_wav_bytes(result)
35 |             
36 |         # Find silence boundary
37 |         split_point = self._find_silence_boundary(self.buffer)
38 |         
39 |         if split_point is not None:
40 |             # Modified: Extend split point to the end of silence
41 |             silence_end = self._find_silence_end(split_point)
42 |             result = self.buffer[:silence_end]
43 |             self.buffer = self.buffer[silence_end:]
44 |             return self._to_wav_bytes(result)
45 |             
46 |         return None
47 |         
48 |     def _find_silence_boundary(self, audio):
49 |         """
50 |         Find the starting point of silence boundary in audio
51 |         """
52 |         # Convert audio to decibels
53 |         db = librosa.amplitude_to_db(np.abs(audio), ref=np.max)
54 |         
55 |         # Find points below threshold
56 |         silence_points = np.where(db < self.threshold_db)[0]
57 |         
58 |         if len(silence_points) == 0:
59 |             return None
60 |             
61 |         # Calculate minimum silence samples
62 |         min_silence_samples = int(self.min_silence_duration * self.sr)
63 |         
64 |         # Search backwards for continuous silence segment starting point
65 |         for i in range(len(silence_points) - min_silence_samples, -1, -1):
66 |             if i < 0:
67 |                 break
68 |             if np.all(np.diff(silence_points[i:i+min_silence_samples]) == 1):
69 |                 return silence_points[i]
70 |                 
71 |         return None
72 |         
73 |     def _find_silence_end(self, start_point):
74 |         """
75 |         Find the end point of silence segment
76 |         """
77 |         db = librosa.amplitude_to_db(np.abs(self.buffer[start_point:]), ref=np.max)
78 |         silence_points = np.where(db >= self.threshold_db)[0]
79 |         
80 |         if len(silence_points) == 0:
81 |             return len(self.buffer)
82 |             
83 |         return start_point + silence_points[0]
84 |       
85 |     def _to_wav_bytes(self, audio_data):
86 |         """
87 |         trans_to_wav_bytes
88 |         """
89 |         wav_buffer = io.BytesIO()
90 |         sf.write(wav_buffer, audio_data, self.sr, format='WAV')
91 |         return wav_buffer.getvalue()
92 |       
93 |     
94 | 


--------------------------------------------------------------------------------
/src/evaluator/dialect.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | import kaldifeat
 3 | import logging
 4 | import numpy as np
 5 | import onnxruntime as ort
 6 | from scipy.special import softmax
 7 | 
 8 | from src.evaluator.base import Evaluator
 9 | from src.utils import parallel_batch, preprocess_audio
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | class DialectSession:
14 |     """
15 |     from https://github.com/Tele-AI/TeleSpeech-DialectIdentify
16 |     """
17 |     def __init__(self, onnx_file: str, device: str = "cpu"):
18 |         self.session = self._init_session(onnx_file, device)
19 |         self.mfcc_extractor = self._init_mfcc_extractor()
20 |         self.sr = 16000
21 |         self.DIALECT_TOKENS = {
22 |             0: "ct", 1: "kej", 2: "mand", 3: "min", 4: "wuy", 5: "zha", 6: "zhc",
23 |             7: "zhd", 8: "zhg", 9: "zhj", 10: "zhs", 11: "zhu", 12: "zhw", 13: "zhx"
24 |         }
25 |         logger.info(f"Loading dialect classify model: {onnx_file} Successfully")
26 | 
27 |     def _init_session(self, onnx_file, device):
28 |         sess_options = ort.SessionOptions()
29 |         sess_options.intra_op_num_threads = 1
30 |         sess_options.inter_op_num_threads = 1
31 |         sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
32 |         sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
33 |         sess_options.add_session_config_entry("session.intra_op.allow_spinning", "1")
34 |         provider = "CPUExecutionProvider" if device == "cpu" else "CUDAExecutionProvider"
35 |         return ort.InferenceSession(
36 |             onnx_file,
37 |             providers=[provider],
38 |             sess_options=sess_options,
39 |         )
40 | 
41 |     def _init_mfcc_extractor(self, sr: int = 16000):
42 |         opts = kaldifeat.MfccOptions()
43 |         opts.device = "cpu"
44 |         opts.frame_opts.dither = 0
45 |         opts.frame_opts.snip_edges = False
46 |         opts.frame_opts.samp_freq = sr
47 |         opts.use_energy = False
48 |         opts.mel_opts.num_bins = 40
49 |         opts.mel_opts.low_freq = 40
50 |         opts.mel_opts.high_freq = -200
51 |         opts.num_ceps = 40
52 |         return kaldifeat.Mfcc(opts)
53 | 
54 |     def classify(self, wav_file: str) -> str:
55 |         wav = preprocess_audio(wav_file, target_sr=self.sr)
56 |         wav = wav * (1 << 15)
57 |         feats = self.mfcc_extractor(wav.squeeze())
58 |         out = self.session.run(
59 |             input_feed={"feats": feats.unsqueeze(0).numpy()},
60 |             output_names=["labels"]
61 |         )[0]
62 |         pred = np.argmax(softmax(out, axis=1))
63 |         return self.DIALECT_TOKENS[int(pred)]
64 | 
65 | class DialectClassify(Evaluator):
66 |     def __init__(self, model: str, max_workers=None):
67 |         if max_workers is not None:
68 |             self.max_workers = max_workers
69 |         self.onnx_sess = DialectSession(model)
70 |         self.dialect_mapping = {
71 |             "ct": "粤语", "zhs": "河南话", "zhc": "四川话",
72 |             "zhd": "东北话", "wuy": "上海话", "mand": "普通话"
73 |         }
74 |     
75 |     @parallel_batch(default_workers=4)
76 |     def evaluate(self, pred: str, ref: str, pred_info: Dict, **kwargs):
77 |         pred_audio = pred_info["pred_audio"]
78 |         res = self.onnx_sess.classify(pred_audio)
79 |         mapped_dialect = self.dialect_mapping.get(res, None)
80 |         logger.info(f"key: {pred_info['key']} recognition dialect: {mapped_dialect}")
81 | 
82 |         score = int(mapped_dialect == ref) if mapped_dialect else 0
83 |         return {"key": pred_info["key"], "score": score}


--------------------------------------------------------------------------------
/src/models/qwen.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Dict, Any
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer
 4 | from src.models.base import Model
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | class Qwen2Instruct(Model):
 9 |     def __init__(self, path: str, sample_params: Dict[str, Any] = None):
10 |         super().__init__(sample_params)
11 |         logger.info("start load model from {}".format(path))
12 |         self.model = AutoModelForCausalLM.from_pretrained(
13 |             path,
14 |             torch_dtype="auto",
15 |             device_map="auto",
16 |         ).eval()
17 |         logger.info("successfully load model from {}".format(path))
18 | 
19 |         self.tokenizer = AutoTokenizer.from_pretrained(path)
20 |         config = {
21 |             "greedy": {
22 |                 "do_sample": False,
23 |                 "max_new_tokens": 1024,
24 |                 "top_k": None,
25 |                 "num_beams": 1,
26 |                 "temperature": None,
27 |                 "top_p": None
28 |             }
29 |         }
30 |         self.generation_config = config.get(self.sample_params.get("gen_type", "greedy"), None)
31 |         logger.info("generation_config: {}".format(self.generation_config))
32 |         self.system_prompt_qwen2 = "You are a helpful assistant."
33 |         self.system_prompt_qwen2d5 = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
34 | 
35 |     def generate_once(self, audio, **kwargs):
36 |         system_prompt = self.system_prompt_qwen2d5
37 |         content = kwargs.get("instruct", "") + kwargs["query"]
38 |             
39 |         messages = [
40 |             {"role": "system", "content": system_prompt},
41 |             {"role": "user", "content": content}
42 |         ]
43 | 
44 |         text = self.tokenizer.apply_chat_template(
45 |             messages,
46 |             tokenize=False,
47 |             add_generation_prompt=True
48 |         )
49 |         model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
50 | 
51 |         generated_ids = self.model.generate(
52 |             **model_inputs,
53 |             **self.generation_config
54 |         )
55 |         generated_ids = [
56 |             output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
57 |         ]
58 |         response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
59 |         return {"pred": response}
60 | 
61 | class Qwen3Instruct(Qwen2Instruct):
62 |     def __init__(self, path: str, sample_params: Dict[str, Any] = None):
63 |         # transformers>=4.51.0
64 |         super().__init__(path, sample_params)
65 | 
66 |     def generate_once(self, audio, **kwargs):
67 |         content = kwargs.get("instruct", "") + kwargs["query"]
68 |         
69 |         messages = [
70 |             {"role": "user", "content": content}
71 |         ]
72 | 
73 |         text = self.tokenizer.apply_chat_template(
74 |             messages,
75 |             tokenize=False,
76 |             add_generation_prompt=True,
77 |             enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
78 |         )
79 |         model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
80 | 
81 |         generated_ids = self.model.generate(
82 |             **model_inputs,
83 |             **self.generation_config
84 |         )
85 |         output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
86 |         response = self.tokenizer.decode(output_ids, skip_special_tokens=True)
87 |         return {"pred": response}


--------------------------------------------------------------------------------
/src/summarizer/summarizer.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Dict, List, Union, Any
 3 | from collections import Counter
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | class Summarizer:
 8 |     def __init__(self, rescale="base", power=2):
 9 |         rescale_map = {
10 |             "base": lambda x: x,
11 |             "linear": self.linear_rescale,
12 |             "power": lambda x: self.power_rescale(x, power=power)
13 |         }
14 |         logger.info(f"Using rescale type: {rescale}")
15 |         self.rescale_func = rescale_map[rescale]
16 |     
17 |     def _check_scores(self, scores: List[Any]):
18 |         if any(s is None for s in scores if not isinstance(s, dict)):
19 |             raise ValueError("Scores list contains None values, need re-run evaluator.")
20 | 
21 |     @staticmethod
22 |     def linear_rescale(score):
23 |         return score * 20
24 | 
25 |     @staticmethod
26 |     def power_rescale(score, power):
27 |         return ((score / 5) ** power) * 100
28 |     
29 |     # @staticmethod
30 |     # def sigmoid_rescale(score, scale=10):
31 |     #     x = (score - 2.5)  # move to 3
32 |     #     return (1 / (1 + np.exp(-scale * x / 5))) * 100
33 | 
34 |     def statistic(self, scores: List[Any], **kwargs) -> Dict[str, Any]:
35 |         raise NotImplementedError
36 | 
37 | class AvgInfo(Summarizer):
38 |     def statistic(self, scores: List[Union[float, Dict[str, float]]], **kwargs):
39 |         if isinstance(scores[0], dict):
40 |             keys = scores[0].keys()
41 |             result = {}
42 |             for key in keys:
43 |                 values = [float(s[key]) for s in scores if key in s]
44 |                 avg = sum(values) / len(values) * 100
45 |                 result[key] = "{}: {:.2f}%".format(key, avg)
46 |             return result
47 |         
48 |         # common
49 |         avg = sum(map(float, scores)) / len(scores) * 100
50 |         return {"score": "AVG: {:.2f}%".format(avg)}
51 | 
52 | class AvgThreshold(Summarizer):
53 |     def __init__(self, rescale, threshold=60, power=2):
54 |         super().__init__(rescale, power)
55 |         self.threshold = threshold
56 |     
57 |     def statistic(self, scores: List[float], **kwargs):
58 |         self._check_scores(scores)
59 |         scores = list(map(lambda x: self.rescale_func(float(x)), scores))
60 |         score_count = Counter(scores)
61 | 
62 |         avg = sum(scores) / len(scores)
63 |         above_threshold = sum(count for score, count in score_count.items() if score > self.threshold)
64 |         return {"score": "AVG: {:.2f}".format(avg), "above_threshold": "above{}: {}".format(self.threshold, above_threshold)}
65 | 
66 | class AvgMOS(Summarizer):
67 |     def statistic(self, scores: List[float], **kwargs):
68 |         avg = sum(map(float, scores)) / len(scores)
69 |         return {"score": "DNSMOS: {:.2f}".format(avg)}
70 | 
71 | class AvgWER(Summarizer):
72 |     def statistic(self, scores: List[Dict], **kwargs):
73 |         """
74 |         score = {"ref_len": ref_len, "subs": subs, "dele": dele, "inse": inse, "wer": wer}
75 |         """
76 |         total_ref_len = 0
77 |         total_subs = 0.0
78 |         total_dele = 0.0
79 |         total_inse = 0.0
80 | 
81 |         for score in scores:
82 |             total_ref_len += score.get("ref_len", 0)
83 |             total_subs += score.get("subs", 0.0)
84 |             total_dele += score.get("dele", 0.0)
85 |             total_inse += score.get("inse", 0.0)
86 | 
87 |         if total_ref_len == 0:
88 |             raise ValueError("Not enough ref_len to static")
89 | 
90 |         avg_wer = (total_subs + total_dele + total_inse) / total_ref_len * 100
91 |         return {"score": "WER: {:.2f}%".format(avg_wer)}


--------------------------------------------------------------------------------
/src/models/src_freezeomni/encoder/cmvn.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import json
  3 | import math
  4 | 
  5 | import numpy as np
  6 | 
  7 | class GlobalCMVN(torch.nn.Module):
  8 |     def __init__(self,
  9 |                  mean: torch.Tensor,
 10 |                  istd: torch.Tensor,
 11 |                  norm_var: bool = True):
 12 |         """
 13 |         Args:
 14 |             mean (torch.Tensor): mean stats
 15 |             istd (torch.Tensor): inverse std, std which is 1.0 / std
 16 |         """
 17 |         super().__init__()
 18 |         assert mean.shape == istd.shape
 19 |         self.norm_var = norm_var
 20 |         # The buffer can be accessed from this module using self.mean
 21 |         self.register_buffer("mean", mean)
 22 |         self.register_buffer("istd", istd)
 23 | 
 24 |     def forward(self, x: torch.Tensor):
 25 |         """
 26 |         Args:
 27 |             x (torch.Tensor): (batch, max_len, feat_dim)
 28 | 
 29 |         Returns:
 30 |             (torch.Tensor): normalized feature
 31 |         """
 32 |         x = x - self.mean
 33 |         if self.norm_var:
 34 |             x = x * self.istd
 35 |         return x
 36 | 
 37 | def _load_json_cmvn(json_cmvn_file):
 38 |     """ Load the json format cmvn stats file and calculate cmvn
 39 | 
 40 |     Args:
 41 |         json_cmvn_file: cmvn stats file in json format
 42 | 
 43 |     Returns:
 44 |         a numpy array of [means, vars]
 45 |     """
 46 |     with open(json_cmvn_file) as f:
 47 |         cmvn_stats = json.load(f)
 48 | 
 49 |     means = cmvn_stats['mean_stat']
 50 |     variance = cmvn_stats['var_stat']
 51 |     count = cmvn_stats['frame_num']
 52 |     for i in range(len(means)):
 53 |         means[i] /= count
 54 |         variance[i] = variance[i] / count - means[i] * means[i]
 55 |         if variance[i] < 1.0e-20:
 56 |             variance[i] = 1.0e-20
 57 |         variance[i] = 1.0 / math.sqrt(variance[i])
 58 |     cmvn = np.array([means, variance])
 59 |     return cmvn
 60 | 
 61 | def _load_kaldi_cmvn(kaldi_cmvn_file):
 62 |     """ Load the kaldi format cmvn stats file and calculate cmvn
 63 | 
 64 |     Args:
 65 |         kaldi_cmvn_file:  kaldi text style global cmvn file, which
 66 |            is generated by:
 67 |            compute-cmvn-stats --binary=false scp:feats.scp global_cmvn
 68 | 
 69 |     Returns:
 70 |         a numpy array of [means, vars]
 71 |     """
 72 |     means = []
 73 |     variance = []
 74 |     with open(kaldi_cmvn_file, 'r') as fid:
 75 |         # kaldi binary file start with '\0B'
 76 |         if fid.read(2) == '\0B':
 77 |             print('kaldi cmvn binary file is not supported, please '
 78 |                           'recompute it by: compute-cmvn-stats --binary=false '
 79 |                           ' scp:feats.scp global_cmvn')
 80 |             sys.exit(1)
 81 |         fid.seek(0)
 82 |         arr = fid.read().split()
 83 |         assert (arr[0] == '[')
 84 |         assert (arr[-2] == '0')
 85 |         assert (arr[-1] == ']')
 86 |         feat_dim = int((len(arr) - 2 - 2) / 2)
 87 |         for i in range(1, feat_dim + 1):
 88 |             means.append(float(arr[i]))
 89 |         count = float(arr[feat_dim + 1])
 90 |         for i in range(feat_dim + 2, 2 * feat_dim + 2):
 91 |             variance.append(float(arr[i]))
 92 | 
 93 |     for i in range(len(means)):
 94 |         means[i] /= count
 95 |         variance[i] = variance[i] / count - means[i] * means[i]
 96 |         if variance[i] < 1.0e-20:
 97 |             variance[i] = 1.0e-20
 98 |         variance[i] = 1.0 / math.sqrt(variance[i])
 99 |     cmvn = np.array([means, variance])
100 |     return cmvn
101 | 
102 | def load_cmvn(cmvn_file, is_json):
103 |     if is_json:
104 |         cmvn = _load_json_cmvn(cmvn_file)
105 |     else:
106 |         cmvn = _load_kaldi_cmvn(cmvn_file)
107 |     return cmvn[0], cmvn[1]
108 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/detokenizer/bigvgan_wrapper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import logging
  4 | 
  5 | import librosa
  6 | import torch
  7 | 
  8 | from .vocoder.bigvgan import BigVGAN
  9 | from .vocoder.utils import get_melspec, AttrDict, load_checkpoint
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class BigVGANWrapper:
 15 |     def __init__(
 16 |         self, vocoder: BigVGAN, device: torch.device, h: AttrDict, dtype=None
 17 |     ) -> None:
 18 |         self.vocoder = vocoder.to(device)
 19 |         if dtype is not None:
 20 |             self.vocoder = self.vocoder.to(dtype)
 21 |         self.vocoder = self.vocoder.eval()
 22 |         self.device = device
 23 |         self.h = h
 24 | 
 25 |     def to_dtype(self, dtype):
 26 |         self.vocoder = self.vocoder.to(dtype)
 27 | 
 28 |     def extract_mel_from_wav(self, wav_path=None, wav_data=None):
 29 |         """
 30 |         params:
 31 |             wav_path: str, path of the wav, should be 24k
 32 |             wav_data: torch.tensor or numpy array, shape [T], wav data, should be 24k
 33 |         return:
 34 |             mel: [T, num_mels], torch.tensor
 35 |         """
 36 |         if wav_data is None:
 37 |             wav_data, _ = librosa.load(wav_path, sr=self.h["sampling_rate"])
 38 | 
 39 |         wav_data = torch.tensor(wav_data).unsqueeze(0)
 40 | 
 41 |         mel = get_melspec(
 42 |             y=wav_data,
 43 |             n_fft=self.h["n_fft"],
 44 |             num_mels=self.h["num_mels"],
 45 |             sampling_rate=self.h["sampling_rate"],
 46 |             hop_size=self.h["hop_size"],
 47 |             win_size=self.h["win_size"],
 48 |             fmin=self.h["fmin"],
 49 |             fmax=self.h["fmax"],
 50 |         )
 51 |         return mel.squeeze(0).transpose(0, 1)
 52 | 
 53 |     @torch.inference_mode()
 54 |     def extract_mel_from_wav_batch(self, wav_data):
 55 |         """
 56 |         params:
 57 |             wav_data: torch.tensor or numpy array, shape [Batch, T], wav data, should be 24k
 58 |         return:
 59 |             mel: [Batch, T, num_mels], torch.tensor
 60 |         """
 61 | 
 62 |         wav_data = torch.tensor(wav_data)
 63 | 
 64 |         mel = get_melspec(
 65 |             wav=wav_data,
 66 |             n_fft=self.h["n_fft"],
 67 |             num_mels=self.h["num_mels"],
 68 |             sampling_rate=self.h["sampling_rate"],
 69 |             hop_size=self.h["hop_size"],
 70 |             win_size=self.h["win_size"],
 71 |             fmin=self.h["fmin"],
 72 |             fmax=self.h["fmax"],
 73 |         )
 74 |         return mel.transpose(1, 2)
 75 | 
 76 |     def decode_mel(self, mel):
 77 |         """
 78 |         params:
 79 |             mel: [T, num_mels], torch.tensor
 80 |         return:
 81 |             wav: [1, T], torch.tensor
 82 |         """
 83 |         mel = mel.transpose(0, 1).unsqueeze(0).to(self.device)
 84 |         wav = self.vocoder(mel)
 85 |         return wav.squeeze(0)
 86 | 
 87 |     def decode_mel_batch(self, mel):
 88 |         """
 89 |         params:
 90 |             mel: [B, T, num_mels], torch.tensor
 91 |         return:
 92 |             wav: [B, 1, T], torch.tensor
 93 |         """
 94 |         mel = mel.transpose(1, 2).to(self.device)
 95 |         wav = self.vocoder(mel)
 96 |         return wav
 97 | 
 98 |     @classmethod
 99 |     def from_pretrained(cls, model_config, ckpt_path, device):
100 |         with open(model_config) as f:
101 |             data = f.read()
102 |         json_config = json.loads(data)
103 |         h = AttrDict(json_config)
104 |         vocoder = BigVGAN(h, True)
105 |         state_dict_g = load_checkpoint(ckpt_path, "cpu")
106 |         vocoder.load_state_dict(state_dict_g["generator"])
107 | 
108 |         logger.info(">>> Load vocoder from {}".format(ckpt_path))
109 |         return cls(vocoder, device, h)
110 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/transformer/label_smoothing_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Shigeki Karita
 2 | #               2020 Mobvoi Inc (Binbin Zhang)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Label smoothing module."""
16 | 
17 | import torch
18 | from torch import nn
19 | 
20 | 
21 | class LabelSmoothingLoss(nn.Module):
22 |     """Label-smoothing loss.
23 | 
24 |     In a standard CE loss, the label's data distribution is:
25 |     [0,1,2] ->
26 |     [
27 |         [1.0, 0.0, 0.0],
28 |         [0.0, 1.0, 0.0],
29 |         [0.0, 0.0, 1.0],
30 |     ]
31 | 
32 |     In the smoothing version CE Loss,some probabilities
33 |     are taken from the true label prob (1.0) and are divided
34 |     among other labels.
35 | 
36 |     e.g.
37 |     smoothing=0.1
38 |     [0,1,2] ->
39 |     [
40 |         [0.9, 0.05, 0.05],
41 |         [0.05, 0.9, 0.05],
42 |         [0.05, 0.05, 0.9],
43 |     ]
44 | 
45 |     Args:
46 |         size (int): the number of class
47 |         padding_idx (int): padding class id which will be ignored for loss
48 |         smoothing (float): smoothing rate (0.0 means the conventional CE)
49 |         normalize_length (bool):
50 |             normalize loss by sequence length if True
51 |             normalize loss by batch size if False
52 |     """
53 | 
54 |     def __init__(self,
55 |                  size: int,
56 |                  padding_idx: int,
57 |                  smoothing: float,
58 |                  normalize_length: bool = False):
59 |         """Construct an LabelSmoothingLoss object."""
60 |         super(LabelSmoothingLoss, self).__init__()
61 |         self.criterion = nn.KLDivLoss(reduction="none")
62 |         self.padding_idx = padding_idx
63 |         self.confidence = 1.0 - smoothing
64 |         self.smoothing = smoothing
65 |         self.size = size
66 |         self.normalize_length = normalize_length
67 | 
68 |     def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
69 |         """Compute loss between x and target.
70 | 
71 |         The model outputs and data labels tensors are flatten to
72 |         (batch*seqlen, class) shape and a mask is applied to the
73 |         padding part which should not be calculated for loss.
74 | 
75 |         Args:
76 |             x (torch.Tensor): prediction (batch, seqlen, class)
77 |             target (torch.Tensor):
78 |                 target signal masked with self.padding_id (batch, seqlen)
79 |         Returns:
80 |             loss (torch.Tensor) : The KL loss, scalar float value
81 |         """
82 |         assert x.size(2) == self.size
83 |         batch_size = x.size(0)
84 |         x = x.view(-1, self.size)
85 |         target = target.view(-1)
86 |         # use zeros_like instead of torch.no_grad() for true_dist,
87 |         # since no_grad() can not be exported by JIT
88 |         true_dist = torch.zeros_like(x)
89 |         true_dist.fill_(self.smoothing / (self.size - 1))
90 |         ignore = target == self.padding_idx  # (B,)
91 |         total = len(target) - ignore.sum().item()
92 |         target = target.masked_fill(ignore, 0)  # avoid -1 index
93 |         true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
94 |         kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
95 |         denom = total if self.normalize_length else batch_size
96 |         return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
97 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/utils/common.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
  2 | #               2024 Alibaba Inc (authors: Xiang Lyu)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #   http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # Modified from ESPnet(https://github.com/espnet/espnet)
 16 | """Unility functions for Transformer."""
 17 | 
 18 | from typing import List
 19 | 
 20 | import torch
 21 | 
 22 | IGNORE_ID = -1
 23 | 
 24 | 
 25 | def pad_list(xs: List[torch.Tensor], pad_value: int):
 26 |     """Perform padding for the list of tensors.
 27 | 
 28 |     Args:
 29 |         xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
 30 |         pad_value (float): Value for padding.
 31 | 
 32 |     Returns:
 33 |         Tensor: Padded tensor (B, Tmax, `*`).
 34 | 
 35 |     Examples:
 36 |         >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
 37 |         >>> x
 38 |         [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
 39 |         >>> pad_list(x, 0)
 40 |         tensor([[1., 1., 1., 1.],
 41 |                 [1., 1., 0., 0.],
 42 |                 [1., 0., 0., 0.]])
 43 | 
 44 |     """
 45 |     max_len = max([len(item) for item in xs])
 46 |     batchs = len(xs)
 47 |     ndim = xs[0].ndim
 48 |     if ndim == 1:
 49 |         pad_res = torch.zeros(batchs,
 50 |                               max_len,
 51 |                               dtype=xs[0].dtype,
 52 |                               device=xs[0].device)
 53 |     elif ndim == 2:
 54 |         pad_res = torch.zeros(batchs,
 55 |                               max_len,
 56 |                               xs[0].shape[1],
 57 |                               dtype=xs[0].dtype,
 58 |                               device=xs[0].device)
 59 |     elif ndim == 3:
 60 |         pad_res = torch.zeros(batchs,
 61 |                               max_len,
 62 |                               xs[0].shape[1],
 63 |                               xs[0].shape[2],
 64 |                               dtype=xs[0].dtype,
 65 |                               device=xs[0].device)
 66 |     else:
 67 |         raise ValueError(f"Unsupported ndim: {ndim}")
 68 |     pad_res.fill_(pad_value)
 69 |     for i in range(batchs):
 70 |         pad_res[i, :len(xs[i])] = xs[i]
 71 |     return pad_res
 72 | 
 73 | 
 74 | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
 75 |                 ignore_label: int) -> torch.Tensor:
 76 |     """Calculate accuracy.
 77 | 
 78 |     Args:
 79 |         pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
 80 |         pad_targets (LongTensor): Target label tensors (B, Lmax).
 81 |         ignore_label (int): Ignore label id.
 82 | 
 83 |     Returns:
 84 |         torch.Tensor: Accuracy value (0.0 - 1.0).
 85 | 
 86 |     """
 87 |     pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1),
 88 |                                 pad_outputs.size(1)).argmax(2)
 89 |     mask = pad_targets != ignore_label
 90 |     numerator = torch.sum(
 91 |         pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
 92 |     denominator = torch.sum(mask)
 93 |     return (numerator / denominator).detach()
 94 | 
 95 | 
 96 | def get_padding(kernel_size, dilation=1):
 97 |     return int((kernel_size * dilation - dilation) / 2)
 98 | 
 99 | 
100 | def init_weights(m, mean=0.0, std=0.01):
101 |     classname = m.__class__.__name__
102 |     if classname.find("Conv") != -1:
103 |         m.weight.data.normal_(mean, std)
104 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/utils.py:
--------------------------------------------------------------------------------
  1 | from librosa.filters import mel as librosa_mel_fn
  2 | import torch
  3 | import os
  4 | 
  5 | mel_basis_cache = {}
  6 | hann_window_cache = {}
  7 | 
  8 | 
  9 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 10 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 11 | 
 12 | 
 13 | def spectral_normalize_torch(magnitudes):
 14 |     return dynamic_range_compression_torch(magnitudes)
 15 | 
 16 | 
 17 | def get_melspec(
 18 |     y: torch.Tensor,
 19 |     n_fft: int,
 20 |     num_mels: int,
 21 |     sampling_rate: int,
 22 |     hop_size: int,
 23 |     win_size: int,
 24 |     fmin: int,
 25 |     fmax: int = None,
 26 |     center: bool = False,
 27 | ) -> torch.Tensor:
 28 |     """
 29 |     Calculate the mel spectrogram of an input signal.
 30 |     This function uses slaney norm for the librosa mel filterbank (using librosa.filters.mel) and uses Hann window for STFT (using torch.stft).
 31 | 
 32 |     Args:
 33 |         y (torch.Tensor): Input signal.
 34 |         n_fft (int): FFT size.
 35 |         num_mels (int): Number of mel bins.
 36 |         sampling_rate (int): Sampling rate of the input signal.
 37 |         hop_size (int): Hop size for STFT.
 38 |         win_size (int): Window size for STFT.
 39 |         fmin (int): Minimum frequency for mel filterbank.
 40 |         fmax (int): Maximum frequency for mel filterbank. If None, defaults to half the sampling rate (fmax = sr / 2.0) inside librosa_mel_fn
 41 |         center (bool): Whether to pad the input to center the frames. Default is False.
 42 | 
 43 |     Returns:
 44 |         torch.Tensor: Mel spectrogram.
 45 |     """
 46 |     if torch.min(y) < -1.0:
 47 |         print(f"[WARNING] Min value of input waveform signal is {torch.min(y)}")
 48 |     if torch.max(y) > 1.0:
 49 |         print(f"[WARNING] Max value of input waveform signal is {torch.max(y)}")
 50 | 
 51 |     device = y.device
 52 |     key = f"{n_fft}_{num_mels}_{sampling_rate}_{hop_size}_{win_size}_{fmin}_{fmax}_{device}"
 53 | 
 54 |     if key not in mel_basis_cache:
 55 |         mel = librosa_mel_fn(
 56 |             sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
 57 |         )
 58 |         mel_basis_cache[key] = torch.from_numpy(mel).float().to(device)
 59 |         hann_window_cache[key] = torch.hann_window(win_size).to(device)
 60 | 
 61 |     mel_basis = mel_basis_cache[key]
 62 |     hann_window = hann_window_cache[key]
 63 | 
 64 |     padding = (n_fft - hop_size) // 2
 65 |     y = torch.nn.functional.pad(
 66 |         y.unsqueeze(1), (padding, padding), mode="reflect"
 67 |     ).squeeze(1)
 68 | 
 69 |     spec = torch.stft(
 70 |         y,
 71 |         n_fft,
 72 |         hop_length=hop_size,
 73 |         win_length=win_size,
 74 |         window=hann_window,
 75 |         center=center,
 76 |         pad_mode="reflect",
 77 |         normalized=False,
 78 |         onesided=True,
 79 |         return_complex=True,
 80 |     )
 81 |     spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
 82 | 
 83 |     mel_spec = torch.matmul(mel_basis, spec)
 84 |     mel_spec = spectral_normalize_torch(mel_spec)
 85 | 
 86 |     return mel_spec
 87 | 
 88 | 
 89 | class AttrDict(dict):
 90 |     def __init__(self, *args, **kwargs):
 91 |         super(AttrDict, self).__init__(*args, **kwargs)
 92 |         self.__dict__ = self
 93 | 
 94 | 
 95 | def load_checkpoint(filepath, device):
 96 |     assert os.path.isfile(filepath)
 97 |     print(f"Loading '{filepath}'")
 98 |     checkpoint_dict = torch.load(filepath, map_location=device, weights_only=True)
 99 |     print("Complete.")
100 |     return checkpoint_dict
101 | 
102 | 
103 | def init_weights(m, mean=0.0, std=0.01):
104 |     classname = m.__class__.__name__
105 |     if classname.find("Conv") != -1:
106 |         m.weight.data.normal_(mean, std)
107 | 
108 | 
109 | def get_padding(kernel_size, dilation=1):
110 |     return int((kernel_size * dilation - dilation) / 2)
111 | 


--------------------------------------------------------------------------------
/src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/filter.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
  2 | #   LICENSE is in incl_licenses directory.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import math
  8 | 
  9 | if "sinc" in dir(torch):
 10 |     sinc = torch.sinc
 11 | else:
 12 |     # This code is adopted from adefossez's julius.core.sinc under the MIT License
 13 |     # https://adefossez.github.io/julius/julius/core.html
 14 |     #   LICENSE is in incl_licenses directory.
 15 |     def sinc(x: torch.Tensor):
 16 |         """
 17 |         Implementation of sinc, i.e. sin(pi * x) / (pi * x)
 18 |         __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
 19 |         """
 20 |         return torch.where(
 21 |             x == 0,
 22 |             torch.tensor(1.0, device=x.device, dtype=x.dtype),
 23 |             torch.sin(math.pi * x) / math.pi / x,
 24 |         )
 25 | 
 26 | 
 27 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
 28 | # https://adefossez.github.io/julius/julius/lowpass.html
 29 | #   LICENSE is in incl_licenses directory.
 30 | def kaiser_sinc_filter1d(
 31 |     cutoff, half_width, kernel_size
 32 | ):  # return filter [1,1,kernel_size]
 33 |     even = kernel_size % 2 == 0
 34 |     half_size = kernel_size // 2
 35 | 
 36 |     # For kaiser window
 37 |     delta_f = 4 * half_width
 38 |     A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
 39 |     if A > 50.0:
 40 |         beta = 0.1102 * (A - 8.7)
 41 |     elif A >= 21.0:
 42 |         beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
 43 |     else:
 44 |         beta = 0.0
 45 |     window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
 46 | 
 47 |     # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
 48 |     if even:
 49 |         time = torch.arange(-half_size, half_size) + 0.5
 50 |     else:
 51 |         time = torch.arange(kernel_size) - half_size
 52 |     if cutoff == 0:
 53 |         filter_ = torch.zeros_like(time)
 54 |     else:
 55 |         filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
 56 |         """
 57 |         Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal.
 58 |         """
 59 |         filter_ /= filter_.sum()
 60 |         filter = filter_.view(1, 1, kernel_size)
 61 | 
 62 |     return filter
 63 | 
 64 | 
 65 | class LowPassFilter1d(nn.Module):
 66 |     def __init__(
 67 |         self,
 68 |         cutoff=0.5,
 69 |         half_width=0.6,
 70 |         stride: int = 1,
 71 |         padding: bool = True,
 72 |         padding_mode: str = "replicate",
 73 |         kernel_size: int = 12,
 74 |     ):
 75 |         """
 76 |         kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible.
 77 |         """
 78 |         super().__init__()
 79 |         if cutoff < -0.0:
 80 |             raise ValueError("Minimum cutoff must be larger than zero.")
 81 |         if cutoff > 0.5:
 82 |             raise ValueError("A cutoff above 0.5 does not make sense.")
 83 |         self.kernel_size = kernel_size
 84 |         self.even = kernel_size % 2 == 0
 85 |         self.pad_left = kernel_size // 2 - int(self.even)
 86 |         self.pad_right = kernel_size // 2
 87 |         self.stride = stride
 88 |         self.padding = padding
 89 |         self.padding_mode = padding_mode
 90 |         filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
 91 |         self.register_buffer("filter", filter)
 92 | 
 93 |     # Input [B, C, T]
 94 |     def forward(self, x):
 95 |         _, C, _ = x.shape
 96 | 
 97 |         if self.padding:
 98 |             x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
 99 |         out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
100 | 
101 |         return out
102 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONPATH=$PWD:$PYTHONPATH
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | 
 5 | max_memory=400
 6 | save_dir="res/test"
 7 | 
 8 | stage=1
 9 | stop_stage=2
10 | eval_bsz=1
11 | save_pred_audio=False
12 | 
13 | aqa_tasks="aqa-llamaqa-en,aqa-llamaqa-zh,aqa-triviaqa-en,aqa-triviaqa-zh,aqa-webq-en,aqa-webq-zh,aqa-chinesesimpleqa-zh,aqa-chinese_quiz-zh"
14 | choice_tasks="choice-agieval-zh,choice-ceval-zh"
15 | aqa_dialect_tasks="aqa-chinese_quiz-sichuanese,aqa-chinese_quiz-shanghainese,aqa-chinese_quiz-northeastern_mandarin,aqa-chinese_quiz-henan_dialect,aqa-chinese_quiz-cantonese"
16 | chitchat_dialect_tasks="follow-chitchat-sichuanese,follow-chitchat-shanghainese,follow-chitchat-northeastern_mandarin,follow-chitchat-henan_dialect,follow-chitchat-cantonese"
17 | 
18 | down_tasks="aqa-livelihood_policy-zh,aqa-livelihood_policy-sichuanese,aqa-livelihood_policy-shanghainese,aqa-livelihood_policy-northeastern_mandarin,aqa-livelihood_policy-henan_dialect,aqa-livelihood_policy-cantonese"
19 | noise_tasks="aqa-babble_noise-zh,aqa-white_noise-zh,aqa-distortion-zh,aqa-single_background_speaker-zh,aqa-multi_background_speakers-zh,aqa-lowpass_filtering-zh,aqa-packet_loss-zh,aqa-reverberation_RT60-zh,aqa-complex_environments-zh,aqa-complex_environments_reverb-zh,aqa-different_distance-zh"
20 | multiturn_tasks="multiturn-memory-zh"
21 | para_tasks="aqa-para_mix300-zh"
22 | llm_judge_tasks="emotion-esd,aed-audio-instruct,acceptance-human-zh,chitchat-human-zh,care-age-zh"
23 | 
24 | declare -A model_tasks
25 | model_tasks=(
26 |     ["MiniCPMo2_6-audio"]="$aqa_tasks,$aqa_dialect_tasks"
27 |     ["baichuan_omni_1d5"]="$aqa_tasks,$aqa_dialect_tasks"
28 |     ["llama_omni"]="$aqa_tasks,$aqa_dialect_tasks"
29 |     ["speechgpt2"]="$aqa_tasks,$aqa_dialect_tasks"
30 |     ["freeze_omni"]="$aqa_tasks,$para_taaqa_dialect_taskssks"
31 |     ["glm-4-voice-9b"]="$aqa_tasks,$aqa_dialect_tasks"
32 |     ["kimi-audio-7b-instruct"]="$aqa_tasks,$aqa_dialect_tasks"
33 |     ["qwen2_5_omni"]="$aqa_tasks,$aqa_dialect_tasks"
34 | )
35 | 
36 | gpu_list=($(echo $CUDA_VISIBLE_DEVICES | tr ',' ' '))
37 | gpu_counts=${#gpu_list[@]}
38 | 
39 | get_free_gpu() {
40 |     while true; do
41 |         for gpu in "${gpu_list[@]}"; do
42 |             used_mem=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk "NR==$((gpu+1))")
43 |             if [[ "$used_mem" -lt "$max_memory" ]]; then
44 |                 echo "$gpu"
45 |                 return
46 |             fi
47 |         done
48 |         sleep 30
49 |     done
50 | }
51 | 
52 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
53 |     for model in "${!model_tasks[@]}"; do
54 |         IFS=',' read -r -a values <<< "${model_tasks[$model]}"
55 |         for task in "${values[@]}"; do
56 |             gpu=$(get_free_gpu)
57 |             echo "***********************************************"
58 |             echo "processing model: $model using task: $task on GPU: $gpu"
59 |             echo "***********************************************"
60 |             CUDA_VISIBLE_DEVICES=$gpu python main.py \
61 |                 --mode "infer" \
62 |                 --task $task \
63 |                 --save_dir $save_dir \
64 |                 --save_pred_audio $save_pred_audio \
65 |                 --model $model &
66 |             sleep 40  # Increase sleep time appropriately according to the speed of loading the model
67 |         done
68 |     done
69 |     wait
70 | fi
71 | 
72 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
73 |     for model in "${!model_tasks[@]}"; do
74 |         IFS=',' read -r -a values <<< "${model_tasks[$model]}"
75 |         for task in "${values[@]}"; do
76 |             python main.py \
77 |                 --mode "eval" \
78 |                 --save_dir $save_dir \
79 |                 --save_pred_audio $save_pred_audio \
80 |                 --model $model \
81 |                 --bsz $eval_bsz \
82 |                 --task $task
83 |         done
84 |     done
85 |     wait
86 |     python tools/save_csv.py --root_dir $save_dir
87 | fi


--------------------------------------------------------------------------------
/src/models/src_speechgpt2/Codec/models/moshi_modules/resample.py:
--------------------------------------------------------------------------------
  1 | import typing as tp
  2 | 
  3 | from einops import rearrange
  4 | import torch
  5 | from torch import nn
  6 | 
  7 | from .conv import StreamingConv1d, StreamingConvTranspose1d
  8 | 
  9 | 
 10 | class ConvDownsample1d(nn.Module):
 11 |     """
 12 |     Downsampling by some integer amount `stride` using convolutions
 13 |     with a kernel size of twice the stride.
 14 |     If `causal` is True, the output uses a causal convolution.
 15 |     """
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         stride: int,
 20 |         dimension: tp.Optional[int] = None,
 21 |         causal: bool = False,
 22 |         learnt: bool = False,
 23 |         channel_wise: bool = False,
 24 |     ):
 25 |         super().__init__()
 26 |         self.learnt = learnt
 27 |         self.channel_wise = channel_wise
 28 |         groups = 1
 29 |         if learnt:
 30 |             assert dimension is not None, "Dimension required for learnt convolutions."
 31 |             in_channels = dimension
 32 |             out_channels = dimension
 33 |             if channel_wise:
 34 |                 groups = dimension
 35 |         else:
 36 |             in_channels = 1
 37 |             out_channels = 1
 38 | 
 39 |         self.conv = StreamingConv1d(
 40 |             in_channels,
 41 |             out_channels,
 42 |             kernel_size=2 * stride,
 43 |             stride=stride,
 44 |             causal=causal,
 45 |             groups=groups,
 46 |             bias=False,
 47 |             pad_mode="replicate",
 48 |         )
 49 |         if not learnt:
 50 |             actual_conv = self.conv.conv.conv
 51 |             actual_conv.weight.requires_grad_(False)
 52 |             actual_conv.weight.data.fill_(1.0 / (2 * stride))
 53 | 
 54 |     def forward(self, x: torch.Tensor):
 55 |         batch_size = len(x)
 56 |         if not self.learnt:
 57 |             x = rearrange(x, "b c t -> (b c) () t")
 58 |         y = self.conv(x)
 59 |         if not self.learnt:
 60 |             y = rearrange(y, "(b c) () t -> b c t", b=batch_size)
 61 |         return y
 62 | 
 63 | 
 64 | class ConvTrUpsample1d(nn.Module):
 65 |     """
 66 |     Upsample by some integer amount `stride` using transposed convolutions.
 67 |     """
 68 | 
 69 |     def __init__(
 70 |         self,
 71 |         stride: int,
 72 |         dimension: tp.Optional[int] = None,
 73 |         causal: bool = False,
 74 |         learnt: bool = False,
 75 |         channel_wise: bool = False,
 76 |     ):
 77 |         super().__init__()
 78 |         self.learnt = learnt
 79 |         self.channel_wise = channel_wise
 80 |         groups = 1
 81 |         if learnt:
 82 |             assert dimension is not None, "Dimension required for learnt convolutions."
 83 |             in_channels = dimension
 84 |             out_channels = dimension
 85 |             if channel_wise:
 86 |                 groups = dimension
 87 |         else:
 88 |             in_channels = 1
 89 |             out_channels = 1
 90 | 
 91 |         self.convtr = StreamingConvTranspose1d(
 92 |             in_channels,
 93 |             out_channels,
 94 |             kernel_size=2 * stride,
 95 |             stride=stride,
 96 |             causal=causal,
 97 |             groups=groups,
 98 |             bias=False,
 99 |         )
100 |         if not learnt:
101 |             actual_convtr = self.convtr.convtr.convtr
102 |             actual_convtr.weight.requires_grad_(False)
103 |             actual_convtr.weight.data.fill_(1.0)
104 | 
105 |     def forward(self, x: torch.Tensor):
106 |         batch_size = len(x)
107 |         if not self.learnt:
108 |             x = rearrange(x, "b c t -> (b c) () t")
109 |         y = self.convtr(x)
110 |         if not self.learnt:
111 |             x_for_normalization = torch.ones_like(x[:1])
112 |             normalization = self.convtr(x_for_normalization)
113 |             y = y / normalization
114 |             y = rearrange(y, "(b c) () t -> b c t", b=batch_size)
115 |         return y
116 | 


--------------------------------------------------------------------------------
/src/models/src_speechgpt2/Codec/models/modules/residual_block.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Residual block modules."""
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | from .conv_layers import NonCausalConv1d, CausalConv1d   
 10 | 
 11 | class HiFiGANResidualBlock(nn.Module):
 12 |     """Causal Residual block module in HiFiGAN."""
 13 | 
 14 |     def __init__(
 15 |         self,
 16 |         mode,
 17 |         kernel_size=3,
 18 |         channels=512,
 19 |         dilations=(1, 3, 5),
 20 |         groups=1,
 21 |         bias=True,
 22 |         use_additional_convs=True,
 23 |         nonlinear_activation="LeakyReLU",
 24 |         nonlinear_activation_params={"negative_slope": 0.1}
 25 |     ):
 26 |         """Initialize CausalResidualBlock module.
 27 | 
 28 |         Args:
 29 |             kernel_size (int): Kernel size of dilation convolution layer.
 30 |             channels (int): Number of channels for convolution layer.
 31 |             dilations (List[int]): List of dilation factors.
 32 |             use_additional_convs (bool): Whether to use additional convolution layers.
 33 |             groups (int): The group number of conv1d (default: 1)
 34 |             bias (bool): Whether to add bias parameter in convolution layers.
 35 |             nonlinear_activation (str): Activation function module name.
 36 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
 37 | 
 38 |         """
 39 |         super().__init__()
 40 |         self.mode = mode
 41 |         if self.mode == 'noncausal':
 42 |             Conv1d = NonCausalConv1d
 43 |         elif self.mode == 'causal':
 44 |             Conv1d = CausalConv1d
 45 |         else:
 46 |             raise NotImplementedError(f"Mode ({self.mode}) is not supported!")
 47 | 
 48 |         self.use_additional_convs = use_additional_convs
 49 |         self.convs1 = nn.ModuleList()
 50 |         if use_additional_convs:
 51 |             self.convs2 = nn.ModuleList()
 52 |         assert kernel_size % 2 == 1, "Kernel size must be odd number."
 53 |         self.activation = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
 54 |         for dilation in dilations:
 55 |             self.convs1 += [
 56 |                 Conv1d(
 57 |                     in_channels=channels,
 58 |                     out_channels=channels,
 59 |                     kernel_size=kernel_size,
 60 |                     stride=1,
 61 |                     dilation=dilation,
 62 |                     groups=groups,
 63 |                     bias=bias,
 64 |                 )
 65 |             ]
 66 |             if use_additional_convs:
 67 |                 self.convs2 += [
 68 |                     Conv1d(
 69 |                         in_channels=channels,
 70 |                         out_channels=channels,
 71 |                         kernel_size=kernel_size,
 72 |                         stride=1,
 73 |                         dilation=1,
 74 |                         groups=groups,
 75 |                         bias=bias,
 76 |                     )
 77 |                 ]
 78 |         self.num_layer = len(self.convs1)
 79 |     
 80 |     def forward(self, x):
 81 |         """Calculate forward propagation.
 82 | 
 83 |         Args:
 84 |             x (Tensor): Input tensor (B, channels, T).
 85 | 
 86 |         Returns:
 87 |             Tensor: Output tensor (B, channels, T).
 88 | 
 89 |         """
 90 |         for idx in range(self.num_layer):
 91 |             xt = self.convs1[idx](self.activation(x))
 92 |             if self.use_additional_convs:
 93 |                 xt = self.convs2[idx](self.activation(xt))
 94 |             x = xt + x
 95 |         return x
 96 |     
 97 |     def inference(self, x):
 98 |         for idx in range(self.num_layer):
 99 |             xt = self.convs1[idx].inference(self.activation(x))
100 |             if self.use_additional_convs:
101 |                 xt = self.convs2[idx].inference(self.activation(xt))
102 |             x = xt + x
103 |         return x
104 |     


--------------------------------------------------------------------------------
/src/models/src_glm4/speech_tokenizer/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import io
 3 | import glob
 4 | import math
 5 | import tarfile
 6 | import torch
 7 | import torchaudio
 8 | import safetensors
 9 | from .configuration_whisper import WhisperVQConfig
10 | from .modeling_whisper import WhisperVQEncoder, WhisperVQForConditionalGeneration
11 | from transformers import WhisperFeatureExtractor, WhisperTokenizerFast
12 | 
13 | 
14 | def load_quantize_encoder(model_path):
15 |     config = WhisperVQConfig.from_pretrained(model_path)
16 |     config.quantize_encoder_only = True
17 |     model = WhisperVQEncoder(config)
18 |     state_dict = {}
19 |     for path in glob.glob(os.path.join(model_path, "model*.safetensors")):
20 |         with safetensors.safe_open(path, framework="pt", device="cpu") as f:
21 |             for key in f.keys():
22 |                 if key.startswith("model.encoder."):
23 |                     new_key = key[len("model.encoder."):]
24 |                     if new_key.startswith("layer_norm"):
25 |                         continue
26 |                     if new_key.startswith("layers"):
27 |                         layer_id = int(new_key.split(".")[1])
28 |                         if layer_id >= config.quantize_position:
29 |                             continue
30 |                     state_dict[new_key] = f.get_tensor(key)
31 |     model.load_state_dict(state_dict)
32 |     model.eval()
33 |     model.cuda()
34 |     return model
35 | 
36 | 
37 | _resample_buffer: dict[int, torchaudio.transforms.Resample] = {}
38 | 
39 | 
40 | def extract_speech_token(model: WhisperVQEncoder, feature_extractor: WhisperFeatureExtractor, utts):
41 |     with torch.no_grad():
42 |         audios, indices = [], []
43 |         for idx, utt in enumerate(utts):
44 |             if isinstance(utt, tuple):
45 |                 audio, sample_rate = utt
46 |             else:
47 |                 audio, sample_rate = torchaudio.load(utt)
48 |             audio = audio.cuda()
49 |             if sample_rate != 16000:
50 |                 if sample_rate not in _resample_buffer:
51 |                     _resample_buffer[sample_rate] = torchaudio.transforms.Resample(
52 |                         orig_freq=sample_rate,
53 |                         new_freq=16000
54 |                     ).to('cuda')
55 |                 audio = _resample_buffer[sample_rate](audio)
56 |             # if audio.shape[0] > 1:
57 |             #     audio = audio[:1]
58 |             audio = audio[0]
59 |             audio = audio.cpu().numpy()
60 |             time_step = 0
61 |             while time_step * 16000 < audio.shape[0]:
62 |                 audio_segment = audio[time_step * 16000: (time_step + 30) * 16000]
63 |                 audios.append(audio_segment)
64 |                 indices.append(idx)
65 |                 time_step += 30
66 |         pooling_kernel_size = model.config.pooling_kernel_size or 1
67 |         stride = model.conv1.stride[0] * model.conv2.stride[0] * pooling_kernel_size * feature_extractor.hop_length
68 |         all_speech_tokens = [[] for _ in range(len(utts))]
69 |         batch_size = 128
70 |         for start in range(0, len(audios), batch_size):
71 |             features = feature_extractor(audios[start: start + batch_size], sampling_rate=16000,
72 |                                          return_attention_mask=True, return_tensors="pt", device='cuda',
73 |                                          padding="longest", pad_to_multiple_of=stride)
74 |             features = features.to(device="cuda")
75 |             outputs = model(**features)
76 |             speech_tokens = outputs.quantized_token_ids
77 |             attention_mask = features.attention_mask[:, ::model.conv1.stride[0] * model.conv2.stride[0]]
78 |             attention_mask = attention_mask[:, ::model.config.pooling_kernel_size]
79 |             assert attention_mask.shape == speech_tokens.shape
80 |             for i in range(len(speech_tokens)):
81 |                 idx = indices[start + i]
82 |                 speech_token = speech_tokens[i][attention_mask[i].bool()].tolist()
83 |                 all_speech_tokens[idx].extend(speech_token)
84 |         return all_speech_tokens
85 | 


--------------------------------------------------------------------------------
/src/evaluator/dnsmos.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | import librosa
 3 | import numpy as np
 4 | import onnxruntime as ort
 5 | import soundfile as sf
 6 | from src.evaluator.base import Evaluator
 7 | from src.utils import parallel_batch
 8 | 
 9 | SAMPLING_RATE = 16000
10 | INPUT_LENGTH = 9.01
11 | 
12 | class ComputeScore:
13 |     """
14 |     from https://github.com/microsoft/DNS-Challenge/blob/master/DNSMOS/dnsmos_local.py
15 |     """
16 |     def __init__(self, primary_model_path) -> None:
17 |         self.onnx_sess = ort.InferenceSession(primary_model_path)
18 |         
19 |     def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True):
20 |         mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size+1, hop_length=hop_length, n_mels=n_mels)
21 |         if to_db:
22 |             mel_spec = (librosa.power_to_db(mel_spec, ref=np.max)+40)/40
23 |         return mel_spec.T
24 | 
25 |     def get_polyfit_val(self, sig, bak, ovr):
26 |         p_ovr = np.poly1d([-0.06766283,  1.11546468,  0.04602535])
27 |         p_sig = np.poly1d([-0.08397278,  1.22083953,  0.0052439 ])
28 |         p_bak = np.poly1d([-0.13166888,  1.60915514, -0.39604546])
29 | 
30 |         sig_poly = p_sig(sig)
31 |         bak_poly = p_bak(bak)
32 |         ovr_poly = p_ovr(ovr)
33 | 
34 |         return sig_poly, bak_poly, ovr_poly
35 | 
36 |     def __call__(self, fpath, sampling_rate):
37 |         aud, input_fs = sf.read(fpath)
38 |         fs = sampling_rate
39 |         if input_fs != fs:
40 |             audio = librosa.resample(aud, orig_sr=input_fs, target_sr=fs)
41 |         else:
42 |             audio = aud
43 |         actual_audio_len = len(audio)
44 |         len_samples = int(INPUT_LENGTH*fs)
45 |         while len(audio) < len_samples:
46 |             audio = np.append(audio, audio)
47 |         
48 |         num_hops = int(np.floor(len(audio)/fs) - INPUT_LENGTH)+1
49 |         hop_len_samples = fs
50 |         predicted_mos_sig_seg_raw = []
51 |         predicted_mos_bak_seg_raw = []
52 |         predicted_mos_ovr_seg_raw = []
53 |         predicted_mos_sig_seg = []
54 |         predicted_mos_bak_seg = []
55 |         predicted_mos_ovr_seg = []
56 | 
57 |         for idx in range(num_hops):
58 |             audio_seg = audio[int(idx*hop_len_samples) : int((idx+INPUT_LENGTH)*hop_len_samples)]
59 |             if len(audio_seg) < len_samples:
60 |                 continue
61 | 
62 |             input_features = np.array(audio_seg).astype('float32')[np.newaxis,:]
63 |             oi = {'input_1': input_features}
64 |             mos_sig_raw,mos_bak_raw,mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
65 |             mos_sig,mos_bak,mos_ovr = self.get_polyfit_val(mos_sig_raw, mos_bak_raw, mos_ovr_raw)
66 |             predicted_mos_sig_seg_raw.append(mos_sig_raw)
67 |             predicted_mos_bak_seg_raw.append(mos_bak_raw)
68 |             predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
69 |             predicted_mos_sig_seg.append(mos_sig)
70 |             predicted_mos_bak_seg.append(mos_bak)
71 |             predicted_mos_ovr_seg.append(mos_ovr)
72 |         clip_dict = {'filename': fpath, 'len_in_sec': actual_audio_len/fs, 'sr':fs}
73 |         clip_dict['num_hops'] = num_hops
74 |         clip_dict['OVRL_raw'] = np.mean(predicted_mos_ovr_seg_raw)
75 |         clip_dict['SIG_raw'] = np.mean(predicted_mos_sig_seg_raw)
76 |         clip_dict['BAK_raw'] = np.mean(predicted_mos_bak_seg_raw)
77 |         clip_dict['OVRL'] = np.mean(predicted_mos_ovr_seg)
78 |         clip_dict['SIG'] = np.mean(predicted_mos_sig_seg)
79 |         clip_dict['BAK'] = np.mean(predicted_mos_bak_seg)
80 |         return clip_dict
81 |     
82 | class DNSMOS(Evaluator):
83 |     def __init__(self, model: str, max_workers=None):
84 |         if max_workers is not None:
85 |             self.max_workers = max_workers
86 |         self.compute_score = ComputeScore(model)
87 | 
88 |     @parallel_batch(default_workers=4)
89 |     def evaluate(self, pred: str, ref: str, pred_info: Dict, **kwargs):
90 |         pred_audio = pred_info["pred_audio"]
91 |         res = self.compute_score(pred_audio, SAMPLING_RATE)
92 |         return {"key": pred_info["key"], "score": res["OVRL"]}


--------------------------------------------------------------------------------
/src/models/api.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import io
  3 | import base64
  4 | import requests
  5 | import threading
  6 | import itertools
  7 | from typing import Dict
  8 | import json
  9 | import torchaudio
 10 | from src.models.base import Model
 11 | from src.utils import retry
 12 | 
 13 | import sys
 14 | sys.stdout.reconfigure(encoding='utf-8')
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | class GPT4oAudio(Model):
 19 |     def __init__(self, llm_name: str, api_keys: Dict, max_workers=None):
 20 |         super().__init__(sample_params=None)
 21 |         logging.info(f"Using {llm_name} API for judgement...")
 22 |         assert len(api_keys) > 0
 23 |         self.llm_name = llm_name
 24 |         self.api_keys = api_keys
 25 | 
 26 |         self.urls = {
 27 |             key: (
 28 |                 f"https://{key}.openai.azure.com/"
 29 |                 f"openai/deployments/{llm_name}/chat/completions?api-version=2025-01-01-preview"
 30 |             )
 31 |             for key in api_keys
 32 |         }
 33 |         self.max_workers = max_workers or len(api_keys)
 34 |         self.key_cycle = itertools.cycle(self.api_keys.items())
 35 |         self.lock = threading.Lock()
 36 | 
 37 |     def get_next_key(self):
 38 |         with self.lock:
 39 |             key_name, key_value = next(self.key_cycle)
 40 |             return key_name, key_value, self.urls[key_name]
 41 | 
 42 |     @retry(max_retries=8, sleep_second=3)
 43 |     def api_generate(self, messages, api_key, url, modalities):
 44 |         headers = {
 45 |             "Content-Type": "application/json",
 46 |             "Authorization": f"Bearer {api_key}"
 47 |         }
 48 |         input_data = {
 49 |             "model": "gpt-4o-audio-preview",
 50 |             "modalities": modalities,
 51 |             "audio": {
 52 |                 "voice": "alloy",
 53 |                 "format": "wav"
 54 |             },
 55 |             "messages": messages
 56 |         }
 57 | 
 58 |         response= requests.post(url, headers=headers, data=json.dumps(input_data))
 59 |         response.raise_for_status()
 60 |         response_data = response.json()
 61 |         response = response_data["choices"][0]["message"]
 62 | 
 63 |         if "audio" in modalities:
 64 |             base64_str = response["audio"]["data"]
 65 |             pred = response["audio"]["transcript"].strip()
 66 |             assert base64_str is not None
 67 |         else:
 68 |             if "content" not in response:
 69 |                 logging.info(f"response is unique: {response}")
 70 |             pred = response["content"].strip()
 71 |             base64_str = None
 72 |         return base64_str, pred
 73 | 
 74 |     def generate_once(self, audio, **kwargs):
 75 |         save_pred_audio = kwargs.get("pred_audio", None)
 76 |         if save_pred_audio:
 77 |             modalities = ["audio", "text"]
 78 |         else:
 79 |             modalities = ["text"]
 80 |         
 81 |         with open(audio, "rb") as audio_file:
 82 |             audio_bytes = audio_file.read()
 83 |             audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
 84 |         
 85 |         messages = [
 86 |             {
 87 |                 "role": "user",
 88 |                 "content": [
 89 |                     {
 90 |                         "type": "input_audio",
 91 |                         "input_audio": {
 92 |                             "data": audio_base64,
 93 |                             "format": "wav"
 94 |                         }
 95 |                     }
 96 |                 ]
 97 |             } 
 98 |         ]
 99 | 
100 |         key_name, api_key, url = self.get_next_key()
101 |         base64_str, pred = self.api_generate(messages, api_key, url, modalities)
102 | 
103 |         if save_pred_audio:
104 |             audio_bytes = base64.b64decode(base64_str)
105 |             audio_buf = io.BytesIO(audio_bytes)
106 |             waveform, sample_rate = torchaudio.load(audio_buf)
107 |             torchaudio.save(save_pred_audio, waveform, sample_rate=sample_rate)
108 | 
109 |         return {"pred": pred, "pred_audio": kwargs.get("pred_audio")}
110 |     
111 |     def generate_multiturn(self, audio, user_history, assistant_history, **kwargs):
112 |         return self.generate_once(audio)
113 |     


--------------------------------------------------------------------------------
/registry/infer_task/dialect.yaml:
--------------------------------------------------------------------------------
  1 | # ------------------dialect understanding---------------------
  2 | aqa-chinese_quiz-sichuanese:
  3 |   class: src.config.InferTaskCfg
  4 |   args:
  5 |     dataset: chinese_quiz-sichuanese
  6 |     template: zeroshot-aqa
  7 |     model: qwen2_5_omni
  8 |     save_pred_audio: False
  9 |     eval_task: basic
 10 | 
 11 | aqa-chinese_quiz-shanghainese:
 12 |   class: src.config.InferTaskCfg
 13 |   args:
 14 |     dataset: chinese_quiz-shanghainese
 15 |     template: zeroshot-aqa
 16 |     model: qwen2_5_omni
 17 |     save_pred_audio: False
 18 |     eval_task: basic
 19 | 
 20 | aqa-chinese_quiz-northeastern_mandarin:
 21 |   class: src.config.InferTaskCfg
 22 |   args:
 23 |     dataset: chinese_quiz-northeastern_mandarin
 24 |     template: zeroshot-aqa
 25 |     model: qwen2_5_omni
 26 |     save_pred_audio: False
 27 |     eval_task: basic
 28 | 
 29 | aqa-chinese_quiz-henan_dialect:
 30 |   class: src.config.InferTaskCfg
 31 |   args:
 32 |     dataset: chinese_quiz-henan_dialect
 33 |     template: zeroshot-aqa
 34 |     model: qwen2_5_omni
 35 |     save_pred_audio: False
 36 |     eval_task: basic
 37 | 
 38 | aqa-chinese_quiz-cantonese:
 39 |   class: src.config.InferTaskCfg
 40 |   args:
 41 |     dataset: chinese_quiz-cantonese
 42 |     template: zeroshot-aqa
 43 |     model: qwen2_5_omni
 44 |     save_pred_audio: False
 45 |     eval_task: basic
 46 | 
 47 | # -----------------dialectt understanding livelihood policy (hard)------------
 48 | aqa-livelihood_policy-sichuanese:
 49 |   class: src.config.InferTaskCfg
 50 |   args:
 51 |     dataset: livelihood_policy-sichuanese
 52 |     template: zeroshot-aqa
 53 |     model: qwen2_5_omni
 54 |     save_pred_audio: False
 55 |     eval_task: basic
 56 | 
 57 | aqa-livelihood_policy-shanghainese:
 58 |   class: src.config.InferTaskCfg
 59 |   args:
 60 |     dataset: livelihood_policy-shanghainese
 61 |     template: zeroshot-aqa
 62 |     model: qwen2_5_omni
 63 |     save_pred_audio: False
 64 |     eval_task: basic
 65 | 
 66 | aqa-livelihood_policy-northeastern_mandarin:
 67 |   class: src.config.InferTaskCfg
 68 |   args:
 69 |     dataset: livelihood_policy-northeastern_mandarin
 70 |     template: zeroshot-aqa
 71 |     model: qwen2_5_omni
 72 |     save_pred_audio: False
 73 |     eval_task: basic
 74 | 
 75 | aqa-livelihood_policy-henan_dialect:
 76 |   class: src.config.InferTaskCfg
 77 |   args:
 78 |     dataset: livelihood_policy-henan_dialect
 79 |     template: zeroshot-aqa
 80 |     model: qwen2_5_omni
 81 |     save_pred_audio: False
 82 |     eval_task: basic
 83 | 
 84 | aqa-livelihood_policy-cantonese:
 85 |   class: src.config.InferTaskCfg
 86 |   args:
 87 |     dataset: livelihood_policy-cantonese
 88 |     template: zeroshot-aqa
 89 |     model: qwen2_5_omni
 90 |     save_pred_audio: False
 91 |     eval_task: basic
 92 | 
 93 | # ------------------dialect chitchat---------------------
 94 | follow-chitchat-sichuanese:
 95 |   class: src.config.InferTaskCfg
 96 |   args:
 97 |     dataset: chitchat-sichuanese
 98 |     template: zeroshot-aqa
 99 |     model: qwen2_5_omni
100 |     save_pred_audio: True
101 |     eval_task: dialect_follow # dialect_follow dialect_classify
102 | 
103 | follow-chitchat-shanghainese:
104 |   class: src.config.InferTaskCfg
105 |   args:
106 |     dataset: chitchat-shanghainese
107 |     template: zeroshot-aqa
108 |     model: qwen2_5_omni
109 |     save_pred_audio: True
110 |     eval_task: dialect_follow # dialect_follow dialect_classify
111 | 
112 | follow-chitchat-northeastern_mandarin:
113 |   class: src.config.InferTaskCfg
114 |   args:
115 |     dataset: chitchat-northeastern_mandarin
116 |     template: zeroshot-aqa
117 |     model: qwen2_5_omni
118 |     save_pred_audio: True
119 |     eval_task: dialect_follow # dialect_follow dialect_classify
120 | 
121 | follow-chitchat-henan_dialect:
122 |   class: src.config.InferTaskCfg
123 |   args:
124 |     dataset: chitchat-henan_dialect
125 |     template: zeroshot-aqa
126 |     model: qwen2_5_omni
127 |     save_pred_audio: True
128 |     eval_task: dialect_follow # dialect_follow dialect_classify
129 | 
130 | follow-chitchat-cantonese:
131 |   class: src.config.InferTaskCfg
132 |   args:
133 |     dataset: chitchat-cantonese
134 |     template: zeroshot-aqa
135 |     model: qwen2_5_omni
136 |     save_pred_audio: True
137 |     eval_task: dialect_follow # dialect_follow dialect_classify


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/flow/stable/stable_diffusion_test.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | from .dit import DiffusionTransformer
  4 | from .adp import UNet1d
  5 | from .sampling import sample
  6 | import math
  7 | from model.base import BaseModule
  8 | import pdb
  9 | 
 10 | target_length = 1536
 11 | def pad_and_create_mask(matrix, target_length):
 12 |     
 13 |     T = matrix.shape[2]
 14 |     if T > target_length:
 15 |         raise ValueError("The third dimension length %s should not exceed %s"%(T, target_length))
 16 | 
 17 |     padding_size = target_length - T
 18 | 
 19 |     padded_matrix = F.pad(matrix, (0, padding_size), "constant", 0)
 20 |     
 21 |     mask = torch.ones((1, target_length))
 22 |     mask[:, T:] = 0  # Set the padding part to 0
 23 |     
 24 |     return padded_matrix.to(matrix.device), mask.to(matrix.device)
 25 | 
 26 | 
 27 | class Stable_Diffusion(BaseModule):
 28 |     def __init__(self):
 29 |         super(Stable_Diffusion, self).__init__()
 30 |         self.diffusion = DiffusionTransformer(
 31 |                           io_channels=80, 
 32 |                           # input_concat_dim=80,
 33 |                           embed_dim=768,
 34 |                           # cond_token_dim=target_length,      
 35 |                           depth=24,
 36 |                           num_heads=24,
 37 |                           project_cond_tokens=False,
 38 |                           transformer_type="continuous_transformer",
 39 |                           )
 40 |         # self.diffusion = UNet1d(
 41 |         #                   in_channels=80,
 42 |         #                   channels=256,
 43 |         #                   resnet_groups=16,
 44 |         #                   kernel_multiplier_downsample=2,
 45 |         #                   multipliers=[4, 4, 4, 5, 5],
 46 |         #                   factors=[1, 2, 2, 4], # 输入长度不一致卷积缩短
 47 |         #                   num_blocks=[2, 2, 2, 2],
 48 |         #                   attentions=[1, 3, 3, 3, 3],
 49 |         #                   attention_heads=16,
 50 |         #                   attention_multiplier=4,
 51 |         #                   use_nearest_upsample=False,
 52 |         #                   use_skip_scale=True,
 53 |         #                   use_context_time=True
 54 |         #                   )
 55 |         self.rng = torch.quasirandom.SobolEngine(1, scramble=True)
 56 | 
 57 |     @torch.no_grad()
 58 |     def forward(self, mu, mask, n_timesteps):
 59 |         # pdb.set_trace()
 60 |         mask = mask.squeeze(1)
 61 |         # noise = torch.randn_like(mu).to(mu.device)
 62 |         # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length)
 63 |         # extra_args = {"cross_attn_cond": mu, "cross_attn_cond_mask": mask, "mask": mask}
 64 |         extra_args = {"mask": mask}
 65 |         fakes = sample(self.diffusion, mu, n_timesteps, 0, **extra_args)
 66 | 
 67 |         return fakes
 68 | 
 69 | 
 70 |     def compute_loss(self, x0, mask, mu):
 71 |         
 72 |         # pdb.set_trace()
 73 |         t = self.rng.draw(x0.shape[0])[:, 0].to(x0.device)
 74 |         alphas, sigmas = torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2)
 75 |    
 76 |         alphas = alphas[:, None, None]
 77 |         sigmas = sigmas[:, None, None]
 78 |         noise = torch.randn_like(x0)
 79 |         noised_inputs = x0 * alphas + noise * sigmas
 80 |         targets = mu * alphas - x0 * sigmas
 81 |         mask = mask.squeeze(1)
 82 |         # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length)
 83 |         # output = self.diffusion(noised_inputs, t, cross_attn_cond=mu, 
 84 |         #                         cross_attn_cond_mask=mask, mask=mask, cfg_dropout_prob=0.1)
 85 |         output = self.diffusion(noised_inputs, t, mask=mask, cfg_dropout_prob=0.1)
 86 | 
 87 |         return self.mse_loss(output, targets, mask), output
 88 |     
 89 | 
 90 |     def mse_loss(self, output, targets, mask):
 91 |         
 92 |         mse_loss = F.mse_loss(output, targets, reduction='none')
 93 | 
 94 |         if mask.ndim == 2 and mse_loss.ndim == 3:
 95 |             mask = mask.unsqueeze(1)
 96 | 
 97 |         if mask.shape[1] != mse_loss.shape[1]:
 98 |             mask = mask.repeat(1, mse_loss.shape[1], 1)
 99 | 
100 |         mse_loss = mse_loss[mask]
101 | 
102 |         mse_loss = mse_loss.mean()
103 | 
104 |         return mse_loss


--------------------------------------------------------------------------------
/src/models/src_freezeomni/encoder/subsampling.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, Union
  2 | 
  3 | import torch
  4 | 
  5 | class BaseSubsampling(torch.nn.Module):
  6 |     def __init__(self):
  7 |         super().__init__()
  8 |         self.right_context = 0
  9 |         self.subsampling_rate = 1
 10 | 
 11 |     def position_encoding(self, offset: Union[int, torch.Tensor],
 12 |                           size: int) -> torch.Tensor:
 13 |         return self.pos_enc.position_encoding(offset, size)
 14 | 
 15 | class Conv2dSubsampling4(BaseSubsampling):
 16 |     """Convolutional 2D subsampling (to 1/4 length).
 17 | 
 18 |     Args:
 19 |         idim (int): Input dimension.
 20 |         odim (int): Output dimension.
 21 |         dropout_rate (float): Dropout rate.
 22 | 
 23 |     """
 24 |     def __init__(self, idim: int, odim: int, dropout_rate: float):
 25 |         """Construct an Conv2dSubsampling4 object."""
 26 |         super().__init__()
 27 |         self.conv = torch.nn.Sequential(
 28 |             torch.nn.Conv2d(1, odim, 3, 2),
 29 |             torch.nn.ReLU(),
 30 |             torch.nn.Conv2d(odim, odim, 3, 2),
 31 |             torch.nn.ReLU(),
 32 |         )
 33 |         self.out = torch.nn.Sequential(
 34 |             torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
 35 |         # The right context for every conv layer is computed by:
 36 |         # (kernel_size - 1) * frame_rate_of_this_layer
 37 |         self.subsampling_rate = 4
 38 |         # 6 = (3 - 1) * 1 + (3 - 1) * 2
 39 |         self.right_context = 6
 40 | 
 41 |     def forward(
 42 |             self,
 43 |             x: torch.Tensor,
 44 |             x_mask: torch.Tensor
 45 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
 46 |         """Subsample x.
 47 | 
 48 |         Args:
 49 |             x (torch.Tensor): Input tensor (#batch, time, idim).
 50 |             x_mask (torch.Tensor): Input mask (#batch, 1, time).
 51 | 
 52 |         Returns:
 53 |             torch.Tensor: Subsampled tensor (#batch, time', odim),
 54 |                 where time' = time // 4.
 55 |             torch.Tensor: Subsampled mask (#batch, 1, time'),
 56 |                 where time' = time // 4.
 57 |             torch.Tensor: positional encoding
 58 | 
 59 |         """
 60 |         x = x.unsqueeze(1)  # (b, c=1, t, f)
 61 |         x = self.conv(x)
 62 |         b, c, t, f = x.size()
 63 |         x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
 64 | 
 65 |         return x, x_mask[:, :, 2::2][:, :, 2::2]
 66 |     
 67 |     def infer(self, x, buffer, buffer_index, buffer_out):
 68 |         x = x.unsqueeze(1)  # (b, c=1, t, f)
 69 |         x = self.conv(x)
 70 |         b, c, t, f = x.size()
 71 |         x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
 72 | 
 73 |         return x, buffer, buffer_index, buffer_out
 74 | 
 75 | class Subsampling(torch.nn.Module):
 76 |     @staticmethod
 77 |     def add_arguments(group):
 78 |         """Add Subsampling common arguments."""
 79 |         group.add_argument('--subsampling-rate', default=4, type=int)
 80 |         group.add_argument('--subsampling-input-dim', default=256, type=int)
 81 |         group.add_argument('--subsampling-output-dim', default=256, type=int)
 82 |         group.add_argument('--subsampling-dropout-rate', default=0.1, type=float)
 83 | 
 84 |         return group
 85 |     
 86 |     def __init__(self, args):
 87 |         super().__init__()
 88 |         self.subsampling_rate = args.subsampling_rate
 89 |         self.subsampling_input_dim = args.subsampling_input_dim
 90 |         self.subsampling_output_dim = args.subsampling_output_dim
 91 |         self.subsampling_dropout_rate = args.subsampling_dropout_rate
 92 | 
 93 |         if self.subsampling_rate == 4:
 94 |             self.core = Conv2dSubsampling4(self.subsampling_input_dim, 
 95 |                                            self.subsampling_output_dim, 
 96 |                                            self.subsampling_dropout_rate)
 97 | 
 98 |     def forward(self, xs, ilens, masks):
 99 |         xs, masks = self.core(xs, masks)
100 |         ilens = masks.squeeze(1).sum(1)
101 |         return xs, ilens, masks
102 |     
103 |     def infer(self, x, buffer, buffer_index, buffer_out, pe_index):
104 |         x, buffer, buffer_index, buffer_out = self.core.infer(x, 
105 |                                     buffer, buffer_index, buffer_out)
106 |         return x, buffer, buffer_index, buffer_out, pe_index
107 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/cli/cosyvoice.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import os
15 | import torch
16 | from hyperpyyaml import load_hyperpyyaml
17 | from modelscope import snapshot_download
18 | from cosyvoice.cli.frontend import CosyVoiceFrontEnd
19 | from cosyvoice.cli.model import CosyVoiceModel
20 | 
21 | class CosyVoice:
22 | 
23 |     def __init__(self, model_dir):
24 |         instruct = True if '-Instruct' in model_dir else False
25 |         self.model_dir = model_dir
26 |         if not os.path.exists(model_dir):
27 |             model_dir = snapshot_download(model_dir)
28 |         with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
29 |             configs = load_hyperpyyaml(f)
30 |         self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
31 |                                           configs['feat_extractor'],
32 |                                           '{}/campplus.onnx'.format(model_dir),
33 |                                           '{}/speech_tokenizer_v1.onnx'.format(model_dir),
34 |                                           '{}/spk2info.pt'.format(model_dir),
35 |                                           instruct,
36 |                                           configs['allowed_special'])
37 |         self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
38 |         self.model.load('{}/llm.pt'.format(model_dir),
39 |                         '{}/flow.pt'.format(model_dir),
40 |                         '{}/hift.pt'.format(model_dir))
41 |         del configs
42 | 
43 |     def list_avaliable_spks(self):
44 |         spks = list(self.frontend.spk2info.keys())
45 |         return spks
46 | 
47 |     def inference_sft(self, tts_text, spk_id):
48 |         tts_speeches = []
49 |         for i in self.frontend.text_normalize(tts_text, split=True):
50 |             model_input = self.frontend.frontend_sft(i, spk_id)
51 |             model_output = self.model.inference(**model_input)
52 |             tts_speeches.append(model_output['tts_speech'])
53 |         return {'tts_speech': torch.concat(tts_speeches, dim=1)}
54 | 
55 |     def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
56 |         prompt_text = self.frontend.text_normalize(prompt_text, split=False)
57 |         tts_speeches = []
58 |         for i in self.frontend.text_normalize(tts_text, split=True):
59 |             model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
60 |             model_output = self.model.inference(**model_input)
61 |             tts_speeches.append(model_output['tts_speech'])
62 |         return {'tts_speech': torch.concat(tts_speeches, dim=1)}
63 | 
64 |     def inference_cross_lingual(self, tts_text, prompt_speech_16k):
65 |         if self.frontend.instruct is True:
66 |             raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
67 |         tts_speeches = []
68 |         for i in self.frontend.text_normalize(tts_text, split=True):
69 |             model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
70 |             model_output = self.model.inference(**model_input)
71 |             tts_speeches.append(model_output['tts_speech'])
72 |         return {'tts_speech': torch.concat(tts_speeches, dim=1)}
73 | 
74 |     def inference_instruct(self, tts_text, spk_id, instruct_text):
75 |         if self.frontend.instruct is False:
76 |             raise ValueError('{} do not support instruct inference'.format(self.model_dir))
77 |         instruct_text = self.frontend.text_normalize(instruct_text, split=False)
78 |         tts_speeches = []
79 |         for i in self.frontend.text_normalize(tts_text, split=True):
80 |             model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
81 |             model_output = self.model.inference(**model_input)
82 |             tts_speeches.append(model_output['tts_speech'])
83 |         return {'tts_speech': torch.concat(tts_speeches, dim=1)}
84 | 


--------------------------------------------------------------------------------
/src/models/src_glm4/cosyvoice/utils/frontend_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import re
 16 | chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+')
 17 | 
 18 | # whether contain chinese character
 19 | def contains_chinese(text):
 20 |     return bool(chinese_char_pattern.search(text))
 21 | 
 22 | 
 23 | # replace special symbol
 24 | def replace_corner_mark(text):
 25 |     text = text.replace('²', '平方')
 26 |     text = text.replace('³', '立方')
 27 |     return text
 28 | 
 29 | 
 30 | # remove meaningless symbol
 31 | def remove_bracket(text):
 32 |     text = text.replace('（', '').replace('）', '')
 33 |     text = text.replace('【', '').replace('】', '')
 34 |     text = text.replace('`', '').replace('`', '')
 35 |     text = text.replace("——", " ")
 36 |     return text
 37 | 
 38 | 
 39 | # spell Arabic numerals
 40 | def spell_out_number(text: str, inflect_parser):
 41 |     new_text = []
 42 |     st = None
 43 |     for i, c in enumerate(text):
 44 |         if not c.isdigit():
 45 |             if st is not None:
 46 |                 num_str = inflect_parser.number_to_words(text[st: i])
 47 |                 new_text.append(num_str)
 48 |                 st = None
 49 |             new_text.append(c)
 50 |         else:
 51 |             if st is None:
 52 |                 st = i
 53 |     if st is not None and st < len(text):
 54 |         num_str = inflect_parser.number_to_words(text[st:])
 55 |         new_text.append(num_str)
 56 |     return ''.join(new_text)
 57 | 
 58 | 
 59 | # split paragrah logic：
 60 | # 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len
 61 | # 2. cal sentence len according to lang
 62 | # 3. split sentence according to puncatation
 63 | def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False):
 64 |     def calc_utt_length(_text: str):
 65 |         if lang == "zh":
 66 |             return len(_text)
 67 |         else:
 68 |             return len(tokenize(_text))
 69 | 
 70 |     def should_merge(_text: str):
 71 |         if lang == "zh":
 72 |             return len(_text) < merge_len
 73 |         else:
 74 |             return len(tokenize(_text)) < merge_len
 75 | 
 76 |     if lang == "zh":
 77 |         pounc = ['。', '？', '！', '；', '：', '、', '.', '?', '!', ';']
 78 |     else:
 79 |         pounc = ['.', '?', '!', ';', ':']
 80 |     if comma_split:
 81 |         pounc.extend(['，', ','])
 82 |     st = 0
 83 |     utts = []
 84 |     for i, c in enumerate(text):
 85 |         if c in pounc:
 86 |             if len(text[st: i]) > 0:
 87 |                 utts.append(text[st: i] + c)
 88 |             if i + 1 < len(text) and text[i + 1] in ['"', '”']:
 89 |                 tmp = utts.pop(-1)
 90 |                 utts.append(tmp + text[i + 1])
 91 |                 st = i + 2
 92 |             else:
 93 |                 st = i + 1
 94 |     if len(utts) == 0:
 95 |         if lang == "zh":
 96 |             utts.append(text + '。')
 97 |         else:
 98 |             utts.append(text + '.')
 99 |     final_utts = []
100 |     cur_utt = ""
101 |     for utt in utts:
102 |         if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n:
103 |             final_utts.append(cur_utt)
104 |             cur_utt = ""
105 |         cur_utt = cur_utt + utt
106 |     if len(cur_utt) > 0:
107 |         if should_merge(cur_utt) and len(final_utts) != 0:
108 |             final_utts[-1] = final_utts[-1] + cur_utt
109 |         else:
110 |             final_utts.append(cur_utt)
111 | 
112 |     return final_utts
113 | 
114 | 
115 | # remove blank between chinese character
116 | def replace_blank(text: str):
117 |     out_str = []
118 |     for i, c in enumerate(text):
119 |         if c == " ":
120 |             if ((text[i + 1].isascii() and text[i + 1] != " ") and
121 |                     (text[i - 1].isascii() and text[i - 1] != " ")):
122 |                 out_str.append(c)
123 |         else:
124 |             out_str.append(c)
125 |     return "".join(out_str)
126 | 


--------------------------------------------------------------------------------