├── src ├── __init__.py ├── models │ ├── __init__.py │ ├── src_glm4 │ │ ├── cosyvoice │ │ │ ├── __init__.py │ │ │ ├── cli │ │ │ │ ├── __init__.py │ │ │ │ └── cosyvoice.py │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── block_mask_util.py │ │ │ │ ├── file_utils.py │ │ │ │ ├── class_utils.py │ │ │ │ ├── common.py │ │ │ │ └── frontend_utils.py │ │ │ ├── dataset │ │ │ │ └── __init__.py │ │ │ ├── transformer │ │ │ │ ├── __init__.py │ │ │ │ ├── activation.py │ │ │ │ └── label_smoothing_loss.py │ │ │ ├── flow │ │ │ │ ├── length_regulator.py │ │ │ │ └── stable │ │ │ │ │ └── stable_diffusion_test.py │ │ │ └── hifigan │ │ │ │ └── f0_predictor.py │ │ ├── speech_tokenizer │ │ │ ├── __init__.py │ │ │ ├── configuration_whisper.py │ │ │ └── utils.py │ │ └── audio_process.py │ ├── src_kimi │ │ └── kimia_infer │ │ │ ├── __init__.py │ │ │ ├── api │ │ │ └── __init__.py │ │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── tokenizer │ │ │ │ ├── __init__.py │ │ │ │ ├── whisper_Lv3 │ │ │ │ │ └── mel_filters.npz │ │ │ │ └── glm4_tokenizer.py │ │ │ └── detokenizer │ │ │ │ ├── vocoder │ │ │ │ ├── alias_free_activation │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── cuda │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── compat.h │ │ │ │ │ │ ├── anti_alias_activation.cpp │ │ │ │ │ │ ├── activation1d.py │ │ │ │ │ │ └── load.py │ │ │ │ │ └── torch │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── act.py │ │ │ │ │ │ ├── resample.py │ │ │ │ │ │ └── filter.py │ │ │ │ └── utils.py │ │ │ │ ├── flow_matching │ │ │ │ └── scheduler.py │ │ │ │ └── bigvgan_wrapper.py │ │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── special_tokens.py │ │ │ └── data.py │ ├── src_llama_omni │ │ ├── datasets │ │ │ └── __init__.py │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── speech_generator │ │ │ │ └── builder.py │ │ │ ├── speech_projector │ │ │ │ ├── builder.py │ │ │ │ └── speech_projector.py │ │ │ └── speech_encoder │ │ │ │ ├── builder.py │ │ │ │ └── speech_encoder.py │ │ ├── constants.py │ │ └── arguments.py │ ├── src_speechgpt2 │ │ └── Codec │ │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ ├── activation_function.py │ │ │ │ ├── projector.py │ │ │ │ ├── quantizer.py │ │ │ │ └── residual_block.py │ │ │ └── moshi_modules │ │ │ │ ├── __init__.py │ │ │ │ ├── rope.py │ │ │ │ ├── gating.py │ │ │ │ └── resample.py │ │ │ └── utils.py │ ├── src_minicpm │ │ └── ref_audios │ │ │ ├── assistant_male_voice.wav │ │ │ ├── assistant_female_voice.wav │ │ │ └── assistant_default_female_voice.wav │ ├── src_baichuan │ │ ├── constants.py │ │ └── cosy24k_vocoder │ │ │ ├── README.md │ │ │ ├── hifigan │ │ │ ├── __init__.py │ │ │ └── f0_predictor.py │ │ │ └── cosy24k_vocoder.py │ ├── src_freezeomni │ │ ├── decoder │ │ │ └── ticodec │ │ │ │ ├── vqvae_tester.py │ │ │ │ └── vqvae.py │ │ ├── utils.py │ │ └── encoder │ │ │ ├── cmvn.py │ │ │ └── subsampling.py │ ├── telechat2.py │ ├── kimi_audio.py │ ├── model_utils.py │ ├── qwen.py │ └── api.py ├── prompt │ ├── __init__.py │ └── template.py ├── evaluator │ ├── __init__.py │ ├── emo2vec.py │ ├── text_utils.py │ ├── asr.py │ ├── base.py │ ├── dialect.py │ └── dnsmos.py ├── summarizer │ ├── __init__.py │ └── summarizer.py └── config.py ├── assets └── contact.jpg ├── .gitmodules ├── registry ├── dataset │ ├── scene.yaml │ ├── multiturn.yaml │ ├── choice.yaml │ ├── human.yaml │ ├── paralinguistic.yaml │ ├── aqa.yaml │ └── dialect.yaml ├── model │ ├── api.yaml │ ├── text.yaml │ └── offline.yaml ├── evaluator │ ├── match.yaml │ ├── speech.yaml │ └── llm.yaml ├── infer_task │ ├── multiturn.yaml │ ├── scene.yaml │ ├── choice.yaml │ ├── human.yaml │ ├── paralinguistic.yaml │ ├── aqa.yaml │ └── dialect.yaml ├── eval_task │ ├── objective.yaml │ ├── speech.yaml │ └── llm.yaml ├── summarizer │ └── base.yaml └── template │ ├── aqa.yaml │ ├── text_llm.yaml │ └── multiturn.yaml ├── requirements ├── freeze_omni_requirements.txt ├── qwen2_5_omni_requirements.txt ├── minicpm_omni_requirements.txt ├── llama_omni_requirements.txt ├── kimi_audio_requirements.txt ├── glm4voice_requirements.txt └── speechgpt2_requirements.txt ├── requirements_eval.txt ├── main.py ├── requirements_all.txt ├── tools ├── test_eval_speech.py ├── save_csv.py ├── test_api.py └── parquet2jsonl.py ├── run_text.sh └── run.sh /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/prompt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/evaluator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/summarizer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_glm4/speech_tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_llama_omni/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_speechgpt2/Codec/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_speechgpt2/Codec/models/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/contact.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tele-AI/TELEVAL/HEAD/assets/contact.jpg -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/src_minicpm/ref_audios/assistant_male_voice.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tele-AI/TELEVAL/HEAD/src/models/src_minicpm/ref_audios/assistant_male_voice.wav -------------------------------------------------------------------------------- /src/models/src_minicpm/ref_audios/assistant_female_voice.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tele-AI/TELEVAL/HEAD/src/models/src_minicpm/ref_audios/assistant_female_voice.wav -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/models/src_glm4/third_party/Matcha-TTS"] 2 | path = src/models/src_glm4/third_party/Matcha-TTS 3 | url = git@github.com:shivammehta25/Matcha-TTS.git 4 | -------------------------------------------------------------------------------- /registry/dataset/scene.yaml: -------------------------------------------------------------------------------- 1 | aed_combine-zh: 2 | class: src.dataset.BatchLoader 3 | args: 4 | file: Tele-AI/TELEVAL/aed_combine-zh 5 | ref_col: answer 6 | query_col: query -------------------------------------------------------------------------------- /registry/model/api.yaml: -------------------------------------------------------------------------------- 1 | gpt4o-audio: 2 | class: src.models.api.GPT4oAudio 3 | args: 4 | llm_name: gpt-4o-audio-preview 5 | api_keys: 6 | key1: "xxx" 7 | max_workers: 1 -------------------------------------------------------------------------------- /src/models/src_minicpm/ref_audios/assistant_default_female_voice.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tele-AI/TELEVAL/HEAD/src/models/src_minicpm/ref_audios/assistant_default_female_voice.wav -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/tokenizer/whisper_Lv3/mel_filters.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tele-AI/TELEVAL/HEAD/src/models/src_kimi/kimia_infer/models/tokenizer/whisper_Lv3/mel_filters.npz -------------------------------------------------------------------------------- /src/models/src_llama_omni/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.omni_speech_llama import OmniSpeechLlamaForCausalLM, OmniSpeechConfig 2 | from .language_model.omni_speech2s_llama import OmniSpeech2SLlamaForCausalLM -------------------------------------------------------------------------------- /registry/dataset/multiturn.yaml: -------------------------------------------------------------------------------- 1 | 2 | multiturn_memory-zh: 3 | class: src.dataset.BatchLoader 4 | args: 5 | file: Tele-AI/TELEVAL/multiturn_memory-zh 6 | batch_size: 1 # suggest bsz=1 since multiturn may OOM 7 | tuple_decode: False -------------------------------------------------------------------------------- /src/models/src_llama_omni/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | SPEECH_TOKEN_INDEX = -200 9 | DEFAULT_SPEECH_TOKEN = "" -------------------------------------------------------------------------------- /requirements/freeze_omni_requirements.txt: -------------------------------------------------------------------------------- 1 | cryptography 2 | flask==3.0.3 3 | flask_socketio==5.3.4 4 | librosa==0.10.2.post1 5 | numpy==1.24.4 6 | silero-vad==5.1.2 7 | soundfile==0.12.1 8 | torch==2.2.0 9 | torchaudio==2.2.0 10 | transformers==4.45.2 11 | PyYAML==6.0.2 -------------------------------------------------------------------------------- /registry/evaluator/match.yaml: -------------------------------------------------------------------------------- 1 | exist_match: 2 | class: src.evaluator.base.ExistMatch 3 | args: 4 | keep_punc: False 5 | max_workers: 1 6 | 7 | single_option_match: 8 | class: src.evaluator.base.SingleOptionMatch 9 | args: 10 | max_workers: 1 11 | cushion: True 12 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__init__.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | from .filter import * 5 | from .resample import * 6 | from .act import * 7 | -------------------------------------------------------------------------------- /registry/infer_task/multiturn.yaml: -------------------------------------------------------------------------------- 1 | multiturn-memory-zh: 2 | class: src.config.InferTaskCfg 3 | args: 4 | dataset: multiturn_memory-zh 5 | template: multiturn-audio 6 | model: qwen2_5_omni 7 | save_pred_audio: False 8 | eval_task: basic 9 | reverse_spkr: False 10 | use_model_history: True 11 | save_latest_only: True 12 | -------------------------------------------------------------------------------- /registry/eval_task/objective.yaml: -------------------------------------------------------------------------------- 1 | 2 | # -------------------------------qa------------------------------------- 3 | 4 | basic: 5 | class: src.config.EvalTaskCfg 6 | args: 7 | evaluator: exist_match 8 | summarizer: AvgInfo 9 | 10 | choice: 11 | class: src.config.EvalTaskCfg 12 | args: 13 | evaluator: single_option_match 14 | summarizer: AvgInfo 15 | -------------------------------------------------------------------------------- /registry/dataset/choice.yaml: -------------------------------------------------------------------------------- 1 | ceval-zh: 2 | class: src.dataset.BatchLoader 3 | args: 4 | file: Tele-AI/TELEVAL/ceval-zh 5 | ref_col: answer 6 | query_col: query 7 | batch_size: 1 8 | 9 | agieval-zh: 10 | class: src.dataset.BatchLoader 11 | args: 12 | file: Tele-AI/TELEVAL/agieval-zh 13 | ref_col: answer 14 | query_col: query 15 | batch_size: 1 -------------------------------------------------------------------------------- /src/models/src_llama_omni/model/speech_generator/builder.py: -------------------------------------------------------------------------------- 1 | from .speech_generator import SpeechGeneratorCTC 2 | 3 | 4 | def build_speech_generator(config): 5 | generator_type = getattr(config, 'speech_generator_type', 'ctc') 6 | if generator_type == 'ctc': 7 | return SpeechGeneratorCTC(config) 8 | 9 | raise ValueError(f'Unknown generator type: {generator_type}') 10 | -------------------------------------------------------------------------------- /src/models/src_llama_omni/model/speech_projector/builder.py: -------------------------------------------------------------------------------- 1 | from .speech_projector import EncoderProjectorConcat 2 | 3 | 4 | def build_speech_projector(config): 5 | projector_type = getattr(config, 'speech_projector_type', 'linear') 6 | if projector_type == 'linear': 7 | return EncoderProjectorConcat(config) 8 | 9 | raise ValueError(f'Unknown projector type: {projector_type}') 10 | -------------------------------------------------------------------------------- /src/models/src_baichuan/constants.py: -------------------------------------------------------------------------------- 1 | MODEL_PATH = "../baichuan-omni/model" 2 | COSY_VOCODER = "../third_party/cosy24k_vocoder" 3 | g_cache_dir = "../cache" 4 | sampling_rate = 24000 5 | wave_concat_overlap = int(sampling_rate * 0.01) 6 | role_prefix = { 7 | 'system': '', 8 | 'user': '', 9 | 'assistant': '', 10 | 'audiogen': '' 11 | } 12 | max_frames = 8 -------------------------------------------------------------------------------- /registry/dataset/human.yaml: -------------------------------------------------------------------------------- 1 | human_acceptance-zh: 2 | class: src.dataset.BatchLoader 3 | args: 4 | file: Tele-AI/TELEVAL/human_accept-zh 5 | ref_col: answer 6 | query_col: query 7 | batch_size: 1 8 | 9 | human_chitchat-zh: 10 | class: src.dataset.BatchLoader 11 | args: 12 | file: Tele-AI/TELEVAL/human_chitchat-zh 13 | ref_col: answer 14 | query_col: query 15 | batch_size: 1 -------------------------------------------------------------------------------- /requirements/qwen2_5_omni_requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies 2 | gradio==5.23.1 3 | gradio_client==1.8.0 4 | qwen-omni-utils==0.0.4 5 | librosa==0.11.0 6 | ffmpeg==1.4 7 | ffmpeg-python==0.2.0 8 | soundfile==0.13.1 9 | modelscope_studio==1.2.2 10 | transformers==4.52.3 11 | accelerate 12 | av 13 | 14 | # Optional dependency 15 | # Uncomment the following line if you need flash-attn 16 | flash-attn==2.7.4.post1 -------------------------------------------------------------------------------- /src/models/src_llama_omni/model/speech_encoder/builder.py: -------------------------------------------------------------------------------- 1 | from .speech_encoder import WhisperWrappedEncoder 2 | 3 | 4 | def build_speech_encoder(config): 5 | speech_encoder_type = getattr(config, 'speech_encoder_type', None) 6 | if "whisper" in speech_encoder_type.lower(): 7 | return WhisperWrappedEncoder.load(config) 8 | 9 | raise ValueError(f'Unknown speech encoder: {speech_encoder_type}') 10 | -------------------------------------------------------------------------------- /registry/summarizer/base.yaml: -------------------------------------------------------------------------------- 1 | AvgInfo: 2 | class: src.summarizer.summarizer.AvgInfo 3 | args: {} 4 | 5 | AvgThreshold: 6 | class: src.summarizer.summarizer.AvgThreshold 7 | args: 8 | rescale: power # linear power 9 | power: 2 10 | threshold: 60 11 | 12 | AvgWER: 13 | class: src.summarizer.summarizer.AvgWER 14 | args: {} 15 | 16 | AvgMOS: 17 | class: src.summarizer.summarizer.AvgMOS 18 | args: {} -------------------------------------------------------------------------------- /registry/infer_task/scene.yaml: -------------------------------------------------------------------------------- 1 | aed-audio-instruct: 2 | class: src.config.InferTaskCfg 3 | args: 4 | dataset: aed_combine-zh 5 | template: zeroshot-aqa 6 | model: qwen2_5_omni 7 | eval_task: aed_instruct 8 | 9 | aed-text-instruct: # not recommand 10 | class: src.config.InferTaskCfg 11 | args: 12 | dataset: aed_combine-zh 13 | template: text-instruct-caption 14 | model: qwen2_5_omni 15 | eval_task: aed_instruct -------------------------------------------------------------------------------- /src/models/src_baichuan/cosy24k_vocoder/README.md: -------------------------------------------------------------------------------- 1 | # CosyVoice 2.0 HiFi-GAN Vocoder Module 2 | This module contains the HiFi-GAN vocoder component extracted from [CosyVoice 2.0](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B), providing high-quality speech waveform generation capabilities and optimized for ease of integration. 3 | 4 | The weights (hift.pt) are derived from [CosyVoice 2.0](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B) (Apache 2.0 licensed) -------------------------------------------------------------------------------- /registry/infer_task/choice.yaml: -------------------------------------------------------------------------------- 1 | choice-ceval-zh: 2 | class: src.config.InferTaskCfg 3 | args: 4 | dataset: ceval-zh 5 | template: zeroshot-aqa 6 | model: qwen2_5_omni 7 | save_pred_audio: False 8 | eval_task: choice 9 | 10 | choice-agieval-zh: 11 | class: src.config.InferTaskCfg 12 | args: 13 | dataset: agieval-zh 14 | template: zeroshot-aqa 15 | model: qwen2_5_omni 16 | save_pred_audio: False 17 | eval_task: choice -------------------------------------------------------------------------------- /requirements_eval.txt: -------------------------------------------------------------------------------- 1 | jinja2 2 | tqdm 3 | requests 4 | pandas 5 | regex 6 | datasets 7 | soundfile 8 | librosa 9 | transformers 10 | WeTextProcessing==1.0.3 11 | #vllm # (choice) for llm_offline judgement 12 | jiwer 13 | funasr 14 | zhon 15 | zhconv 16 | onnxruntime==1.18.1 # (choice) for dialect classify and dnsmos 17 | torch 18 | torchaudio 19 | scipy 20 | --find-links https://csukuangfj.github.io/kaldifeat/cuda.html 21 | kaldifeat==1.25.5 # (choice) for dialect classify -------------------------------------------------------------------------------- /registry/infer_task/human.yaml: -------------------------------------------------------------------------------- 1 | acceptance-human-zh: 2 | class: src.config.InferTaskCfg 3 | args: 4 | dataset: human_acceptance-zh 5 | template: zeroshot-aqa 6 | model: qwen2_5_omni 7 | save_pred_audio: False 8 | eval_task: human_acceptance 9 | 10 | chitchat-human-zh: 11 | class: src.config.InferTaskCfg 12 | args: 13 | dataset: human_chitchat-zh 14 | template: zeroshot-aqa 15 | model: qwen2_5_omni 16 | save_pred_audio: False 17 | eval_task: human_likeness -------------------------------------------------------------------------------- /registry/eval_task/speech.yaml: -------------------------------------------------------------------------------- 1 | wer: 2 | class: src.config.EvalTaskCfg 3 | args: 4 | evaluator: paraformer-zh 5 | summarizer: AvgWER 6 | 7 | dnsmos: 8 | class: src.config.EvalTaskCfg 9 | args: 10 | evaluator: MS-DNSMOS 11 | summarizer: AvgMOS 12 | 13 | emotion_response: 14 | class: src.config.EvalTaskCfg 15 | args: 16 | evaluator: emo2vec-large 17 | summarizer: AvgInfo 18 | 19 | dialect_classify: 20 | class: src.config.EvalTaskCfg 21 | args: 22 | evaluator: TeleSpeech-Dialect 23 | summarizer: AvgInfo -------------------------------------------------------------------------------- /requirements/minicpm_omni_requirements.txt: -------------------------------------------------------------------------------- 1 | Pillow==10.1.0 2 | torch==2.3.1 3 | torchaudio==2.3.1 4 | torchvision==0.18.1 5 | transformers==4.44.2 6 | sentencepiece==0.2.0 7 | vector-quantize-pytorch==1.18.5 8 | vocos==0.1.0 9 | accelerate==1.2.1 10 | timm==0.9.10 11 | soundfile==0.12.1 12 | librosa==0.9.0 13 | decord 14 | moviepy 15 | 16 | # for web demo 17 | aiofiles==23.2.1 18 | onnxruntime==1.20.1 19 | fastapi 20 | uvicorn 21 | gradio==4.44.1 22 | http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/modelscope_studio-0.4.0.9-py3-none-any.whl -------------------------------------------------------------------------------- /requirements/llama_omni_requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.1.2 2 | torchvision==0.16.2 3 | torchaudio==2.1.2 4 | transformers==4.43.4 5 | tokenizers==0.19.1 6 | sentencepiece==0.1.99 7 | shortuuid 8 | accelerate==0.33.0 9 | peft==0.11.1 10 | bitsandbytes==0.43.1 11 | pydantic 12 | markdown2 13 | numpy 14 | scikit-learn==1.2.2 15 | gradio==4.43.0 16 | gradio_client==1.3.0 17 | requests 18 | httpx==0.27.2 19 | uvicorn 20 | fastapi 21 | soundfile 22 | einops==0.6.1 23 | einops-exts==0.0.4 24 | timm==0.6.13 25 | openai-whisper 26 | setuptools==59.5.0 27 | omegaconf==2.0.6 28 | fairseq -------------------------------------------------------------------------------- /src/models/src_speechgpt2/Codec/models/modules/activation_function.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """Activation functions.""" 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | 11 | def get_activation(nonlinear_activation, nonlinear_activation_params={}): 12 | if hasattr(nn, nonlinear_activation): 13 | return getattr(nn, nonlinear_activation)(**nonlinear_activation_params) 14 | else: 15 | raise NotImplementedError(f"Activation {nonlinear_activation} is not supported!") -------------------------------------------------------------------------------- /requirements/kimi_audio_requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.4.1 2 | torchaudio==2.4.1 3 | packaging 4 | jinja2 5 | openai-whisper 6 | jsonlines 7 | pandas 8 | validators 9 | sty 10 | transformers 11 | librosa 12 | accelerate 13 | aiohttp 14 | colorama 15 | omegaconf==2.3.0 16 | sox 17 | six==1.16.0 18 | hyperpyyaml 19 | conformer==0.3.2 20 | diffusers 21 | pillow 22 | sentencepiece 23 | easydict 24 | fire 25 | ujson 26 | cairosvg 27 | immutabledict 28 | rich 29 | wget 30 | gdown 31 | datasets 32 | torchdyn==1.0.6 33 | huggingface_hub 34 | loguru 35 | decord 36 | blobfile 37 | timm 38 | sacrebleu==1.5.1 39 | soundfile 40 | tqdm -------------------------------------------------------------------------------- /registry/template/aqa.yaml: -------------------------------------------------------------------------------- 1 | zeroshot-aqa: 2 | class: src.prompt.template.DataTemplate 3 | args: 4 | template: 5 | - role: user 6 | content: 7 | audio: "{{audio}}" 8 | 9 | zeroshot-qa: 10 | class: src.prompt.template.DataTemplate 11 | args: 12 | template: 13 | - role: user 14 | content: 15 | text: "{{query}}" 16 | 17 | text-instruct-caption: # not recommand 18 | class: src.prompt.template.DataTemplate 19 | args: 20 | template: 21 | - role: instruct 22 | content: 23 | text: "{{query}}" 24 | - role: user 25 | content: 26 | audio: "{{audio_only}}" 27 | -------------------------------------------------------------------------------- /registry/dataset/paralinguistic.yaml: -------------------------------------------------------------------------------- 1 | esd: 2 | class: src.dataset.BatchLoader 3 | args: 4 | file: Tele-AI/TELEVAL/esd-zh 5 | ref_col: query_emo 6 | query_col: query 7 | extra_col: ["query_emo_zh", "answer", "answer_emo", "answer_emo_zh"] 8 | batch_size: 1 9 | 10 | para_mix300-zh: 11 | class: src.dataset.BatchLoader 12 | args: 13 | file: Tele-AI/TELEVAL/para_mix300-zh 14 | query_col: query 15 | ref_col: answer 16 | extra_col: ["para_name"] 17 | 18 | age-zh: 19 | class: src.dataset.BatchLoader 20 | args: 21 | file: Tele-AI/TELEVAL/age-zh 22 | query_col: query 23 | ref_col: age 24 | extra_col: ["answer_age", "answer_common"] -------------------------------------------------------------------------------- /registry/model/text.yaml: -------------------------------------------------------------------------------- 1 | qwen2-7b-instruct: 2 | class: src.models.qwen.Qwen2Instruct 3 | args: 4 | path: path/to/Qwen2-7B-Instruct 5 | sample_params: 6 | gen_type: greedy 7 | 8 | qwen2.5-7b-instruct: 9 | class: src.models.qwen.Qwen2Instruct 10 | args: 11 | path: path/to/Qwen2.5-7B-Instruct 12 | sample_params: 13 | gen_type: greedy 14 | 15 | qwen3-8b-instruct: 16 | class: src.models.qwen.Qwen3Instruct 17 | args: 18 | path: path/to/Qwen3-8B 19 | sample_params: 20 | gen_type: greedy 21 | 22 | qwen2.5-72b-instruct: 23 | args: 24 | path: path/to/qwen2.5-72b-instruct 25 | sample_params: 26 | gen_type: default # as judgement -------------------------------------------------------------------------------- /src/models/src_baichuan/cosy24k_vocoder/hifigan/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """HIFI-GAN""" -------------------------------------------------------------------------------- /registry/evaluator/speech.yaml: -------------------------------------------------------------------------------- 1 | paraformer-zh: 2 | class: src.evaluator.asr.ASR 3 | args: 4 | model: paraformer-zh # path/to/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch 5 | max_workers: 1 6 | 7 | MS-DNSMOS: 8 | class: src.evaluator.dnsmos.DNSMOS 9 | args: 10 | model: path/to/sig_bak_ovr.onnx 11 | max_workers: 1 12 | 13 | emo2vec-large: 14 | class: src.evaluator.emo2vec.Emo2vec 15 | args: 16 | model: iic/emotion2vec_plus_large # path/to/emotion2vec_large 17 | strict: True 18 | 19 | TeleSpeech-Dialect: 20 | class: src.evaluator.dialect.DialectClassify 21 | args: 22 | model: path/to/ecapa_tdnn-mfcc40-ch512-cls14.onnx 23 | max_workers: 1 -------------------------------------------------------------------------------- /src/models/src_speechgpt2/Codec/models/moshi_modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | 5 | # Copyright (c) Meta Platforms, Inc. and affiliates. 6 | # All rights reserved. 7 | # 8 | # This source code is licensed under the license found in the 9 | # LICENSE file in the root directory of this source tree. 10 | """Modules used for building the models.""" 11 | 12 | # flake8: noqa 13 | from .conv import ( 14 | NormConv1d, 15 | NormConvTranspose1d, 16 | StreamingConv1d, 17 | StreamingConvTranspose1d, 18 | pad_for_conv1d, 19 | pad1d, 20 | unpad1d, 21 | ) 22 | from .transformer import StreamingTransformer 23 | -------------------------------------------------------------------------------- /registry/infer_task/paralinguistic.yaml: -------------------------------------------------------------------------------- 1 | emotion-esd: 2 | class: src.config.InferTaskCfg 3 | args: 4 | dataset: esd-zh 5 | template: zeroshot-aqa 6 | model: qwen2_5_omni 7 | eval_task: emotion_understand # ["emotion_understand", "wer", "dnsmos", "emotion_response"] 8 | save_pred_audio: True 9 | 10 | aqa-para_mix300-zh: 11 | class: src.config.InferTaskCfg 12 | args: 13 | dataset: para_mix300-zh 14 | template: zeroshot-aqa 15 | model: qwen2_5_omni 16 | eval_task: para_care # ["para_care", "basic"] 17 | save_pred_audio: False 18 | 19 | care-age-zh: 20 | class: src.config.InferTaskCfg 21 | args: 22 | dataset: age-zh 23 | template: zeroshot-aqa 24 | model: qwen2_5_omni 25 | eval_task: age_care 26 | save_pred_audio: False -------------------------------------------------------------------------------- /registry/template/text_llm.yaml: -------------------------------------------------------------------------------- 1 | # for text LLM 2 | text-choice: 3 | class: src.prompt.template.DataTemplate 4 | args: 5 | template: 6 | - role: user 7 | content: 8 | text: "以下是单选题,请以'正确答案是'为引导,直接给出答案选项。例如'正确答案是C'。{{query}}" 9 | 10 | text-emo: 11 | class: src.prompt.template.DataTemplate 12 | args: 13 | template: 14 | - role: user 15 | content: 16 | text: "用户以 '{{query_emo_zh}}' 的情绪输入 '{{query}}' 。结合用户的情绪作出适当的回答。" 17 | 18 | # as judge 19 | judge-qwen2.5_72b_instruct: 20 | class: src.prompt.template.DataTemplate 21 | args: 22 | template: 23 | - role: system 24 | content: 25 | text: "You are Qwen, created by Alibaba Cloud. You are a helpful assistant." 26 | - role: user 27 | content: 28 | text: "{{text}}" 29 | -------------------------------------------------------------------------------- /requirements/glm4voice_requirements.txt: -------------------------------------------------------------------------------- 1 | conformer==0.3.2 2 | deepspeed==0.14.2; sys_platform == 'linux' 3 | diffusers==0.27.2 4 | fastapi==0.115.3 5 | fastapi-cli==0.0.4 6 | gdown==5.1.0 7 | gradio==5.3.0 8 | grpcio==1.57.0 9 | grpcio-tools==1.57.0 10 | huggingface_hub==0.25.2 11 | hydra-core==1.3.2 12 | HyperPyYAML==1.2.2 13 | inflect==7.3.1 14 | librosa==0.10.2 15 | lightning==2.2.4 16 | matplotlib==3.7.5 17 | modelscope==1.15.0 18 | networkx==3.1 19 | numpy==1.24.4 20 | omegaconf==2.3.0 21 | onnxruntime-gpu==1.16.0; sys_platform == 'linux' 22 | onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows' 23 | openai-whisper==20231117 24 | protobuf==4.25 25 | pydantic==2.7.0 26 | rich==13.7.1 27 | Requests==2.32.3 28 | safetensors==0.4.5 29 | soundfile==0.12.1 30 | tensorboard==2.14.0 31 | transformers==4.44.1 32 | uvicorn==0.32.0 33 | wget==3.2 34 | WeTextProcessing==1.0.3 35 | torch==2.3.0 36 | torchaudio==2.3.0 -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Dict, List, Union, Tuple, Any 3 | from src.dataset import BatchLoader, BatchSaver 4 | 5 | TemplateStruct = Union[str, Dict[str, Any], List[Dict[str, Union[str, List[Dict[str, str]]]]]] 6 | RefType = Union[str, List["RefType"], Tuple["RefType", ...]] 7 | RefsType = List[RefType] 8 | 9 | @dataclass 10 | class EvalTaskCfg: 11 | evaluator: str 12 | summarizer: str 13 | 14 | @dataclass 15 | class InferTaskCfg: 16 | dataset: Union[str, List[str]] 17 | template: str 18 | model: str 19 | eval_task: str 20 | save_pred_audio: bool = False 21 | reverse_spkr : bool = False # for multiturn 22 | use_model_history: bool = True # for multiturn 23 | save_latest_only: bool = False # for multiturn_memory 24 | 25 | @dataclass 26 | class DatasetRuntimeCtx: 27 | name: str 28 | loader: BatchLoader 29 | saver: BatchSaver 30 | summary_file: str = None -------------------------------------------------------------------------------- /registry/eval_task/llm.yaml: -------------------------------------------------------------------------------- 1 | 2 | emotion_understand: 3 | class: src.config.EvalTaskCfg 4 | args: 5 | evaluator: emo_llm 6 | summarizer: AvgThreshold 7 | 8 | aed_instruct: 9 | class: src.config.EvalTaskCfg 10 | args: 11 | evaluator: aed_llm 12 | summarizer: AvgThreshold 13 | 14 | dialect_follow: 15 | class: src.config.EvalTaskCfg 16 | args: 17 | evaluator: dialect_llm 18 | summarizer: AvgThreshold 19 | 20 | human_acceptance: 21 | class: src.config.EvalTaskCfg 22 | args: 23 | evaluator: acceptance_llm 24 | summarizer: AvgThreshold 25 | 26 | human_likeness: 27 | class: src.config.EvalTaskCfg 28 | args: 29 | evaluator: human_likeness_llm 30 | summarizer: AvgThreshold 31 | 32 | para_care: 33 | class: src.config.EvalTaskCfg 34 | args: 35 | evaluator: para_care_llm 36 | summarizer: AvgThreshold 37 | 38 | age_care: 39 | class: src.config.EvalTaskCfg 40 | args: 41 | evaluator: age_care_llm 42 | summarizer: AvgThreshold -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/act.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from .resample import UpSample1d, DownSample1d 6 | 7 | 8 | class Activation1d(nn.Module): 9 | def __init__( 10 | self, 11 | activation, 12 | up_ratio: int = 2, 13 | down_ratio: int = 2, 14 | up_kernel_size: int = 12, 15 | down_kernel_size: int = 12, 16 | ): 17 | super().__init__() 18 | self.up_ratio = up_ratio 19 | self.down_ratio = down_ratio 20 | self.act = activation 21 | self.upsample = UpSample1d(up_ratio, up_kernel_size) 22 | self.downsample = DownSample1d(down_ratio, down_kernel_size) 23 | 24 | # x: [B,C,T] 25 | def forward(self, x): 26 | x = self.upsample(x) 27 | x = self.act(x) 28 | x = self.downsample(x) 29 | 30 | return x 31 | -------------------------------------------------------------------------------- /registry/template/multiturn.yaml: -------------------------------------------------------------------------------- 1 | multiturn-audio: 2 | class: src.prompt.template.DataTemplate 3 | args: 4 | template: | 5 | { 6 | "nrounds": {{ nrounds }}, 7 | "dialogue": [ 8 | {% for i in range(1, nrounds + 1) %} 9 | { 10 | "role": "A", 11 | "round": "{{ i }}", 12 | "content": { 13 | "audio": {{ getvar("user_audio" ~ i) | tojson }}, 14 | "text": {{ getvar("user_text" ~ i) | tojson }} 15 | } 16 | }, 17 | { 18 | "role": "B", 19 | "round": "{{ i }}", 20 | "content": { 21 | "audio": {{ getvar("bot_audio" ~ i) | tojson }}, 22 | "text": {% if loop.last and answer is not none %} 23 | {{ answer | tojson }} 24 | {% else %} 25 | {{ getvar("bot_text" ~ i) | tojson }} 26 | {% endif %} 27 | } 28 | }{% if not loop.last %},{% endif %} 29 | {% endfor %} 30 | ] 31 | } 32 | 33 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied fron NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | #ifndef TORCH_CHECK 22 | #define TORCH_CHECK AT_CHECK 23 | #endif 24 | 25 | #ifdef VERSION_GE_1_3 26 | #define DATA_PTR data_ptr 27 | #else 28 | #define DATA_PTR data 29 | #endif 30 | -------------------------------------------------------------------------------- /src/models/src_llama_omni/model/speech_projector/speech_projector.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/ddlBoJack/SLAM-LLM/blob/main/src/slam_llm/models/projector.py 2 | 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | class EncoderProjectorConcat(nn.Module): 9 | def __init__(self, config): 10 | super().__init__() 11 | self.k = config.speech_encoder_ds_rate 12 | self.encoder_dim = config.speech_encoder_hidden_size 13 | self.llm_dim = config.hidden_size 14 | self.linear1 = nn.Linear(self.encoder_dim * self.k, 2048) 15 | self.relu = nn.ReLU() 16 | self.linear2 = nn.Linear(2048, config.hidden_size) 17 | 18 | def forward(self, x): 19 | batch_size, seq_len, dim = x.size() 20 | num_frames_to_discard = seq_len % self.k 21 | if num_frames_to_discard > 0: 22 | x = x[:, :-num_frames_to_discard, :] 23 | seq_len = x.size(1) 24 | 25 | x = x.contiguous() 26 | x = x.view(batch_size, seq_len // self.k, dim * self.k) 27 | x = self.linear1(x) 28 | x = self.relu(x) 29 | x = self.linear2(x) 30 | return x -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | 4 | from src.task import Pipeline 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--mode", default="eval", choices=["infer", "eval"]) 11 | parser.add_argument("--task", default="aqa") 12 | parser.add_argument("--model", default=None) 13 | 14 | parser.add_argument("--bsz", default=None) 15 | parser.add_argument("--save_dir", default="") 16 | parser.add_argument("--eval_task", default=None) 17 | parser.add_argument("--save_pred_audio", default=None) 18 | 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | def main(): 24 | args = get_args() 25 | logging.basicConfig( 26 | level=logging.INFO, 27 | format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", 28 | handlers=[logging.StreamHandler()], 29 | encoding="utf-8" 30 | ) 31 | user_args = vars(args) 32 | logger.info(f"Processing task: \nglobal args: {user_args}") 33 | t = Pipeline.create(**user_args) 34 | t.run() 35 | 36 | if __name__ == "__main__": 37 | main() 38 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/anti_alias_activation.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | 19 | extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta); 20 | 21 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 22 | m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)"); 23 | } -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/utils/block_mask_util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def create_grid_mask(seq_length, trunck_length, fill_triangle): 5 | assert seq_length > 0 6 | 7 | # 先不考虑seen_length创建一个grid mask: 8 | if fill_triangle: 9 | mask = 1 - torch.triu(torch.ones(seq_length, seq_length), diagonal=1) 10 | # 下三角与主对角线都为1 11 | else: 12 | mask = torch.zeros(seq_length, seq_length) 13 | 14 | for i in range(seq_length): 15 | trunck_idx = i // trunck_length 16 | trunck_start = trunck_idx * trunck_length 17 | trunck_end = trunck_length + trunck_start 18 | mask[i][trunck_start:trunck_end] = 1 19 | 20 | return mask 21 | 22 | 23 | if __name__ == "__main__": 24 | mask = create_grid_mask(seq_length=8, trunck_length=3, fill_triangle=True).int() 25 | print(mask) 26 | # tensor([[1, 1, 1, 0, 0, 0, 0, 0], 27 | # [1, 1, 1, 0, 0, 0, 0, 0], 28 | # [1, 1, 1, 0, 0, 0, 0, 0], 29 | # [1, 1, 1, 1, 1, 1, 0, 0], 30 | # [1, 1, 1, 1, 1, 1, 0, 0], 31 | # [1, 1, 1, 1, 1, 1, 0, 0], 32 | # [1, 1, 1, 1, 1, 1, 1, 1], 33 | # [1, 1, 1, 1, 1, 1, 1, 1]] 34 | 35 | -------------------------------------------------------------------------------- /src/models/src_llama_omni/model/speech_encoder/speech_encoder.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/ddlBoJack/SLAM-LLM/blob/main/src/slam_llm/models/encoder.py 2 | 3 | import types 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | 9 | class WhisperWrappedEncoder: 10 | 11 | @classmethod 12 | def load(cls, model_config): 13 | 14 | def replace_layer_norm(module): 15 | from whisper.model import LayerNorm 16 | for name, child in module.named_children(): 17 | if isinstance(child, LayerNorm): 18 | old_params = child.state_dict() 19 | new_layer_norm = nn.LayerNorm(child.normalized_shape, eps=child.eps, elementwise_affine=child.elementwise_affine) 20 | new_layer_norm.load_state_dict(old_params) 21 | setattr(module, name, new_layer_norm) 22 | else: 23 | replace_layer_norm(child) 24 | 25 | import whisper 26 | encoder = whisper.load_model(name=model_config.speech_encoder, device='cpu').encoder 27 | replace_layer_norm(encoder) 28 | return encoder -------------------------------------------------------------------------------- /requirements_all.txt: -------------------------------------------------------------------------------- 1 | jinja2 2 | tqdm 3 | requests 4 | pandas 5 | regex 6 | datasets 7 | soundfile 8 | librosa 9 | torch==2.6.0 # average 10 | torchaudio==2.6.0 # average 11 | transformers==4.45.0 # average 12 | flash-attn==2.7.2.post1 13 | accelerate>=0.26.0 14 | 15 | # minicpm 16 | pillow 17 | vector-quantize-pytorch==1.18.5 18 | vocos==0.1.0 19 | 20 | # speech-gpt2 21 | einops 22 | 23 | # baichuan 24 | av==12.3.0 25 | fire==0.4.0 26 | ujson==5.10.0 27 | easydict==1.13 28 | diffusers==0.24.0 29 | deepspeed 30 | decord==0.6.0 31 | opencv-python==4.10.0.84 32 | imagesize==1.4.1 33 | cairosvg==2.7.1 34 | 35 | # glm4voice 36 | hyperpyyaml 37 | conformer==0.3.2 38 | diffusers==0.27.2 39 | huggingface_hub==0.25.2 40 | lightning==2.2.4 41 | rich==13.7.1 42 | gdown==5.1.0 43 | wget==3.2 44 | matplotlib 45 | 46 | # lamma-omni 47 | openai-whisper==20240930 48 | fairseq==0.12.2 49 | 50 | # qwen2.5-omni 51 | qwen-omni-utils==0.0.4 52 | # transformers==4.52.3 53 | 54 | # kimi 55 | loguru 56 | blobfile 57 | timm 58 | torchdyn==1.0.6 59 | # transformers>=4.48.3 60 | 61 | # ============ evaluate ============== 62 | WeTextProcessing==1.0.3 63 | #vllm # choice 64 | jiwer 65 | funasr 66 | zhon 67 | zhconv 68 | onnxruntime==1.18.1 69 | scipy 70 | --find-links https://csukuangfj.github.io/kaldifeat/cuda.html 71 | kaldifeat==1.25.5 # choice, for dialect classify -------------------------------------------------------------------------------- /src/models/src_freezeomni/decoder/ticodec/vqvae_tester.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import librosa 4 | import torch 5 | import torch.nn as nn 6 | 7 | from .vqvae import VQVAE 8 | 9 | class VqvaeTester(nn.Module): 10 | def __init__(self, config_path, model_path, sample_rate=24000): 11 | super().__init__() 12 | self.vqvae = VQVAE(config_path, model_path, with_encoder=True) 13 | self.sample_rate = sample_rate 14 | 15 | @torch.no_grad() 16 | def forward(self, wav_path): 17 | # 单声道 18 | # wav.shape (T, ), 按照模型的 sr 读取 19 | wav, sr = librosa.load(wav_path, sr=self.sample_rate) 20 | fid = os.path.basename(wav_path)[:-4] 21 | wav = torch.tensor(wav).unsqueeze(0) 22 | wav = wav.cuda() 23 | # vq_codes is acoustic token 24 | vq_codes, global_token = self.vqvae.encode(wav) 25 | import pdb; pdb.set_trace() 26 | syn = self.vqvae(vq_codes, global_token) 27 | return fid, syn 28 | 29 | @torch.no_grad() 30 | def vq(self, wav_path): 31 | wav, sr = librosa.load(wav_path, sr=self.sample_rate) 32 | fid = os.path.basename(wav_path)[:-4] 33 | wav = torch.tensor(wav).unsqueeze(0) 34 | wav = wav.cuda() 35 | # vq_codes is acoustic token 36 | vq_codes, global_token = self.vqvae.encode(wav) 37 | return fid, vq_codes, global_token 38 | -------------------------------------------------------------------------------- /src/models/src_speechgpt2/Codec/models/modules/projector.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """Projector modules.""" 5 | 6 | import torch 7 | 8 | from .conv_layers import NonCausalConv1d, CausalConv1d 9 | 10 | 11 | class Projector(torch.nn.Module): 12 | def __init__(self, 13 | mode, 14 | input_channels, 15 | output_channels, 16 | stride=1, 17 | bias=False, 18 | model='conv1d', 19 | ): 20 | super().__init__() 21 | self.mode = mode 22 | if self.mode == 'noncausal': 23 | Conv1d = NonCausalConv1d 24 | elif self.mode == 'causal': 25 | Conv1d = CausalConv1d 26 | else: 27 | raise NotImplementedError(f"Mode ({mode}) is not supported!") 28 | 29 | if model == 'conv1d': 30 | self.project = Conv1d(input_channels, output_channels, kernel_size=1, stride=stride, bias=bias) 31 | elif model == 'conv1d_bn': 32 | self.project = torch.nn.Sequential( 33 | Conv1d(input_channels, output_channels, kernel_size=1, stride=stride, bias=bias), 34 | torch.nn.BatchNorm1d(output_channels) 35 | ) 36 | else: 37 | raise NotImplementedError(f"Model ({model}) is not supported!") 38 | 39 | def forward(self, x): 40 | return self.project(x) 41 | 42 | def encode(self, x): 43 | return self.project.inference(x) 44 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/tokenizer/glm4_tokenizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import librosa 3 | import os 4 | 5 | from transformers import WhisperFeatureExtractor 6 | from src.models.src_glm4.speech_tokenizer.modeling_whisper import WhisperVQEncoder 7 | from src.models.src_glm4.speech_tokenizer.utils import extract_speech_token 8 | from torch import nn 9 | 10 | 11 | class Glm4Tokenizer(nn.Module): 12 | def __init__(self, tokenizer_path): 13 | super().__init__() 14 | self.whisper_model = WhisperVQEncoder.from_pretrained(tokenizer_path).eval() 15 | self.feature_extractor = WhisperFeatureExtractor.from_pretrained(tokenizer_path) 16 | 17 | def tokenize(self, speech=None, audio_path=None, sr=16000): 18 | if audio_path: 19 | audio, sr = librosa.load(audio_path, sr=16000) 20 | audio = torch.tensor(audio).unsqueeze(0) 21 | audio_info = (audio, sr) 22 | else: 23 | assert speech is not None 24 | assert sr 25 | if isinstance(speech, list): 26 | speech = torch.tensor(speech).unsqueeze(0) 27 | if len(speech.shape) == 1: 28 | speech = speech.unsqueeze(0) 29 | audio_info = (speech, sr) 30 | 31 | audio_tokens = extract_speech_token( 32 | self.whisper_model, self.feature_extractor, [audio_info] 33 | )[0] 34 | audio_tokens = torch.tensor(audio_tokens).unsqueeze(0) 35 | return audio_tokens 36 | -------------------------------------------------------------------------------- /src/models/src_speechgpt2/Codec/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import sys 4 | import debugpy 5 | import torchaudio 6 | import torch 7 | import numpy as np 8 | 9 | def set_logging(): 10 | rank = os.environ.get("RANK", 0) 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | stream=sys.stdout, 14 | format=f"%(asctime)s [RANK {rank}] (%(module)s:%(lineno)d) %(levelname)s : %(message)s", 15 | ) 16 | 17 | def waiting_for_debug(ip, port): 18 | rank = os.environ.get("RANK", "0") 19 | debugpy.listen((ip, port)) 20 | logging.info(f"[rank = {rank}] Waiting for debugger attach...") 21 | debugpy.wait_for_client() 22 | logging.info(f"[rank = {rank}] Debugger attached") 23 | 24 | def load_audio(audio_path, target_sample_rate): 25 | wav, raw_sample_rate = torchaudio.load(audio_path) # (1, T) tensor 26 | if raw_sample_rate != target_sample_rate: 27 | wav = torchaudio.functional.resample(wav, raw_sample_rate, target_sample_rate) # tensor 28 | wav = np.expand_dims(wav.squeeze(0).numpy(), axis=1) 29 | wav = torch.tensor(wav).reshape(1, 1, -1) 30 | return wav 31 | 32 | def save_audio(audio_outpath, audio_out, sample_rate): 33 | print(audio_outpath, audio_out, sample_rate) 34 | torchaudio.save( 35 | audio_outpath, 36 | audio_out, 37 | sample_rate=sample_rate, 38 | encoding='PCM_S', 39 | bits_per_sample=16 40 | ) 41 | logging.info(f"success save audio at {audio_outpath}") -------------------------------------------------------------------------------- /src/prompt/template.py: -------------------------------------------------------------------------------- 1 | """ 2 | from https://github.com/OpenBMB/UltraEval-Audio/blob/main/src/prompt/base.py 3 | """ 4 | import json 5 | from functools import singledispatch 6 | from typing import Any, Dict, List 7 | from jinja2 import StrictUndefined, Template 8 | from jinja2.exceptions import UndefinedError 9 | from src.config import TemplateStruct 10 | 11 | @singledispatch 12 | def _load(t: Any, **kwargs: Any) -> Any: 13 | return t 14 | 15 | 16 | @_load.register 17 | def _(t: str, **kwargs: Any) -> str: 18 | def getvar(name: str, default=None): # for multiturn 19 | return kwargs.get(name, default) 20 | 21 | template = Template(t, undefined=StrictUndefined) 22 | try: 23 | rendered = template.render(**kwargs, getvar=getvar) 24 | # add for multiturn template 25 | try: 26 | return json.loads(rendered) 27 | except json.JSONDecodeError: 28 | return rendered 29 | except UndefinedError as e: 30 | raise ValueError("{}: template is {}\ndoc is {}".format(e, t, kwargs)) 31 | 32 | 33 | @_load.register 34 | def _(t: list, **kwargs: Any) -> List[Any]: 35 | return [_load(item, **kwargs) for item in t] 36 | 37 | 38 | @_load.register 39 | def _(t: dict, **kwargs: Any) -> Dict[Any, Any]: 40 | return {k: _load(v, **kwargs) for k, v in t.items()} 41 | 42 | 43 | class DataTemplate: 44 | def __init__(self, template: TemplateStruct): 45 | self.template = template 46 | 47 | def load(self, **kwargs): 48 | return _load(self.template, **kwargs) 49 | -------------------------------------------------------------------------------- /tools/test_eval_speech.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 4 | 5 | from src.dataset import BatchLoader, BatchSaver 6 | from src.registry import registry 7 | 8 | dataset_name = "esd" 9 | eval_task = "dialect_classify" # wer dnsmos emotion_response dialect_classify 10 | eval_task_cfg = registry.get_eval_task(eval_task) 11 | evaluator = registry.get_evaluator(eval_task_cfg.evaluator) 12 | summarizer = registry.get_summarizer(eval_task_cfg.summarizer) 13 | 14 | pred_file = f"{dataset_name}.jsonl" 15 | save_file = f"{dataset_name}_{eval_task}.jsonl" 16 | 17 | scores = [] 18 | all_results = [] 19 | data_loader = BatchLoader(pred_file, batch_size=1) 20 | saver = BatchSaver(save_file) 21 | 22 | for batch_data in data_loader: 23 | keys, preds, refs, pred_info_list = [ 24 | list(x) for x in zip(*[ 25 | ( 26 | d["key"], 27 | d["pred"], 28 | d["ref"] if isinstance(d["ref"], list) else [d["ref"]], 29 | {k: d[k] for k in d if k not in ("pred", "ref")} 30 | ) 31 | for d in batch_data 32 | ]) 33 | ] 34 | eval_results = evaluator.evaluate(preds, refs, pred_info_list) 35 | if len(eval_results) != len(pred_info_list): 36 | raise ValueError("Lost some results...") 37 | 38 | for result, pred_info in zip(eval_results, pred_info_list): 39 | result.update(pred_info) 40 | scores.append(result["score"]) 41 | all_results.append(result) 42 | 43 | saver.save_all(all_results) 44 | stat = summarizer.statistic(scores) 45 | print(f"stage: {eval_task}, total_score: {stat}") -------------------------------------------------------------------------------- /src/models/src_freezeomni/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import re 3 | import os 4 | import yaml 5 | from .audioLLM import AudioLLM 6 | 7 | from .encoder.cmvn import GlobalCMVN, load_cmvn 8 | from .encoder.encoder import speechEncoder 9 | 10 | def load_checkpoint(model: torch.nn.Module, path: str) -> dict: 11 | if torch.cuda.is_available(): 12 | print('Checkpoint: loading from checkpoint %s for GPU' % path) 13 | checkpoint = torch.load(path) 14 | else: 15 | print('Checkpoint: loading from checkpoint %s for CPU' % path) 16 | checkpoint = torch.load(path, map_location='cpu') 17 | 18 | # load parm from checkpoint 19 | model.load_state_dict(checkpoint, strict=False) 20 | 21 | info_path = re.sub('.pt$', '.yaml', path) 22 | configs = {} 23 | # get configs 24 | if os.path.exists(info_path): 25 | with open(info_path, 'r') as fin: 26 | configs = yaml.safe_load(fin) 27 | return configs 28 | 29 | def init_encoder_llm(configs): 30 | if configs['cmvn_file'] is not None: 31 | # read cmvn 32 | mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) 33 | # init cmvn layer 34 | global_cmvn = GlobalCMVN( 35 | torch.from_numpy(mean).float(), 36 | torch.from_numpy(istd).float()) 37 | else: 38 | global_cmvn = None 39 | 40 | input_dim = configs['input_dim'] 41 | vocab_size = configs['output_dim'] 42 | 43 | # init speech encoder 44 | encoder = speechEncoder(input_dim, global_cmvn=global_cmvn, **configs['encoder_conf']) 45 | # init audioLLM 46 | model = AudioLLM(encoder=encoder, **configs['model_conf']) 47 | 48 | return model 49 | -------------------------------------------------------------------------------- /registry/dataset/aqa.yaml: -------------------------------------------------------------------------------- 1 | llamaqa-en: 2 | class: src.dataset.BatchLoader 3 | args: 4 | file: Tele-AI/TELEVAL/llamaqa-en 5 | ref_col: answer 6 | query_col: query 7 | batch_size: 1 8 | 9 | llamaqa-zh: 10 | class: src.dataset.BatchLoader 11 | args: 12 | file: Tele-AI/TELEVAL/llamaqa-zh 13 | ref_col: answer 14 | query_col: query 15 | batch_size: 1 16 | 17 | triviaqa-en: 18 | class: src.dataset.BatchLoader 19 | args: 20 | file: Tele-AI/TELEVAL/triviaqa-en 21 | ref_col: answer 22 | query_col: query 23 | batch_size: 1 24 | 25 | triviaqa-zh: 26 | class: src.dataset.BatchLoader 27 | args: 28 | file: Tele-AI/TELEVAL/triviaqa-zh 29 | ref_col: answer 30 | query_col: query 31 | batch_size: 1 32 | 33 | webq-en: 34 | class: src.dataset.BatchLoader 35 | args: 36 | file: Tele-AI/TELEVAL/webq-en 37 | ref_col: answer 38 | query_col: query 39 | batch_size: 1 40 | 41 | webq-zh: 42 | class: src.dataset.BatchLoader 43 | args: 44 | file: Tele-AI/TELEVAL/webq-zh 45 | ref_col: answer 46 | query_col: query 47 | batch_size: 1 48 | 49 | chinesesimpleqa-zh: 50 | class: src.dataset.BatchLoader 51 | args: 52 | file: Tele-AI/TELEVAL/chinesesimpleqa-zh 53 | ref_col: answer 54 | query_col: query 55 | batch_size: 1 56 | 57 | chinese_quiz-zh: 58 | class: src.dataset.BatchLoader 59 | args: 60 | file: Tele-AI/TELEVAL/chinese_quiz-zh 61 | ref_col: answer 62 | query_col: query 63 | batch_size: 1 64 | 65 | livelihood_policy-zh: 66 | class: src.dataset.BatchLoader 67 | args: 68 | file: Tele-AI/TELEVAL/livelihood_policy-zh 69 | ref_col: answer 70 | query_col: query 71 | batch_size: 1 -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/utils/special_tokens.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class ExtraTokens: 6 | msg_end: int 7 | user_msg_start: int 8 | assistant_msg_start: int 9 | 10 | media_begin: int 11 | media_end: int 12 | 13 | kimia_text_blank: int 14 | kimia_text_eos: int 15 | 16 | kimia_user_msg_start: int 17 | kimia_assistant_msg_start: int 18 | 19 | kimia_speech_ct_id: int 20 | kimia_speech_ctd_id: int 21 | 22 | pad: int 23 | 24 | 25 | def instantiate_extra_tokens(tokenizer): 26 | if hasattr(tokenizer, "special_tokens"): 27 | map_fn = lambda x: tokenizer.special_tokens[x] 28 | elif hasattr(tokenizer, "convert_tokens_to_ids"): 29 | map_fn = lambda x: tokenizer.convert_tokens_to_ids(x) 30 | else: 31 | raise ValueError(f"Invalid tokenizer type: {type(tokenizer)}") 32 | return ExtraTokens( 33 | msg_end=map_fn("<|im_msg_end|>"), # 0 34 | user_msg_start=map_fn("<|im_user_msg_start|>"), # 1 35 | assistant_msg_start=map_fn("<|im_assistant_msg_start|>"), # 2 36 | media_begin=map_fn("<|im_media_begin|>"), # 13 37 | media_end=map_fn("<|im_media_end|>"), # 15 38 | kimia_text_blank=map_fn("<|im_kimia_text_blank|>"), # 18 39 | kimia_text_eos=map_fn("<|im_kimia_text_eos|>"), # 19 40 | kimia_user_msg_start=map_fn("<|im_kimia_user_msg_start|>"), # 22 41 | kimia_assistant_msg_start=map_fn("<|im_kimia_assistant_msg_start|>"), # 23 42 | kimia_speech_ct_id=map_fn("<|im_kimia_speech_ct_id|>"), # 27 43 | kimia_speech_ctd_id=map_fn("<|im_kimia_speech_ctd_id|>"), # 28 44 | pad=tokenizer.pad_id, 45 | ) 46 | -------------------------------------------------------------------------------- /src/evaluator/emo2vec.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | from src.evaluator.base import Evaluator 3 | 4 | class Emo2vec(Evaluator): 5 | def __init__(self, model: str, strict: bool = True): 6 | from funasr import AutoModel 7 | self.model = AutoModel(model=model, hub="ms", disable_update=True) 8 | self.strict = strict 9 | 10 | def evaluate(self, preds, refs, pred_info_list: List[Dict], **kwargs): 11 | # emo2vec model support batch generate 12 | pred_audios = [info["pred_audio"] for info in pred_info_list] 13 | model_outputs = self.model.generate( 14 | pred_audios, output_dir=None, granularity="utterance", extract_embedding=False 15 | ) 16 | 17 | results = [] 18 | for output, info in zip(model_outputs, pred_info_list): 19 | label_scores = { 20 | label.split("/")[-1].lower(): score 21 | for label, score in zip(output["labels"], output["scores"]) 22 | } 23 | ref_emotions = [emo.lower() for emo in info["answer_emo"]] 24 | 25 | if self.strict: 26 | neutral_count = sum(1 for emo in ref_emotions if emo == "neutral") 27 | if neutral_count <= len(ref_emotions) // 2: 28 | # remove "neutral" 29 | filtered_ref_emotions = [emo for emo in ref_emotions if emo != "neutral"] 30 | else: 31 | filtered_ref_emotions = ref_emotions 32 | else: 33 | filtered_ref_emotions = ref_emotions 34 | 35 | score = max((label_scores.get(emo, 0) for emo in filtered_ref_emotions), default=0) 36 | results.append({"key": info["key"], "score": score}) 37 | return results 38 | -------------------------------------------------------------------------------- /src/models/src_baichuan/cosy24k_vocoder/cosy24k_vocoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from .hifigan.generator import HiFTGenerator 4 | from .hifigan.f0_predictor import ConvRNNF0Predictor 5 | 6 | 7 | class Cosy24kVocoder(nn.Module): 8 | def __init__(self): 9 | super().__init__() 10 | self.hifigan_generator = HiFTGenerator( 11 | in_channels=80, 12 | base_channels=512, 13 | nb_harmonics=8, 14 | sampling_rate=24000, 15 | nsf_alpha=0.1, 16 | nsf_sigma=0.003, 17 | nsf_voiced_threshold=10, 18 | upsample_rates=[8, 5, 3], 19 | upsample_kernel_sizes=[16, 11, 7], 20 | resblock_kernel_sizes=[3, 7, 11], 21 | resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], 22 | source_resblock_kernel_sizes=[7, 7, 11], 23 | source_resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], 24 | lrelu_slope=0.1, 25 | audio_limit=0.99, 26 | f0_predictor=ConvRNNF0Predictor( 27 | num_class=1, 28 | in_channels=80, 29 | cond_channels=512, 30 | ), 31 | ) 32 | 33 | def decode(self, mel, device="cuda"): 34 | """ 35 | Args: mel: (batch_size, n_frames, n_mel) 36 | """ 37 | generated_speech, f0 = self.hifigan_generator.forward( 38 | {"speech_feat": mel.transpose(1, 2)}, device=device 39 | ) 40 | return generated_speech 41 | 42 | @classmethod 43 | def from_pretrained(cls, model_path: str): 44 | """Load a pretrained model from a checkpoint.""" 45 | model = cls() 46 | model.hifigan_generator.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) 47 | model.eval() 48 | return model 49 | -------------------------------------------------------------------------------- /src/evaluator/text_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import zhconv 4 | from zhon.hanzi import punctuation as zh_punct 5 | 6 | from tn.chinese.normalizer import Normalizer as ZhNormalizer 7 | from tn.english.normalizer import Normalizer as EnNormalizer 8 | 9 | 10 | class TextProcessor: 11 | RE_PUNCTUATION = re.compile(rf'[{re.escape(zh_punct + string.punctuation)}]') 12 | RE_SPACES = re.compile(r'[\s\u3000]+') 13 | 14 | def __init__(self, language: str = "zh"): 15 | self.language = language 16 | if self.language == "zh": 17 | self.normalizer = ZhNormalizer() 18 | elif self.language == "en": 19 | self.normalizer = EnNormalizer() 20 | else: 21 | raise ValueError(f"Unsupported language: {self.language}") 22 | 23 | @staticmethod 24 | def clean_text(text: str, remove_punct: bool = True, remove_space: bool = True) -> str: 25 | if remove_punct: 26 | text = TextProcessor.RE_PUNCTUATION.sub('', text) 27 | if remove_space: 28 | text = TextProcessor.RE_SPACES.sub('', text) 29 | return text 30 | 31 | @staticmethod 32 | def convert_cn(text: str) -> str: 33 | return zhconv.convert(text, 'zh-cn') 34 | 35 | def normalize_text(self, text: str) -> str: 36 | return self.normalizer.normalize(text) 37 | 38 | def normalize_and_clean(self, text: str, 39 | do_normalize: bool = True, 40 | simplified_zh: bool = True, 41 | remove_punct: bool = True, 42 | remove_space: bool = True) -> str: 43 | if simplified_zh: 44 | text = self.convert_cn(text) 45 | if do_normalize: 46 | text = self.normalize_text(text) 47 | return self.clean_text(text, remove_punct=remove_punct, remove_space=remove_space) -------------------------------------------------------------------------------- /registry/model/offline.yaml: -------------------------------------------------------------------------------- 1 | kimi-audio-7b-instruct: 2 | class: src.models.kimi_audio.Kimi 3 | args: 4 | path: path/to/Kimi-Audio-7B-Instruct 5 | whisper_path: path/to/whisper-large-v3 6 | glm4_tokenizer: path/to/glm-4-voice-tokenizer 7 | sample_params: 8 | gen_type: greedy 9 | 10 | qwen2_5_omni: 11 | class: src.models.qwen2_omni.Qwen2Omni 12 | args: 13 | path: path/to/Qwen2.5-Omni-7B 14 | sample_params: 15 | gen_type: greedy 16 | 17 | glm-4-voice-9b: 18 | class: src.models.glm4voice.GLM4voice 19 | args: 20 | path: path/to/glm-4-voice-9b 21 | speech_tokenizer_path: path/to/glm-4-voice-tokenizer 22 | flow_path: path/to/glm-4-voice-decoder 23 | sample_params: 24 | gen_type: greedy 25 | 26 | MiniCPMo2_6-audio: 27 | class: src.models.mini_cpm.MiniCPMoAudio 28 | args: 29 | path: path/to/MiniCPM-o-2_6 30 | sample_params: 31 | gen_type: greedy 32 | 33 | baichuan_omni_1d5: 34 | class: src.models.baichuan.BaichuanOmni 35 | args: 36 | path: path/to/Baichuan-Omni-1d5 37 | cosy_vocoder_path: path/to/Baichuan-Omni-1d5/hift.pt # third_party/cosy24k_vocoder/hift.pt 38 | sample_params: 39 | gen_type: greedy 40 | 41 | llama_omni: 42 | class: src.models.llama_omni.LlamaOmni 43 | args: 44 | path: path/to/Llama-3.1-8B-Omni 45 | vocoder_path: path/to/Llama-3.1-8B-Omni/vocoder 46 | sample_params: 47 | gen_type: greedy 48 | 49 | speechgpt2: 50 | class: src.models.speechgpt2.SpeechGPT2 51 | args: 52 | path: path/to/SpeechGPT-2-preview-7B 53 | codec_ckpt_path: path/to/SpeechGPT-2.0-preview-Codec/sg2_codec_ckpt.pkl 54 | sample_params: 55 | gen_type: greedy 56 | 57 | freeze_omni: 58 | class: src.models.freeze_omni.FreezeOmni 59 | args: 60 | path: path/to/Freeze-Omni/checkpoints 61 | llm_path: path/to/Qwen2-7B-Instruct 62 | sample_params: 63 | gen_type: greedy -------------------------------------------------------------------------------- /src/models/src_glm4/speech_tokenizer/configuration_whisper.py: -------------------------------------------------------------------------------- 1 | from transformers import WhisperConfig 2 | 3 | 4 | class WhisperVQConfig(WhisperConfig): 5 | def __init__(self, 6 | pooling_kernel_size=None, 7 | pooling_type="max", 8 | pooling_position=0, 9 | quantize_vocab_size=None, 10 | quantize_position=16, 11 | quantize_commit_coefficient=0.25, 12 | quantize_loss_scale=1.0, 13 | quantize_ema_decay=None, 14 | quantize_restart_interval=None, 15 | quantize_encoder_only=False, 16 | quantize_causal_encoder=False, 17 | quantize_causal_block_size=None, 18 | skip_language_detection=False, 19 | encoder_causal_attention=False, 20 | encoder_causal_convolution=False, 21 | **kwargs): 22 | self.pooling_kernel_size = pooling_kernel_size 23 | self.pooling_type = pooling_type 24 | self.pooling_position = pooling_position 25 | self.quantize_vocab_size = quantize_vocab_size 26 | self.quantize_position = quantize_position 27 | self.quantize_commit_coefficient = quantize_commit_coefficient 28 | self.quantize_loss_scale = quantize_loss_scale 29 | self.quantize_ema_decay = quantize_ema_decay 30 | self.quantize_restart_interval = quantize_restart_interval 31 | self.quantize_encoder_only = quantize_encoder_only 32 | self.quantize_causal_encoder = quantize_causal_encoder 33 | self.quantize_causal_block_size = quantize_causal_block_size 34 | self.skip_language_detection = skip_language_detection 35 | self.encoder_causal_attention = encoder_causal_attention 36 | self.encoder_causal_convolution = encoder_causal_convolution 37 | super().__init__(**kwargs) 38 | -------------------------------------------------------------------------------- /src/evaluator/asr.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from jiwer import compute_measures 3 | from src.evaluator.base import Evaluator 4 | from src.utils import parallel_batch 5 | from src.evaluator.text_utils import TextProcessor 6 | 7 | class ASR(Evaluator): 8 | """ 9 | Part from https://github.com/BytedanceSpeech/seed-tts-eval/tree/main 10 | """ 11 | def __init__(self, model: str, max_workers=None): 12 | if max_workers is not None: 13 | self.max_workers = max_workers 14 | from funasr import AutoModel 15 | self.model = AutoModel(model=model, disable_update=True) 16 | self.text_processor = TextProcessor(language="zh") 17 | 18 | @parallel_batch(default_workers=4) 19 | def evaluate(self, pred: str, ref: str, pred_info: Dict, **kwargs): 20 | pred_audio = pred_info["pred_audio"] 21 | res = self.model.generate(input=pred_audio, batch_size_s=300) 22 | transcription = res[0]["text"] 23 | 24 | clean_truth, clean_hypo, wer, subs, dele, inse, ref_len = self.compute_wer(hypo=transcription, truth=pred) 25 | score = { 26 | "ref_len": ref_len, 27 | "subs": subs, 28 | "dele": dele, 29 | "inse": inse, 30 | "wer": wer 31 | } 32 | return {"key": pred_info["key"], "clean_trans": clean_hypo, "clean_text": clean_truth, "score": score} 33 | 34 | def compute_wer(self, hypo, truth): 35 | truth = self.text_processor.normalize_and_clean(truth) 36 | hypo = self.text_processor.normalize_and_clean(hypo) 37 | 38 | truth_chars = " ".join(truth) 39 | hypo_chars = " ".join(hypo) 40 | measures = compute_measures(truth_chars, hypo_chars) 41 | ref_len = len(truth) 42 | 43 | wer = measures["wer"] 44 | subs = measures["substitutions"] 45 | dele = measures["deletions"] 46 | inse = measures["insertions"] 47 | 48 | return truth_chars, hypo_chars, wer, subs, dele, inse, ref_len -------------------------------------------------------------------------------- /src/evaluator/base.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any, Union, List 2 | from src.evaluator.process import SimpleTokenizer, OptionExtractor 3 | from src.config import RefType, RefsType 4 | from src.utils import parallel_batch 5 | 6 | class Evaluator: 7 | def evaluate(self, pred, label, **kwargs) -> Dict[str, Any]: 8 | raise NotImplementedError 9 | 10 | class ExistMatch(Evaluator): 11 | """ 12 | referred to https://github.com/DevSinghSachan/emdr2/blob/main/tasks/openqa/dense_retriever/evaluation/qa_validation.py 13 | """ 14 | 15 | def __init__(self, keep_punc=False, max_workers=None): 16 | self.keep_punc = keep_punc 17 | if max_workers is not None: 18 | self.max_workers = max_workers 19 | 20 | @parallel_batch(default_workers=4) 21 | def evaluate(self, pred: str, ref: RefsType, pred_info: Dict, **kwargs): 22 | # NOTE (TTTdas): If strict sequential matching is required, set keep_punc=False and simply put the ref into a string 23 | if not isinstance(ref, List): 24 | raise ValueError(f"Need List type ref for ExistMatch, but got {type(ref)} instead") 25 | match = SimpleTokenizer.has_answer(ref, str(pred), uncased=True, keep_punc=self.keep_punc) 26 | return {"key": pred_info["key"], "pred": pred, "ref": ref, "score": 1 if match else 0} 27 | 28 | 29 | class SingleOptionMatch(Evaluator): 30 | def __init__(self, max_workers=None, cushion=False): 31 | self.cushion = cushion 32 | if max_workers is not None: 33 | self.max_workers = max_workers 34 | 35 | @parallel_batch(default_workers=4) 36 | def evaluate(self, pred: str, ref: Union[str, List], pred_info: Dict, **kwargs): 37 | if isinstance(ref, list): 38 | assert len(ref) == 1 39 | ref = ref[0] 40 | match_dict = OptionExtractor.has_answer(ref, str(pred), pred_info.get("query", None), cushion=self.cushion) 41 | return {"key": pred_info["key"], "pred": pred, "ref": ref, "score": match_dict} -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/flow/length_regulator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import Tuple 15 | import torch.nn as nn 16 | from torch.nn import functional as F 17 | from cosyvoice.utils.mask import make_pad_mask 18 | 19 | 20 | class InterpolateRegulator(nn.Module): 21 | def __init__( 22 | self, 23 | channels: int, 24 | sampling_ratios: Tuple, 25 | out_channels: int = None, 26 | groups: int = 1, 27 | ): 28 | super().__init__() 29 | self.sampling_ratios = sampling_ratios 30 | out_channels = out_channels or channels 31 | model = nn.ModuleList([]) 32 | if len(sampling_ratios) > 0: 33 | for _ in sampling_ratios: 34 | module = nn.Conv1d(channels, channels, 3, 1, 1) 35 | norm = nn.GroupNorm(groups, channels) 36 | act = nn.Mish() 37 | model.extend([module, norm, act]) 38 | model.append( 39 | nn.Conv1d(channels, out_channels, 1, 1) 40 | ) 41 | self.model = nn.Sequential(*model) 42 | 43 | def forward(self, x, ylens=None): 44 | # x in (B, T, D) 45 | mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1) 46 | x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest') 47 | out = self.model(x).transpose(1, 2).contiguous() 48 | olens = ylens 49 | return out * mask, olens 50 | -------------------------------------------------------------------------------- /registry/evaluator/llm.yaml: -------------------------------------------------------------------------------- 1 | acceptance_llm: 2 | class: src.evaluator.llm.LLMScorer 3 | args: 4 | llm_name: gpt4o 5 | judge_task: value_align 6 | api_keys: 7 | key1: "xxx" 8 | key2: "xxx" 9 | key3: "xxx" 10 | max_workers: 3 11 | 12 | human_likeness_llm: 13 | class: src.evaluator.llm.LLMScorer 14 | args: 15 | llm_name: gpt4o 16 | judge_task: humanlike 17 | api_keys: 18 | key1: "xxx" 19 | key2: "xxx" 20 | key3: "xxx" 21 | max_workers: 3 22 | 23 | emo_llm: 24 | class: src.evaluator.llm.LLMScorer 25 | args: 26 | llm_name: gpt4o 27 | judge_task: emotion_understand 28 | api_keys: 29 | key1: "xxx" 30 | key2: "xxx" 31 | key3: "xxx" 32 | max_workers: 3 33 | 34 | aed_llm: 35 | class: src.evaluator.llm.LLMScorer 36 | args: 37 | llm_name: gpt4o 38 | judge_task: aed 39 | api_keys: 40 | key1: "xxx" 41 | key2: "xxx" 42 | key3: "xxx" 43 | max_workers: 3 44 | 45 | dialect_llm: 46 | class: src.evaluator.llm.LLMScorer 47 | args: 48 | llm_name: gpt4o 49 | judge_task: dialect_follow 50 | api_keys: 51 | key1: "xxx" 52 | key2: "xxx" 53 | key3: "xxx" 54 | max_workers: 3 55 | 56 | para_care_llm: 57 | class: src.evaluator.llm.LLMScorer 58 | args: 59 | llm_name: gpt4o 60 | judge_task: para_care 61 | api_keys: 62 | key1: "xxx" 63 | key2: "xxx" 64 | key3: "xxx" 65 | max_workers: 3 66 | 67 | age_care_llm: 68 | class: src.evaluator.llm.LLMScorer 69 | args: 70 | llm_name: gpt4o 71 | judge_task: age_care 72 | api_keys: 73 | key1: "xxx" 74 | key2: "xxx" 75 | key3: "xxx" 76 | max_workers: 3 77 | 78 | llm_offline: 79 | class: src.evaluator.llm.LLMOfflineScorer 80 | args: 81 | llm_name: qwen2.3-72b-instruct 82 | template: judge-qwen2.3_72b_instruct 83 | judge_task: emotion_understand 84 | generate_params: 85 | ngpus: 8 86 | max_tokens: 1024 87 | temperature: 0.7 88 | top_p: 0.8 89 | repetition_penalty: 1.03 90 | -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import torchaudio 18 | 19 | 20 | def read_lists(list_file): 21 | lists = [] 22 | with open(list_file, 'r', encoding='utf8') as fin: 23 | for line in fin: 24 | lists.append(line.strip()) 25 | return lists 26 | 27 | def read_json_lists(list_file): 28 | lists = read_lists(list_file) 29 | results = {} 30 | for fn in lists: 31 | with open(fn, 'r', encoding='utf8') as fin: 32 | results.update(json.load(fin)) 33 | return results 34 | 35 | def load_wav(wav, target_sr): 36 | speech, sample_rate = torchaudio.load(wav) 37 | speech = speech.mean(dim=0, keepdim=True) 38 | if sample_rate != target_sr: 39 | assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr) 40 | speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech) 41 | return speech 42 | 43 | def speed_change(waveform, sample_rate, speed_factor: str): 44 | effects = [ 45 | ["tempo", speed_factor], # speed_factor 46 | ["rate", f"{sample_rate}"] 47 | ] 48 | augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor( 49 | waveform, 50 | sample_rate, 51 | effects 52 | ) 53 | return augmented_waveform, new_sample_rate 54 | -------------------------------------------------------------------------------- /registry/infer_task/aqa.yaml: -------------------------------------------------------------------------------- 1 | 2 | # -------------------------------audio qa------------------------------------- 3 | aqa-llamaqa-en: 4 | class: src.config.InferTaskCfg 5 | args: 6 | dataset: llamaqa-en 7 | template: zeroshot-aqa 8 | model: qwen2_5_omni 9 | save_pred_audio: False 10 | eval_task: basic 11 | 12 | aqa-llamaqa-zh: 13 | class: src.config.InferTaskCfg 14 | args: 15 | dataset: llamaqa-zh 16 | template: zeroshot-aqa 17 | model: qwen2_5_omni 18 | save_pred_audio: False 19 | eval_task: basic 20 | 21 | aqa-triviaqa-en: 22 | class: src.config.InferTaskCfg 23 | args: 24 | dataset: triviaqa-en 25 | template: zeroshot-aqa 26 | model: qwen2_5_omni 27 | save_pred_audio: False 28 | eval_task: basic 29 | 30 | aqa-triviaqa-zh: 31 | class: src.config.InferTaskCfg 32 | args: 33 | dataset: triviaqa-zh 34 | template: zeroshot-aqa 35 | model: qwen2_5_omni 36 | save_pred_audio: False 37 | eval_task: basic 38 | 39 | aqa-webq-en: 40 | class: src.config.InferTaskCfg 41 | args: 42 | dataset: webq-en 43 | template: zeroshot-aqa 44 | model: qwen2_5_omni 45 | save_pred_audio: False 46 | eval_task: basic 47 | 48 | aqa-webq-zh: 49 | class: src.config.InferTaskCfg 50 | args: 51 | dataset: webq-zh 52 | template: zeroshot-aqa 53 | model: qwen2_5_omni 54 | save_pred_audio: False 55 | eval_task: basic 56 | 57 | aqa-chinesesimpleqa-zh: 58 | class: src.config.InferTaskCfg 59 | args: 60 | dataset: chinesesimpleqa-zh 61 | template: zeroshot-aqa 62 | model: qwen2_5_omni 63 | save_pred_audio: False 64 | eval_task: basic 65 | 66 | aqa-chinese_quiz-zh: 67 | class: src.config.InferTaskCfg 68 | args: 69 | dataset: chinese_quiz-zh 70 | template: zeroshot-aqa 71 | model: qwen2_5_omni 72 | save_pred_audio: False 73 | eval_task: basic 74 | 75 | aqa-livelihood_policy-zh: 76 | class: src.config.InferTaskCfg 77 | args: 78 | dataset: livelihood_policy-zh 79 | template: zeroshot-aqa 80 | model: qwen2_5_omni 81 | save_pred_audio: False 82 | eval_task: basic 83 | 84 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/resample.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from torch.nn import functional as F 6 | from .filter import LowPassFilter1d 7 | from .filter import kaiser_sinc_filter1d 8 | 9 | 10 | class UpSample1d(nn.Module): 11 | def __init__(self, ratio=2, kernel_size=None): 12 | super().__init__() 13 | self.ratio = ratio 14 | self.kernel_size = ( 15 | int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 16 | ) 17 | self.stride = ratio 18 | self.pad = self.kernel_size // ratio - 1 19 | self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 20 | self.pad_right = ( 21 | self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 22 | ) 23 | filter = kaiser_sinc_filter1d( 24 | cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size 25 | ) 26 | self.register_buffer("filter", filter) 27 | 28 | # x: [B, C, T] 29 | def forward(self, x): 30 | _, C, _ = x.shape 31 | 32 | x = F.pad(x, (self.pad, self.pad), mode="replicate") 33 | x = self.ratio * F.conv_transpose1d( 34 | x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C 35 | ) 36 | x = x[..., self.pad_left : -self.pad_right] 37 | 38 | return x 39 | 40 | 41 | class DownSample1d(nn.Module): 42 | def __init__(self, ratio=2, kernel_size=None): 43 | super().__init__() 44 | self.ratio = ratio 45 | self.kernel_size = ( 46 | int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 47 | ) 48 | self.lowpass = LowPassFilter1d( 49 | cutoff=0.5 / ratio, 50 | half_width=0.6 / ratio, 51 | stride=ratio, 52 | kernel_size=self.kernel_size, 53 | ) 54 | 55 | def forward(self, x): 56 | xx = self.lowpass(x) 57 | 58 | return xx 59 | -------------------------------------------------------------------------------- /tools/save_csv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import shutil 5 | import pandas as pd 6 | 7 | def get_args(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--root_dir", default="res") 10 | parser.add_argument("--transpose", default=True) 11 | return parser.parse_args() 12 | 13 | def main(): 14 | results = dict() 15 | args = get_args() 16 | summary_dir = os.path.join(args.root_dir, "summary") 17 | column_order = [] 18 | for model_name in os.listdir(summary_dir): 19 | model_path = os.path.join(summary_dir, model_name) 20 | if not os.path.isdir(model_path): 21 | continue 22 | 23 | results[model_name] = dict() 24 | 25 | jsonl_files = [f for f in os.listdir(model_path) if f.endswith(".jsonl")] 26 | jsonl_files.sort() 27 | 28 | 29 | for jsonl_file in jsonl_files: 30 | dataset_name = os.path.splitext(jsonl_file)[0] 31 | if dataset_name not in column_order: 32 | column_order.append(dataset_name) 33 | 34 | file_path = os.path.join(model_path, jsonl_file) 35 | 36 | try: 37 | with open(file_path, "r", encoding="utf-8") as f: 38 | score_str = "" 39 | line = f.readline().strip() 40 | data = json.loads(line) 41 | for key, value in data.items(): 42 | score_str += str(value) + " " 43 | results[model_name][dataset_name] = score_str 44 | except Exception as e: 45 | print(f"fail to read {file_path}: {e}") 46 | raise e 47 | 48 | df = pd.DataFrame.from_dict(results, orient="index") 49 | df = df.reindex(columns=column_order) 50 | 51 | if args.transpose: 52 | df = df.T 53 | df.to_csv(f"{args.root_dir}/results.csv", encoding="utf-8") 54 | 55 | print("========================== results ==========================", flush=True) 56 | terminal_width = shutil.get_terminal_size().columns 57 | pd.set_option("display.max_columns", None) 58 | pd.set_option("display.width", terminal_width) 59 | print(df) 60 | 61 | if __name__ == "__main__": 62 | main() -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/hifigan/f0_predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | import torch.nn as nn 16 | from torch.nn.utils import weight_norm 17 | 18 | 19 | class ConvRNNF0Predictor(nn.Module): 20 | def __init__(self, 21 | num_class: int = 1, 22 | in_channels: int = 80, 23 | cond_channels: int = 512 24 | ): 25 | super().__init__() 26 | 27 | self.num_class = num_class 28 | self.condnet = nn.Sequential( 29 | weight_norm( 30 | nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1) 31 | ), 32 | nn.ELU(), 33 | weight_norm( 34 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 35 | ), 36 | nn.ELU(), 37 | weight_norm( 38 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 39 | ), 40 | nn.ELU(), 41 | weight_norm( 42 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 43 | ), 44 | nn.ELU(), 45 | weight_norm( 46 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 47 | ), 48 | nn.ELU(), 49 | ) 50 | self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class) 51 | 52 | def forward(self, x: torch.Tensor) -> torch.Tensor: 53 | x = self.condnet(x) 54 | x = x.transpose(1, 2) 55 | return torch.abs(self.classifier(x).squeeze(-1)) 56 | -------------------------------------------------------------------------------- /src/models/src_baichuan/cosy24k_vocoder/hifigan/f0_predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | import torch.nn as nn 16 | from torch.nn.utils import weight_norm 17 | 18 | 19 | class ConvRNNF0Predictor(nn.Module): 20 | def __init__(self, 21 | num_class: int = 1, 22 | in_channels: int = 80, 23 | cond_channels: int = 512 24 | ): 25 | super().__init__() 26 | 27 | self.num_class = num_class 28 | self.condnet = nn.Sequential( 29 | weight_norm( 30 | nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1) 31 | ), 32 | nn.ELU(), 33 | weight_norm( 34 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 35 | ), 36 | nn.ELU(), 37 | weight_norm( 38 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 39 | ), 40 | nn.ELU(), 41 | weight_norm( 42 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 43 | ), 44 | nn.ELU(), 45 | weight_norm( 46 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 47 | ), 48 | nn.ELU(), 49 | ) 50 | self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class) 51 | 52 | def forward(self, x: torch.Tensor) -> torch.Tensor: 53 | x = self.condnet(x) 54 | x = x.transpose(1, 2) 55 | return torch.abs(self.classifier(x).squeeze(-1)) 56 | -------------------------------------------------------------------------------- /src/models/telechat2.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | from typing import Dict, Any 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | from src.models.base import Model 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | class TeleChat2(Model): 10 | def __init__(self, path: str, sample_params: Dict[str, Any] = None): 11 | super().__init__(sample_params) 12 | logger.info("start load model from {}".format(path)) 13 | self.model = AutoModelForCausalLM.from_pretrained( 14 | path, 15 | device_map="auto", 16 | trust_remote_code=True, 17 | torch_dtype=torch.float16 18 | ).eval() 19 | logger.info("successfully load model from {}".format(path)) 20 | 21 | self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) 22 | config = { 23 | "greedy": { 24 | "do_sample": False, 25 | "max_new_tokens": 1024, 26 | "top_k": None, 27 | "num_beams": 1, 28 | "temperature": None, 29 | "top_p": None 30 | } 31 | } 32 | self.generation_config = config.get(self.sample_params.get("gen_type", "greedy"), None) 33 | logger.info("generation_config: {}".format(self.generation_config)) 34 | 35 | 36 | def generate_once(self, audio, **kwargs): 37 | content = kwargs["query"] 38 | 39 | messages = [ 40 | {"role": "user", "content": content} 41 | ] 42 | text = self.tokenizer.apply_chat_template( 43 | messages, 44 | tokenize=False, 45 | add_generation_prompt=True 46 | ) 47 | model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device) 48 | 49 | generated_ids = self.model.generate( 50 | **model_inputs, 51 | **self.generation_config 52 | ) 53 | 54 | generated_ids = [ 55 | output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) 56 | ] 57 | response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] 58 | 59 | return {"pred": response} 60 | 61 | -------------------------------------------------------------------------------- /src/models/src_freezeomni/decoder/ticodec/vqvae.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from .models import Encoder 7 | from .models import Generator 8 | from .models import Quantizer 9 | 10 | class AttrDict(dict): 11 | def __init__(self, *args, **kwargs): 12 | super(AttrDict, self).__init__(*args, **kwargs) 13 | self.__dict__ = self 14 | 15 | class VQVAE(nn.Module): 16 | def __init__(self, 17 | config_path, 18 | ckpt_path, 19 | with_encoder=False): 20 | super(VQVAE, self).__init__() 21 | ckpt = torch.load(ckpt_path) 22 | with open(config_path) as f: 23 | data = f.read() 24 | json_config = json.loads(data) 25 | self.h = AttrDict(json_config) 26 | # self.gst = GST() 27 | # self.gst = Proposed(n_specs=128, token_num=10, E=128, n_layers=4) 28 | self.quantizer = Quantizer(self.h) 29 | self.generator = Generator(self.h) 30 | self.generator.load_state_dict(ckpt['generator']) 31 | self.quantizer.load_state_dict(ckpt['quantizer']) 32 | # self.gst.load_state_dict(ckpt['gst']) 33 | if with_encoder: 34 | self.encoder = Encoder(self.h) 35 | self.encoder.load_state_dict(ckpt['encoder']) 36 | 37 | def forward(self, x, global_style_token): 38 | # x is the codebook 39 | # x.shape (B, T, Nq) 40 | quant_emb = self.quantizer.embed(x) 41 | global_style_quantized_emb = self.quantizer.embed_gst(global_style_token).squeeze(-1) 42 | return self.generator(quant_emb, global_style_quantized_emb) 43 | 44 | def encode(self, x): 45 | batch_size = x.size(0) 46 | if len(x.shape) == 3 and x.shape[-1] == 1: 47 | x = x.squeeze(-1) 48 | # print(x.shape) 49 | 50 | c, global_features = self.encoder(x.unsqueeze(1)) 51 | # mid = mid.transpose(1, 2).unsqueeze(1) 52 | # global_style = self.gst(mid) 53 | q, loss_q, local_token, g, global_style_token = self.quantizer(c, global_features) 54 | local_token = [code.reshape(batch_size, -1) for code in local_token] 55 | global_style_token = torch.stack(global_style_token, -1).unsqueeze(1) 56 | # shape: [N, T, 4] 57 | return torch.stack(local_token, -1), global_style_token 58 | -------------------------------------------------------------------------------- /src/models/src_speechgpt2/Codec/models/modules/quantizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import torch 4 | import logging 5 | 6 | from .vq_module import ResidualVQ 7 | 8 | class Quantizer(torch.nn.Module): 9 | def __init__( 10 | self, 11 | train_codebook, 12 | code_dim, 13 | codebook_num, 14 | codebook_size, 15 | kmeans_init, 16 | kmeans_iters, 17 | decay, 18 | threshold_ema_dead_code, 19 | model, 20 | ): 21 | self.quantizer_type = model 22 | super().__init__() 23 | # speech 24 | if model == 'residual_vq': 25 | self.codebook = ResidualVQ( 26 | train_codebook=train_codebook, 27 | dim=code_dim, 28 | num_quantizers=codebook_num, 29 | codebook_size=codebook_size, 30 | kmeans_init=kmeans_init, 31 | kmeans_iters=kmeans_iters, 32 | decay=decay, 33 | threshold_ema_dead_code=threshold_ema_dead_code 34 | ) 35 | else: 36 | raise NotImplementedError(f"Model ({model}) is not supported!") 37 | 38 | def patch_accelerator(self, accelerator): 39 | logging.info(f"[in models/melvqgan/modules/quantizer.py/ Quantizer] patch accelerator !") 40 | self.codebook.patch_accelerator(accelerator) 41 | 42 | def initial(self): 43 | self.codebook.initial() 44 | 45 | def forward(self, z): 46 | zq, embed_nums, vqloss, perplexity, all_layers_output = self.codebook(z.transpose(2, 1)) 47 | all_layers_output = [output.transpose(2, 1) for output in all_layers_output] 48 | zq = zq.transpose(2, 1) 49 | return zq, embed_nums, vqloss, perplexity, all_layers_output 50 | 51 | def inference(self, z): 52 | zq, indices = self.codebook.forward_index(z.transpose(2, 1)) 53 | zq = zq.transpose(2, 1) 54 | return zq, indices 55 | 56 | def encode(self, z): # 给 model 57 | indices = self.codebook.encode(z.transpose(2, 1)) 58 | return indices # (num_layers, bs, len) 59 | 60 | def decode(self, indices): # 给 model (num_layers, bs, len) 61 | zq = self.codebook.decode(indices) 62 | zq = zq.transpose(1, 2) 63 | return zq # (bs, length, dim) 64 | -------------------------------------------------------------------------------- /tools/test_api.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import os 4 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 5 | 6 | from src.dataset import BatchLoader, BatchSaver 7 | from src.registry import registry 8 | 9 | 10 | if __name__ == "__main__": 11 | static_only = False 12 | model_name = "freeze_omni" 13 | evaluator_name = "emo_llm" 14 | judge_task = "emotion" 15 | jsonl_files =["esd"] 16 | 17 | summarizer = registry.get_summarizer("AvgThreshold") 18 | 19 | for file in jsonl_files: 20 | input_file = f"res/prediction/{model_name}/{file}.jsonl" 21 | save_file = f"res/result/{model_name}/{file}_{judge_task}.jsonl" 22 | 23 | print("processing file: ", input_file) 24 | if static_only: 25 | scores = [] 26 | with open(save_file, "r", encoding="utf-8") as f: 27 | for line in f: 28 | data = json.loads(line) 29 | scores.append(int(data["score"])) 30 | stat = summarizer.statistic(scores) 31 | print(f"file: {file}, total_score: {stat}") 32 | raise RuntimeError 33 | 34 | scores = [] 35 | dataloader = BatchLoader(input_file, batch_size=4) 36 | saver = BatchSaver(save_file) 37 | evaluator = registry.get_evaluator(evaluator_name) 38 | 39 | for idx, batch_data in enumerate(dataloader): 40 | keys, preds, refs, pred_info_list = [ 41 | list(x) for x in zip(*[ 42 | ( 43 | d["key"], 44 | d["pred"], 45 | d["ref"] if isinstance(d["ref"], list) else [d["ref"]], 46 | {k: d[k] for k in d if k not in ("pred", "ref")} 47 | ) 48 | for d in batch_data 49 | ]) 50 | ] 51 | 52 | eval_results = evaluator.evaluate(preds, refs, pred_info_list) 53 | if len(eval_results) != len(pred_info_list): 54 | raise ValueError("Lost some results...") 55 | 56 | for result, pred_info in zip(eval_results, pred_info_list): 57 | scores.append(result["score"]) 58 | result.update(pred_info) 59 | saver.save_one(result) 60 | 61 | stat = summarizer.statistic(scores) 62 | print(f"file: {file}, total_score: {stat}") -------------------------------------------------------------------------------- /tools/parquet2jsonl.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import os 4 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 5 | 6 | import json 7 | import glob 8 | import yaml 9 | from src.utils import load_and_process_parquet_dataset 10 | 11 | def collect_yaml_file_info(yaml_dir): 12 | result = {} 13 | yaml_files = glob.glob(os.path.join(yaml_dir, "*.yaml")) 14 | 15 | for yaml_file in yaml_files: 16 | with open(yaml_file, "r", encoding="utf-8") as f: 17 | data = yaml.safe_load(f) 18 | 19 | for name, config in data.items(): 20 | file_path = config.get("args", {}).get("file", None) 21 | result[name] = file_path 22 | 23 | return result 24 | 25 | def export_parquet_to_jsonl(repo_or_path="Tele-AI/TELEVAL", data_dir_pattern="llamaqa-zh", save_root_dir="./", is_local=False): 26 | print(f"processing {repo_or_path}, {data_dir_pattern} data from huggingface, saving to {save_root_dir}") 27 | if "*.parquet" in data_dir_pattern: 28 | base_subdir = os.path.normpath(os.path.dirname(data_dir_pattern)) 29 | else: 30 | base_subdir = os.path.normpath(data_dir_pattern) 31 | 32 | jsonl_filename = os.path.basename(base_subdir) + ".jsonl" 33 | jsonl_path = os.path.join(save_root_dir, base_subdir, jsonl_filename) 34 | audio_output_dir = os.path.join(save_root_dir, "audios", base_subdir) 35 | 36 | os.makedirs(os.path.dirname(jsonl_path), exist_ok=True) 37 | 38 | if os.path.exists(jsonl_path): 39 | print(f"JSONL already exists and will be overwritten: {jsonl_path}") 40 | 41 | records = load_and_process_parquet_dataset( 42 | repo_or_path, data_dir_pattern, audio_output_dir, key_col="key", is_local=is_local, tuple_decode=False 43 | ) 44 | 45 | with open(jsonl_path, "w", encoding="utf-8") as fout: 46 | for record in records: 47 | print(json.dumps(record, ensure_ascii=False), file=fout) 48 | 49 | print(f"JSONL saved to: {jsonl_path}") 50 | print(f"Audio files saved under: {audio_output_dir}") 51 | 52 | if __name__ == "__main__": 53 | save_root_dir = "audiobench_data" 54 | all_dataset = collect_yaml_file_info("registry/dataset") 55 | for dataset, repo_data_dir in all_dataset.items(): 56 | parts = repo_data_dir.split("/", 2) 57 | repo_or_path, data_dir_pattern = "/".join(parts[:2]), parts[-1] 58 | export_parquet_to_jsonl(repo_or_path, data_dir_pattern, save_root_dir) -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/utils/data.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class KimiAContent: 5 | def __init__( 6 | self, audio_token_ids=None, text_token_ids=None, is_continuous_mask=None 7 | ): 8 | self.audio_token_ids: list[int] = audio_token_ids or [] 9 | self.text_token_ids: list[int] = text_token_ids or [] 10 | self.is_continuous_mask: list[int] = is_continuous_mask or [] 11 | 12 | self.continuous_feature = [] 13 | 14 | def audio_append(self, index: int, is_continuous: bool = False): 15 | self.audio_token_ids.append(index) 16 | self.is_continuous_mask.append(is_continuous) 17 | 18 | def text_append(self, index: int): 19 | self.text_token_ids.append(index) 20 | 21 | def audio_extend(self, ids: list[int], is_continuous: bool = False): 22 | self.audio_token_ids.extend(ids) 23 | self.is_continuous_mask.extend([is_continuous] * len(ids)) 24 | 25 | def text_extend(self, ids: list[int]): 26 | self.text_token_ids.extend(ids) 27 | 28 | def audio_prepend(self, index: int, is_continuous: bool = False): 29 | self.audio_token_ids = [index] + self.audio_token_ids 30 | self.is_continuous_mask = [is_continuous] + self.is_continuous_mask 31 | 32 | def text_prepend(self, index: int): 33 | self.text_token_ids = [index] + self.text_token_ids 34 | 35 | def audio_pretend(self, ids: list[int], is_continuous: bool = False): 36 | self.audio_token_ids = ids + self.audio_token_ids 37 | self.is_continuous_mask = [is_continuous] * len(ids) + self.is_continuous_mask 38 | 39 | def text_pretend(self, ids: list[int]): 40 | self.text_token_ids = ids + self.text_token_ids 41 | 42 | def merge(self, other: "KimiAContent"): 43 | self.audio_token_ids.extend(other.audio_token_ids) 44 | self.text_token_ids.extend(other.text_token_ids) 45 | self.is_continuous_mask.extend(other.is_continuous_mask) 46 | self.continuous_feature.extend(other.continuous_feature) 47 | 48 | def to_tensor(self) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 49 | return ( 50 | torch.tensor([self.audio_token_ids], dtype=torch.long), 51 | torch.tensor([self.text_token_ids], dtype=torch.long), 52 | torch.tensor([self.is_continuous_mask], dtype=torch.bool), 53 | ) 54 | 55 | def is_valid(self): 56 | return ( 57 | len(self.audio_token_ids) 58 | == len(self.text_token_ids) 59 | == len(self.is_continuous_mask) 60 | ) 61 | -------------------------------------------------------------------------------- /src/models/kimi_audio.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import soundfile as sf 4 | from typing import Dict, Any 5 | 6 | from src.models.base import Model 7 | from src.models.src_kimi.kimia_infer.api.kimia import KimiAudio 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class Kimi(Model): 12 | def __init__(self, path: str, whisper_path: str, glm4_tokenizer: str, sample_params: Dict[str, Any] = None): 13 | super().__init__(sample_params) 14 | self.model = KimiAudio( 15 | model_path=path, 16 | whisper_path=whisper_path, 17 | glm4_tokenizer=glm4_tokenizer, 18 | load_detokenizer=True, 19 | split_device=False, # split need 4.48.3 20 | ) 21 | 22 | config = { 23 | "default": { 24 | "audio_temperature": 0.8, 25 | "audio_top_k": 10, 26 | "text_temperature": 0.0, 27 | "text_top_k": 5, 28 | "audio_repetition_penalty": 1.0, 29 | "audio_repetition_window_size": 64, 30 | "text_repetition_penalty": 1.0, 31 | "text_repetition_window_size": 16 32 | }, 33 | "greedy": { 34 | "audio_temperature": 1e-7, 35 | "text_temperature": 1e-7, 36 | "audio_repetition_penalty": 1.0, 37 | "text_repetition_penalty": 1.0 38 | } # NOTE (TTTdas): temerature > 1e-6 will do sampling 39 | } 40 | self.generation_config = config.get(self.sample_params.get("gen_type", "greedy"), None) 41 | logger.info("generation_config: {}".format(self.generation_config)) 42 | 43 | def generate_once(self, audio, **kwargs): 44 | messages = [] 45 | instruction = kwargs.get("instruct", "") 46 | if len(instruction) > 0: 47 | messages.append({"role": "user", "message_type": "text", "content": instruction}) 48 | 49 | messages.append({"role": "user", "message_type": "audio", "content": audio}) 50 | wav, text = self.model.generate(messages, **self.generation_config, output_type="both") 51 | if kwargs.get("pred_audio"): 52 | sf.write( 53 | kwargs["pred_audio"], 54 | wav.detach().cpu().view(-1).numpy(), 55 | 24000, 56 | ) 57 | 58 | return {"pred": text, "pred_audio": kwargs.get("pred_audio")} 59 | 60 | def generate_multiturn(self, audio, user_history, assistant_history, **kwargs): 61 | raise NotImplementedError("Waiting for Kimi-audio debug...") -------------------------------------------------------------------------------- /requirements/speechgpt2_requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==2.1.0 2 | accelerate==1.3.0 3 | aiofiles==23.2.1 4 | annotated-types==0.7.0 5 | anyio==4.8.0 6 | audioread==3.0.1 7 | certifi==2024.12.14 8 | cffi==1.17.1 9 | charset-normalizer==3.4.1 10 | click==8.1.8 11 | cramjam==2.9.1 12 | debugpy==1.8.12 13 | decorator==5.1.1 14 | einops==0.8.0 15 | fastapi==0.115.7 16 | fastparquet==2024.11.0 17 | ffmpy==0.5.0 18 | filelock==3.17.0 19 | fsspec==2024.12.0 20 | gradio==5.13.1 21 | gradio_client==1.6.0 22 | grpcio==1.70.0 23 | h11==0.14.0 24 | httpcore==1.0.7 25 | httpx==0.28.1 26 | huggingface-hub==0.27.1 27 | idna==3.10 28 | Jinja2==3.1.5 29 | jiwer==3.0.5 30 | joblib==1.4.2 31 | lazy_loader==0.4 32 | librosa==0.10.2.post1 33 | llvmlite==0.44.0 34 | Markdown==3.7 35 | markdown-it-py==3.0.0 36 | MarkupSafe==2.1.5 37 | mdurl==0.1.2 38 | mpmath==1.3.0 39 | msgpack==1.1.0 40 | networkx==3.4.2 41 | numba==0.61.0 42 | numpy==2.1.3 43 | nvidia-cublas-cu12==12.4.5.8 44 | nvidia-cuda-cupti-cu12==12.4.127 45 | nvidia-cuda-nvrtc-cu12==12.4.127 46 | nvidia-cuda-runtime-cu12==12.4.127 47 | nvidia-cudnn-cu12==9.1.0.70 48 | nvidia-cufft-cu12==11.2.1.3 49 | nvidia-curand-cu12==10.3.5.147 50 | nvidia-cusolver-cu12==11.6.1.9 51 | nvidia-cusparse-cu12==12.3.1.170 52 | nvidia-nccl-cu12==2.21.5 53 | nvidia-nvjitlink-cu12==12.4.127 54 | nvidia-nvtx-cu12==12.4.127 55 | orjson==3.10.15 56 | packaging==24.2 57 | pandas==2.2.3 58 | pillow==11.1.0 59 | platformdirs==4.3.6 60 | pooch==1.8.2 61 | protobuf==5.29.3 62 | psutil==6.1.1 63 | pycparser==2.22 64 | pydantic==2.10.6 65 | pydantic_core==2.27.2 66 | pydub==0.25.1 67 | Pygments==2.19.1 68 | python-dateutil==2.9.0.post0 69 | python-multipart==0.0.20 70 | pytz==2024.2 71 | PyYAML==6.0.2 72 | RapidFuzz==3.11.0 73 | regex==2024.11.6 74 | requests==2.32.3 75 | rich==13.9.4 76 | ruff==0.9.3 77 | safehttpx==0.1.6 78 | safetensors==0.5.2 79 | scikit-learn==1.6.1 80 | scipy==1.15.1 81 | semantic-version==2.10.0 82 | setuptools==75.1.0 83 | shellingham==1.5.4 84 | six==1.17.0 85 | sniffio==1.3.1 86 | sounddevice==0.5.1 87 | soundfile==0.13.1 88 | soxr==0.5.0.post1 89 | starlette==0.45.3 90 | sympy==1.13.1 91 | tensorboard==2.18.0 92 | tensorboard-data-server==0.7.2 93 | tensorboardX==2.6.2.2 94 | threadpoolctl==3.5.0 95 | tokenizers==0.20.3 96 | tomlkit==0.13.2 97 | torch==2.5.1 98 | torchaudio==2.5.1 99 | torchvision==0.20.1 100 | tqdm==4.67.1 101 | transformers==4.46.1 102 | triton==3.1.0 103 | typer==0.15.1 104 | typing_extensions==4.12.2 105 | tzdata==2025.1 106 | urllib3==2.3.0 107 | uroman==1.3.1.1 108 | uvicorn==0.34.0 109 | websockets==14.2 110 | Werkzeug==3.1.3 111 | wheel==0.44.0 112 | -------------------------------------------------------------------------------- /src/models/src_llama_omni/arguments.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | 3 | from dataclasses import dataclass, field 4 | from typing import Optional 5 | 6 | 7 | @dataclass 8 | class ModelArguments: 9 | model_name_or_path: Optional[str] = field(default="facebook/opt-125m") 10 | version: Optional[str] = field(default="v0") 11 | freeze_backbone: bool = field(default=False) 12 | tune_speech_projector: bool = field(default=False) 13 | tune_speech_encoder: bool = field(default=False) 14 | tune_speech_generator_only: bool = field(default=False) 15 | speech_encoder_type: Optional[str] = field(default=None) 16 | speech_encoder: Optional[str] = field(default=None) 17 | pretrain_speech_projector: Optional[str] = field(default=None) 18 | speech_projector_type: Optional[str] = field(default='linear') 19 | speech_generator_type: Optional[str] = field(default='ctc') 20 | ctc_decoder_config: str = "(2,4096,32,11008)" 21 | ctc_upsample_factor: int = 1 22 | ctc_loss_weight: float = 1.0 23 | unit_vocab_size: int = 1000 24 | speech_encoder_ds_rate: int = 5 25 | speech_encoder_hidden_size: int = 1280 26 | 27 | 28 | @dataclass 29 | class DataArguments: 30 | data_path: str = field(default=None, 31 | metadata={"help": "Path to the training data."}) 32 | is_multimodal: bool = False 33 | input_type: str = field(default="mel") 34 | speech_normalize: bool = False 35 | mel_size: int = 128 36 | has_tgt_units: bool = False 37 | 38 | 39 | @dataclass 40 | class TrainingArguments(transformers.TrainingArguments): 41 | cache_dir: Optional[str] = field(default=None) 42 | optim: str = field(default="adamw_torch") 43 | freeze_speech_projector: bool = field(default=False) 44 | model_max_length: int = field( 45 | default=512, 46 | metadata={ 47 | "help": 48 | "Maximum sequence length. Sequences will be right padded (and possibly truncated)." 49 | }, 50 | ) 51 | double_quant: bool = field( 52 | default=True, 53 | metadata={"help": "Compress the quantization statistics through double quantization."} 54 | ) 55 | quant_type: str = field( 56 | default="nf4", 57 | metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} 58 | ) 59 | bits: int = field( 60 | default=16, 61 | metadata={"help": "How many bits to use."} 62 | ) 63 | lora_enable: bool = False 64 | lora_r: int = 64 65 | lora_alpha: int = 16 66 | lora_dropout: float = 0.05 67 | lora_weight_path: str = "" 68 | lora_bias: str = "none" 69 | speech_projector_lr: Optional[float] = None 70 | group_by_modality_length: bool = field(default=False) -------------------------------------------------------------------------------- /src/models/src_speechgpt2/Codec/models/moshi_modules/rope.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import math 3 | import torch 4 | 5 | 6 | def apply_rope( 7 | q: torch.Tensor, 8 | k: torch.Tensor, 9 | offset: torch.Tensor, 10 | max_period: float = 10_000, 11 | time_before_heads: bool = False, 12 | ): 13 | """ 14 | Args: 15 | q (torch.Tensor): queries, shape `[B, T, H, D]`. 16 | k (torch.Tensor): keys, shape `[B, T, H, D]`. 17 | offset (int): current offset, e.g. when streaming. 18 | max_period (float): maximum period for the cos and sin. 19 | time_before_heads (bool): if True, expected [B, T, H, D], else [B, H, T ,D] 20 | """ 21 | 22 | if time_before_heads: 23 | B, T, H, D = q.shape 24 | else: 25 | B, H, T, D = q.shape 26 | assert k.shape == q.shape 27 | assert D > 0 28 | assert D % 2 == 0 29 | assert max_period > 0 30 | 31 | ds = torch.arange(D // 2, device=q.device, dtype=torch.float32) 32 | freqs = torch.exp(ds * (-math.log(max_period) * 2 / D)) 33 | ts = offset.float() + torch.arange(T, device=q.device, dtype=torch.float32) 34 | if time_before_heads: 35 | ts = ts.view(-1, 1, 1) 36 | else: 37 | ts = ts.view(1, -1, 1) 38 | 39 | dims = q.shape[:-1] 40 | q = q.view(*dims, D // 2, 2) 41 | k = k.view(*dims, D // 2, 2) 42 | 43 | # convention is `r` suffix is real part, `i` is imaginary. 44 | qr = q[..., 0].float() 45 | qi = q[..., 1].float() 46 | 47 | kr = k[..., 0].float() 48 | ki = k[..., 1].float() 49 | 50 | rotr = torch.cos(freqs * ts) 51 | roti = torch.sin(freqs * ts) 52 | qor = qr * rotr - qi * roti 53 | qoi = qr * roti + qi * rotr 54 | 55 | kor = kr * rotr - ki * roti 56 | koi = kr * roti + ki * rotr 57 | 58 | dtype = q.dtype 59 | qo = torch.stack([qor.to(dtype), qoi.to(dtype)], dim=-1) 60 | ko = torch.stack([kor.to(dtype), koi.to(dtype)], dim=-1) 61 | 62 | return qo.view(*dims, D), ko.view(*dims, D) 63 | 64 | 65 | class RotaryEmbedding(nn.Module): 66 | """Rotary positional embedding (RoPE) from [Su et al 2022](https://arxiv.org/abs/2104.09864). 67 | 68 | Args: 69 | max_period (float): Maximum period of the rotation frequencies. 70 | """ 71 | 72 | def __init__(self, max_period: float = 10000.0): 73 | super().__init__() 74 | self.max_period = max_period 75 | 76 | def forward( 77 | self, 78 | q: torch.Tensor, 79 | k: torch.Tensor, 80 | offset: torch.Tensor, 81 | time_before_heads: bool = False, 82 | ): 83 | """Apply rope rotation to query or key tensor.""" 84 | return apply_rope(q, k, offset, self.max_period, time_before_heads) 85 | -------------------------------------------------------------------------------- /src/models/src_speechgpt2/Codec/models/moshi_modules/gating.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | def gating_forward_kernel( 6 | weight_in: torch.Tensor, weight_out: torch.Tensor, activation, x: torch.Tensor 7 | ): 8 | x = F.linear(x, weight_in) 9 | B, T, _ = x.shape 10 | x = x.view(B, T, 2, -1) 11 | x = activation(x[..., 0, :]) * x[..., 1, :] 12 | x = F.linear(x, weight_out) 13 | return x 14 | 15 | 16 | class ActivationGating(nn.Module): 17 | """ 18 | Gating FFN layer, using the given activation. 19 | Args: 20 | dim (int): dimension of the input and output of the transformer. 21 | activation (any callable Tensor to Tensor): activation function to use. 22 | **factory_kwargs: other kwargs passed to the linear layer, in particular device and dtype. 23 | """ 24 | 25 | _fsdp_final = True 26 | 27 | def __init__(self, dim: int, dim_feedforward: int, activation, **factory_kwargs): 28 | super().__init__() 29 | # We should have 8 d^2 param, instead we will have 30 | # 2 * h * d + h * d = 3 h * d = 8 d^2 31 | # so h = 8 d / 3 but following Hervé's advice we use 21 / 8 as an approx. 32 | if dim_feedforward == 4 * dim: 33 | hidden = (21 * dim) // 8 34 | else: 35 | hidden = (2 * dim_feedforward) // 3 36 | self.linear_in = nn.Linear(dim, 2 * hidden, bias=False, **factory_kwargs) 37 | self.linear_out = nn.Linear(hidden, dim, bias=False, **factory_kwargs) 38 | self.activation = activation 39 | 40 | def forward(self, x: torch.Tensor): 41 | return gating_forward_kernel( 42 | self.linear_in.weight, self.linear_out.weight, self.activation, x 43 | ) 44 | 45 | 46 | def _get_activation(name: str): 47 | if name in ["sigmoid", "tanh", "relu"]: 48 | return getattr(torch, name) 49 | elif name in ["leaky_relu", "elu", "gelu", "silu", "mish", "softsign"]: 50 | return getattr(torch.nn.functional, name) 51 | elif name == "identity": 52 | return torch.nn.Identity() 53 | else: 54 | raise ValueError(f"Unknown activation {name}") 55 | 56 | 57 | def _make_gating( 58 | name: str, dim: int, dim_feedforward: int, **factory_kwargs 59 | ) -> nn.Module: 60 | return ActivationGating( 61 | dim, dim_feedforward, _get_activation(name), **factory_kwargs 62 | ) 63 | 64 | 65 | def make_gating( 66 | name: str, dim: int, dim_feedforward: int, **factory_kwargs 67 | ) -> nn.Module: 68 | gating = _make_gating(name, dim, dim_feedforward, **factory_kwargs) 69 | max_params = 2 * dim * dim_feedforward 70 | params = sum(p.numel() for p in gating.parameters()) 71 | assert ( 72 | params <= max_params 73 | ), f"{name} gating has {params} params, max is {max_params}" 74 | return gating 75 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/activation1d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 NVIDIA CORPORATION. 2 | # Licensed under the MIT license. 3 | 4 | import torch 5 | import torch.nn as nn 6 | from ..torch.resample import UpSample1d, DownSample1d 7 | 8 | # load fused CUDA kernel: this enables importing anti_alias_activation_cuda 9 | from . import load 10 | 11 | anti_alias_activation_cuda = load.load() 12 | 13 | 14 | class FusedAntiAliasActivation(torch.autograd.Function): 15 | """ 16 | Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs. 17 | The hyperparameters are hard-coded in the kernel to maximize speed. 18 | NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters. 19 | """ 20 | 21 | @staticmethod 22 | def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta): 23 | activation_results = anti_alias_activation_cuda.forward( 24 | inputs, up_ftr, down_ftr, alpha, beta 25 | ) 26 | 27 | return activation_results 28 | 29 | @staticmethod 30 | def backward(ctx, output_grads): 31 | raise NotImplementedError 32 | return output_grads, None, None 33 | 34 | 35 | class Activation1d(nn.Module): 36 | def __init__( 37 | self, 38 | activation, 39 | up_ratio: int = 2, 40 | down_ratio: int = 2, 41 | up_kernel_size: int = 12, 42 | down_kernel_size: int = 12, 43 | fused: bool = True, 44 | ): 45 | super().__init__() 46 | self.up_ratio = up_ratio 47 | self.down_ratio = down_ratio 48 | self.act = activation 49 | self.upsample = UpSample1d(up_ratio, up_kernel_size) 50 | self.downsample = DownSample1d(down_ratio, down_kernel_size) 51 | 52 | self.fused = fused # Whether to use fused CUDA kernel or not 53 | 54 | def forward(self, x): 55 | if not self.fused: 56 | x = self.upsample(x) 57 | x = self.act(x) 58 | x = self.downsample(x) 59 | return x 60 | else: 61 | if self.act.__class__.__name__ == "Snake": 62 | beta = self.act.alpha.data # Snake uses same params for alpha and beta 63 | else: 64 | beta = ( 65 | self.act.beta.data 66 | ) # Snakebeta uses different params for alpha and beta 67 | alpha = self.act.alpha.data 68 | if ( 69 | not self.act.alpha_logscale 70 | ): # Exp baked into cuda kernel, cancel it out with a log 71 | alpha = torch.log(alpha) 72 | beta = torch.log(beta) 73 | 74 | x = FusedAntiAliasActivation.apply( 75 | x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta 76 | ) 77 | return x 78 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/load.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 NVIDIA CORPORATION. 2 | # Licensed under the MIT license. 3 | 4 | import os 5 | import pathlib 6 | import subprocess 7 | 8 | from torch.utils import cpp_extension 9 | 10 | """ 11 | Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels. 12 | Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below 13 | """ 14 | os.environ["TORCH_CUDA_ARCH_LIST"] = "" 15 | 16 | 17 | def load(): 18 | # Check if cuda 11 is installed for compute capability 8.0 19 | cc_flag = [] 20 | _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME) 21 | if int(bare_metal_major) >= 11: 22 | cc_flag.append("-gencode") 23 | cc_flag.append("arch=compute_80,code=sm_80") 24 | 25 | # Build path 26 | srcpath = pathlib.Path(__file__).parent.absolute() 27 | buildpath = srcpath / "build" 28 | _create_build_dir(buildpath) 29 | 30 | # Helper function to build the kernels. 31 | def _cpp_extention_load_helper(name, sources, extra_cuda_flags): 32 | return cpp_extension.load( 33 | name=name, 34 | sources=sources, 35 | build_directory=buildpath, 36 | extra_cflags=[ 37 | "-O3", 38 | ], 39 | extra_cuda_cflags=[ 40 | "-O3", 41 | "-gencode", 42 | "arch=compute_70,code=sm_70", 43 | "--use_fast_math", 44 | ] 45 | + extra_cuda_flags 46 | + cc_flag, 47 | verbose=True, 48 | ) 49 | 50 | extra_cuda_flags = [ 51 | "-U__CUDA_NO_HALF_OPERATORS__", 52 | "-U__CUDA_NO_HALF_CONVERSIONS__", 53 | "--expt-relaxed-constexpr", 54 | "--expt-extended-lambda", 55 | ] 56 | 57 | sources = [ 58 | srcpath / "anti_alias_activation.cpp", 59 | srcpath / "anti_alias_activation_cuda.cu", 60 | ] 61 | anti_alias_activation_cuda = _cpp_extention_load_helper( 62 | "anti_alias_activation_cuda", sources, extra_cuda_flags 63 | ) 64 | 65 | return anti_alias_activation_cuda 66 | 67 | 68 | def _get_cuda_bare_metal_version(cuda_dir): 69 | raw_output = subprocess.check_output( 70 | [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True 71 | ) 72 | output = raw_output.split() 73 | release_idx = output.index("release") + 1 74 | release = output[release_idx].split(".") 75 | bare_metal_major = release[0] 76 | bare_metal_minor = release[1][0] 77 | 78 | return raw_output, bare_metal_major, bare_metal_minor 79 | 80 | 81 | def _create_build_dir(buildpath): 82 | try: 83 | os.mkdir(buildpath) 84 | except OSError: 85 | if not os.path.isdir(buildpath): 86 | print(f"Creation of the build directory {buildpath} failed") 87 | -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/utils/class_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright [2023-11-28] 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | 17 | from cosyvoice.transformer.activation import Swish 18 | from cosyvoice.transformer.subsampling import ( 19 | LinearNoSubsampling, 20 | EmbedinigNoSubsampling, 21 | Conv1dSubsampling2, 22 | Conv2dSubsampling4, 23 | Conv2dSubsampling6, 24 | Conv2dSubsampling8, 25 | ) 26 | from cosyvoice.transformer.embedding import (PositionalEncoding, 27 | RelPositionalEncoding, 28 | WhisperPositionalEncoding, 29 | LearnablePositionalEncoding, 30 | NoPositionalEncoding) 31 | from cosyvoice.transformer.attention import (MultiHeadedAttention, 32 | RelPositionMultiHeadedAttention, 33 | BlockRelPositionMultiHeadedAttention) 34 | from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding 35 | from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling 36 | 37 | 38 | COSYVOICE_ACTIVATION_CLASSES = { 39 | "hardtanh": torch.nn.Hardtanh, 40 | "tanh": torch.nn.Tanh, 41 | "relu": torch.nn.ReLU, 42 | "selu": torch.nn.SELU, 43 | "swish": getattr(torch.nn, "SiLU", Swish), 44 | "gelu": torch.nn.GELU, 45 | } 46 | 47 | COSYVOICE_SUBSAMPLE_CLASSES = { 48 | "linear": LinearNoSubsampling, 49 | "linear_legacy": LegacyLinearNoSubsampling, 50 | "embed": EmbedinigNoSubsampling, 51 | "conv1d2": Conv1dSubsampling2, 52 | "conv2d": Conv2dSubsampling4, 53 | "conv2d6": Conv2dSubsampling6, 54 | "conv2d8": Conv2dSubsampling8, 55 | 'paraformer_dummy': torch.nn.Identity 56 | } 57 | 58 | COSYVOICE_EMB_CLASSES = { 59 | "embed": PositionalEncoding, 60 | "abs_pos": PositionalEncoding, 61 | "rel_pos": RelPositionalEncoding, 62 | "rel_pos_espnet": EspnetRelPositionalEncoding, 63 | "no_pos": NoPositionalEncoding, 64 | "abs_pos_whisper": WhisperPositionalEncoding, 65 | "embed_learnable_pe": LearnablePositionalEncoding, 66 | } 67 | 68 | COSYVOICE_ATTENTION_CLASSES = { 69 | "selfattn": MultiHeadedAttention, 70 | "rel_selfattn": RelPositionMultiHeadedAttention, 71 | "block_rel_selfattn": BlockRelPositionMultiHeadedAttention, 72 | } 73 | -------------------------------------------------------------------------------- /src/models/model_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import functools 3 | import logging 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | from accelerate import infer_auto_device_map 6 | from accelerate.utils import get_balanced_memory 7 | from collections import Counter 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | def get_no_split_module_candidates(model): 12 | class_counter = Counter() 13 | 14 | def count_module_classes(module): 15 | for child in module.children(): 16 | class_name = child.__class__.__name__ 17 | class_counter[class_name] += 1 18 | count_module_classes(child) 19 | 20 | count_module_classes(model) 21 | 22 | candidates = {name for name, count in class_counter.items() if count > 1} 23 | return candidates 24 | 25 | def load_model_with_auto_device_map( 26 | model_name: str, 27 | max_memory: dict = None, 28 | no_split_module_classes: list = [], 29 | dtype=torch.float16, 30 | return_tokenizer=False 31 | ): 32 | """ 33 | Automatically infer device_map and load a multi-GPU model. 34 | 35 | Args: 36 | model_name (str): Model name or path. 37 | max_memory (dict, optional): Max memory per GPU, e.g., {0: "20GiB", 1: "20GiB"}. 38 | If None, get balance memory. 39 | no_split_module_classes (list, optional): List of module class names that must not be split. 40 | dtype (torch.dtype, optional): Model precision (default: float16). 41 | return_tokenizer (bool, optional): Whether to return tokenizer along with model. 42 | 43 | Returns: 44 | model or (model, tokenizer) 45 | """ 46 | # load to CPU first 47 | model = AutoModelForCausalLM.from_pretrained( 48 | model_name, 49 | torch_dtype=dtype, 50 | trust_remote_code=True, 51 | device_map=None 52 | ) 53 | candidates = get_no_split_module_candidates(model) 54 | logger.info(f"Folloing modules can be split: {candidates}") 55 | 56 | illegal = [cls for cls in no_split_module_classes if cls not in candidates] 57 | if illegal: 58 | raise ValueError(f"{illegal} not in allowed no_split_module_classes: {candidates}") 59 | 60 | if max_memory is None: 61 | max_memory = get_balanced_memory(model, dtype=dtype) 62 | # n_gpus = torch.cuda.device_count() 63 | # if n_gpus == 0: 64 | # raise ValueError("No CUDA GPUs detected for max_memory auto-inference.") 65 | # max_memory = {i: "20GiB" for i in range(n_gpus)} 66 | 67 | device_map = infer_auto_device_map( 68 | model, 69 | max_memory=max_memory, 70 | no_split_module_classes=no_split_module_classes 71 | ) 72 | 73 | model = AutoModelForCausalLM.from_pretrained( 74 | model_name, 75 | torch_dtype=dtype, 76 | trust_remote_code=True, 77 | device_map=device_map 78 | ) 79 | 80 | if return_tokenizer: 81 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 82 | return model, tokenizer 83 | 84 | return model -------------------------------------------------------------------------------- /run_text.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONPATH=$PWD:$PYTHONPATH 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | 5 | max_memory=400 6 | save_dir="res/test" 7 | 8 | stage=1 9 | stop_stage=1 10 | eval_bsz=1 11 | 12 | text_qa_tasks="text-llamaqa-en,text-llamaqa-zh,text-triviaqa-en,text-triviaqa-zh,text-webq-en,text-webq-zh,text-chinesesimpleqa-zh" 13 | text_choice_tasks="text-agieval-zh,text-ceval-zh" 14 | text_dialect_tasks="text-sichuanese,text-shanghainese,text-northeastern_mandarin,text-henan_dialect,text-cantonese" 15 | text_chitchat_dialect_tasks="text-chitchat-sichuanese,text-chitchat-shanghainese,text-chitchat-northeastern_mandarin,text-chitchat-henan_dialect,text-chitchat-cantonese" 16 | 17 | text_down_tasks="text-chinese_quiz-zh,text-livelihood_policy-zh" 18 | text_down_dialect_tasks="text-livelihood_policy-sichuanese,text-livelihood_policy-shanghainese,text-livelihood_policy-northeastern_mandarin,text-livelihood_policy-henan_dialect,text-livelihood_policy-cantonese" 19 | 20 | text_emo_tasks="text-emo" 21 | 22 | 23 | declare -A model_tasks 24 | model_tasks=( 25 | ["qwen3-8b-instruct"]="$text_down_tasks,$text_dialect_tasks,$text_down_tasks" 26 | ) 27 | 28 | gpu_list=($(echo $CUDA_VISIBLE_DEVICES | tr ',' ' ')) 29 | gpu_counts=${#gpu_list[@]} 30 | 31 | get_free_gpu() { 32 | while true; do 33 | for gpu in "${gpu_list[@]}"; do 34 | used_mem=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk "NR==$((gpu+1))") 35 | if [[ "$used_mem" -lt "$max_memory" ]]; then 36 | echo "$gpu" 37 | return 38 | fi 39 | done 40 | sleep 30 41 | done 42 | } 43 | 44 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 45 | for model in "${!model_tasks[@]}"; do 46 | IFS=',' read -r -a values <<< "${model_tasks[$model]}" 47 | for task in "${values[@]}"; do 48 | gpu=$(get_free_gpu) 49 | echo "***********************************************" 50 | echo "processing model: $model using task: $task on GPU: $gpu" 51 | echo "***********************************************" 52 | CUDA_VISIBLE_DEVICES=$gpu python main.py \ 53 | --mode "infer" \ 54 | --save_dir $save_dir \ 55 | --model $model \ 56 | --task $task & 57 | sleep 40 # Increase sleep time appropriately according to the speed of loading the model 58 | done 59 | done 60 | wait 61 | fi 62 | 63 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 64 | for model in "${!model_tasks[@]}"; do 65 | IFS=',' read -r -a values <<< "${model_tasks[$model]}" 66 | # read -a values <<< "${model_tasks[$model]}" 67 | for task in "${values[@]}"; do 68 | python main.py \ 69 | --mode "eval" \ 70 | --save_dir $save_dir \ 71 | --model $model \ 72 | --bsz $eval_bsz \ 73 | --task $task 74 | done 75 | done 76 | wait 77 | python tools/save_csv.py --root_dir $save_dir 78 | fi -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/detokenizer/flow_matching/scheduler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from abc import abstractmethod, ABC 3 | 4 | try: 5 | from torchdyn.core import NeuralODE 6 | 7 | NEURALODE_INSTALLED = True 8 | except ImportError: 9 | NEURALODE_INSTALLED = False 10 | 11 | 12 | class SchedulerBase(ABC): 13 | def __init__(self) -> None: 14 | pass 15 | 16 | @abstractmethod 17 | def set_timesteps(self): 18 | pass 19 | 20 | @abstractmethod 21 | def step(self): 22 | pass 23 | 24 | @abstractmethod 25 | def add_noise(self): 26 | pass 27 | 28 | 29 | class StreamingFlowMatchingScheduler(SchedulerBase): 30 | def __init__( 31 | self, 32 | timesteps=1000, 33 | sigma_min=1e-4, 34 | ) -> None: 35 | super().__init__() 36 | 37 | self.sigma_min = sigma_min 38 | self.timesteps = timesteps 39 | self.t_min = 0 40 | self.t_max = 1 - self.sigma_min 41 | 42 | self.neural_ode = None 43 | 44 | def set_timesteps(self, timesteps=15): 45 | self.timesteps = timesteps 46 | 47 | def step(self, xt, predicted_v): 48 | 49 | h = (self.t_max - self.t_min) / self.timesteps 50 | h = h * torch.ones(xt.shape[0], dtype=xt.dtype, device=xt.device) 51 | 52 | xt = xt + h * predicted_v 53 | return xt 54 | 55 | def sample(self, ode_wrapper, time_steps, xt, verbose=False, x0=None): 56 | h = (self.t_max - self.t_min) / self.timesteps 57 | h = h * torch.ones(xt.shape[0], dtype=xt.dtype, device=xt.device) 58 | 59 | if verbose: 60 | gt_v = x0 - xt 61 | 62 | for t in time_steps: 63 | predicted_v = ode_wrapper(t, xt) 64 | if verbose: 65 | dist = torch.mean(torch.nn.functional.l1_loss(gt_v, predicted_v)) 66 | print("Time: {}, Distance: {}".format(t, dist)) 67 | xt = xt + h * predicted_v 68 | return xt 69 | 70 | def sample_by_neuralode(self, ode_wrapper, time_steps, xt, verbose=False, x0=None): 71 | if not NEURALODE_INSTALLED: 72 | raise ImportError("NeuralODE is not installed, please install it first.") 73 | 74 | if self.neural_ode is None: 75 | self.neural_ode = NeuralODE( 76 | ode_wrapper, 77 | solver="euler", 78 | sensitivity="adjoint", 79 | atol=self.sigma_min, 80 | rtol=self.sigma_min, 81 | ) 82 | 83 | eval_points, traj = self.neural_ode(xt, time_steps) 84 | return traj[-1] 85 | 86 | def add_noise( 87 | self, 88 | original_samples: torch.FloatTensor, 89 | noise: torch.FloatTensor, 90 | timesteps: torch.IntTensor, 91 | ): 92 | ut = original_samples - (1 - self.sigma_min) * noise # 和ut的梯度没关系 93 | t_unsqueeze = timesteps.unsqueeze(1).unsqueeze(1).float() / self.timesteps 94 | x_noisy = ( 95 | t_unsqueeze * original_samples 96 | + (1.0 - (1 - self.sigma_min) * t_unsqueeze) * noise 97 | ) 98 | return x_noisy, ut 99 | -------------------------------------------------------------------------------- /registry/dataset/dialect.yaml: -------------------------------------------------------------------------------- 1 | 2 | # -----------------chinese quiz dialect------------------ 3 | chinese_quiz-sichuanese: 4 | class: src.dataset.BatchLoader 5 | args: 6 | file: Tele-AI/TELEVAL/chinese_quiz-sichuanese 7 | ref_col: answer 8 | query_col: query 9 | 10 | chinese_quiz-shanghainese: 11 | class: src.dataset.BatchLoader 12 | args: 13 | file: Tele-AI/TELEVAL/chinese_quiz-shanghainese 14 | ref_col: answer 15 | query_col: query 16 | 17 | chinese_quiz-northeastern_mandarin: 18 | class: src.dataset.BatchLoader 19 | args: 20 | file: Tele-AI/TELEVAL/chinese_quiz-northeastern_mandarin 21 | ref_col: answer 22 | query_col: query 23 | 24 | chinese_quiz-henan_dialect: 25 | class: src.dataset.BatchLoader 26 | args: 27 | file: Tele-AI/TELEVAL/chinese_quiz-henan_dialect 28 | ref_col: answer 29 | query_col: query 30 | 31 | chinese_quiz-cantonese: 32 | class: src.dataset.BatchLoader 33 | args: 34 | file: Tele-AI/TELEVAL/chinese_quiz-cantonese 35 | ref_col: answer 36 | query_col: query 37 | 38 | # -----------------livelihood policy dialect------------------ 39 | livelihood_policy-sichuanese: 40 | class: src.dataset.BatchLoader 41 | args: 42 | file: Tele-AI/TELEVAL/livelihood_policy-sichuanese 43 | ref_col: answer 44 | query_col: query 45 | 46 | livelihood_policy-shanghainese: 47 | class: src.dataset.BatchLoader 48 | args: 49 | file: Tele-AI/TELEVAL/livelihood_policy-shanghainese 50 | ref_col: answer 51 | query_col: query 52 | 53 | livelihood_policy-northeastern_mandarin: 54 | class: src.dataset.BatchLoader 55 | args: 56 | file: Tele-AI/TELEVAL/livelihood_policy-northeastern_mandarin 57 | ref_col: answer 58 | query_col: query 59 | 60 | livelihood_policy-henan_dialect: 61 | class: src.dataset.BatchLoader 62 | args: 63 | file: Tele-AI/TELEVAL/livelihood_policy-henan_dialect 64 | ref_col: answer 65 | query_col: query 66 | 67 | livelihood_policy-cantonese: 68 | class: src.dataset.BatchLoader 69 | args: 70 | file: Tele-AI/TELEVAL/livelihood_policy-cantonese 71 | ref_col: answer 72 | query_col: query 73 | 74 | # -----------------chitchat dialect------------------ 75 | chitchat-sichuanese: 76 | class: src.dataset.BatchLoader 77 | args: 78 | file: Tele-AI/TELEVAL/chitchat-sichuanese 79 | ref_col: dialect 80 | query_col: query 81 | 82 | chitchat-shanghainese: 83 | class: src.dataset.BatchLoader 84 | args: 85 | file: Tele-AI/TELEVAL/chitchat-shanghainese 86 | ref_col: dialect 87 | query_col: query 88 | 89 | chitchat-northeastern_mandarin: 90 | class: src.dataset.BatchLoader 91 | args: 92 | file: Tele-AI/TELEVAL/chitchat-northeastern_mandarin 93 | ref_col: dialect 94 | query_col: query 95 | 96 | chitchat-henan_dialect: 97 | class: src.dataset.BatchLoader 98 | args: 99 | file: Tele-AI/TELEVAL/chitchat-henan_dialect 100 | ref_col: dialect 101 | query_col: query 102 | 103 | chitchat-cantonese: 104 | class: src.dataset.BatchLoader 105 | args: 106 | file: Tele-AI/TELEVAL/chitchat-cantonese 107 | ref_col: dialect 108 | query_col: query 109 | -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/transformer/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) 2 | # 2020 Northwestern Polytechnical University (Pengcheng Guo) 3 | # 2020 Mobvoi Inc (Binbin Zhang) 4 | # 2024 Alibaba Inc (Xiang Lyu) 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Swish() activation function for Conformer.""" 18 | 19 | import torch 20 | from torch import nn, sin, pow 21 | from torch.nn import Parameter 22 | 23 | 24 | class Swish(torch.nn.Module): 25 | """Construct an Swish object.""" 26 | 27 | def forward(self, x: torch.Tensor) -> torch.Tensor: 28 | """Return Swish activation function.""" 29 | return x * torch.sigmoid(x) 30 | 31 | 32 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. 33 | # LICENSE is in incl_licenses directory. 34 | class Snake(nn.Module): 35 | ''' 36 | Implementation of a sine-based periodic activation function 37 | Shape: 38 | - Input: (B, C, T) 39 | - Output: (B, C, T), same shape as the input 40 | Parameters: 41 | - alpha - trainable parameter 42 | References: 43 | - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: 44 | https://arxiv.org/abs/2006.08195 45 | Examples: 46 | >>> a1 = snake(256) 47 | >>> x = torch.randn(256) 48 | >>> x = a1(x) 49 | ''' 50 | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): 51 | ''' 52 | Initialization. 53 | INPUT: 54 | - in_features: shape of the input 55 | - alpha: trainable parameter 56 | alpha is initialized to 1 by default, higher values = higher-frequency. 57 | alpha will be trained along with the rest of your model. 58 | ''' 59 | super(Snake, self).__init__() 60 | self.in_features = in_features 61 | 62 | # initialize alpha 63 | self.alpha_logscale = alpha_logscale 64 | if self.alpha_logscale: # log scale alphas initialized to zeros 65 | self.alpha = Parameter(torch.zeros(in_features) * alpha) 66 | else: # linear scale alphas initialized to ones 67 | self.alpha = Parameter(torch.ones(in_features) * alpha) 68 | 69 | self.alpha.requires_grad = alpha_trainable 70 | 71 | self.no_div_by_zero = 0.000000001 72 | 73 | def forward(self, x): 74 | ''' 75 | Forward pass of the function. 76 | Applies the function to the input elementwise. 77 | Snake ∶= x + 1/a * sin^2 (xa) 78 | ''' 79 | alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] 80 | if self.alpha_logscale: 81 | alpha = torch.exp(alpha) 82 | x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) 83 | 84 | return x 85 | -------------------------------------------------------------------------------- /src/models/src_glm4/audio_process.py: -------------------------------------------------------------------------------- 1 | import os 2 | import librosa 3 | import soundfile as sf 4 | import numpy as np 5 | from pathlib import Path 6 | import io 7 | 8 | # Split audio stream at silence points to prevent playback stuttering issues 9 | # caused by AAC encoder frame padding when streaming audio through Gradio audio components. 10 | class AudioStreamProcessor: 11 | def __init__(self, sr=22050, min_silence_duration=0.1, threshold_db=-40): 12 | self.sr = sr 13 | self.min_silence_duration = min_silence_duration 14 | self.threshold_db = threshold_db 15 | self.buffer = np.array([]) 16 | 17 | 18 | def process(self, audio_data, last=False): 19 | """ 20 | Add audio data and process it 21 | params: 22 | audio_data: audio data in numpy array 23 | last: whether this is the last chunk of data 24 | returns: 25 | Processed audio data, returns None if no split point is found 26 | """ 27 | 28 | # Add new data to buffer 29 | self.buffer = np.concatenate([self.buffer, audio_data]) if len(self.buffer) > 0 else audio_data 30 | 31 | if last: 32 | result = self.buffer 33 | self.buffer = np.array([]) 34 | return self._to_wav_bytes(result) 35 | 36 | # Find silence boundary 37 | split_point = self._find_silence_boundary(self.buffer) 38 | 39 | if split_point is not None: 40 | # Modified: Extend split point to the end of silence 41 | silence_end = self._find_silence_end(split_point) 42 | result = self.buffer[:silence_end] 43 | self.buffer = self.buffer[silence_end:] 44 | return self._to_wav_bytes(result) 45 | 46 | return None 47 | 48 | def _find_silence_boundary(self, audio): 49 | """ 50 | Find the starting point of silence boundary in audio 51 | """ 52 | # Convert audio to decibels 53 | db = librosa.amplitude_to_db(np.abs(audio), ref=np.max) 54 | 55 | # Find points below threshold 56 | silence_points = np.where(db < self.threshold_db)[0] 57 | 58 | if len(silence_points) == 0: 59 | return None 60 | 61 | # Calculate minimum silence samples 62 | min_silence_samples = int(self.min_silence_duration * self.sr) 63 | 64 | # Search backwards for continuous silence segment starting point 65 | for i in range(len(silence_points) - min_silence_samples, -1, -1): 66 | if i < 0: 67 | break 68 | if np.all(np.diff(silence_points[i:i+min_silence_samples]) == 1): 69 | return silence_points[i] 70 | 71 | return None 72 | 73 | def _find_silence_end(self, start_point): 74 | """ 75 | Find the end point of silence segment 76 | """ 77 | db = librosa.amplitude_to_db(np.abs(self.buffer[start_point:]), ref=np.max) 78 | silence_points = np.where(db >= self.threshold_db)[0] 79 | 80 | if len(silence_points) == 0: 81 | return len(self.buffer) 82 | 83 | return start_point + silence_points[0] 84 | 85 | def _to_wav_bytes(self, audio_data): 86 | """ 87 | trans_to_wav_bytes 88 | """ 89 | wav_buffer = io.BytesIO() 90 | sf.write(wav_buffer, audio_data, self.sr, format='WAV') 91 | return wav_buffer.getvalue() 92 | 93 | 94 | -------------------------------------------------------------------------------- /src/evaluator/dialect.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | import kaldifeat 3 | import logging 4 | import numpy as np 5 | import onnxruntime as ort 6 | from scipy.special import softmax 7 | 8 | from src.evaluator.base import Evaluator 9 | from src.utils import parallel_batch, preprocess_audio 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class DialectSession: 14 | """ 15 | from https://github.com/Tele-AI/TeleSpeech-DialectIdentify 16 | """ 17 | def __init__(self, onnx_file: str, device: str = "cpu"): 18 | self.session = self._init_session(onnx_file, device) 19 | self.mfcc_extractor = self._init_mfcc_extractor() 20 | self.sr = 16000 21 | self.DIALECT_TOKENS = { 22 | 0: "ct", 1: "kej", 2: "mand", 3: "min", 4: "wuy", 5: "zha", 6: "zhc", 23 | 7: "zhd", 8: "zhg", 9: "zhj", 10: "zhs", 11: "zhu", 12: "zhw", 13: "zhx" 24 | } 25 | logger.info(f"Loading dialect classify model: {onnx_file} Successfully") 26 | 27 | def _init_session(self, onnx_file, device): 28 | sess_options = ort.SessionOptions() 29 | sess_options.intra_op_num_threads = 1 30 | sess_options.inter_op_num_threads = 1 31 | sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL 32 | sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL 33 | sess_options.add_session_config_entry("session.intra_op.allow_spinning", "1") 34 | provider = "CPUExecutionProvider" if device == "cpu" else "CUDAExecutionProvider" 35 | return ort.InferenceSession( 36 | onnx_file, 37 | providers=[provider], 38 | sess_options=sess_options, 39 | ) 40 | 41 | def _init_mfcc_extractor(self, sr: int = 16000): 42 | opts = kaldifeat.MfccOptions() 43 | opts.device = "cpu" 44 | opts.frame_opts.dither = 0 45 | opts.frame_opts.snip_edges = False 46 | opts.frame_opts.samp_freq = sr 47 | opts.use_energy = False 48 | opts.mel_opts.num_bins = 40 49 | opts.mel_opts.low_freq = 40 50 | opts.mel_opts.high_freq = -200 51 | opts.num_ceps = 40 52 | return kaldifeat.Mfcc(opts) 53 | 54 | def classify(self, wav_file: str) -> str: 55 | wav = preprocess_audio(wav_file, target_sr=self.sr) 56 | wav = wav * (1 << 15) 57 | feats = self.mfcc_extractor(wav.squeeze()) 58 | out = self.session.run( 59 | input_feed={"feats": feats.unsqueeze(0).numpy()}, 60 | output_names=["labels"] 61 | )[0] 62 | pred = np.argmax(softmax(out, axis=1)) 63 | return self.DIALECT_TOKENS[int(pred)] 64 | 65 | class DialectClassify(Evaluator): 66 | def __init__(self, model: str, max_workers=None): 67 | if max_workers is not None: 68 | self.max_workers = max_workers 69 | self.onnx_sess = DialectSession(model) 70 | self.dialect_mapping = { 71 | "ct": "粤语", "zhs": "河南话", "zhc": "四川话", 72 | "zhd": "东北话", "wuy": "上海话", "mand": "普通话" 73 | } 74 | 75 | @parallel_batch(default_workers=4) 76 | def evaluate(self, pred: str, ref: str, pred_info: Dict, **kwargs): 77 | pred_audio = pred_info["pred_audio"] 78 | res = self.onnx_sess.classify(pred_audio) 79 | mapped_dialect = self.dialect_mapping.get(res, None) 80 | logger.info(f"key: {pred_info['key']} recognition dialect: {mapped_dialect}") 81 | 82 | score = int(mapped_dialect == ref) if mapped_dialect else 0 83 | return {"key": pred_info["key"], "score": score} -------------------------------------------------------------------------------- /src/models/qwen.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict, Any 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | from src.models.base import Model 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | class Qwen2Instruct(Model): 9 | def __init__(self, path: str, sample_params: Dict[str, Any] = None): 10 | super().__init__(sample_params) 11 | logger.info("start load model from {}".format(path)) 12 | self.model = AutoModelForCausalLM.from_pretrained( 13 | path, 14 | torch_dtype="auto", 15 | device_map="auto", 16 | ).eval() 17 | logger.info("successfully load model from {}".format(path)) 18 | 19 | self.tokenizer = AutoTokenizer.from_pretrained(path) 20 | config = { 21 | "greedy": { 22 | "do_sample": False, 23 | "max_new_tokens": 1024, 24 | "top_k": None, 25 | "num_beams": 1, 26 | "temperature": None, 27 | "top_p": None 28 | } 29 | } 30 | self.generation_config = config.get(self.sample_params.get("gen_type", "greedy"), None) 31 | logger.info("generation_config: {}".format(self.generation_config)) 32 | self.system_prompt_qwen2 = "You are a helpful assistant." 33 | self.system_prompt_qwen2d5 = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant." 34 | 35 | def generate_once(self, audio, **kwargs): 36 | system_prompt = self.system_prompt_qwen2d5 37 | content = kwargs.get("instruct", "") + kwargs["query"] 38 | 39 | messages = [ 40 | {"role": "system", "content": system_prompt}, 41 | {"role": "user", "content": content} 42 | ] 43 | 44 | text = self.tokenizer.apply_chat_template( 45 | messages, 46 | tokenize=False, 47 | add_generation_prompt=True 48 | ) 49 | model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device) 50 | 51 | generated_ids = self.model.generate( 52 | **model_inputs, 53 | **self.generation_config 54 | ) 55 | generated_ids = [ 56 | output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) 57 | ] 58 | response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] 59 | return {"pred": response} 60 | 61 | class Qwen3Instruct(Qwen2Instruct): 62 | def __init__(self, path: str, sample_params: Dict[str, Any] = None): 63 | # transformers>=4.51.0 64 | super().__init__(path, sample_params) 65 | 66 | def generate_once(self, audio, **kwargs): 67 | content = kwargs.get("instruct", "") + kwargs["query"] 68 | 69 | messages = [ 70 | {"role": "user", "content": content} 71 | ] 72 | 73 | text = self.tokenizer.apply_chat_template( 74 | messages, 75 | tokenize=False, 76 | add_generation_prompt=True, 77 | enable_thinking=False # Switches between thinking and non-thinking modes. Default is True. 78 | ) 79 | model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device) 80 | 81 | generated_ids = self.model.generate( 82 | **model_inputs, 83 | **self.generation_config 84 | ) 85 | output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 86 | response = self.tokenizer.decode(output_ids, skip_special_tokens=True) 87 | return {"pred": response} -------------------------------------------------------------------------------- /src/summarizer/summarizer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict, List, Union, Any 3 | from collections import Counter 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | class Summarizer: 8 | def __init__(self, rescale="base", power=2): 9 | rescale_map = { 10 | "base": lambda x: x, 11 | "linear": self.linear_rescale, 12 | "power": lambda x: self.power_rescale(x, power=power) 13 | } 14 | logger.info(f"Using rescale type: {rescale}") 15 | self.rescale_func = rescale_map[rescale] 16 | 17 | def _check_scores(self, scores: List[Any]): 18 | if any(s is None for s in scores if not isinstance(s, dict)): 19 | raise ValueError("Scores list contains None values, need re-run evaluator.") 20 | 21 | @staticmethod 22 | def linear_rescale(score): 23 | return score * 20 24 | 25 | @staticmethod 26 | def power_rescale(score, power): 27 | return ((score / 5) ** power) * 100 28 | 29 | # @staticmethod 30 | # def sigmoid_rescale(score, scale=10): 31 | # x = (score - 2.5) # move to 3 32 | # return (1 / (1 + np.exp(-scale * x / 5))) * 100 33 | 34 | def statistic(self, scores: List[Any], **kwargs) -> Dict[str, Any]: 35 | raise NotImplementedError 36 | 37 | class AvgInfo(Summarizer): 38 | def statistic(self, scores: List[Union[float, Dict[str, float]]], **kwargs): 39 | if isinstance(scores[0], dict): 40 | keys = scores[0].keys() 41 | result = {} 42 | for key in keys: 43 | values = [float(s[key]) for s in scores if key in s] 44 | avg = sum(values) / len(values) * 100 45 | result[key] = "{}: {:.2f}%".format(key, avg) 46 | return result 47 | 48 | # common 49 | avg = sum(map(float, scores)) / len(scores) * 100 50 | return {"score": "AVG: {:.2f}%".format(avg)} 51 | 52 | class AvgThreshold(Summarizer): 53 | def __init__(self, rescale, threshold=60, power=2): 54 | super().__init__(rescale, power) 55 | self.threshold = threshold 56 | 57 | def statistic(self, scores: List[float], **kwargs): 58 | self._check_scores(scores) 59 | scores = list(map(lambda x: self.rescale_func(float(x)), scores)) 60 | score_count = Counter(scores) 61 | 62 | avg = sum(scores) / len(scores) 63 | above_threshold = sum(count for score, count in score_count.items() if score > self.threshold) 64 | return {"score": "AVG: {:.2f}".format(avg), "above_threshold": "above{}: {}".format(self.threshold, above_threshold)} 65 | 66 | class AvgMOS(Summarizer): 67 | def statistic(self, scores: List[float], **kwargs): 68 | avg = sum(map(float, scores)) / len(scores) 69 | return {"score": "DNSMOS: {:.2f}".format(avg)} 70 | 71 | class AvgWER(Summarizer): 72 | def statistic(self, scores: List[Dict], **kwargs): 73 | """ 74 | score = {"ref_len": ref_len, "subs": subs, "dele": dele, "inse": inse, "wer": wer} 75 | """ 76 | total_ref_len = 0 77 | total_subs = 0.0 78 | total_dele = 0.0 79 | total_inse = 0.0 80 | 81 | for score in scores: 82 | total_ref_len += score.get("ref_len", 0) 83 | total_subs += score.get("subs", 0.0) 84 | total_dele += score.get("dele", 0.0) 85 | total_inse += score.get("inse", 0.0) 86 | 87 | if total_ref_len == 0: 88 | raise ValueError("Not enough ref_len to static") 89 | 90 | avg_wer = (total_subs + total_dele + total_inse) / total_ref_len * 100 91 | return {"score": "WER: {:.2f}%".format(avg_wer)} -------------------------------------------------------------------------------- /src/models/src_freezeomni/encoder/cmvn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import json 3 | import math 4 | 5 | import numpy as np 6 | 7 | class GlobalCMVN(torch.nn.Module): 8 | def __init__(self, 9 | mean: torch.Tensor, 10 | istd: torch.Tensor, 11 | norm_var: bool = True): 12 | """ 13 | Args: 14 | mean (torch.Tensor): mean stats 15 | istd (torch.Tensor): inverse std, std which is 1.0 / std 16 | """ 17 | super().__init__() 18 | assert mean.shape == istd.shape 19 | self.norm_var = norm_var 20 | # The buffer can be accessed from this module using self.mean 21 | self.register_buffer("mean", mean) 22 | self.register_buffer("istd", istd) 23 | 24 | def forward(self, x: torch.Tensor): 25 | """ 26 | Args: 27 | x (torch.Tensor): (batch, max_len, feat_dim) 28 | 29 | Returns: 30 | (torch.Tensor): normalized feature 31 | """ 32 | x = x - self.mean 33 | if self.norm_var: 34 | x = x * self.istd 35 | return x 36 | 37 | def _load_json_cmvn(json_cmvn_file): 38 | """ Load the json format cmvn stats file and calculate cmvn 39 | 40 | Args: 41 | json_cmvn_file: cmvn stats file in json format 42 | 43 | Returns: 44 | a numpy array of [means, vars] 45 | """ 46 | with open(json_cmvn_file) as f: 47 | cmvn_stats = json.load(f) 48 | 49 | means = cmvn_stats['mean_stat'] 50 | variance = cmvn_stats['var_stat'] 51 | count = cmvn_stats['frame_num'] 52 | for i in range(len(means)): 53 | means[i] /= count 54 | variance[i] = variance[i] / count - means[i] * means[i] 55 | if variance[i] < 1.0e-20: 56 | variance[i] = 1.0e-20 57 | variance[i] = 1.0 / math.sqrt(variance[i]) 58 | cmvn = np.array([means, variance]) 59 | return cmvn 60 | 61 | def _load_kaldi_cmvn(kaldi_cmvn_file): 62 | """ Load the kaldi format cmvn stats file and calculate cmvn 63 | 64 | Args: 65 | kaldi_cmvn_file: kaldi text style global cmvn file, which 66 | is generated by: 67 | compute-cmvn-stats --binary=false scp:feats.scp global_cmvn 68 | 69 | Returns: 70 | a numpy array of [means, vars] 71 | """ 72 | means = [] 73 | variance = [] 74 | with open(kaldi_cmvn_file, 'r') as fid: 75 | # kaldi binary file start with '\0B' 76 | if fid.read(2) == '\0B': 77 | print('kaldi cmvn binary file is not supported, please ' 78 | 'recompute it by: compute-cmvn-stats --binary=false ' 79 | ' scp:feats.scp global_cmvn') 80 | sys.exit(1) 81 | fid.seek(0) 82 | arr = fid.read().split() 83 | assert (arr[0] == '[') 84 | assert (arr[-2] == '0') 85 | assert (arr[-1] == ']') 86 | feat_dim = int((len(arr) - 2 - 2) / 2) 87 | for i in range(1, feat_dim + 1): 88 | means.append(float(arr[i])) 89 | count = float(arr[feat_dim + 1]) 90 | for i in range(feat_dim + 2, 2 * feat_dim + 2): 91 | variance.append(float(arr[i])) 92 | 93 | for i in range(len(means)): 94 | means[i] /= count 95 | variance[i] = variance[i] / count - means[i] * means[i] 96 | if variance[i] < 1.0e-20: 97 | variance[i] = 1.0e-20 98 | variance[i] = 1.0 / math.sqrt(variance[i]) 99 | cmvn = np.array([means, variance]) 100 | return cmvn 101 | 102 | def load_cmvn(cmvn_file, is_json): 103 | if is_json: 104 | cmvn = _load_json_cmvn(cmvn_file) 105 | else: 106 | cmvn = _load_kaldi_cmvn(cmvn_file) 107 | return cmvn[0], cmvn[1] 108 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/detokenizer/bigvgan_wrapper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | 5 | import librosa 6 | import torch 7 | 8 | from .vocoder.bigvgan import BigVGAN 9 | from .vocoder.utils import get_melspec, AttrDict, load_checkpoint 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class BigVGANWrapper: 15 | def __init__( 16 | self, vocoder: BigVGAN, device: torch.device, h: AttrDict, dtype=None 17 | ) -> None: 18 | self.vocoder = vocoder.to(device) 19 | if dtype is not None: 20 | self.vocoder = self.vocoder.to(dtype) 21 | self.vocoder = self.vocoder.eval() 22 | self.device = device 23 | self.h = h 24 | 25 | def to_dtype(self, dtype): 26 | self.vocoder = self.vocoder.to(dtype) 27 | 28 | def extract_mel_from_wav(self, wav_path=None, wav_data=None): 29 | """ 30 | params: 31 | wav_path: str, path of the wav, should be 24k 32 | wav_data: torch.tensor or numpy array, shape [T], wav data, should be 24k 33 | return: 34 | mel: [T, num_mels], torch.tensor 35 | """ 36 | if wav_data is None: 37 | wav_data, _ = librosa.load(wav_path, sr=self.h["sampling_rate"]) 38 | 39 | wav_data = torch.tensor(wav_data).unsqueeze(0) 40 | 41 | mel = get_melspec( 42 | y=wav_data, 43 | n_fft=self.h["n_fft"], 44 | num_mels=self.h["num_mels"], 45 | sampling_rate=self.h["sampling_rate"], 46 | hop_size=self.h["hop_size"], 47 | win_size=self.h["win_size"], 48 | fmin=self.h["fmin"], 49 | fmax=self.h["fmax"], 50 | ) 51 | return mel.squeeze(0).transpose(0, 1) 52 | 53 | @torch.inference_mode() 54 | def extract_mel_from_wav_batch(self, wav_data): 55 | """ 56 | params: 57 | wav_data: torch.tensor or numpy array, shape [Batch, T], wav data, should be 24k 58 | return: 59 | mel: [Batch, T, num_mels], torch.tensor 60 | """ 61 | 62 | wav_data = torch.tensor(wav_data) 63 | 64 | mel = get_melspec( 65 | wav=wav_data, 66 | n_fft=self.h["n_fft"], 67 | num_mels=self.h["num_mels"], 68 | sampling_rate=self.h["sampling_rate"], 69 | hop_size=self.h["hop_size"], 70 | win_size=self.h["win_size"], 71 | fmin=self.h["fmin"], 72 | fmax=self.h["fmax"], 73 | ) 74 | return mel.transpose(1, 2) 75 | 76 | def decode_mel(self, mel): 77 | """ 78 | params: 79 | mel: [T, num_mels], torch.tensor 80 | return: 81 | wav: [1, T], torch.tensor 82 | """ 83 | mel = mel.transpose(0, 1).unsqueeze(0).to(self.device) 84 | wav = self.vocoder(mel) 85 | return wav.squeeze(0) 86 | 87 | def decode_mel_batch(self, mel): 88 | """ 89 | params: 90 | mel: [B, T, num_mels], torch.tensor 91 | return: 92 | wav: [B, 1, T], torch.tensor 93 | """ 94 | mel = mel.transpose(1, 2).to(self.device) 95 | wav = self.vocoder(mel) 96 | return wav 97 | 98 | @classmethod 99 | def from_pretrained(cls, model_config, ckpt_path, device): 100 | with open(model_config) as f: 101 | data = f.read() 102 | json_config = json.loads(data) 103 | h = AttrDict(json_config) 104 | vocoder = BigVGAN(h, True) 105 | state_dict_g = load_checkpoint(ckpt_path, "cpu") 106 | vocoder.load_state_dict(state_dict_g["generator"]) 107 | 108 | logger.info(">>> Load vocoder from {}".format(ckpt_path)) 109 | return cls(vocoder, device, h) 110 | -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/transformer/label_smoothing_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Shigeki Karita 2 | # 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Label smoothing module.""" 16 | 17 | import torch 18 | from torch import nn 19 | 20 | 21 | class LabelSmoothingLoss(nn.Module): 22 | """Label-smoothing loss. 23 | 24 | In a standard CE loss, the label's data distribution is: 25 | [0,1,2] -> 26 | [ 27 | [1.0, 0.0, 0.0], 28 | [0.0, 1.0, 0.0], 29 | [0.0, 0.0, 1.0], 30 | ] 31 | 32 | In the smoothing version CE Loss,some probabilities 33 | are taken from the true label prob (1.0) and are divided 34 | among other labels. 35 | 36 | e.g. 37 | smoothing=0.1 38 | [0,1,2] -> 39 | [ 40 | [0.9, 0.05, 0.05], 41 | [0.05, 0.9, 0.05], 42 | [0.05, 0.05, 0.9], 43 | ] 44 | 45 | Args: 46 | size (int): the number of class 47 | padding_idx (int): padding class id which will be ignored for loss 48 | smoothing (float): smoothing rate (0.0 means the conventional CE) 49 | normalize_length (bool): 50 | normalize loss by sequence length if True 51 | normalize loss by batch size if False 52 | """ 53 | 54 | def __init__(self, 55 | size: int, 56 | padding_idx: int, 57 | smoothing: float, 58 | normalize_length: bool = False): 59 | """Construct an LabelSmoothingLoss object.""" 60 | super(LabelSmoothingLoss, self).__init__() 61 | self.criterion = nn.KLDivLoss(reduction="none") 62 | self.padding_idx = padding_idx 63 | self.confidence = 1.0 - smoothing 64 | self.smoothing = smoothing 65 | self.size = size 66 | self.normalize_length = normalize_length 67 | 68 | def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 69 | """Compute loss between x and target. 70 | 71 | The model outputs and data labels tensors are flatten to 72 | (batch*seqlen, class) shape and a mask is applied to the 73 | padding part which should not be calculated for loss. 74 | 75 | Args: 76 | x (torch.Tensor): prediction (batch, seqlen, class) 77 | target (torch.Tensor): 78 | target signal masked with self.padding_id (batch, seqlen) 79 | Returns: 80 | loss (torch.Tensor) : The KL loss, scalar float value 81 | """ 82 | assert x.size(2) == self.size 83 | batch_size = x.size(0) 84 | x = x.view(-1, self.size) 85 | target = target.view(-1) 86 | # use zeros_like instead of torch.no_grad() for true_dist, 87 | # since no_grad() can not be exported by JIT 88 | true_dist = torch.zeros_like(x) 89 | true_dist.fill_(self.smoothing / (self.size - 1)) 90 | ignore = target == self.padding_idx # (B,) 91 | total = len(target) - ignore.sum().item() 92 | target = target.masked_fill(ignore, 0) # avoid -1 index 93 | true_dist.scatter_(1, target.unsqueeze(1), self.confidence) 94 | kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) 95 | denom = total if self.normalize_length else batch_size 96 | return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom 97 | -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/utils/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # Modified from ESPnet(https://github.com/espnet/espnet) 16 | """Unility functions for Transformer.""" 17 | 18 | from typing import List 19 | 20 | import torch 21 | 22 | IGNORE_ID = -1 23 | 24 | 25 | def pad_list(xs: List[torch.Tensor], pad_value: int): 26 | """Perform padding for the list of tensors. 27 | 28 | Args: 29 | xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. 30 | pad_value (float): Value for padding. 31 | 32 | Returns: 33 | Tensor: Padded tensor (B, Tmax, `*`). 34 | 35 | Examples: 36 | >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] 37 | >>> x 38 | [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] 39 | >>> pad_list(x, 0) 40 | tensor([[1., 1., 1., 1.], 41 | [1., 1., 0., 0.], 42 | [1., 0., 0., 0.]]) 43 | 44 | """ 45 | max_len = max([len(item) for item in xs]) 46 | batchs = len(xs) 47 | ndim = xs[0].ndim 48 | if ndim == 1: 49 | pad_res = torch.zeros(batchs, 50 | max_len, 51 | dtype=xs[0].dtype, 52 | device=xs[0].device) 53 | elif ndim == 2: 54 | pad_res = torch.zeros(batchs, 55 | max_len, 56 | xs[0].shape[1], 57 | dtype=xs[0].dtype, 58 | device=xs[0].device) 59 | elif ndim == 3: 60 | pad_res = torch.zeros(batchs, 61 | max_len, 62 | xs[0].shape[1], 63 | xs[0].shape[2], 64 | dtype=xs[0].dtype, 65 | device=xs[0].device) 66 | else: 67 | raise ValueError(f"Unsupported ndim: {ndim}") 68 | pad_res.fill_(pad_value) 69 | for i in range(batchs): 70 | pad_res[i, :len(xs[i])] = xs[i] 71 | return pad_res 72 | 73 | 74 | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, 75 | ignore_label: int) -> torch.Tensor: 76 | """Calculate accuracy. 77 | 78 | Args: 79 | pad_outputs (Tensor): Prediction tensors (B * Lmax, D). 80 | pad_targets (LongTensor): Target label tensors (B, Lmax). 81 | ignore_label (int): Ignore label id. 82 | 83 | Returns: 84 | torch.Tensor: Accuracy value (0.0 - 1.0). 85 | 86 | """ 87 | pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), 88 | pad_outputs.size(1)).argmax(2) 89 | mask = pad_targets != ignore_label 90 | numerator = torch.sum( 91 | pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) 92 | denominator = torch.sum(mask) 93 | return (numerator / denominator).detach() 94 | 95 | 96 | def get_padding(kernel_size, dilation=1): 97 | return int((kernel_size * dilation - dilation) / 2) 98 | 99 | 100 | def init_weights(m, mean=0.0, std=0.01): 101 | classname = m.__class__.__name__ 102 | if classname.find("Conv") != -1: 103 | m.weight.data.normal_(mean, std) 104 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/utils.py: -------------------------------------------------------------------------------- 1 | from librosa.filters import mel as librosa_mel_fn 2 | import torch 3 | import os 4 | 5 | mel_basis_cache = {} 6 | hann_window_cache = {} 7 | 8 | 9 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 10 | return torch.log(torch.clamp(x, min=clip_val) * C) 11 | 12 | 13 | def spectral_normalize_torch(magnitudes): 14 | return dynamic_range_compression_torch(magnitudes) 15 | 16 | 17 | def get_melspec( 18 | y: torch.Tensor, 19 | n_fft: int, 20 | num_mels: int, 21 | sampling_rate: int, 22 | hop_size: int, 23 | win_size: int, 24 | fmin: int, 25 | fmax: int = None, 26 | center: bool = False, 27 | ) -> torch.Tensor: 28 | """ 29 | Calculate the mel spectrogram of an input signal. 30 | This function uses slaney norm for the librosa mel filterbank (using librosa.filters.mel) and uses Hann window for STFT (using torch.stft). 31 | 32 | Args: 33 | y (torch.Tensor): Input signal. 34 | n_fft (int): FFT size. 35 | num_mels (int): Number of mel bins. 36 | sampling_rate (int): Sampling rate of the input signal. 37 | hop_size (int): Hop size for STFT. 38 | win_size (int): Window size for STFT. 39 | fmin (int): Minimum frequency for mel filterbank. 40 | fmax (int): Maximum frequency for mel filterbank. If None, defaults to half the sampling rate (fmax = sr / 2.0) inside librosa_mel_fn 41 | center (bool): Whether to pad the input to center the frames. Default is False. 42 | 43 | Returns: 44 | torch.Tensor: Mel spectrogram. 45 | """ 46 | if torch.min(y) < -1.0: 47 | print(f"[WARNING] Min value of input waveform signal is {torch.min(y)}") 48 | if torch.max(y) > 1.0: 49 | print(f"[WARNING] Max value of input waveform signal is {torch.max(y)}") 50 | 51 | device = y.device 52 | key = f"{n_fft}_{num_mels}_{sampling_rate}_{hop_size}_{win_size}_{fmin}_{fmax}_{device}" 53 | 54 | if key not in mel_basis_cache: 55 | mel = librosa_mel_fn( 56 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax 57 | ) 58 | mel_basis_cache[key] = torch.from_numpy(mel).float().to(device) 59 | hann_window_cache[key] = torch.hann_window(win_size).to(device) 60 | 61 | mel_basis = mel_basis_cache[key] 62 | hann_window = hann_window_cache[key] 63 | 64 | padding = (n_fft - hop_size) // 2 65 | y = torch.nn.functional.pad( 66 | y.unsqueeze(1), (padding, padding), mode="reflect" 67 | ).squeeze(1) 68 | 69 | spec = torch.stft( 70 | y, 71 | n_fft, 72 | hop_length=hop_size, 73 | win_length=win_size, 74 | window=hann_window, 75 | center=center, 76 | pad_mode="reflect", 77 | normalized=False, 78 | onesided=True, 79 | return_complex=True, 80 | ) 81 | spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9) 82 | 83 | mel_spec = torch.matmul(mel_basis, spec) 84 | mel_spec = spectral_normalize_torch(mel_spec) 85 | 86 | return mel_spec 87 | 88 | 89 | class AttrDict(dict): 90 | def __init__(self, *args, **kwargs): 91 | super(AttrDict, self).__init__(*args, **kwargs) 92 | self.__dict__ = self 93 | 94 | 95 | def load_checkpoint(filepath, device): 96 | assert os.path.isfile(filepath) 97 | print(f"Loading '{filepath}'") 98 | checkpoint_dict = torch.load(filepath, map_location=device, weights_only=True) 99 | print("Complete.") 100 | return checkpoint_dict 101 | 102 | 103 | def init_weights(m, mean=0.0, std=0.01): 104 | classname = m.__class__.__name__ 105 | if classname.find("Conv") != -1: 106 | m.weight.data.normal_(mean, std) 107 | 108 | 109 | def get_padding(kernel_size, dilation=1): 110 | return int((kernel_size * dilation - dilation) / 2) 111 | -------------------------------------------------------------------------------- /src/models/src_kimi/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/filter.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import math 8 | 9 | if "sinc" in dir(torch): 10 | sinc = torch.sinc 11 | else: 12 | # This code is adopted from adefossez's julius.core.sinc under the MIT License 13 | # https://adefossez.github.io/julius/julius/core.html 14 | # LICENSE is in incl_licenses directory. 15 | def sinc(x: torch.Tensor): 16 | """ 17 | Implementation of sinc, i.e. sin(pi * x) / (pi * x) 18 | __Warning__: Different to julius.sinc, the input is multiplied by `pi`! 19 | """ 20 | return torch.where( 21 | x == 0, 22 | torch.tensor(1.0, device=x.device, dtype=x.dtype), 23 | torch.sin(math.pi * x) / math.pi / x, 24 | ) 25 | 26 | 27 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License 28 | # https://adefossez.github.io/julius/julius/lowpass.html 29 | # LICENSE is in incl_licenses directory. 30 | def kaiser_sinc_filter1d( 31 | cutoff, half_width, kernel_size 32 | ): # return filter [1,1,kernel_size] 33 | even = kernel_size % 2 == 0 34 | half_size = kernel_size // 2 35 | 36 | # For kaiser window 37 | delta_f = 4 * half_width 38 | A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95 39 | if A > 50.0: 40 | beta = 0.1102 * (A - 8.7) 41 | elif A >= 21.0: 42 | beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0) 43 | else: 44 | beta = 0.0 45 | window = torch.kaiser_window(kernel_size, beta=beta, periodic=False) 46 | 47 | # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio 48 | if even: 49 | time = torch.arange(-half_size, half_size) + 0.5 50 | else: 51 | time = torch.arange(kernel_size) - half_size 52 | if cutoff == 0: 53 | filter_ = torch.zeros_like(time) 54 | else: 55 | filter_ = 2 * cutoff * window * sinc(2 * cutoff * time) 56 | """ 57 | Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal. 58 | """ 59 | filter_ /= filter_.sum() 60 | filter = filter_.view(1, 1, kernel_size) 61 | 62 | return filter 63 | 64 | 65 | class LowPassFilter1d(nn.Module): 66 | def __init__( 67 | self, 68 | cutoff=0.5, 69 | half_width=0.6, 70 | stride: int = 1, 71 | padding: bool = True, 72 | padding_mode: str = "replicate", 73 | kernel_size: int = 12, 74 | ): 75 | """ 76 | kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible. 77 | """ 78 | super().__init__() 79 | if cutoff < -0.0: 80 | raise ValueError("Minimum cutoff must be larger than zero.") 81 | if cutoff > 0.5: 82 | raise ValueError("A cutoff above 0.5 does not make sense.") 83 | self.kernel_size = kernel_size 84 | self.even = kernel_size % 2 == 0 85 | self.pad_left = kernel_size // 2 - int(self.even) 86 | self.pad_right = kernel_size // 2 87 | self.stride = stride 88 | self.padding = padding 89 | self.padding_mode = padding_mode 90 | filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size) 91 | self.register_buffer("filter", filter) 92 | 93 | # Input [B, C, T] 94 | def forward(self, x): 95 | _, C, _ = x.shape 96 | 97 | if self.padding: 98 | x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode) 99 | out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) 100 | 101 | return out 102 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONPATH=$PWD:$PYTHONPATH 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | 5 | max_memory=400 6 | save_dir="res/test" 7 | 8 | stage=1 9 | stop_stage=2 10 | eval_bsz=1 11 | save_pred_audio=False 12 | 13 | aqa_tasks="aqa-llamaqa-en,aqa-llamaqa-zh,aqa-triviaqa-en,aqa-triviaqa-zh,aqa-webq-en,aqa-webq-zh,aqa-chinesesimpleqa-zh,aqa-chinese_quiz-zh" 14 | choice_tasks="choice-agieval-zh,choice-ceval-zh" 15 | aqa_dialect_tasks="aqa-chinese_quiz-sichuanese,aqa-chinese_quiz-shanghainese,aqa-chinese_quiz-northeastern_mandarin,aqa-chinese_quiz-henan_dialect,aqa-chinese_quiz-cantonese" 16 | chitchat_dialect_tasks="follow-chitchat-sichuanese,follow-chitchat-shanghainese,follow-chitchat-northeastern_mandarin,follow-chitchat-henan_dialect,follow-chitchat-cantonese" 17 | 18 | down_tasks="aqa-livelihood_policy-zh,aqa-livelihood_policy-sichuanese,aqa-livelihood_policy-shanghainese,aqa-livelihood_policy-northeastern_mandarin,aqa-livelihood_policy-henan_dialect,aqa-livelihood_policy-cantonese" 19 | noise_tasks="aqa-babble_noise-zh,aqa-white_noise-zh,aqa-distortion-zh,aqa-single_background_speaker-zh,aqa-multi_background_speakers-zh,aqa-lowpass_filtering-zh,aqa-packet_loss-zh,aqa-reverberation_RT60-zh,aqa-complex_environments-zh,aqa-complex_environments_reverb-zh,aqa-different_distance-zh" 20 | multiturn_tasks="multiturn-memory-zh" 21 | para_tasks="aqa-para_mix300-zh" 22 | llm_judge_tasks="emotion-esd,aed-audio-instruct,acceptance-human-zh,chitchat-human-zh,care-age-zh" 23 | 24 | declare -A model_tasks 25 | model_tasks=( 26 | ["MiniCPMo2_6-audio"]="$aqa_tasks,$aqa_dialect_tasks" 27 | ["baichuan_omni_1d5"]="$aqa_tasks,$aqa_dialect_tasks" 28 | ["llama_omni"]="$aqa_tasks,$aqa_dialect_tasks" 29 | ["speechgpt2"]="$aqa_tasks,$aqa_dialect_tasks" 30 | ["freeze_omni"]="$aqa_tasks,$para_taaqa_dialect_taskssks" 31 | ["glm-4-voice-9b"]="$aqa_tasks,$aqa_dialect_tasks" 32 | ["kimi-audio-7b-instruct"]="$aqa_tasks,$aqa_dialect_tasks" 33 | ["qwen2_5_omni"]="$aqa_tasks,$aqa_dialect_tasks" 34 | ) 35 | 36 | gpu_list=($(echo $CUDA_VISIBLE_DEVICES | tr ',' ' ')) 37 | gpu_counts=${#gpu_list[@]} 38 | 39 | get_free_gpu() { 40 | while true; do 41 | for gpu in "${gpu_list[@]}"; do 42 | used_mem=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk "NR==$((gpu+1))") 43 | if [[ "$used_mem" -lt "$max_memory" ]]; then 44 | echo "$gpu" 45 | return 46 | fi 47 | done 48 | sleep 30 49 | done 50 | } 51 | 52 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 53 | for model in "${!model_tasks[@]}"; do 54 | IFS=',' read -r -a values <<< "${model_tasks[$model]}" 55 | for task in "${values[@]}"; do 56 | gpu=$(get_free_gpu) 57 | echo "***********************************************" 58 | echo "processing model: $model using task: $task on GPU: $gpu" 59 | echo "***********************************************" 60 | CUDA_VISIBLE_DEVICES=$gpu python main.py \ 61 | --mode "infer" \ 62 | --task $task \ 63 | --save_dir $save_dir \ 64 | --save_pred_audio $save_pred_audio \ 65 | --model $model & 66 | sleep 40 # Increase sleep time appropriately according to the speed of loading the model 67 | done 68 | done 69 | wait 70 | fi 71 | 72 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 73 | for model in "${!model_tasks[@]}"; do 74 | IFS=',' read -r -a values <<< "${model_tasks[$model]}" 75 | for task in "${values[@]}"; do 76 | python main.py \ 77 | --mode "eval" \ 78 | --save_dir $save_dir \ 79 | --save_pred_audio $save_pred_audio \ 80 | --model $model \ 81 | --bsz $eval_bsz \ 82 | --task $task 83 | done 84 | done 85 | wait 86 | python tools/save_csv.py --root_dir $save_dir 87 | fi -------------------------------------------------------------------------------- /src/models/src_speechgpt2/Codec/models/moshi_modules/resample.py: -------------------------------------------------------------------------------- 1 | import typing as tp 2 | 3 | from einops import rearrange 4 | import torch 5 | from torch import nn 6 | 7 | from .conv import StreamingConv1d, StreamingConvTranspose1d 8 | 9 | 10 | class ConvDownsample1d(nn.Module): 11 | """ 12 | Downsampling by some integer amount `stride` using convolutions 13 | with a kernel size of twice the stride. 14 | If `causal` is True, the output uses a causal convolution. 15 | """ 16 | 17 | def __init__( 18 | self, 19 | stride: int, 20 | dimension: tp.Optional[int] = None, 21 | causal: bool = False, 22 | learnt: bool = False, 23 | channel_wise: bool = False, 24 | ): 25 | super().__init__() 26 | self.learnt = learnt 27 | self.channel_wise = channel_wise 28 | groups = 1 29 | if learnt: 30 | assert dimension is not None, "Dimension required for learnt convolutions." 31 | in_channels = dimension 32 | out_channels = dimension 33 | if channel_wise: 34 | groups = dimension 35 | else: 36 | in_channels = 1 37 | out_channels = 1 38 | 39 | self.conv = StreamingConv1d( 40 | in_channels, 41 | out_channels, 42 | kernel_size=2 * stride, 43 | stride=stride, 44 | causal=causal, 45 | groups=groups, 46 | bias=False, 47 | pad_mode="replicate", 48 | ) 49 | if not learnt: 50 | actual_conv = self.conv.conv.conv 51 | actual_conv.weight.requires_grad_(False) 52 | actual_conv.weight.data.fill_(1.0 / (2 * stride)) 53 | 54 | def forward(self, x: torch.Tensor): 55 | batch_size = len(x) 56 | if not self.learnt: 57 | x = rearrange(x, "b c t -> (b c) () t") 58 | y = self.conv(x) 59 | if not self.learnt: 60 | y = rearrange(y, "(b c) () t -> b c t", b=batch_size) 61 | return y 62 | 63 | 64 | class ConvTrUpsample1d(nn.Module): 65 | """ 66 | Upsample by some integer amount `stride` using transposed convolutions. 67 | """ 68 | 69 | def __init__( 70 | self, 71 | stride: int, 72 | dimension: tp.Optional[int] = None, 73 | causal: bool = False, 74 | learnt: bool = False, 75 | channel_wise: bool = False, 76 | ): 77 | super().__init__() 78 | self.learnt = learnt 79 | self.channel_wise = channel_wise 80 | groups = 1 81 | if learnt: 82 | assert dimension is not None, "Dimension required for learnt convolutions." 83 | in_channels = dimension 84 | out_channels = dimension 85 | if channel_wise: 86 | groups = dimension 87 | else: 88 | in_channels = 1 89 | out_channels = 1 90 | 91 | self.convtr = StreamingConvTranspose1d( 92 | in_channels, 93 | out_channels, 94 | kernel_size=2 * stride, 95 | stride=stride, 96 | causal=causal, 97 | groups=groups, 98 | bias=False, 99 | ) 100 | if not learnt: 101 | actual_convtr = self.convtr.convtr.convtr 102 | actual_convtr.weight.requires_grad_(False) 103 | actual_convtr.weight.data.fill_(1.0) 104 | 105 | def forward(self, x: torch.Tensor): 106 | batch_size = len(x) 107 | if not self.learnt: 108 | x = rearrange(x, "b c t -> (b c) () t") 109 | y = self.convtr(x) 110 | if not self.learnt: 111 | x_for_normalization = torch.ones_like(x[:1]) 112 | normalization = self.convtr(x_for_normalization) 113 | y = y / normalization 114 | y = rearrange(y, "(b c) () t -> b c t", b=batch_size) 115 | return y 116 | -------------------------------------------------------------------------------- /src/models/src_speechgpt2/Codec/models/modules/residual_block.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """Residual block modules.""" 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from .conv_layers import NonCausalConv1d, CausalConv1d 10 | 11 | class HiFiGANResidualBlock(nn.Module): 12 | """Causal Residual block module in HiFiGAN.""" 13 | 14 | def __init__( 15 | self, 16 | mode, 17 | kernel_size=3, 18 | channels=512, 19 | dilations=(1, 3, 5), 20 | groups=1, 21 | bias=True, 22 | use_additional_convs=True, 23 | nonlinear_activation="LeakyReLU", 24 | nonlinear_activation_params={"negative_slope": 0.1} 25 | ): 26 | """Initialize CausalResidualBlock module. 27 | 28 | Args: 29 | kernel_size (int): Kernel size of dilation convolution layer. 30 | channels (int): Number of channels for convolution layer. 31 | dilations (List[int]): List of dilation factors. 32 | use_additional_convs (bool): Whether to use additional convolution layers. 33 | groups (int): The group number of conv1d (default: 1) 34 | bias (bool): Whether to add bias parameter in convolution layers. 35 | nonlinear_activation (str): Activation function module name. 36 | nonlinear_activation_params (dict): Hyperparameters for activation function. 37 | 38 | """ 39 | super().__init__() 40 | self.mode = mode 41 | if self.mode == 'noncausal': 42 | Conv1d = NonCausalConv1d 43 | elif self.mode == 'causal': 44 | Conv1d = CausalConv1d 45 | else: 46 | raise NotImplementedError(f"Mode ({self.mode}) is not supported!") 47 | 48 | self.use_additional_convs = use_additional_convs 49 | self.convs1 = nn.ModuleList() 50 | if use_additional_convs: 51 | self.convs2 = nn.ModuleList() 52 | assert kernel_size % 2 == 1, "Kernel size must be odd number." 53 | self.activation = getattr(nn, nonlinear_activation)(**nonlinear_activation_params) 54 | for dilation in dilations: 55 | self.convs1 += [ 56 | Conv1d( 57 | in_channels=channels, 58 | out_channels=channels, 59 | kernel_size=kernel_size, 60 | stride=1, 61 | dilation=dilation, 62 | groups=groups, 63 | bias=bias, 64 | ) 65 | ] 66 | if use_additional_convs: 67 | self.convs2 += [ 68 | Conv1d( 69 | in_channels=channels, 70 | out_channels=channels, 71 | kernel_size=kernel_size, 72 | stride=1, 73 | dilation=1, 74 | groups=groups, 75 | bias=bias, 76 | ) 77 | ] 78 | self.num_layer = len(self.convs1) 79 | 80 | def forward(self, x): 81 | """Calculate forward propagation. 82 | 83 | Args: 84 | x (Tensor): Input tensor (B, channels, T). 85 | 86 | Returns: 87 | Tensor: Output tensor (B, channels, T). 88 | 89 | """ 90 | for idx in range(self.num_layer): 91 | xt = self.convs1[idx](self.activation(x)) 92 | if self.use_additional_convs: 93 | xt = self.convs2[idx](self.activation(xt)) 94 | x = xt + x 95 | return x 96 | 97 | def inference(self, x): 98 | for idx in range(self.num_layer): 99 | xt = self.convs1[idx].inference(self.activation(x)) 100 | if self.use_additional_convs: 101 | xt = self.convs2[idx].inference(self.activation(xt)) 102 | x = xt + x 103 | return x 104 | -------------------------------------------------------------------------------- /src/models/src_glm4/speech_tokenizer/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import glob 4 | import math 5 | import tarfile 6 | import torch 7 | import torchaudio 8 | import safetensors 9 | from .configuration_whisper import WhisperVQConfig 10 | from .modeling_whisper import WhisperVQEncoder, WhisperVQForConditionalGeneration 11 | from transformers import WhisperFeatureExtractor, WhisperTokenizerFast 12 | 13 | 14 | def load_quantize_encoder(model_path): 15 | config = WhisperVQConfig.from_pretrained(model_path) 16 | config.quantize_encoder_only = True 17 | model = WhisperVQEncoder(config) 18 | state_dict = {} 19 | for path in glob.glob(os.path.join(model_path, "model*.safetensors")): 20 | with safetensors.safe_open(path, framework="pt", device="cpu") as f: 21 | for key in f.keys(): 22 | if key.startswith("model.encoder."): 23 | new_key = key[len("model.encoder."):] 24 | if new_key.startswith("layer_norm"): 25 | continue 26 | if new_key.startswith("layers"): 27 | layer_id = int(new_key.split(".")[1]) 28 | if layer_id >= config.quantize_position: 29 | continue 30 | state_dict[new_key] = f.get_tensor(key) 31 | model.load_state_dict(state_dict) 32 | model.eval() 33 | model.cuda() 34 | return model 35 | 36 | 37 | _resample_buffer: dict[int, torchaudio.transforms.Resample] = {} 38 | 39 | 40 | def extract_speech_token(model: WhisperVQEncoder, feature_extractor: WhisperFeatureExtractor, utts): 41 | with torch.no_grad(): 42 | audios, indices = [], [] 43 | for idx, utt in enumerate(utts): 44 | if isinstance(utt, tuple): 45 | audio, sample_rate = utt 46 | else: 47 | audio, sample_rate = torchaudio.load(utt) 48 | audio = audio.cuda() 49 | if sample_rate != 16000: 50 | if sample_rate not in _resample_buffer: 51 | _resample_buffer[sample_rate] = torchaudio.transforms.Resample( 52 | orig_freq=sample_rate, 53 | new_freq=16000 54 | ).to('cuda') 55 | audio = _resample_buffer[sample_rate](audio) 56 | # if audio.shape[0] > 1: 57 | # audio = audio[:1] 58 | audio = audio[0] 59 | audio = audio.cpu().numpy() 60 | time_step = 0 61 | while time_step * 16000 < audio.shape[0]: 62 | audio_segment = audio[time_step * 16000: (time_step + 30) * 16000] 63 | audios.append(audio_segment) 64 | indices.append(idx) 65 | time_step += 30 66 | pooling_kernel_size = model.config.pooling_kernel_size or 1 67 | stride = model.conv1.stride[0] * model.conv2.stride[0] * pooling_kernel_size * feature_extractor.hop_length 68 | all_speech_tokens = [[] for _ in range(len(utts))] 69 | batch_size = 128 70 | for start in range(0, len(audios), batch_size): 71 | features = feature_extractor(audios[start: start + batch_size], sampling_rate=16000, 72 | return_attention_mask=True, return_tensors="pt", device='cuda', 73 | padding="longest", pad_to_multiple_of=stride) 74 | features = features.to(device="cuda") 75 | outputs = model(**features) 76 | speech_tokens = outputs.quantized_token_ids 77 | attention_mask = features.attention_mask[:, ::model.conv1.stride[0] * model.conv2.stride[0]] 78 | attention_mask = attention_mask[:, ::model.config.pooling_kernel_size] 79 | assert attention_mask.shape == speech_tokens.shape 80 | for i in range(len(speech_tokens)): 81 | idx = indices[start + i] 82 | speech_token = speech_tokens[i][attention_mask[i].bool()].tolist() 83 | all_speech_tokens[idx].extend(speech_token) 84 | return all_speech_tokens 85 | -------------------------------------------------------------------------------- /src/evaluator/dnsmos.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | import librosa 3 | import numpy as np 4 | import onnxruntime as ort 5 | import soundfile as sf 6 | from src.evaluator.base import Evaluator 7 | from src.utils import parallel_batch 8 | 9 | SAMPLING_RATE = 16000 10 | INPUT_LENGTH = 9.01 11 | 12 | class ComputeScore: 13 | """ 14 | from https://github.com/microsoft/DNS-Challenge/blob/master/DNSMOS/dnsmos_local.py 15 | """ 16 | def __init__(self, primary_model_path) -> None: 17 | self.onnx_sess = ort.InferenceSession(primary_model_path) 18 | 19 | def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True): 20 | mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size+1, hop_length=hop_length, n_mels=n_mels) 21 | if to_db: 22 | mel_spec = (librosa.power_to_db(mel_spec, ref=np.max)+40)/40 23 | return mel_spec.T 24 | 25 | def get_polyfit_val(self, sig, bak, ovr): 26 | p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535]) 27 | p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439 ]) 28 | p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546]) 29 | 30 | sig_poly = p_sig(sig) 31 | bak_poly = p_bak(bak) 32 | ovr_poly = p_ovr(ovr) 33 | 34 | return sig_poly, bak_poly, ovr_poly 35 | 36 | def __call__(self, fpath, sampling_rate): 37 | aud, input_fs = sf.read(fpath) 38 | fs = sampling_rate 39 | if input_fs != fs: 40 | audio = librosa.resample(aud, orig_sr=input_fs, target_sr=fs) 41 | else: 42 | audio = aud 43 | actual_audio_len = len(audio) 44 | len_samples = int(INPUT_LENGTH*fs) 45 | while len(audio) < len_samples: 46 | audio = np.append(audio, audio) 47 | 48 | num_hops = int(np.floor(len(audio)/fs) - INPUT_LENGTH)+1 49 | hop_len_samples = fs 50 | predicted_mos_sig_seg_raw = [] 51 | predicted_mos_bak_seg_raw = [] 52 | predicted_mos_ovr_seg_raw = [] 53 | predicted_mos_sig_seg = [] 54 | predicted_mos_bak_seg = [] 55 | predicted_mos_ovr_seg = [] 56 | 57 | for idx in range(num_hops): 58 | audio_seg = audio[int(idx*hop_len_samples) : int((idx+INPUT_LENGTH)*hop_len_samples)] 59 | if len(audio_seg) < len_samples: 60 | continue 61 | 62 | input_features = np.array(audio_seg).astype('float32')[np.newaxis,:] 63 | oi = {'input_1': input_features} 64 | mos_sig_raw,mos_bak_raw,mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0] 65 | mos_sig,mos_bak,mos_ovr = self.get_polyfit_val(mos_sig_raw, mos_bak_raw, mos_ovr_raw) 66 | predicted_mos_sig_seg_raw.append(mos_sig_raw) 67 | predicted_mos_bak_seg_raw.append(mos_bak_raw) 68 | predicted_mos_ovr_seg_raw.append(mos_ovr_raw) 69 | predicted_mos_sig_seg.append(mos_sig) 70 | predicted_mos_bak_seg.append(mos_bak) 71 | predicted_mos_ovr_seg.append(mos_ovr) 72 | clip_dict = {'filename': fpath, 'len_in_sec': actual_audio_len/fs, 'sr':fs} 73 | clip_dict['num_hops'] = num_hops 74 | clip_dict['OVRL_raw'] = np.mean(predicted_mos_ovr_seg_raw) 75 | clip_dict['SIG_raw'] = np.mean(predicted_mos_sig_seg_raw) 76 | clip_dict['BAK_raw'] = np.mean(predicted_mos_bak_seg_raw) 77 | clip_dict['OVRL'] = np.mean(predicted_mos_ovr_seg) 78 | clip_dict['SIG'] = np.mean(predicted_mos_sig_seg) 79 | clip_dict['BAK'] = np.mean(predicted_mos_bak_seg) 80 | return clip_dict 81 | 82 | class DNSMOS(Evaluator): 83 | def __init__(self, model: str, max_workers=None): 84 | if max_workers is not None: 85 | self.max_workers = max_workers 86 | self.compute_score = ComputeScore(model) 87 | 88 | @parallel_batch(default_workers=4) 89 | def evaluate(self, pred: str, ref: str, pred_info: Dict, **kwargs): 90 | pred_audio = pred_info["pred_audio"] 91 | res = self.compute_score(pred_audio, SAMPLING_RATE) 92 | return {"key": pred_info["key"], "score": res["OVRL"]} -------------------------------------------------------------------------------- /src/models/api.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import io 3 | import base64 4 | import requests 5 | import threading 6 | import itertools 7 | from typing import Dict 8 | import json 9 | import torchaudio 10 | from src.models.base import Model 11 | from src.utils import retry 12 | 13 | import sys 14 | sys.stdout.reconfigure(encoding='utf-8') 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | class GPT4oAudio(Model): 19 | def __init__(self, llm_name: str, api_keys: Dict, max_workers=None): 20 | super().__init__(sample_params=None) 21 | logging.info(f"Using {llm_name} API for judgement...") 22 | assert len(api_keys) > 0 23 | self.llm_name = llm_name 24 | self.api_keys = api_keys 25 | 26 | self.urls = { 27 | key: ( 28 | f"https://{key}.openai.azure.com/" 29 | f"openai/deployments/{llm_name}/chat/completions?api-version=2025-01-01-preview" 30 | ) 31 | for key in api_keys 32 | } 33 | self.max_workers = max_workers or len(api_keys) 34 | self.key_cycle = itertools.cycle(self.api_keys.items()) 35 | self.lock = threading.Lock() 36 | 37 | def get_next_key(self): 38 | with self.lock: 39 | key_name, key_value = next(self.key_cycle) 40 | return key_name, key_value, self.urls[key_name] 41 | 42 | @retry(max_retries=8, sleep_second=3) 43 | def api_generate(self, messages, api_key, url, modalities): 44 | headers = { 45 | "Content-Type": "application/json", 46 | "Authorization": f"Bearer {api_key}" 47 | } 48 | input_data = { 49 | "model": "gpt-4o-audio-preview", 50 | "modalities": modalities, 51 | "audio": { 52 | "voice": "alloy", 53 | "format": "wav" 54 | }, 55 | "messages": messages 56 | } 57 | 58 | response= requests.post(url, headers=headers, data=json.dumps(input_data)) 59 | response.raise_for_status() 60 | response_data = response.json() 61 | response = response_data["choices"][0]["message"] 62 | 63 | if "audio" in modalities: 64 | base64_str = response["audio"]["data"] 65 | pred = response["audio"]["transcript"].strip() 66 | assert base64_str is not None 67 | else: 68 | if "content" not in response: 69 | logging.info(f"response is unique: {response}") 70 | pred = response["content"].strip() 71 | base64_str = None 72 | return base64_str, pred 73 | 74 | def generate_once(self, audio, **kwargs): 75 | save_pred_audio = kwargs.get("pred_audio", None) 76 | if save_pred_audio: 77 | modalities = ["audio", "text"] 78 | else: 79 | modalities = ["text"] 80 | 81 | with open(audio, "rb") as audio_file: 82 | audio_bytes = audio_file.read() 83 | audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") 84 | 85 | messages = [ 86 | { 87 | "role": "user", 88 | "content": [ 89 | { 90 | "type": "input_audio", 91 | "input_audio": { 92 | "data": audio_base64, 93 | "format": "wav" 94 | } 95 | } 96 | ] 97 | } 98 | ] 99 | 100 | key_name, api_key, url = self.get_next_key() 101 | base64_str, pred = self.api_generate(messages, api_key, url, modalities) 102 | 103 | if save_pred_audio: 104 | audio_bytes = base64.b64decode(base64_str) 105 | audio_buf = io.BytesIO(audio_bytes) 106 | waveform, sample_rate = torchaudio.load(audio_buf) 107 | torchaudio.save(save_pred_audio, waveform, sample_rate=sample_rate) 108 | 109 | return {"pred": pred, "pred_audio": kwargs.get("pred_audio")} 110 | 111 | def generate_multiturn(self, audio, user_history, assistant_history, **kwargs): 112 | return self.generate_once(audio) 113 | -------------------------------------------------------------------------------- /registry/infer_task/dialect.yaml: -------------------------------------------------------------------------------- 1 | # ------------------dialect understanding--------------------- 2 | aqa-chinese_quiz-sichuanese: 3 | class: src.config.InferTaskCfg 4 | args: 5 | dataset: chinese_quiz-sichuanese 6 | template: zeroshot-aqa 7 | model: qwen2_5_omni 8 | save_pred_audio: False 9 | eval_task: basic 10 | 11 | aqa-chinese_quiz-shanghainese: 12 | class: src.config.InferTaskCfg 13 | args: 14 | dataset: chinese_quiz-shanghainese 15 | template: zeroshot-aqa 16 | model: qwen2_5_omni 17 | save_pred_audio: False 18 | eval_task: basic 19 | 20 | aqa-chinese_quiz-northeastern_mandarin: 21 | class: src.config.InferTaskCfg 22 | args: 23 | dataset: chinese_quiz-northeastern_mandarin 24 | template: zeroshot-aqa 25 | model: qwen2_5_omni 26 | save_pred_audio: False 27 | eval_task: basic 28 | 29 | aqa-chinese_quiz-henan_dialect: 30 | class: src.config.InferTaskCfg 31 | args: 32 | dataset: chinese_quiz-henan_dialect 33 | template: zeroshot-aqa 34 | model: qwen2_5_omni 35 | save_pred_audio: False 36 | eval_task: basic 37 | 38 | aqa-chinese_quiz-cantonese: 39 | class: src.config.InferTaskCfg 40 | args: 41 | dataset: chinese_quiz-cantonese 42 | template: zeroshot-aqa 43 | model: qwen2_5_omni 44 | save_pred_audio: False 45 | eval_task: basic 46 | 47 | # -----------------dialectt understanding livelihood policy (hard)------------ 48 | aqa-livelihood_policy-sichuanese: 49 | class: src.config.InferTaskCfg 50 | args: 51 | dataset: livelihood_policy-sichuanese 52 | template: zeroshot-aqa 53 | model: qwen2_5_omni 54 | save_pred_audio: False 55 | eval_task: basic 56 | 57 | aqa-livelihood_policy-shanghainese: 58 | class: src.config.InferTaskCfg 59 | args: 60 | dataset: livelihood_policy-shanghainese 61 | template: zeroshot-aqa 62 | model: qwen2_5_omni 63 | save_pred_audio: False 64 | eval_task: basic 65 | 66 | aqa-livelihood_policy-northeastern_mandarin: 67 | class: src.config.InferTaskCfg 68 | args: 69 | dataset: livelihood_policy-northeastern_mandarin 70 | template: zeroshot-aqa 71 | model: qwen2_5_omni 72 | save_pred_audio: False 73 | eval_task: basic 74 | 75 | aqa-livelihood_policy-henan_dialect: 76 | class: src.config.InferTaskCfg 77 | args: 78 | dataset: livelihood_policy-henan_dialect 79 | template: zeroshot-aqa 80 | model: qwen2_5_omni 81 | save_pred_audio: False 82 | eval_task: basic 83 | 84 | aqa-livelihood_policy-cantonese: 85 | class: src.config.InferTaskCfg 86 | args: 87 | dataset: livelihood_policy-cantonese 88 | template: zeroshot-aqa 89 | model: qwen2_5_omni 90 | save_pred_audio: False 91 | eval_task: basic 92 | 93 | # ------------------dialect chitchat--------------------- 94 | follow-chitchat-sichuanese: 95 | class: src.config.InferTaskCfg 96 | args: 97 | dataset: chitchat-sichuanese 98 | template: zeroshot-aqa 99 | model: qwen2_5_omni 100 | save_pred_audio: True 101 | eval_task: dialect_follow # dialect_follow dialect_classify 102 | 103 | follow-chitchat-shanghainese: 104 | class: src.config.InferTaskCfg 105 | args: 106 | dataset: chitchat-shanghainese 107 | template: zeroshot-aqa 108 | model: qwen2_5_omni 109 | save_pred_audio: True 110 | eval_task: dialect_follow # dialect_follow dialect_classify 111 | 112 | follow-chitchat-northeastern_mandarin: 113 | class: src.config.InferTaskCfg 114 | args: 115 | dataset: chitchat-northeastern_mandarin 116 | template: zeroshot-aqa 117 | model: qwen2_5_omni 118 | save_pred_audio: True 119 | eval_task: dialect_follow # dialect_follow dialect_classify 120 | 121 | follow-chitchat-henan_dialect: 122 | class: src.config.InferTaskCfg 123 | args: 124 | dataset: chitchat-henan_dialect 125 | template: zeroshot-aqa 126 | model: qwen2_5_omni 127 | save_pred_audio: True 128 | eval_task: dialect_follow # dialect_follow dialect_classify 129 | 130 | follow-chitchat-cantonese: 131 | class: src.config.InferTaskCfg 132 | args: 133 | dataset: chitchat-cantonese 134 | template: zeroshot-aqa 135 | model: qwen2_5_omni 136 | save_pred_audio: True 137 | eval_task: dialect_follow # dialect_follow dialect_classify -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/flow/stable/stable_diffusion_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | from .dit import DiffusionTransformer 4 | from .adp import UNet1d 5 | from .sampling import sample 6 | import math 7 | from model.base import BaseModule 8 | import pdb 9 | 10 | target_length = 1536 11 | def pad_and_create_mask(matrix, target_length): 12 | 13 | T = matrix.shape[2] 14 | if T > target_length: 15 | raise ValueError("The third dimension length %s should not exceed %s"%(T, target_length)) 16 | 17 | padding_size = target_length - T 18 | 19 | padded_matrix = F.pad(matrix, (0, padding_size), "constant", 0) 20 | 21 | mask = torch.ones((1, target_length)) 22 | mask[:, T:] = 0 # Set the padding part to 0 23 | 24 | return padded_matrix.to(matrix.device), mask.to(matrix.device) 25 | 26 | 27 | class Stable_Diffusion(BaseModule): 28 | def __init__(self): 29 | super(Stable_Diffusion, self).__init__() 30 | self.diffusion = DiffusionTransformer( 31 | io_channels=80, 32 | # input_concat_dim=80, 33 | embed_dim=768, 34 | # cond_token_dim=target_length, 35 | depth=24, 36 | num_heads=24, 37 | project_cond_tokens=False, 38 | transformer_type="continuous_transformer", 39 | ) 40 | # self.diffusion = UNet1d( 41 | # in_channels=80, 42 | # channels=256, 43 | # resnet_groups=16, 44 | # kernel_multiplier_downsample=2, 45 | # multipliers=[4, 4, 4, 5, 5], 46 | # factors=[1, 2, 2, 4], # 输入长度不一致卷积缩短 47 | # num_blocks=[2, 2, 2, 2], 48 | # attentions=[1, 3, 3, 3, 3], 49 | # attention_heads=16, 50 | # attention_multiplier=4, 51 | # use_nearest_upsample=False, 52 | # use_skip_scale=True, 53 | # use_context_time=True 54 | # ) 55 | self.rng = torch.quasirandom.SobolEngine(1, scramble=True) 56 | 57 | @torch.no_grad() 58 | def forward(self, mu, mask, n_timesteps): 59 | # pdb.set_trace() 60 | mask = mask.squeeze(1) 61 | # noise = torch.randn_like(mu).to(mu.device) 62 | # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length) 63 | # extra_args = {"cross_attn_cond": mu, "cross_attn_cond_mask": mask, "mask": mask} 64 | extra_args = {"mask": mask} 65 | fakes = sample(self.diffusion, mu, n_timesteps, 0, **extra_args) 66 | 67 | return fakes 68 | 69 | 70 | def compute_loss(self, x0, mask, mu): 71 | 72 | # pdb.set_trace() 73 | t = self.rng.draw(x0.shape[0])[:, 0].to(x0.device) 74 | alphas, sigmas = torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2) 75 | 76 | alphas = alphas[:, None, None] 77 | sigmas = sigmas[:, None, None] 78 | noise = torch.randn_like(x0) 79 | noised_inputs = x0 * alphas + noise * sigmas 80 | targets = mu * alphas - x0 * sigmas 81 | mask = mask.squeeze(1) 82 | # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length) 83 | # output = self.diffusion(noised_inputs, t, cross_attn_cond=mu, 84 | # cross_attn_cond_mask=mask, mask=mask, cfg_dropout_prob=0.1) 85 | output = self.diffusion(noised_inputs, t, mask=mask, cfg_dropout_prob=0.1) 86 | 87 | return self.mse_loss(output, targets, mask), output 88 | 89 | 90 | def mse_loss(self, output, targets, mask): 91 | 92 | mse_loss = F.mse_loss(output, targets, reduction='none') 93 | 94 | if mask.ndim == 2 and mse_loss.ndim == 3: 95 | mask = mask.unsqueeze(1) 96 | 97 | if mask.shape[1] != mse_loss.shape[1]: 98 | mask = mask.repeat(1, mse_loss.shape[1], 1) 99 | 100 | mse_loss = mse_loss[mask] 101 | 102 | mse_loss = mse_loss.mean() 103 | 104 | return mse_loss -------------------------------------------------------------------------------- /src/models/src_freezeomni/encoder/subsampling.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Union 2 | 3 | import torch 4 | 5 | class BaseSubsampling(torch.nn.Module): 6 | def __init__(self): 7 | super().__init__() 8 | self.right_context = 0 9 | self.subsampling_rate = 1 10 | 11 | def position_encoding(self, offset: Union[int, torch.Tensor], 12 | size: int) -> torch.Tensor: 13 | return self.pos_enc.position_encoding(offset, size) 14 | 15 | class Conv2dSubsampling4(BaseSubsampling): 16 | """Convolutional 2D subsampling (to 1/4 length). 17 | 18 | Args: 19 | idim (int): Input dimension. 20 | odim (int): Output dimension. 21 | dropout_rate (float): Dropout rate. 22 | 23 | """ 24 | def __init__(self, idim: int, odim: int, dropout_rate: float): 25 | """Construct an Conv2dSubsampling4 object.""" 26 | super().__init__() 27 | self.conv = torch.nn.Sequential( 28 | torch.nn.Conv2d(1, odim, 3, 2), 29 | torch.nn.ReLU(), 30 | torch.nn.Conv2d(odim, odim, 3, 2), 31 | torch.nn.ReLU(), 32 | ) 33 | self.out = torch.nn.Sequential( 34 | torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) 35 | # The right context for every conv layer is computed by: 36 | # (kernel_size - 1) * frame_rate_of_this_layer 37 | self.subsampling_rate = 4 38 | # 6 = (3 - 1) * 1 + (3 - 1) * 2 39 | self.right_context = 6 40 | 41 | def forward( 42 | self, 43 | x: torch.Tensor, 44 | x_mask: torch.Tensor 45 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 46 | """Subsample x. 47 | 48 | Args: 49 | x (torch.Tensor): Input tensor (#batch, time, idim). 50 | x_mask (torch.Tensor): Input mask (#batch, 1, time). 51 | 52 | Returns: 53 | torch.Tensor: Subsampled tensor (#batch, time', odim), 54 | where time' = time // 4. 55 | torch.Tensor: Subsampled mask (#batch, 1, time'), 56 | where time' = time // 4. 57 | torch.Tensor: positional encoding 58 | 59 | """ 60 | x = x.unsqueeze(1) # (b, c=1, t, f) 61 | x = self.conv(x) 62 | b, c, t, f = x.size() 63 | x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) 64 | 65 | return x, x_mask[:, :, 2::2][:, :, 2::2] 66 | 67 | def infer(self, x, buffer, buffer_index, buffer_out): 68 | x = x.unsqueeze(1) # (b, c=1, t, f) 69 | x = self.conv(x) 70 | b, c, t, f = x.size() 71 | x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) 72 | 73 | return x, buffer, buffer_index, buffer_out 74 | 75 | class Subsampling(torch.nn.Module): 76 | @staticmethod 77 | def add_arguments(group): 78 | """Add Subsampling common arguments.""" 79 | group.add_argument('--subsampling-rate', default=4, type=int) 80 | group.add_argument('--subsampling-input-dim', default=256, type=int) 81 | group.add_argument('--subsampling-output-dim', default=256, type=int) 82 | group.add_argument('--subsampling-dropout-rate', default=0.1, type=float) 83 | 84 | return group 85 | 86 | def __init__(self, args): 87 | super().__init__() 88 | self.subsampling_rate = args.subsampling_rate 89 | self.subsampling_input_dim = args.subsampling_input_dim 90 | self.subsampling_output_dim = args.subsampling_output_dim 91 | self.subsampling_dropout_rate = args.subsampling_dropout_rate 92 | 93 | if self.subsampling_rate == 4: 94 | self.core = Conv2dSubsampling4(self.subsampling_input_dim, 95 | self.subsampling_output_dim, 96 | self.subsampling_dropout_rate) 97 | 98 | def forward(self, xs, ilens, masks): 99 | xs, masks = self.core(xs, masks) 100 | ilens = masks.squeeze(1).sum(1) 101 | return xs, ilens, masks 102 | 103 | def infer(self, x, buffer, buffer_index, buffer_out, pe_index): 104 | x, buffer, buffer_index, buffer_out = self.core.infer(x, 105 | buffer, buffer_index, buffer_out) 106 | return x, buffer, buffer_index, buffer_out, pe_index 107 | -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/cli/cosyvoice.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import os 15 | import torch 16 | from hyperpyyaml import load_hyperpyyaml 17 | from modelscope import snapshot_download 18 | from cosyvoice.cli.frontend import CosyVoiceFrontEnd 19 | from cosyvoice.cli.model import CosyVoiceModel 20 | 21 | class CosyVoice: 22 | 23 | def __init__(self, model_dir): 24 | instruct = True if '-Instruct' in model_dir else False 25 | self.model_dir = model_dir 26 | if not os.path.exists(model_dir): 27 | model_dir = snapshot_download(model_dir) 28 | with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f: 29 | configs = load_hyperpyyaml(f) 30 | self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'], 31 | configs['feat_extractor'], 32 | '{}/campplus.onnx'.format(model_dir), 33 | '{}/speech_tokenizer_v1.onnx'.format(model_dir), 34 | '{}/spk2info.pt'.format(model_dir), 35 | instruct, 36 | configs['allowed_special']) 37 | self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift']) 38 | self.model.load('{}/llm.pt'.format(model_dir), 39 | '{}/flow.pt'.format(model_dir), 40 | '{}/hift.pt'.format(model_dir)) 41 | del configs 42 | 43 | def list_avaliable_spks(self): 44 | spks = list(self.frontend.spk2info.keys()) 45 | return spks 46 | 47 | def inference_sft(self, tts_text, spk_id): 48 | tts_speeches = [] 49 | for i in self.frontend.text_normalize(tts_text, split=True): 50 | model_input = self.frontend.frontend_sft(i, spk_id) 51 | model_output = self.model.inference(**model_input) 52 | tts_speeches.append(model_output['tts_speech']) 53 | return {'tts_speech': torch.concat(tts_speeches, dim=1)} 54 | 55 | def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k): 56 | prompt_text = self.frontend.text_normalize(prompt_text, split=False) 57 | tts_speeches = [] 58 | for i in self.frontend.text_normalize(tts_text, split=True): 59 | model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k) 60 | model_output = self.model.inference(**model_input) 61 | tts_speeches.append(model_output['tts_speech']) 62 | return {'tts_speech': torch.concat(tts_speeches, dim=1)} 63 | 64 | def inference_cross_lingual(self, tts_text, prompt_speech_16k): 65 | if self.frontend.instruct is True: 66 | raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir)) 67 | tts_speeches = [] 68 | for i in self.frontend.text_normalize(tts_text, split=True): 69 | model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k) 70 | model_output = self.model.inference(**model_input) 71 | tts_speeches.append(model_output['tts_speech']) 72 | return {'tts_speech': torch.concat(tts_speeches, dim=1)} 73 | 74 | def inference_instruct(self, tts_text, spk_id, instruct_text): 75 | if self.frontend.instruct is False: 76 | raise ValueError('{} do not support instruct inference'.format(self.model_dir)) 77 | instruct_text = self.frontend.text_normalize(instruct_text, split=False) 78 | tts_speeches = [] 79 | for i in self.frontend.text_normalize(tts_text, split=True): 80 | model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text) 81 | model_output = self.model.inference(**model_input) 82 | tts_speeches.append(model_output['tts_speech']) 83 | return {'tts_speech': torch.concat(tts_speeches, dim=1)} 84 | -------------------------------------------------------------------------------- /src/models/src_glm4/cosyvoice/utils/frontend_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import re 16 | chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+') 17 | 18 | # whether contain chinese character 19 | def contains_chinese(text): 20 | return bool(chinese_char_pattern.search(text)) 21 | 22 | 23 | # replace special symbol 24 | def replace_corner_mark(text): 25 | text = text.replace('²', '平方') 26 | text = text.replace('³', '立方') 27 | return text 28 | 29 | 30 | # remove meaningless symbol 31 | def remove_bracket(text): 32 | text = text.replace('(', '').replace(')', '') 33 | text = text.replace('【', '').replace('】', '') 34 | text = text.replace('`', '').replace('`', '') 35 | text = text.replace("——", " ") 36 | return text 37 | 38 | 39 | # spell Arabic numerals 40 | def spell_out_number(text: str, inflect_parser): 41 | new_text = [] 42 | st = None 43 | for i, c in enumerate(text): 44 | if not c.isdigit(): 45 | if st is not None: 46 | num_str = inflect_parser.number_to_words(text[st: i]) 47 | new_text.append(num_str) 48 | st = None 49 | new_text.append(c) 50 | else: 51 | if st is None: 52 | st = i 53 | if st is not None and st < len(text): 54 | num_str = inflect_parser.number_to_words(text[st:]) 55 | new_text.append(num_str) 56 | return ''.join(new_text) 57 | 58 | 59 | # split paragrah logic: 60 | # 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len 61 | # 2. cal sentence len according to lang 62 | # 3. split sentence according to puncatation 63 | def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False): 64 | def calc_utt_length(_text: str): 65 | if lang == "zh": 66 | return len(_text) 67 | else: 68 | return len(tokenize(_text)) 69 | 70 | def should_merge(_text: str): 71 | if lang == "zh": 72 | return len(_text) < merge_len 73 | else: 74 | return len(tokenize(_text)) < merge_len 75 | 76 | if lang == "zh": 77 | pounc = ['。', '?', '!', ';', ':', '、', '.', '?', '!', ';'] 78 | else: 79 | pounc = ['.', '?', '!', ';', ':'] 80 | if comma_split: 81 | pounc.extend([',', ',']) 82 | st = 0 83 | utts = [] 84 | for i, c in enumerate(text): 85 | if c in pounc: 86 | if len(text[st: i]) > 0: 87 | utts.append(text[st: i] + c) 88 | if i + 1 < len(text) and text[i + 1] in ['"', '”']: 89 | tmp = utts.pop(-1) 90 | utts.append(tmp + text[i + 1]) 91 | st = i + 2 92 | else: 93 | st = i + 1 94 | if len(utts) == 0: 95 | if lang == "zh": 96 | utts.append(text + '。') 97 | else: 98 | utts.append(text + '.') 99 | final_utts = [] 100 | cur_utt = "" 101 | for utt in utts: 102 | if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n: 103 | final_utts.append(cur_utt) 104 | cur_utt = "" 105 | cur_utt = cur_utt + utt 106 | if len(cur_utt) > 0: 107 | if should_merge(cur_utt) and len(final_utts) != 0: 108 | final_utts[-1] = final_utts[-1] + cur_utt 109 | else: 110 | final_utts.append(cur_utt) 111 | 112 | return final_utts 113 | 114 | 115 | # remove blank between chinese character 116 | def replace_blank(text: str): 117 | out_str = [] 118 | for i, c in enumerate(text): 119 | if c == " ": 120 | if ((text[i + 1].isascii() and text[i + 1] != " ") and 121 | (text[i - 1].isascii() and text[i - 1] != " ")): 122 | out_str.append(c) 123 | else: 124 | out_str.append(c) 125 | return "".join(out_str) 126 | --------------------------------------------------------------------------------