├── .gitignore ├── .pre-commit-config.yaml ├── FAQ.md ├── LICENSE ├── README.md ├── README_en.md ├── assets ├── audio_understanding_leaderboard.png ├── dataset_distribute.png ├── default.wav ├── img_1.png ├── leaderboard.md ├── logo.png ├── performance.png ├── s2s_leaderboard.png ├── s2s_semantic_leaderboard.png └── utmos.png ├── audio_evals ├── __init__.py ├── agg │ ├── __init__.py │ ├── air_chat.py │ └── base.py ├── base.py ├── constants.py ├── dataset │ ├── __init__.py │ ├── dataset.py │ ├── giga.py │ ├── huggingface.py │ └── resume.py ├── eval_task.py ├── evaluator │ ├── __init__.py │ ├── air_chat.py │ ├── alpaca_eval.py │ ├── alpaca_eval.txt │ ├── base.py │ ├── bbh.py │ ├── bleu.py │ ├── coco.py │ ├── dict_match.py │ ├── dnsmos.py │ ├── ensemble.py │ ├── harm.py │ ├── ifeval.py │ ├── mcq.py │ ├── qa_eval.py │ ├── qa_exact_match.py │ ├── ref_qa_geval.py │ ├── ref_qa_geval.txt │ ├── simo.py │ ├── string_match.py │ ├── utmos.py │ ├── voice_bench.py │ └── wer.py ├── isolate.py ├── lib │ ├── DNSMOS │ │ ├── README.md │ │ ├── dnsmos_single.py │ │ ├── main.py │ │ └── requirements.txt │ ├── SenseVoice │ │ ├── main.py │ │ └── requirements.txt │ ├── Spark-TTS │ │ ├── .gitignore │ │ ├── LICENSE │ │ ├── README.md │ │ ├── cli │ │ │ ├── SparkTTS.py │ │ │ └── inference.py │ │ ├── encodec.py │ │ ├── example │ │ │ └── infer.sh │ │ ├── main.py │ │ ├── requirements.txt │ │ ├── sparktts │ │ │ ├── models │ │ │ │ ├── audio_tokenizer.py │ │ │ │ └── bicodec.py │ │ │ ├── modules │ │ │ │ ├── blocks │ │ │ │ │ ├── layers.py │ │ │ │ │ ├── samper.py │ │ │ │ │ └── vocos.py │ │ │ │ ├── encoder_decoder │ │ │ │ │ ├── feat_decoder.py │ │ │ │ │ ├── feat_encoder.py │ │ │ │ │ └── wave_generator.py │ │ │ │ ├── fsq │ │ │ │ │ ├── finite_scalar_quantization.py │ │ │ │ │ └── residual_fsq.py │ │ │ │ ├── speaker │ │ │ │ │ ├── ecapa_tdnn.py │ │ │ │ │ ├── perceiver_encoder.py │ │ │ │ │ ├── pooling_layers.py │ │ │ │ │ └── speaker_encoder.py │ │ │ │ └── vq │ │ │ │ │ └── factorized_vector_quantize.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── audio.py │ │ │ │ ├── file.py │ │ │ │ ├── parse_options.sh │ │ │ │ └── token_parser.py │ │ ├── src │ │ │ ├── figures │ │ │ │ ├── gradio_TTS.png │ │ │ │ ├── gradio_control.png │ │ │ │ ├── infer_control.png │ │ │ │ └── infer_voice_cloning.png │ │ │ └── logo │ │ │ │ ├── HKUST.jpg │ │ │ │ ├── NPU.jpg │ │ │ │ ├── NTU.jpg │ │ │ │ ├── SJU.jpg │ │ │ │ ├── SparkAudio.jpg │ │ │ │ ├── SparkAudio2.jpg │ │ │ │ ├── SparkTTS.jpg │ │ │ │ ├── SparkTTS.png │ │ │ │ ├── mobvoi.jpg │ │ │ │ └── mobvoi.png │ │ └── webui.py │ ├── WavTokenizer │ │ ├── LICENSE │ │ ├── README.md │ │ ├── configs │ │ │ ├── wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml │ │ │ └── wavtokenizer_smalldata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml │ │ ├── data │ │ │ └── demo.txt │ │ ├── decoder │ │ │ ├── __init__.py │ │ │ ├── dataset.py │ │ │ ├── discriminator_dac.py │ │ │ ├── discriminators.py │ │ │ ├── experiment.py │ │ │ ├── feature_extractors.py │ │ │ ├── heads.py │ │ │ ├── helpers.py │ │ │ ├── loss.py │ │ │ ├── models.py │ │ │ ├── modules.py │ │ │ ├── pretrained.py │ │ │ ├── pretrained_model.py │ │ │ └── spectral_ops.py │ │ ├── encoder │ │ │ ├── __init__.py │ │ │ ├── distrib.py │ │ │ ├── model.py │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ ├── conv.py │ │ │ │ ├── lstm.py │ │ │ │ ├── norm.py │ │ │ │ ├── seanet.py │ │ │ │ └── transformer.py │ │ │ ├── msstftd.py │ │ │ ├── quantization │ │ │ │ ├── __init__.py │ │ │ │ ├── ac.py │ │ │ │ ├── core_vq.py │ │ │ │ └── vq.py │ │ │ └── utils.py │ │ ├── infer.py │ │ ├── metrics │ │ │ ├── UTMOS.py │ │ │ ├── infer.py │ │ │ └── periodicity.py │ │ ├── requirements.txt │ │ ├── result.png │ │ └── train.py │ ├── __init__.py │ ├── chattts.py │ ├── coco.py │ ├── cpm_tts │ │ ├── __init__.py │ │ ├── chattts.py │ │ ├── config.py │ │ ├── dvae.py │ │ ├── gpt.py │ │ ├── minicpmv26_resampler.py │ │ └── processor.py │ ├── doubao │ │ ├── simplex_websocket_demo.py │ │ └── stream_asr.py │ ├── encodec │ │ ├── main.py │ │ └── requirements.txt │ ├── evaluate_tokenizer.py │ ├── mimi │ │ ├── main.py │ │ ├── requirements.txt │ │ └── stream.py │ ├── minicpm │ │ ├── main.py │ │ └── requirements.txt │ ├── minicpm_0_5B │ │ ├── main.py │ │ └── requirements.txt │ ├── paraformer │ │ ├── main.py │ │ └── requirements.txt │ ├── qwen2-5omni │ │ ├── main.py │ │ └── requirements.txt │ ├── sensevoicelib.py │ ├── simo │ │ ├── models_ecapa_tdnn.py │ │ ├── requirements.txt │ │ └── simo.py │ ├── ssnact │ │ └── ssnact.py │ ├── streaming_asr_demo.py │ ├── text_normalization │ │ ├── __init__.py │ │ ├── basic.py │ │ ├── cn_tn.py │ │ ├── en.py │ │ └── english.json │ ├── utmos │ │ ├── lightning_module.py │ │ ├── main.py │ │ ├── model.py │ │ └── requirements.txt │ ├── wer.py │ └── whisper │ │ ├── main.py │ │ └── requirements.txt ├── main.py ├── models │ ├── AudioEncoder │ │ ├── __init__.py │ │ ├── chattts.py │ │ ├── cosyvoice.py │ │ ├── cosyvoice_adv.py │ │ ├── encodec.py │ │ ├── mimi.py │ │ ├── spark.py │ │ ├── vocos_encode.py │ │ └── wav_tokenizer.py │ ├── TTS │ │ ├── __init__.py │ │ ├── amphion.py │ │ ├── indextts.py │ │ ├── megatts.py │ │ ├── melotts.py │ │ ├── spark.py │ │ └── stabletts.py │ ├── UltraVOX.py │ ├── __init__.py │ ├── ali.py │ ├── asr │ │ ├── __init__.py │ │ ├── ali.py │ │ ├── baidu.py │ │ ├── fireredasr.py │ │ ├── huawei.py │ │ ├── huoshan.py │ │ ├── paraformer.py │ │ ├── sensevoice.py │ │ ├── sherpa.py │ │ ├── tencent.py │ │ └── xfyun.py │ ├── bytedance │ │ ├── __init__.py │ │ └── doubao.py │ ├── dnsmos.py │ ├── glm4audio.py │ ├── glm4voice.py │ ├── google.py │ ├── llama_omni.py │ ├── llmcenter.py │ ├── mini_cpm.py │ ├── mini_omni.py │ ├── model.py │ ├── moonshot.py │ ├── offline_model.py │ ├── ola.py │ ├── openai.py │ ├── openai_realtime.py │ ├── qwen.py │ ├── qwen2_5.py │ ├── sp_gemini.py │ ├── step_audio.py │ ├── utmos.py │ ├── wavlm.py │ └── whisper.py ├── process │ ├── __init__.py │ ├── base.py │ ├── eliminate.py │ ├── firstoption.py │ ├── normalization.py │ ├── qwen.py │ └── speech.py ├── prompt │ ├── __init__.py │ └── base.py ├── recorder.py ├── registry.py └── utils.py ├── docs ├── Procedures for Restarting an Incomplete Evaluation.md ├── how add a dataset.md ├── how eval your model.md ├── how launch a custom eval task.md └── how use UTMOS, DNSMOS eval speech quality.md ├── registry ├── agg │ ├── air-bench.yaml │ └── naive.yaml ├── dataset │ ├── AudioCaps.yaml │ ├── COVID-recognizer.yaml │ ├── CatDog.yaml │ ├── ClothoAQA.yaml │ ├── CommonVoice.yaml │ ├── DESEDpublic_eval.yaml │ ├── GTZAN.yaml │ ├── GigaSpeech.yaml │ ├── KeSpeech.yaml │ ├── MELD.yaml │ ├── MMAU.yaml │ ├── Nsynth.yaml │ ├── RAVDESS.yaml │ ├── RespiratorySound.yaml │ ├── TESS.yaml │ ├── VSC.yaml │ ├── VoxCeleb.yaml │ ├── WavCaps.yaml │ ├── WenetSpeech.yaml │ ├── air.yaml │ ├── aishell.yaml │ ├── alpaca_eval.yaml │ ├── audio-MNIST.yaml │ ├── chord_recoganition.yaml │ ├── covost2.yaml │ ├── fleurs.yaml │ ├── heart_beat.yaml │ ├── librispeech.yaml │ ├── llama_questions.yaml │ ├── multilingual_librispeech.yaml │ ├── peoples_speech.yaml │ ├── sample.yaml │ ├── tedlium.yaml │ ├── triviaqa.yaml │ ├── voxpopuli.yaml │ └── webQ.yaml ├── eval_task │ ├── acoustics.yaml │ ├── air.yaml │ ├── alpaca.yaml │ ├── aqa.yaml │ ├── asr.yaml │ ├── caption.yaml │ ├── digit.yaml │ ├── emo.yaml │ ├── gender.yaml │ ├── inference.yaml │ ├── medicine.yaml │ ├── music.yaml │ ├── sound_identify.yaml │ ├── stt.yaml │ └── vsc.yaml ├── evaluator │ ├── air-bench.yaml │ ├── alpaca.yaml │ ├── choice-with-ans.yaml │ ├── common.yaml │ ├── dnsmos.yaml │ ├── llama-speech.yaml │ ├── qa.yaml │ ├── simo.yaml │ ├── speech_qulity.yaml │ └── utmos.yaml ├── model │ ├── ali.yaml │ ├── dnsmos.yaml │ ├── gemini.yaml │ ├── minicpmo.yaml │ ├── moonshot.yaml │ ├── offline.yaml │ ├── ola.yaml │ ├── paraformer.yaml │ ├── qwen2.5.yaml │ ├── speechLLM.yaml │ ├── step.yaml │ ├── tencent.yaml │ ├── ultravox.yaml │ ├── utmos.yaml │ └── wavlm.yaml ├── process │ ├── base.yaml │ ├── choice.yaml │ └── speech_model_output.yaml ├── prompt │ ├── 3o.yaml │ ├── aqa.yaml │ ├── asr.yaml │ ├── caption.yaml │ ├── chatbot.yaml │ ├── choice.yaml │ ├── digit.yaml │ ├── emotion_anlysis.yaml │ ├── gender_anlysis.yaml │ ├── geval.yaml │ ├── kimi-audio.yaml │ ├── medicine.yaml │ ├── mini-cpm-omni.yaml │ ├── music.yaml │ ├── ola.yaml │ ├── qa.yaml │ ├── qwen-audio-pretrain.yaml │ ├── qwen-omni.yaml │ ├── qwen2-audio-pretrain.yaml │ ├── sound_identify.yaml │ ├── stt.yaml │ └── whisper-pretrain.yaml └── recorder │ └── local.yaml ├── requirments-offline-model.txt ├── requirments.txt ├── requirments └── minicpm_o2_6.txt └── tests ├── test_audio_evals_registry.py └── test_dataset.py /.gitignore: -------------------------------------------------------------------------------- 1 | cremote_registry/ 2 | cyb_registry/ 3 | temp* 4 | # 忽略操作系统生成的文件 5 | init_model/ 6 | anna/ 7 | .vscode/ 8 | envs/ 9 | env/ 10 | tests/* 11 | *.DS_Store 12 | .DS_Store 13 | ._* 14 | *.wav 15 | 16 | local_registry/ 17 | cyb_dev_registry/ 18 | res/ 19 | log/ 20 | script/ 21 | raw_data/ 22 | tmp/ 23 | synthetic_data/ 24 | .run/ 25 | 26 | *.xlsx 27 | 28 | 29 | # 忽略编辑器和IDE生成的文件 30 | .idea/ 31 | *.sublime-workspace 32 | *.sublime-project 33 | *.swp 34 | *.swo 35 | 36 | # 忽略构建和编译生成的文件 37 | __pycache__/ 38 | *.pyc 39 | *.pyo 40 | *.pyd 41 | dist/ 42 | build/ 43 | *.egg-info/ 44 | node_modules/ 45 | *.log 46 | *.tmp 47 | *.bak 48 | *.swp 49 | 50 | # 忽略配置文件 51 | *.env 52 | 53 | # 忽略数据库文件 54 | *.sqlite 55 | *.sqlite3 56 | *.db 57 | 58 | # 忽略压缩文件 59 | *.zip 60 | *.tar 61 | *.gz 62 | *.bz2 63 | *.7z 64 | 65 | # 忽略临时文件 66 | *.tmp 67 | *.temp 68 | 69 | 70 | # 忽略虚拟环境 71 | venv/ 72 | env/ 73 | 74 | # 忽略测试生成的文件 75 | coverage/ 76 | .coverage 77 | 78 | # 忽略其他可能的敏感信息 79 | *.pem 80 | *.key 81 | *.crt 82 | *.p12 83 | *.pfx 84 | *.der 85 | 86 | # 忽略其他可能的临时文件 87 | *.log 88 | *.out 89 | *.pid 90 | *.pid.lock 91 | 92 | # 忽略其他可能的缓存文件 93 | *.cache 94 | *.tmp 95 | 96 | # 忽略其他可能的备份文件 97 | *.bak 98 | *.old 99 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.6.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: check-yaml 7 | - id: end-of-file-fixer 8 | - id: requirements-txt-fixer 9 | - id: check-merge-conflict 10 | - id: fix-encoding-pragma 11 | args: ["--remove"] 12 | - id: mixed-line-ending 13 | args: ["--fix=lf"] 14 | - repo: https://github.com/psf/black 15 | rev: 24.4.2 16 | hooks: 17 | - id: black 18 | -------------------------------------------------------------------------------- /FAQ.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## 1. ./nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkAddData_12_1, version libnvJitLink.so.12 4 | 5 | ref: 6 | https://github.com/pytorch/pytorch/issues/111469 7 | 8 | two solutions: 9 | - you can update your nvidia to match torch 10 | - use your python env nvidia path not system, like: `export LD_LIBRARY_PATH=$HOME/path/to/my/venv3115/lib64/ 11 | python3.11/site-packages/nvidia/nvjitlink/lib` or`export LD_LIBRARY_PATH=env/lib/python3.10/site-packages/nvidia/nvjitlink/lib` 12 | 13 | ## 2. ConnectionError: Couldn't reach 'TwinkStart/xx' on the Hub (LocalEntryNotFoundError) 14 | 15 | make sure you can access the huggingface hub, you maybe need use proxy: 16 | 17 | > export HF_ENDPOINT=https://hf-mirror.com 18 | 19 | 20 | ## 3. gigaspeech: 'None Type' object is not callable 21 | 22 | Gigaspeech is not a directly accessible dataset; you need to request permission from the authors. 23 | https://huggingface.co/datasets/speechcolab/gigaspeech 24 | 25 | When you attempt to download it, you will encounter a login page. If you do not have permission, there will be an HF link prompting you to apply for access. 26 | 27 | If the above does not appear, enter the following code in the Python interactive shell: 28 | 29 | ```python 30 | from datasets import load_dataset 31 | gs_test = load_dataset("speechcolab/gigaspeech", "test") 32 | ``` 33 | 34 | If this code runs successfully, you can proceed with the evaluation. 35 | 36 | ## 4. The official evaluation prompts for MiniCPM-O 2.6 37 | 38 | 1. ASR zh: --prompt mini-cpm-omni-asr-zh 39 | 2. ASR en: --prompt mini-cpm-omni-asr-en 40 | 3. AST 2zh: --prompt mini-cpm-omni-asr-zh 41 | 4. AST 2en: --prompt mini-cpm-omni-ast-en 42 | 5. emotion analysis: --prompt mini-cpm-omni-emotion_analysis 43 | -------------------------------------------------------------------------------- /assets/audio_understanding_leaderboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/audio_understanding_leaderboard.png -------------------------------------------------------------------------------- /assets/dataset_distribute.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/dataset_distribute.png -------------------------------------------------------------------------------- /assets/default.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/default.wav -------------------------------------------------------------------------------- /assets/img_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/img_1.png -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/logo.png -------------------------------------------------------------------------------- /assets/performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/performance.png -------------------------------------------------------------------------------- /assets/s2s_leaderboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/s2s_leaderboard.png -------------------------------------------------------------------------------- /assets/s2s_semantic_leaderboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/s2s_semantic_leaderboard.png -------------------------------------------------------------------------------- /assets/utmos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/utmos.png -------------------------------------------------------------------------------- /audio_evals/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/__init__.py -------------------------------------------------------------------------------- /audio_evals/agg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/agg/__init__.py -------------------------------------------------------------------------------- /audio_evals/agg/air_chat.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | from audio_evals.agg.base import AggPolicy 3 | 4 | 5 | class AirChat(AggPolicy): 6 | 7 | def _agg(self, score_detail: List[Dict[str, any]]) -> Dict[str, float]: 8 | predl, refl = [item["pred_score"] for item in score_detail], [ 9 | item["ref_score"] for item in score_detail 10 | ] 11 | win_count = sum([1 for i in range(len(predl)) if predl[i] > refl[i]]) 12 | return { 13 | "win(%)": win_count / len(predl) * 100, 14 | "ref_score": sum(refl) / len(refl), 15 | "pred_score": sum(predl) / len(predl), 16 | } 17 | -------------------------------------------------------------------------------- /audio_evals/base.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Dict, List, Union 3 | 4 | 5 | class EarlyStop(Exception): 6 | pass 7 | 8 | 9 | """ 10 | request format 11 | eg1: how are you 12 | eg2: [{'role': 'user', 'content': 'how are you'}] 13 | eg3: [{'role': 'user', 'contents': [{'type':'text', 'content': 'how are you'}, {'type':'image', 'content': '/mnt/a.git'}]] 14 | """ 15 | PromptStruct = Union[str, Dict[str, any], List[Dict[str, Union[str, List[Dict[str, str]]]]]] 16 | 17 | ScoreUnit = Dict[str, Union[int, float]] 18 | 19 | 20 | @dataclass 21 | class EvalTaskCfg: 22 | dataset: str 23 | prompt: str 24 | model: str 25 | agg: str = "dump" 26 | evaluator: str = "dump" 27 | post_process: List[str] = field(default_factory=list) 28 | -------------------------------------------------------------------------------- /audio_evals/constants.py: -------------------------------------------------------------------------------- 1 | DEFAULT_MODEL_PATH = "init_model/" 2 | -------------------------------------------------------------------------------- /audio_evals/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/dataset/__init__.py -------------------------------------------------------------------------------- /audio_evals/dataset/dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os.path 3 | from abc import ABC, abstractmethod 4 | from typing import Any, Dict, Generator, List 5 | import pandas as pd 6 | from tqdm import tqdm 7 | 8 | tqdm.pandas() 9 | 10 | 11 | class Dataset(ABC): 12 | def __init__(self, default_task: str, ref_col: str, col_aliases=None): 13 | if col_aliases is None: 14 | col_aliases = {} 15 | self.col_aliases = col_aliases 16 | self.task_name = default_task 17 | self.ref_col = ref_col 18 | 19 | def reset_ref_col(self, ref_col: str): 20 | self.ref_col = ref_col 21 | 22 | @abstractmethod 23 | def load(self, limit=0) -> List[Dict[str, any]]: 24 | raise NotImplementedError() 25 | 26 | def resume_from(self, f_name: str): 27 | from audio_evals.dataset.resume import ResumeDataset 28 | 29 | return ResumeDataset(self, f_name) 30 | 31 | def load_inf_file(self, f_name: str): 32 | from audio_evals.dataset.resume import ResumeDataset 33 | 34 | return ResumeDataset(self, f_name, save_type=["prompt", "inference"]) 35 | 36 | 37 | class JsonlFile(Dataset): 38 | def __init__(self, f_name: str, default_task: str, ref_col: str, col_aliases=None): 39 | super().__init__(default_task, ref_col, col_aliases) 40 | self.f_name = f_name 41 | 42 | def add_col_alias(self, df): 43 | for k, v in self.col_aliases.items(): 44 | if v in df.columns: 45 | raise ValueError(f"Column alias {v} already exists in the dataframe") 46 | df[v] = df[k] 47 | return df 48 | 49 | def load(self, limit=0) -> List[Dict[str, any]]: 50 | df = pd.read_json(self.f_name, lines=True) 51 | if limit > 0: 52 | df = df[:limit] 53 | df = self.add_col_alias(df) 54 | return df.to_dict(orient="records") 55 | 56 | 57 | class RelativePath(JsonlFile): 58 | def __init__( 59 | self, 60 | f_name: str, 61 | default_task: str, 62 | ref_col: str, 63 | file_path_prefix: str, 64 | col_aliases=None, 65 | ): 66 | super().__init__(f_name, default_task, ref_col, col_aliases) 67 | if not file_path_prefix.endswith("/"): 68 | file_path_prefix += "/" 69 | self.file_path = file_path_prefix 70 | 71 | def load(self, limit=0) -> List[Dict[str, any]]: 72 | df = pd.read_json(self.f_name, lines=True) 73 | if limit > 0: 74 | df = df[:limit] 75 | 76 | def abs_path(x): 77 | temp = os.path.join(self.file_path, str(x)) 78 | if os.path.exists(temp) and os.path.isfile(temp): 79 | return temp 80 | return x 81 | 82 | for item in df.columns: 83 | df[item] = df[item].progress_apply(abs_path) 84 | df = self.add_col_alias(df) 85 | return df.to_dict(orient="records") 86 | -------------------------------------------------------------------------------- /audio_evals/dataset/giga.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Dict 3 | from audio_evals.dataset.huggingface import Huggingface, load_audio_hf_dataset 4 | from huggingface_hub import login 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | conversational_filler = [ 9 | "UH", 10 | "UHH", 11 | "UM", 12 | "EH", 13 | "MM", 14 | "HM", 15 | "AH", 16 | "HUH", 17 | "HA", 18 | "ER", 19 | "OOF", 20 | "HEE", 21 | "ACH", 22 | "EEE", 23 | "EW", 24 | ] 25 | unk_tags = ["", ""] 26 | gigaspeech_punctuations = [ 27 | "", 28 | "", 29 | "", 30 | "", 31 | ] 32 | gigaspeech_garbage_utterance_tags = ["", "", "", ""] 33 | non_scoring_words = ( 34 | conversational_filler 35 | + unk_tags 36 | + gigaspeech_punctuations 37 | + gigaspeech_garbage_utterance_tags 38 | ) 39 | 40 | 41 | def asr_text_post_processing(text): 42 | # 1. convert to uppercase 43 | text = text.upper() 44 | 45 | # 2. remove hyphen 46 | # "E-COMMERCE" -> "E COMMERCE", "STATE-OF-THE-ART" -> "STATE OF THE ART" 47 | text = text.replace("-", " ") 48 | 49 | # 3. remove non-scoring words from evaluation 50 | remaining_words = [] 51 | for word in text.split(): 52 | if word in non_scoring_words: 53 | continue 54 | remaining_words.append(word) 55 | 56 | return " ".join(remaining_words) 57 | 58 | 59 | class GigaSpeechDataset(Huggingface): 60 | def __init__(self, **kwargs): 61 | super().__init__(**kwargs) 62 | logger.info(f"very import!!! GigaSpeech need to login to huggingface hub") 63 | login() 64 | 65 | def load(self, limit=0) -> List[Dict[str, any]]: 66 | logger.info( 67 | "start load data, it will take a while for download dataset when first load dataset" 68 | ) 69 | raw = load_audio_hf_dataset( 70 | self.name, self.subset, self.split, self.local_path, self.col_aliases 71 | ) 72 | res = [] 73 | for item in raw: 74 | item["text"] = asr_text_post_processing(item["text"]) 75 | if item["text"]: 76 | res.append(item) 77 | if limit > 0: 78 | res = res[:limit] 79 | return res 80 | -------------------------------------------------------------------------------- /audio_evals/dataset/resume.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | from typing import List, Dict, Union 5 | 6 | from audio_evals.dataset.dataset import Dataset 7 | from tqdm import tqdm 8 | 9 | 10 | class ResumeDataset(Dataset): 11 | def __init__( 12 | self, 13 | raw_dataset: Union[str, Dataset], 14 | resume_file: str, 15 | save_type: List[str] = None, 16 | ): 17 | if isinstance(raw_dataset, str): 18 | from audio_evals.registry import registry 19 | 20 | raw_dataset = registry.get_dataset(raw_dataset) 21 | super().__init__( 22 | raw_dataset.task_name, raw_dataset.ref_col, raw_dataset.col_aliases 23 | ) 24 | self.raw_dataset = raw_dataset 25 | path, base_name = os.path.split(resume_file) 26 | base_name = "temp_{}".format(base_name) 27 | # in case resume file be delete before read 28 | temp_file = os.path.join(path, base_name) 29 | shutil.copy2(resume_file, temp_file) 30 | self.resume_file = temp_file 31 | self.save_type = save_type 32 | 33 | def load(self, limit=0) -> List[Dict[str, any]]: 34 | data = self.raw_dataset.load(limit) 35 | with open(self.resume_file, "r") as f: 36 | for line in tqdm(f): 37 | doc = json.loads(line) 38 | if doc["type"] == "error": 39 | continue 40 | if self.save_type is not None and doc["type"] not in self.save_type: 41 | continue 42 | idx = int(doc["id"]) 43 | if "eval_info" not in data[idx]: 44 | data[idx]["eval_info"] = {} 45 | data[idx]["eval_info"].update({doc["type"]: doc["data"]}) 46 | os.remove(self.resume_file) 47 | return data 48 | -------------------------------------------------------------------------------- /audio_evals/evaluator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/evaluator/__init__.py -------------------------------------------------------------------------------- /audio_evals/evaluator/air_chat.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from audio_evals.evaluator.base import Evaluator 4 | 5 | 6 | prompt = ( 7 | "You are a helpful and precise assistant for checking the quality of the answer.\n" 8 | "[Detailed Audio Description]\n{meta_info}\n[Question]\n{question}\n" 9 | "[The Start of Assistant 1s Answer]\n{label}\n[The End of Assistant 1s Answer]\n" 10 | "[The Start of Assistant 2s Answer]\n{pred}\n[The End of Assistant 2s Answer]\n[System]\n" 11 | "We would like to request your feedback on the performance of two AI assistants in response to the user question " 12 | "and audio description displayed above. AI assistants are provided with detailed audio descriptions and questions.\n" 13 | "Please rate the helpfulness, relevance, accuracy, and comprehensiveness of their responses. " 14 | "Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance. " 15 | "Please output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. " 16 | "The two scores are separated by a space." 17 | ) 18 | 19 | 20 | class AIRChatEvaluator(Evaluator): 21 | def __init__(self, model_name: str): 22 | self.model_name = model_name 23 | 24 | def _eval(self, pred, label, **kwargs) -> Dict[str, any]: 25 | from audio_evals.registry import registry 26 | 27 | model = registry.get_model(self.model_name) 28 | p = prompt.format( 29 | meta_info=kwargs["meta_info"], 30 | question=kwargs["question"], 31 | label=label, 32 | pred=pred, 33 | ) 34 | res = model.inference(p) 35 | ref_score, pred_score = res.split(" ")[0], res.split(" ")[1] 36 | return { 37 | "pred_score": float(pred_score), 38 | "ref_score": float(ref_score), 39 | "pred": pred, 40 | "ref": label, 41 | } 42 | -------------------------------------------------------------------------------- /audio_evals/evaluator/alpaca_eval.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os.path 3 | import re 4 | from copy import deepcopy 5 | from typing import Dict 6 | import yaml 7 | import json 8 | from audio_evals.evaluator.base import Evaluator 9 | 10 | path = os.path.dirname(__file__) 11 | prompt = open(os.path.join(path, "alpaca_eval.txt"), "r").read() 12 | 13 | 14 | class AlpacaEvaluator(Evaluator): 15 | def __init__(self, model_name: str): 16 | self.model_name = model_name 17 | 18 | def _eval(self, pred, label, **kwargs) -> Dict[str, any]: 19 | from audio_evals.registry import registry 20 | 21 | model = registry.get_model(self.model_name) 22 | 23 | p = deepcopy(prompt) 24 | for k, v in {"instruction": kwargs["instruction"], 25 | "output_1": pred, 26 | "output_2": label}.items(): 27 | p = p.replace(f"{{{k}}}", v) 28 | 29 | # with open("/Users/a1/project/alpaca_eval-main/src/alpaca_eval/evaluators_configs/alpaca_eval_cot_gpt4_turbo_fn/configs.yaml", "r", encoding="utf-8") as f: 30 | # d = yaml.safe_load(f.read()) 31 | res = model.inference(p, temperature=0, maxTokens=100) 32 | 33 | # res_d = re.search(r"```json(.*?)```", res, re.DOTALL) 34 | # if res_d: 35 | # d = json.loads(res_d.group(1)) 36 | if res.startswith("```python"): 37 | res = res[9:-3].strip() 38 | elif res.startswith("```"): 39 | res = res[3:-3].strip() 40 | try: 41 | res = ast.literal_eval(res) 42 | if isinstance(res, dict): 43 | for k in res: 44 | res = res[k] 45 | break 46 | return { 47 | "acc": 1 if res[0]["model"] == "model_1" else 0, 48 | "pred": pred, 49 | "ref": label, 50 | } 51 | except Exception as e: 52 | print(f"output is {res}\nError: {e}") 53 | raise e 54 | 55 | 56 | class ChatbotEvaluator(Evaluator): 57 | def __init__(self, model_name: str): 58 | self.model_name = model_name 59 | 60 | def _eval(self, pred, label, **kwargs) -> Dict[str, any]: 61 | from audio_evals.registry import registry 62 | 63 | model = registry.get_model(self.model_name) 64 | prompt = registry.get_prompt("chatbot-eval") 65 | 66 | p = prompt.load(instruction=kwargs["instruction"], response=pred) 67 | res = model.inference(p, temperature=0, maxTokens=2048) 68 | 69 | # res_d = re.search(r"```json(.*?)```", res, re.DOTALL) 70 | d = re.search(r'\[\[(\d+)\]\]', res) 71 | return { 72 | "geval": int(d.group(1)), 73 | "pred": pred, 74 | "ref": label, 75 | } -------------------------------------------------------------------------------- /audio_evals/evaluator/alpaca_eval.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>system 2 | You are a helpful assistant, that ranks models by the quality of their answers. 3 | <|im_end|> 4 | <|im_start|>user 5 | I want you to create a leaderboard of different of large-language models. To do so, I will give you the instructions (prompts) given to the models, and the responses of two models. Please rank the models based on which responses would be preferred by humans. All inputs and outputs should be python dictionaries. 6 | 7 | Here is the prompt: 8 | { 9 | "instruction": """{instruction}""", 10 | } 11 | 12 | Here are the outputs of the models: 13 | [ 14 | { 15 | "model": "model_1", 16 | "answer": """{output_1}""" 17 | }, 18 | { 19 | "model": "model_2", 20 | "answer": """{output_2}""" 21 | } 22 | ] 23 | 24 | Now please rank the models by the quality of their answers, so that the model with rank 1 has the best output. Then return a list of the model names and ranks, i.e., produce the following output: 25 | [ 26 | {'model': , 'rank': }, 27 | {'model': , 'rank': } 28 | ] 29 | 30 | Your response must be a valid Python dictionary and should contain nothing else because we will directly execute it in Python. Please provide the ranking that the majority of humans would give. 31 | <|im_end|> -------------------------------------------------------------------------------- /audio_evals/evaluator/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Dict 3 | 4 | 5 | class Evaluator(ABC): 6 | 7 | @abstractmethod 8 | def _eval(self, pred, label, **kwargs) -> Dict[str, any]: 9 | raise NotImplementedError() 10 | 11 | def __call__(self, pred, ref, **kwargs) -> Dict[str, any]: 12 | res = {"pred": pred, "ref": ref} 13 | eval_kwargs = {k: v for k, v in kwargs.items() if k not in ["pred", "label"]} 14 | res.update(self._eval(pred, ref, **eval_kwargs)) 15 | return res 16 | 17 | 18 | class Dump(Evaluator): 19 | 20 | def _eval(self, pred, label, **kwargs): 21 | return {} 22 | 23 | 24 | class EM(Evaluator): 25 | 26 | def _eval(self, pred, label, **kwargs) -> Dict[str, any]: 27 | if type(label) in [int, float]: 28 | try: 29 | pred, label = float(pred), float(label) 30 | except: 31 | return {"match": 0, "pred": pred, "ref": label} 32 | elif isinstance(label, str): 33 | pred, label = str(pred).strip(), label.strip() 34 | 35 | return {"match": 1 if pred == label else 0, "pred": pred, "ref": label} 36 | 37 | 38 | class ExistMatch(Evaluator): 39 | 40 | def _eval(self, pred, label, **kwargs) -> Dict[str, any]: 41 | if isinstance(label, list): 42 | for item in label: 43 | ans = self._eval(pred, item, **kwargs) 44 | if ans["match"] == 1: 45 | return ans 46 | return {"match": 0, "pred": pred, "ref": label} 47 | 48 | if type(label) in [int, float]: 49 | pred, label = float(pred), float(label) 50 | elif isinstance(label, str): 51 | pred, label = str(pred).strip().lower(), label.strip().lower() 52 | 53 | match = 0 54 | if label in pred: 55 | match = 1 56 | 57 | return {"match": match, "pred": label if match else pred, "ref": label} 58 | 59 | 60 | class PrefixMatch(Evaluator): 61 | 62 | def __init__(self, ignore_case: bool = True): 63 | self.ignore_case = ignore_case 64 | 65 | def _eval(self, pred, label, **kwargs) -> Dict[str, any]: 66 | if self.ignore_case: 67 | pred = pred.lower().strip() 68 | label = str(label).lower().strip() 69 | n = len(label) 70 | return { 71 | "match": 1 if pred[:n] == label else 0, 72 | "pred": pred[:n], 73 | "ref": label, 74 | } 75 | -------------------------------------------------------------------------------- /audio_evals/evaluator/bbh.py: -------------------------------------------------------------------------------- 1 | from .base import Evaluator 2 | from typing import Dict, List 3 | import json 4 | 5 | 6 | class BBH(Evaluator): 7 | def __init__(self, ignore_case: bool = True): 8 | self.ignore_case = ignore_case 9 | 10 | def _extract_answer(self, response: str) -> str: 11 | response = response.lower() if self.ignore_case else response 12 | 13 | # 尝试从 JSON 格式中提取答案 14 | try: 15 | data = json.loads(response) 16 | if isinstance(data, dict) and "answer" in data: 17 | return data["answer"] 18 | except: 19 | pass 20 | 21 | # 尝试从文本中提取答案 22 | for line in response.split("\n"): 23 | line = line.strip() 24 | if line.startswith("answer:") or line.startswith("Answer:"): 25 | return line.split(":", 1)[1].strip() 26 | if line.startswith("the answer is") or line.startswith("The answer is"): 27 | return line.split("is", 1)[1].strip() 28 | 29 | return None 30 | 31 | def _eval(self, pred: str, label: str, **kwargs) -> Dict[str, any]: 32 | pred = str(pred) 33 | label = str(label) 34 | 35 | if self.ignore_case: 36 | pred = pred.lower() 37 | label = label.lower() 38 | 39 | extracted_answer = self._extract_answer(pred) 40 | if extracted_answer is None: 41 | return {"match": 0, "pred": pred, "ref": label, "fail": 1} 42 | 43 | return { 44 | "match": 1 if extracted_answer == label else 0, 45 | "pred": extracted_answer, 46 | "ref": label, 47 | "fail": 0, 48 | } 49 | -------------------------------------------------------------------------------- /audio_evals/evaluator/bleu.py: -------------------------------------------------------------------------------- 1 | import sacrebleu 2 | 3 | from audio_evals.evaluator.base import Evaluator 4 | 5 | 6 | class BLEU(Evaluator): 7 | def __init__(self, lang: str = "13a"): 8 | self.lang = "13a" 9 | if lang == "zh": 10 | self.lang = "zh" 11 | elif lang == "ja": 12 | self.lang = "ja-mecab" 13 | 14 | def _eval(self, pred: str, label: str, **kwargs): 15 | res = sacrebleu.corpus_bleu([pred], [[label]], tokenize=self.lang) 16 | return {"bleu": res.score} 17 | -------------------------------------------------------------------------------- /audio_evals/evaluator/coco.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | from audio_evals.evaluator.base import Evaluator 4 | from audio_evals.lib.coco import compute_caption 5 | 6 | 7 | class Coco(Evaluator): 8 | 9 | def _eval(self, pred: str, label: Union[str, List[str]], **kwargs): 10 | pred = str(pred) 11 | if isinstance(label, str): 12 | label = [label] 13 | return compute_caption([label], [pred]) 14 | -------------------------------------------------------------------------------- /audio_evals/evaluator/dict_match.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from audio_evals.evaluator.base import Evaluator 4 | 5 | 6 | class DictEM(Evaluator): 7 | 8 | def _eval(self, pred, label, **kwargs) -> Dict[str, any]: 9 | assert isinstance(label, dict), "label must be dictionaries, but {}".format( 10 | type(label) 11 | ) 12 | return {"match": 1 if pred == label else 0, "pred": pred, "ref": label} 13 | -------------------------------------------------------------------------------- /audio_evals/evaluator/dnsmos.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Dict 3 | 4 | from audio_evals.evaluator.base import Evaluator 5 | 6 | 7 | class DNSMOS(Evaluator): 8 | def __init__(self, model_name: str = "DNSMOS"): 9 | from audio_evals.registry import registry 10 | 11 | self.model = registry.get_model(model_name) 12 | 13 | def _eval(self, pred, label, **kwargs) -> Dict[str, any]: 14 | pred = {"audio": str(pred)} 15 | res = self.model.inference(pred) 16 | res = json.loads(res) 17 | res["pred"] = pred 18 | res["ref"] = label 19 | return res 20 | -------------------------------------------------------------------------------- /audio_evals/evaluator/ensemble.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | from audio_evals.evaluator.base import Evaluator 4 | 5 | 6 | class Ensemble(Evaluator): 7 | def __init__(self, components: List[str]): 8 | from audio_evals.registry import registry 9 | 10 | self.es = [] 11 | for item in components: 12 | e = registry.get_evaluator(item) 13 | if e is None: 14 | raise ValueError(f"Invalid component: {item}") 15 | self.es.append(e) 16 | 17 | def _eval(self, pred, label, **kwargs) -> Dict[str, any]: 18 | res = {} 19 | for e in self.es: 20 | res.update(e(pred, label, **kwargs)) 21 | return res 22 | -------------------------------------------------------------------------------- /audio_evals/evaluator/qa_eval.py: -------------------------------------------------------------------------------- 1 | from .base import Evaluator 2 | import numpy as np 3 | from typing import Dict, List 4 | from qa_metrics.pedant import PEDANT 5 | 6 | 7 | def majority_vote(scores: List[str]) -> bool: 8 | scores = [item.lower() for item in scores] 9 | final_answer = max(set(scores), key=scores.count) 10 | return True if final_answer == "yes" else False 11 | 12 | 13 | class QAEval(Evaluator): 14 | def __init__(self): 15 | self.pedant = PEDANT() 16 | 17 | def _eval(self, pred: str, label: str, **kwargs) -> Dict[str, any]: 18 | pred = str(pred) 19 | label = str(label) 20 | 21 | # 使用 PEDANT 进行评测 22 | panda_score = self.pedant.evaluate( 23 | [label.lower()], pred.lower(), kwargs.get("prompt", "").lower() 24 | ) 25 | 26 | # 使用多数投票机制 27 | gpt_score = majority_vote([pred]) 28 | 29 | return { 30 | "panda_score": panda_score * 100, 31 | "gpt_score": gpt_score * 100, 32 | "pred": pred, 33 | "ref": label, 34 | } 35 | -------------------------------------------------------------------------------- /audio_evals/evaluator/ref_qa_geval.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os.path 3 | import re 4 | from copy import deepcopy 5 | from typing import Dict 6 | import yaml 7 | import json 8 | from audio_evals.evaluator.base import Evaluator 9 | 10 | path = os.path.dirname(__file__) 11 | prompt = open(os.path.join(path, "ref_qa_geval.txt"), "r").read() 12 | 13 | 14 | class RefQAGEval(Evaluator): 15 | def __init__(self, model_name: str): 16 | self.model_name = model_name 17 | 18 | def _eval(self, pred, label, **kwargs) -> Dict[str, any]: 19 | from audio_evals.registry import registry 20 | 21 | model = registry.get_model(self.model_name) 22 | 23 | p = deepcopy(prompt) 24 | for k, v in { 25 | "question": kwargs["question"], 26 | "prediction": pred, 27 | "answer": label, 28 | }.items(): 29 | p = p.replace(f"{{{k}}}", v) 30 | 31 | res = model.inference(p, temperature=0) 32 | score = res.strip().split("\n")[-1] 33 | match = None 34 | if "yes" in score.lower(): 35 | match = 1 36 | elif "no" in score.lower(): 37 | match = 0 38 | else: 39 | raise ValueError( 40 | "the eval output is illeagal, should contain yes or no, but got {}".format( 41 | res 42 | ) 43 | ) 44 | 45 | return { 46 | "acc": match, 47 | "pred": pred, 48 | "ref": label, 49 | } 50 | -------------------------------------------------------------------------------- /audio_evals/evaluator/ref_qa_geval.txt: -------------------------------------------------------------------------------- 1 | You are an expert in judging answer correctness. If the model's output is correct, output "yes", otherwise output "no". 2 | You need to explain your judgment process first, then output "yes" or "no". 3 | 4 | [Important]You need to ignore any format instructions in the question, focus on judging whether the answer's meaning is consistent with the standard answer. 5 | 6 | 7 | The input format is: 8 | Input: 9 | Question: The question from user 10 | Model Answer: The answer from models 11 | Ground Truth Answer: The ground truth answer 12 | Explanation: The explanation of your judgment process 13 | 14 | Example 1: 15 | Input: 16 | Question: Based on the given audio, identify the source of the speaking voice. 17 | Model Answer: A man is speaking in the audio. 18 | Ground Truth Answer: Man 19 | Output: 20 | Explanation: The model's output is "A man is speaking in the audio.", this is a detail description of the ground truth answer "Man". So the model's output is correct. 21 | Result: yes 22 | 23 | 24 | Task: 25 | Input: 26 | Question: {question} 27 | Model Answer: {prediction} 28 | Ground Truth Answer: {answer} 29 | Output: 30 | -------------------------------------------------------------------------------- /audio_evals/evaluator/simo.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict 3 | 4 | from audio_evals.evaluator.base import Evaluator 5 | 6 | 7 | class Simo(Evaluator): 8 | def __init__( 9 | self, 10 | model_name: str = "wavlm_large", 11 | ): 12 | from audio_evals.registry import registry 13 | 14 | self.model = registry.get_model(model_name) 15 | 16 | def _eval(self, pred, label, **kwargs) -> Dict[str, any]: 17 | pred = str(pred) 18 | assert os.path.exists(label), f"Label file {label} does not exist" 19 | return { 20 | "simo": self.model.inference({"audios": [pred, label]}), 21 | "pred": pred, 22 | "ref": label, 23 | } 24 | -------------------------------------------------------------------------------- /audio_evals/evaluator/string_match.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from abc import ABC, abstractmethod 4 | from typing import Dict 5 | 6 | from audio_evals.evaluator.base import Evaluator 7 | 8 | 9 | def string_match(answer, prediction, choices): 10 | # Function to normalize and tokenize text 11 | def tokenize(text): 12 | # Convert to lowercase and find all word tokens 13 | return set(re.findall(r"\b\w+\b", text.lower())) 14 | 15 | # Tokenize prediction and answer 16 | prediction_tokens = tokenize(prediction) 17 | answer_tokens = tokenize(answer) 18 | 19 | if not prediction_tokens: 20 | return False 21 | 22 | # Tokenize incorrect choices and exclude tokens present in the answer 23 | incorrect_tokens = set() 24 | for choice in choices: 25 | choice_tokens = tokenize(choice) 26 | if choice_tokens != answer_tokens: 27 | incorrect_tokens.update(choice_tokens - answer_tokens) 28 | 29 | # Condition 1: All tokens of the answer are in the prediction 30 | cond1 = answer_tokens.issubset(prediction_tokens) 31 | 32 | # Condition 2: Prediction does not contain any tokens from incorrect choices (excluding shared words) 33 | cond2 = prediction_tokens.isdisjoint(incorrect_tokens) 34 | 35 | return cond1 and cond2 36 | 37 | 38 | class ChoiceStringMatch(Evaluator): 39 | 40 | def _eval(self, pred, label, choices, **kwargs) -> Dict[str, any]: 41 | pred = str(pred) 42 | match = string_match(label, pred, choices) 43 | return { 44 | "match": 1 if match else 0, 45 | "pred": label if match else pred, 46 | "ref": label, 47 | } 48 | -------------------------------------------------------------------------------- /audio_evals/evaluator/utmos.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from audio_evals.evaluator.base import Evaluator 4 | 5 | 6 | class UTMOS(Evaluator): 7 | def __init__(self, model_name: str = "utmos-en"): 8 | from audio_evals.registry import registry 9 | 10 | self.model = registry.get_model(model_name) 11 | 12 | def _eval(self, pred, label, **kwargs) -> Dict[str, any]: 13 | pred = str(pred) 14 | return { 15 | "utmos": self.model.inference(pred), 16 | "pred": pred, 17 | "ref": label, 18 | } 19 | -------------------------------------------------------------------------------- /audio_evals/evaluator/wer.py: -------------------------------------------------------------------------------- 1 | from audio_evals.evaluator.base import Evaluator 2 | from audio_evals.lib.wer import compute_wer 3 | 4 | 5 | class WER(Evaluator): 6 | def __init__(self, ignore_case: bool = False, lang="en"): 7 | self.ignore_case = ignore_case 8 | self.lang = lang 9 | 10 | def _eval(self, pred: str, label: str, **kwargs): 11 | pred, label = str(pred), str(label) 12 | if self.ignore_case: 13 | pred, label = pred.lower(), label.lower() 14 | return { 15 | "wer%": compute_wer([label], [pred], language=self.lang) * 100, 16 | } 17 | 18 | 19 | class CER(Evaluator): 20 | def __init__(self, ignore_case: bool = False): 21 | self.ignore_case = ignore_case 22 | 23 | def _eval(self, pred: str, label: str, **kwargs): 24 | pred, label = str(pred), str(label) 25 | if self.ignore_case: 26 | pred, label = pred.lower(), label.lower() 27 | return {"cer%": compute_wer([label], [pred], language="zh") * 100} 28 | -------------------------------------------------------------------------------- /audio_evals/lib/DNSMOS/README.md: -------------------------------------------------------------------------------- 1 | # DNSMOS: A non-intrusive perceptual objective speech quality metric to evaluate noise suppressors 2 | 3 | Human subjective evaluation is the ”gold standard” to evaluate speech quality optimized for human perception. Perceptual objective metrics serve as a proxy for subjective scores. The conventional and widely used metrics require a reference clean speech signal, which is unavailable in real recordings. The no-reference approaches correlate poorly with human ratings and are not widely adopted in the research community. One of the biggest use cases of these perceptual objective metrics is to evaluate noise suppression algorithms. DNSMOS generalizes well in challenging test conditions with a high correlation to human ratings in stack ranking noise suppression methods. More details can be found in [DNSMOS paper](https://arxiv.org/pdf/2010.15258.pdf). 4 | 5 | ## Evaluation methodology: 6 | Use the **dnsmos_local.py** script. 7 | 1. To compute a personalized MOS score (where interfering speaker is penalized) provide the '-p' argument 8 | Ex: python dnsmos_local.py -t C:\temp\SampleClips -o sample.csv -p 9 | 2. To compute a regular MOS score omit the '-p' argument. 10 | Ex: python dnsmos_local.py -t C:\temp\SampleClips -o sample.csv 11 | 12 | ## Citation: 13 | If you have used the API for your research and development purpose, please cite the [DNSMOS paper](https://arxiv.org/pdf/2010.15258.pdf): 14 | ```BibTex 15 | @inproceedings{reddy2021dnsmos, 16 | title={Dnsmos: A non-intrusive perceptual objective speech quality metric to evaluate noise suppressors}, 17 | author={Reddy, Chandan KA and Gopal, Vishak and Cutler, Ross}, 18 | booktitle={ICASSP 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 19 | pages={6493--6497}, 20 | year={2021}, 21 | organization={IEEE} 22 | } 23 | ``` 24 | 25 | If you used DNSMOS P.835 please cite the [DNSMOS P.835](https://arxiv.org/pdf/2110.01763.pdf) paper: 26 | 27 | ```BibTex 28 | @inproceedings{reddy2022dnsmos, 29 | title={DNSMOS P.835: A non-intrusive perceptual objective speech quality metric to evaluate noise suppressors}, 30 | author={Reddy, Chandan KA and Gopal, Vishak and Cutler, Ross}, 31 | booktitle={ICASSP 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 32 | year={2022}, 33 | organization={IEEE} 34 | } 35 | ``` 36 | -------------------------------------------------------------------------------- /audio_evals/lib/DNSMOS/requirements.txt: -------------------------------------------------------------------------------- 1 | argparse==1.1 2 | configparser==5.3.0 3 | librosa==0.8.1 4 | numpy==1.22.4 5 | onnxruntime==1.13.1 6 | -------------------------------------------------------------------------------- /audio_evals/lib/SenseVoice/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import select 5 | import sys 6 | from funasr import AutoModel 7 | from funasr.utils.postprocess_utils import rich_transcription_postprocess 8 | import torch 9 | 10 | 11 | logging.basicConfig(level=logging.INFO) 12 | logger = logging.getLogger(__name__) 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument( 17 | "--path", type=str, required=True, help="Path to SenseVoice model" 18 | ) 19 | config = parser.parse_args() 20 | 21 | # Initialize model 22 | model = AutoModel( 23 | model=config.path, 24 | vad_model="fsmn-vad", 25 | vad_kwargs={"max_single_segment_time": 30000}, 26 | device="cuda" if torch.cuda.is_available() else "cpu", 27 | ) 28 | logger.info(f"Using SenseVoice model from: {config.path}") 29 | 30 | while True: 31 | try: 32 | prompt = input() 33 | anchor = prompt.find("->") 34 | if anchor == -1: 35 | print( 36 | "Error: Invalid conversation format, must contains ->, but {}".format( 37 | prompt 38 | ), 39 | flush=True, 40 | ) 41 | continue 42 | prefix = prompt[:anchor].strip() + "->" 43 | x = json.loads(prompt[anchor + 2 :]) 44 | 45 | # Process input 46 | res = model.generate( 47 | input=x["audio"], 48 | cache={}, 49 | language=x.get("language", "auto"), 50 | use_itn=True, 51 | batch_size_s=30000, 52 | merge_vad=True, 53 | merge_length_s=15, 54 | ) 55 | text = rich_transcription_postprocess(res[0]["text"]) 56 | while True: 57 | print(f"{prefix}{text}", flush=True) 58 | rlist, _, _ = select.select([sys.stdin], [], [], 1) 59 | if rlist: 60 | finish = sys.stdin.readline().strip() 61 | if finish == "{}close".format(prefix): 62 | break 63 | print("not found close signal, will emit again", flush=True) 64 | 65 | except Exception as e: 66 | print(f"Error: {str(e)}", flush=True) 67 | -------------------------------------------------------------------------------- /audio_evals/lib/SenseVoice/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi>=0.111.1 2 | funasr 3 | funasr>=1.1.3 4 | gradio 5 | huggingface 6 | huggingface_hub 7 | modelscope 8 | numpy<=1.26.4 9 | torch<=2.3 10 | torchaudio 11 | -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/encodec.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tempfile 3 | 4 | import soundfile as sf 5 | from sparktts.models.audio_tokenizer import BiCodecTokenizer 6 | import torch 7 | import logging 8 | 9 | logging.basicConfig(level=logging.INFO) 10 | device = "cuda" if torch.cuda.is_available() else "cpu" 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | if __name__ == "__main__": 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument( 18 | "--path", type=str, required=True, help="Path to checkpoint file" 19 | ) 20 | config = parser.parse_args() 21 | 22 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 23 | logger.info(f"Using device: {device}") 24 | logger.info(f"Loading tokenizer from {config.path}") 25 | tokenizer = BiCodecTokenizer( 26 | model_dir=config.path, 27 | device=device, 28 | ) 29 | logger.info(f"successfully loaded tokenizer") 30 | 31 | while True: 32 | try: 33 | prompt = input() 34 | global_tokens, semantic_tokens = tokenizer.tokenize(prompt) 35 | wav_rec = tokenizer.detokenize(global_tokens.squeeze(0), semantic_tokens) 36 | with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: 37 | sf.write(f.name, wav_rec, 16000) 38 | print("Result:" + f.name) 39 | except Exception as e: 40 | print("Error:{}".format(e)) 41 | -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/example/infer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2025 SparkAudio 4 | # 2025 Xinsheng Wang (w.xinshawn@gmail.com) 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | # Get the absolute path of the script's directory 20 | script_dir=$(dirname "$(realpath "$0")") 21 | 22 | # Get the root directory 23 | root_dir=$(dirname "$script_dir") 24 | 25 | # Set default parameters 26 | device=0 27 | save_dir='example/results' 28 | model_dir="pretrained_models/Spark-TTS-0.5B" 29 | text="身临其境,换新体验。塑造开源语音合成新范式,让智能语音更自然。" 30 | prompt_text="吃燕窝就选燕之屋,本节目由26年专注高品质燕窝的燕之屋冠名播出。豆奶牛奶换着喝,营养更均衡,本节目由豆本豆豆奶特约播出。" 31 | prompt_speech_path="example/prompt_audio.wav" 32 | 33 | # Change directory to the root directory 34 | cd "$root_dir" || exit 35 | 36 | source sparktts/utils/parse_options.sh 37 | 38 | # Run inference 39 | python -m cli.inference \ 40 | --text "${text}" \ 41 | --device "${device}" \ 42 | --save_dir "${save_dir}" \ 43 | --model_dir "${model_dir}" \ 44 | --prompt_text "${prompt_text}" \ 45 | --prompt_speech_path "${prompt_speech_path}" 46 | -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/requirements.txt: -------------------------------------------------------------------------------- 1 | einops==0.8.1 2 | einx==0.3.0 3 | gradio==5.18.0 4 | numpy==2.2.3 5 | omegaconf==2.3.0 6 | packaging==24.2 7 | safetensors==0.5.2 8 | soundfile==0.12.1 9 | soxr==0.5.0.post1 10 | torch==2.5.1 11 | torchaudio==2.5.1 12 | tqdm==4.66.5 13 | transformers==4.46.2 14 | -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/sparktts/modules/blocks/layers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 SparkAudio 2 | # 2025 Xinsheng Wang (w.xinshawn@gmail.com) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Adapted from https://github.com/descriptinc/descript-audio-codec under the Apache License 2.0 17 | 18 | 19 | import torch 20 | import torch.nn as nn 21 | from torch.nn.utils import weight_norm 22 | 23 | 24 | def WNConv1d(*args, **kwargs): 25 | return weight_norm(nn.Conv1d(*args, **kwargs)) 26 | 27 | 28 | def WNConvTranspose1d(*args, **kwargs): 29 | return weight_norm(nn.ConvTranspose1d(*args, **kwargs)) 30 | 31 | 32 | # Scripting this brings model speed up 1.4x 33 | @torch.jit.script 34 | def snake(x, alpha): 35 | shape = x.shape 36 | x = x.reshape(shape[0], shape[1], -1) 37 | x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2) 38 | x = x.reshape(shape) 39 | return x 40 | 41 | 42 | class Snake1d(nn.Module): 43 | def __init__(self, channels): 44 | super().__init__() 45 | self.alpha = nn.Parameter(torch.ones(1, channels, 1)) 46 | 47 | def forward(self, x): 48 | return snake(x, self.alpha) 49 | 50 | 51 | class ResidualUnit(nn.Module): 52 | def __init__(self, dim: int = 16, dilation: int = 1): 53 | super().__init__() 54 | pad = ((7 - 1) * dilation) // 2 55 | self.block = nn.Sequential( 56 | Snake1d(dim), 57 | WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad), 58 | Snake1d(dim), 59 | WNConv1d(dim, dim, kernel_size=1), 60 | ) 61 | 62 | def forward(self, x): 63 | y = self.block(x) 64 | pad = (x.shape[-1] - y.shape[-1]) // 2 65 | if pad > 0: 66 | x = x[..., pad:-pad] 67 | return x + y 68 | 69 | 70 | def init_weights(m): 71 | if isinstance(m, nn.Conv1d): 72 | nn.init.trunc_normal_(m.weight, std=0.02) 73 | nn.init.constant_(m.bias, 0) 74 | -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/sparktts/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/sparktts/utils/__init__.py -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/figures/gradio_TTS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/figures/gradio_TTS.png -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/figures/gradio_control.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/figures/gradio_control.png -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/figures/infer_control.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/figures/infer_control.png -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/figures/infer_voice_cloning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/figures/infer_voice_cloning.png -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/logo/HKUST.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/HKUST.jpg -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/logo/NPU.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/NPU.jpg -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/logo/NTU.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/NTU.jpg -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/logo/SJU.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/SJU.jpg -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/logo/SparkAudio.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/SparkAudio.jpg -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/logo/SparkAudio2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/SparkAudio2.jpg -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/logo/SparkTTS.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/SparkTTS.jpg -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/logo/SparkTTS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/SparkTTS.png -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/logo/mobvoi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/mobvoi.jpg -------------------------------------------------------------------------------- /audio_evals/lib/Spark-TTS/src/logo/mobvoi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/mobvoi.png -------------------------------------------------------------------------------- /audio_evals/lib/WavTokenizer/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 jishengpeng 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /audio_evals/lib/WavTokenizer/data/demo.txt: -------------------------------------------------------------------------------- 1 | ./example1.wav 2 | ./example2.wav 3 | ./example3.mp3 4 | ./example4.flac 5 | -------------------------------------------------------------------------------- /audio_evals/lib/WavTokenizer/decoder/__init__.py: -------------------------------------------------------------------------------- 1 | from decoder.pretrained import WavTokenizer 2 | 3 | 4 | __version__ = "0.0.3" 5 | -------------------------------------------------------------------------------- /audio_evals/lib/WavTokenizer/decoder/helpers.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import numpy as np 3 | import torch 4 | from matplotlib import pyplot as plt 5 | from pytorch_lightning import Callback 6 | 7 | matplotlib.use("Agg") 8 | 9 | 10 | def save_figure_to_numpy(fig: plt.Figure) -> np.ndarray: 11 | """ 12 | Save a matplotlib figure to a numpy array. 13 | 14 | Args: 15 | fig (Figure): Matplotlib figure object. 16 | 17 | Returns: 18 | ndarray: Numpy array representing the figure. 19 | """ 20 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") 21 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 22 | return data 23 | 24 | 25 | def plot_spectrogram_to_numpy(spectrogram: np.ndarray) -> np.ndarray: 26 | """ 27 | Plot a spectrogram and convert it to a numpy array. 28 | 29 | Args: 30 | spectrogram (ndarray): Spectrogram data. 31 | 32 | Returns: 33 | ndarray: Numpy array representing the plotted spectrogram. 34 | """ 35 | spectrogram = spectrogram.astype(np.float32) 36 | fig, ax = plt.subplots(figsize=(12, 3)) 37 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") 38 | plt.colorbar(im, ax=ax) 39 | plt.xlabel("Frames") 40 | plt.ylabel("Channels") 41 | plt.tight_layout() 42 | 43 | fig.canvas.draw() 44 | data = save_figure_to_numpy(fig) 45 | plt.close() 46 | return data 47 | 48 | 49 | class GradNormCallback(Callback): 50 | """ 51 | Callback to log the gradient norm. 52 | """ 53 | 54 | def on_after_backward(self, trainer, model): 55 | model.log("grad_norm", gradient_norm(model)) 56 | 57 | 58 | def gradient_norm(model: torch.nn.Module, norm_type: float = 2.0) -> torch.Tensor: 59 | """ 60 | Compute the gradient norm. 61 | 62 | Args: 63 | model (Module): PyTorch model. 64 | norm_type (float, optional): Type of the norm. Defaults to 2.0. 65 | 66 | Returns: 67 | Tensor: Gradient norm. 68 | """ 69 | grads = [p.grad for p in model.parameters() if p.grad is not None] 70 | total_norm = torch.norm( 71 | torch.stack([torch.norm(g.detach(), norm_type) for g in grads]), norm_type 72 | ) 73 | return total_norm 74 | -------------------------------------------------------------------------------- /audio_evals/lib/WavTokenizer/encoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # flake8: noqa 7 | 8 | """EnCodec neural audio codec.""" 9 | 10 | __version__ = "0.1.2a3" 11 | 12 | from .model import EncodecModel 13 | -------------------------------------------------------------------------------- /audio_evals/lib/WavTokenizer/encoder/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """Torch modules.""" 8 | 9 | # flake8: noqa 10 | from .conv import ( 11 | pad1d, 12 | unpad1d, 13 | NormConv1d, 14 | NormConvTranspose1d, 15 | NormConv2d, 16 | NormConvTranspose2d, 17 | SConv1d, 18 | SConvTranspose1d, 19 | ) 20 | from .lstm import SLSTM 21 | from .seanet import SEANetEncoder, SEANetDecoder 22 | from .transformer import StreamingTransformerEncoder 23 | -------------------------------------------------------------------------------- /audio_evals/lib/WavTokenizer/encoder/modules/lstm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """LSTM layers module.""" 8 | 9 | from torch import nn 10 | 11 | 12 | class SLSTM(nn.Module): 13 | """ 14 | LSTM without worrying about the hidden state, nor the layout of the data. 15 | Expects input as convolutional layout. 16 | """ 17 | 18 | def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True): 19 | super().__init__() 20 | self.skip = skip 21 | self.lstm = nn.LSTM(dimension, dimension, num_layers) 22 | 23 | # def forward(self, x): 24 | # x = x.permute(2, 0, 1) 25 | # y, _ = self.lstm(x) 26 | # if self.skip: 27 | # y = y + x 28 | # y = y.permute(1, 2, 0) 29 | # return y 30 | 31 | # 修改transpose顺序 32 | def forward(self, x): 33 | # # 插入reshape 34 | # x = x.reshape(x.shape) 35 | x1 = x.permute(2, 0, 1) 36 | y, _ = self.lstm(x1) 37 | y = y.permute(1, 2, 0) 38 | if self.skip: 39 | y = y + x 40 | return y 41 | -------------------------------------------------------------------------------- /audio_evals/lib/WavTokenizer/encoder/modules/norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """Normalization modules.""" 8 | 9 | import typing as tp 10 | 11 | import einops 12 | import torch 13 | from torch import nn 14 | 15 | 16 | class ConvLayerNorm(nn.LayerNorm): 17 | """ 18 | Convolution-friendly LayerNorm that moves channels to last dimensions 19 | before running the normalization and moves them back to original position right after. 20 | """ 21 | 22 | def __init__( 23 | self, normalized_shape: tp.Union[int, tp.List[int], torch.Size], **kwargs 24 | ): 25 | super().__init__(normalized_shape, **kwargs) 26 | 27 | def forward(self, x): 28 | x = einops.rearrange(x, "b ... t -> b t ...") 29 | x = super().forward(x) 30 | x = einops.rearrange(x, "b t ... -> b ... t") 31 | return 32 | -------------------------------------------------------------------------------- /audio_evals/lib/WavTokenizer/encoder/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # flake8: noqa 8 | from .vq import QuantizedResult, ResidualVectorQuantizer 9 | -------------------------------------------------------------------------------- /audio_evals/lib/WavTokenizer/infer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from encoder.utils import convert_audio 4 | import torchaudio 5 | import torch 6 | from decoder.pretrained import WavTokenizer 7 | 8 | import time 9 | 10 | import logging 11 | 12 | device1 = torch.device("cuda:0") 13 | # device2=torch.device('cpu') 14 | 15 | input_path = "./WavTokenizer/data/infer/lirbitts_testclean" 16 | out_folder = "./WavTokenizer/result/infer" 17 | # os.system("rm -r %s"%(out_folder)) 18 | # os.system("mkdir -p %s"%(out_folder)) 19 | # ll="libritts_testclean500_large" 20 | ll = "wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn_testclean_epoch34" 21 | 22 | tmptmp = out_folder + "/" + ll 23 | 24 | os.system("rm -r %s" % (tmptmp)) 25 | os.system("mkdir -p %s" % (tmptmp)) 26 | 27 | # 自己数据模型加载 28 | config_path = "./WavTokenizer/configs/wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml" 29 | model_path = "./WavTokenizer/result/train/wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn/lightning_logs/version_3/checkpoints/wavtokenizer_checkpoint_epoch=24_step=137150_val_loss=5.6731.ckpt" 30 | wavtokenizer = WavTokenizer.from_pretrained0802(config_path, model_path) 31 | wavtokenizer = wavtokenizer.to(device1) 32 | # wavtokenizer = wavtokenizer.to(device2) 33 | 34 | with open(input_path, "r") as fin: 35 | x = fin.readlines() 36 | 37 | x = [i.strip() for i in x] 38 | 39 | # 完成一些加速处理 40 | 41 | features_all = [] 42 | 43 | for i in range(len(x)): 44 | 45 | wav, sr = torchaudio.load(x[i]) 46 | # print("***:",x[i]) 47 | # wav = convert_audio(wav, sr, 24000, 1) # (1,131040) 48 | bandwidth_id = torch.tensor([0]) 49 | wav = wav.to(device1) 50 | print(i) 51 | 52 | features, discrete_code = wavtokenizer.encode_infer(wav, bandwidth_id=bandwidth_id) 53 | features_all.append(features) 54 | 55 | # wavtokenizer = wavtokenizer.to(device2) 56 | 57 | for i in range(len(x)): 58 | 59 | bandwidth_id = torch.tensor([0]) 60 | 61 | bandwidth_id = bandwidth_id.to(device1) 62 | 63 | print(i) 64 | audio_out = wavtokenizer.decode(features_all[i], bandwidth_id=bandwidth_id) 65 | # print(i,time.time()) 66 | # breakpoint() # (1, 131200) 67 | audio_path = out_folder + "/" + ll + "/" + x[i].split("/")[-1] 68 | # os.makedirs(out_folder + '/' + ll, exist_ok=True) 69 | torchaudio.save( 70 | audio_path, 71 | audio_out.cpu(), 72 | sample_rate=24000, 73 | encoding="PCM_S", 74 | bits_per_sample=16, 75 | ) 76 | -------------------------------------------------------------------------------- /audio_evals/lib/WavTokenizer/requirements.txt: -------------------------------------------------------------------------------- 1 | einops==0.6.1 2 | encodec==0.1.1 3 | fairseq 4 | huggingface_hub==0.23.0 5 | jsonargparse[signatures]>=4.15.2 6 | librosa 7 | matplotlib==3.7.1 8 | numpy==1.23.5 9 | pesq 10 | pytorch-lightning==1.8.6 11 | pyyaml==6.0 12 | scipy==1.10.1 13 | soundfile==0.12.1 14 | tensorboardX==2.6 15 | torch==2.0.0 16 | torchaudio==2.0.1 17 | torchcrepe 18 | transformers==4.28.1 19 | -------------------------------------------------------------------------------- /audio_evals/lib/WavTokenizer/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/WavTokenizer/result.png -------------------------------------------------------------------------------- /audio_evals/lib/WavTokenizer/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 4 | 5 | from pytorch_lightning.cli import LightningCLI, ArgsType 6 | 7 | 8 | def cli_main(args: ArgsType = None): 9 | # breakpoint() 10 | cli = LightningCLI(args=args) 11 | # breakpoint() 12 | cli.trainer.fit(model=cli.model, datamodule=cli.datamodule) 13 | 14 | 15 | if __name__ == "__main__": 16 | cli_main() 17 | -------------------------------------------------------------------------------- /audio_evals/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/__init__.py -------------------------------------------------------------------------------- /audio_evals/lib/cpm_tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/cpm_tts/__init__.py -------------------------------------------------------------------------------- /audio_evals/lib/encodec/requirements.txt: -------------------------------------------------------------------------------- 1 | librosa 2 | numpy 3 | soundfile 4 | torch 5 | transformers 6 | -------------------------------------------------------------------------------- /audio_evals/lib/evaluate_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The OFA-Sys Team. All rights reserved. 2 | # This source code is licensed under the Apache 2.0 license 3 | # found in the LICENSE file in the root directory. 4 | 5 | import unicodedata 6 | 7 | from sacrebleu.tokenizers import TOKENIZERS 8 | 9 | 10 | class EvaluationTokenizer(object): 11 | """A generic evaluation-time tokenizer, which leverages built-in tokenizers 12 | in sacreBLEU (https://github.com/mjpost/sacrebleu). It additionally provides 13 | lowercasing, punctuation removal and character tokenization, which are 14 | applied after sacreBLEU tokenization. 15 | 16 | Args: 17 | tokenizer_type (str): the type of sacreBLEU tokenizer to apply. 18 | lowercase (bool): lowercase the text. 19 | punctuation_removal (bool): remove punctuation (based on unicode 20 | category) from text. 21 | character_tokenization (bool): tokenize the text to characters. 22 | """ 23 | 24 | SPACE = chr(32) 25 | SPACE_ESCAPE = chr(9601) 26 | # ALL_TOKENIZER_TYPES = ChoiceEnum(["none", "13a", "intl", "zh", "ja-mecab"]) 27 | 28 | def __init__( 29 | self, 30 | tokenizer_type: str = "13a", 31 | lowercase: bool = False, 32 | punctuation_removal: bool = False, 33 | character_tokenization: bool = False, 34 | ): 35 | 36 | assert tokenizer_type in TOKENIZERS, f"{tokenizer_type}, {TOKENIZERS}" 37 | self.lowercase = lowercase 38 | self.punctuation_removal = punctuation_removal 39 | self.character_tokenization = character_tokenization 40 | self.tokenizer = TOKENIZERS[tokenizer_type] 41 | 42 | @classmethod 43 | def remove_punctuation(cls, sent: str): 44 | """Remove punctuation based on Unicode category.""" 45 | return cls.SPACE.join( 46 | t 47 | for t in sent.split(cls.SPACE) 48 | if not all(unicodedata.category(c)[0] == "P" for c in t) 49 | ) 50 | 51 | def tokenize(self, sent: str): 52 | tokenized = self.tokenizer()(sent) 53 | 54 | if self.punctuation_removal: 55 | tokenized = self.remove_punctuation(tokenized) 56 | 57 | if self.character_tokenization: 58 | tokenized = self.SPACE.join( 59 | list(tokenized.replace(self.SPACE, self.SPACE_ESCAPE)) 60 | ) 61 | 62 | if self.lowercase: 63 | tokenized = tokenized.lower() 64 | 65 | return tokenized 66 | -------------------------------------------------------------------------------- /audio_evals/lib/mimi/requirements.txt: -------------------------------------------------------------------------------- 1 | librosa 2 | numpy 3 | soundfile 4 | torch 5 | torchaudio 6 | transformers==4.49.0 7 | -------------------------------------------------------------------------------- /audio_evals/lib/minicpm/requirements.txt: -------------------------------------------------------------------------------- 1 | decord 2 | librosa==0.9.0 3 | moviepy 4 | numpy==1.26 5 | Pillow==10.1.0 6 | soundfile==0.12.1 7 | torch==2.2.0 8 | torchaudio==2.2.0 9 | torchvision==0.17.0 10 | transformers==4.44.2 11 | vector-quantize-pytorch==1.18.5 12 | vocos==0.1.0 13 | -------------------------------------------------------------------------------- /audio_evals/lib/minicpm_0_5B/requirements.txt: -------------------------------------------------------------------------------- 1 | decord 2 | librosa==0.9.0 3 | moviepy 4 | numpy==1.26 5 | Pillow==10.1.0 6 | soundfile==0.12.1 7 | torch==2.2.0 8 | torchaudio==2.2.0 9 | torchvision==0.17.0 10 | transformers==4.44.2 11 | vector-quantize-pytorch==1.18.5 12 | vocos==0.1.0 13 | -------------------------------------------------------------------------------- /audio_evals/lib/paraformer/requirements.txt: -------------------------------------------------------------------------------- 1 | funasr 2 | soundfile 3 | torch 4 | torchaudio 5 | -------------------------------------------------------------------------------- /audio_evals/lib/qwen2-5omni/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | flash-attn 3 | git+https://github.com/huggingface/transformers@3a1ead0aabed473eafe527915eea8c197d424356 4 | qwen-omni-utils[decord] 5 | torchvision 6 | -------------------------------------------------------------------------------- /audio_evals/lib/simo/requirements.txt: -------------------------------------------------------------------------------- 1 | librosa 2 | omegaconf 3 | pyyaml 4 | s3prl 5 | torch==2.2.0 6 | torchaudio 7 | tqdm 8 | transformers 9 | -------------------------------------------------------------------------------- /audio_evals/lib/text_normalization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/text_normalization/__init__.py -------------------------------------------------------------------------------- /audio_evals/lib/text_normalization/basic.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unicodedata 3 | 4 | import regex 5 | 6 | # non-ASCII letters that are not separated by "NFKD" normalization 7 | ADDITIONAL_DIACRITICS = { 8 | "œ": "oe", 9 | "Œ": "OE", 10 | "ø": "o", 11 | "Ø": "O", 12 | "æ": "ae", 13 | "Æ": "AE", 14 | "ß": "ss", 15 | "ẞ": "SS", 16 | "đ": "d", 17 | "Đ": "D", 18 | "ð": "d", 19 | "Ð": "D", 20 | "þ": "th", 21 | "Þ": "th", 22 | "ł": "l", 23 | "Ł": "L", 24 | } 25 | 26 | 27 | def remove_symbols_and_diacritics(s: str, keep=""): 28 | """ 29 | Replace any other markers, symbols, and punctuations with a space, 30 | and drop any diacritics (category 'Mn' and some manual mappings) 31 | """ 32 | return "".join( 33 | ( 34 | c 35 | if c in keep 36 | else ( 37 | ADDITIONAL_DIACRITICS[c] 38 | if c in ADDITIONAL_DIACRITICS 39 | else ( 40 | "" 41 | if unicodedata.category(c) == "Mn" 42 | else " " if unicodedata.category(c)[0] in "MSP" else c 43 | ) 44 | ) 45 | ) 46 | for c in unicodedata.normalize("NFKD", s) 47 | ) 48 | 49 | 50 | def remove_symbols(s: str): 51 | """ 52 | Replace any other markers, symbols, punctuations with a space, keeping diacritics 53 | """ 54 | return "".join( 55 | " " if unicodedata.category(c)[0] in "MSP" else c 56 | for c in unicodedata.normalize("NFKC", s) 57 | ) 58 | 59 | 60 | class BasicTextNormalizer: 61 | def __init__(self, remove_diacritics: bool = False, split_letters: bool = False): 62 | self.clean = ( 63 | remove_symbols_and_diacritics if remove_diacritics else remove_symbols 64 | ) 65 | self.split_letters = split_letters 66 | 67 | def __call__(self, s: str): 68 | s = s.lower() 69 | s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets 70 | s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis 71 | s = self.clean(s).lower() 72 | 73 | if self.split_letters: 74 | s = " ".join(regex.findall(r"\X", s, regex.U)) 75 | 76 | s = re.sub( 77 | r"\s+", " ", s 78 | ) # replace any successive whitespace characters with a space 79 | 80 | return s 81 | -------------------------------------------------------------------------------- /audio_evals/lib/utmos/lightning_module.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | import torch 3 | import torch.nn as nn 4 | from model import load_ssl_model, DomainEmbedding, LDConditioner, Projection 5 | import os 6 | 7 | 8 | class BaselineLightningModule(pl.LightningModule): 9 | def __init__(self, cfg): 10 | super().__init__() 11 | self.cfg = cfg 12 | self.construct_model() 13 | self.save_hyperparameters() 14 | 15 | def construct_model(self, path=None): 16 | if path is None: 17 | assert ( 18 | os.environ.get("SSL_MODEL_PATH") is not None 19 | ), "SSL_MODEL_PATH is not set" 20 | path = os.environ.get("SSL_MODEL_PATH") 21 | 22 | self.feature_extractors = nn.ModuleList( 23 | [ 24 | load_ssl_model(cp_path=path), 25 | DomainEmbedding(3, 128), 26 | ] 27 | ) 28 | output_dim = sum( 29 | [ 30 | feature_extractor.get_output_dim() 31 | for feature_extractor in self.feature_extractors 32 | ] 33 | ) 34 | output_layers = [ 35 | LDConditioner(judge_dim=128, num_judges=3000, input_dim=output_dim) 36 | ] 37 | output_dim = output_layers[-1].get_output_dim() 38 | output_layers.append( 39 | Projection( 40 | hidden_dim=2048, 41 | activation=torch.nn.ReLU(), 42 | range_clipping=False, 43 | input_dim=output_dim, 44 | ) 45 | ) 46 | 47 | self.output_layers = nn.ModuleList(output_layers) 48 | 49 | def forward(self, inputs): 50 | outputs = {} 51 | for feature_extractor in self.feature_extractors: 52 | outputs.update(feature_extractor(inputs)) 53 | x = outputs 54 | for output_layer in self.output_layers: 55 | x = output_layer(x, inputs) 56 | return x 57 | -------------------------------------------------------------------------------- /audio_evals/lib/utmos/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.0.0 2 | aiohttp==3.8.1 3 | aiosignal==1.2.0 4 | analytics-python==1.4.0 5 | antlr4-python3-runtime==4.8 6 | anyio==3.5.0 7 | asgiref==3.5.0 8 | async-timeout==4.0.2 9 | attrs==21.4.0 10 | backoff==1.10.0 11 | bcrypt==3.2.0 12 | bitarray==2.4.0 13 | cachetools==5.0.0 14 | certifi==2021.10.8 15 | cffi==1.15.0 16 | charset-normalizer==2.0.12 17 | click==8.0.4 18 | colorama==0.4.4 19 | cryptography==36.0.1 20 | cycler==0.11.0 21 | Cython==0.29.28 22 | fairseq @ git+https://github.com/pytorch/fairseq.git@d03f4e771484a433f025f47744017c2eb6e9c6bc 23 | fastapi==0.75.0 24 | ffmpy==0.3.0 25 | fonttools==4.30.0 26 | frozenlist==1.3.0 27 | fsspec==2022.2.0 28 | future==0.18.2 29 | google-auth==2.6.0 30 | google-auth-oauthlib==0.4.6 31 | gradio==2.8.10 32 | grpcio==1.44.0 33 | h11==0.13.0 34 | hydra-core==1.0.7 35 | idna==3.3 36 | importlib-metadata==4.11.3 37 | Jinja2==3.0.3 38 | kiwisolver==1.3.2 39 | linkify-it-py==1.0.3 40 | Markdown==3.3.6 41 | markdown-it-py==2.0.1 42 | MarkupSafe==2.1.0 43 | matplotlib==3.5.1 44 | mdit-py-plugins==0.3.0 45 | mdurl==0.1.0 46 | monotonic==1.6 47 | multidict==6.0.2 48 | numpy==1.22.3 49 | oauthlib==3.2.0 50 | omegaconf==2.0.6 51 | orjson==3.6.7 52 | packaging==21.3 53 | pandas==1.4.1 54 | paramiko==2.10.1 55 | Pillow==9.0.1 56 | protobuf==3.19.4 57 | pyasn1==0.4.8 58 | pyasn1-modules==0.2.8 59 | pycparser==2.21 60 | pycryptodome==3.14.1 61 | pydantic==1.9.0 62 | pyDeprecate==0.3.1 63 | pydub==0.25.1 64 | PyNaCl==1.5.0 65 | pyparsing==3.0.7 66 | python-dateutil==2.8.2 67 | python-multipart==0.0.5 68 | pytorch-lightning==1.5.10 69 | pytz==2021.3 70 | PyYAML==6.0 71 | regex==2022.3.2 72 | requests==2.27.1 73 | requests-oauthlib==1.3.1 74 | rsa==4.8 75 | sacrebleu==2.0.0 76 | six==1.16.0 77 | sniffio==1.2.0 78 | starlette==0.17.1 79 | tabulate==0.8.9 80 | tensorboard==2.8.0 81 | tensorboard-data-server==0.6.1 82 | tensorboard-plugin-wit==1.8.1 83 | torch==1.11.0 84 | torchaudio==0.11.0 85 | torchmetrics==0.7.2 86 | tqdm==4.63.0 87 | typing-extensions==4.1.1 88 | uc-micro-py==1.0.1 89 | urllib3==1.26.8 90 | uvicorn==0.17.6 91 | Werkzeug==2.0.3 92 | yarl==1.7.2 93 | zipp==3.7.0 94 | -------------------------------------------------------------------------------- /audio_evals/lib/wer.py: -------------------------------------------------------------------------------- 1 | import editdistance as ed 2 | import zhconv 3 | 4 | from audio_evals.lib.evaluate_tokenizer import EvaluationTokenizer 5 | from audio_evals.lib.text_normalization.basic import BasicTextNormalizer 6 | from audio_evals.lib.text_normalization.cn_tn import TextNorm 7 | from audio_evals.lib.text_normalization.en import EnglishTextNormalizer 8 | 9 | english_normalizer = EnglishTextNormalizer() 10 | chinese_normalizer = TextNorm( 11 | to_banjiao=False, 12 | to_upper=False, 13 | to_lower=False, 14 | remove_fillers=False, 15 | remove_erhua=False, 16 | check_chars=False, 17 | remove_space=False, 18 | cc_mode="", 19 | ) 20 | basic_normalizer = BasicTextNormalizer() 21 | 22 | 23 | def compute_wer(refs, hyps, language="en"): 24 | distance = 0 25 | ref_length = 0 26 | tokenizer = EvaluationTokenizer( 27 | tokenizer_type="none", 28 | lowercase=True, 29 | punctuation_removal=False, 30 | character_tokenization=False, 31 | ) 32 | for i in range(len(refs)): 33 | ref = refs[i] 34 | pred = hyps[i] 35 | 36 | ref = english_normalizer(ref) 37 | pred = english_normalizer(pred) 38 | if language in ["zh"]: 39 | ref = chinese_normalizer(ref) 40 | pred = chinese_normalizer(pred) 41 | if language in ["yue"]: 42 | ref = zhconv.convert(ref, "zh-cn") 43 | pred = zhconv.convert(pred, "zh-cn") 44 | 45 | ref_items = tokenizer.tokenize(ref).split() 46 | pred_items = tokenizer.tokenize(pred).split() 47 | 48 | if language in ["zh", "yue"]: 49 | ref_items = [x for x in "".join(ref_items)] 50 | pred_items = [x for x in "".join(pred_items)] 51 | if len(refs) > 1 and i == 0: 52 | print(f"ref: {ref}") 53 | print(f"pred: {pred}") 54 | print(f"ref_items:\n{ref_items}\n{len(ref_items)}\n{ref_items[0]}") 55 | print(f"pred_items:\n{pred_items}\n{len(ref_items)}\n{ref_items[0]}") 56 | distance += ed.eval(ref_items, pred_items) 57 | ref_length += len(ref_items) 58 | return distance / ref_length 59 | -------------------------------------------------------------------------------- /audio_evals/lib/whisper/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import select 5 | import sys 6 | import torch 7 | from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline 8 | 9 | logging.basicConfig(level=logging.INFO) 10 | logger = logging.getLogger(__name__) 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--path", type=str, required=True, help="Path to Whisper model") 15 | config = parser.parse_args() 16 | 17 | # Initialize model 18 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 19 | torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 20 | 21 | model = AutoModelForSpeechSeq2Seq.from_pretrained( 22 | config.path, 23 | torch_dtype=torch_dtype, 24 | low_cpu_mem_usage=True, 25 | use_safetensors=True, 26 | ).eval() 27 | model.to(device) 28 | 29 | processor = AutoProcessor.from_pretrained(config.path) 30 | logger.info(f"Using Whisper model from: {config.path} on device: {device}") 31 | pipe = pipeline( 32 | "automatic-speech-recognition", 33 | model=model, 34 | tokenizer=processor.tokenizer, 35 | feature_extractor=processor.feature_extractor, 36 | torch_dtype=torch_dtype, 37 | device=device, 38 | ) 39 | while True: 40 | try: 41 | prompt = input() 42 | anchor = prompt.find("->") 43 | if anchor == -1: 44 | print( 45 | "Error: Invalid conversation format, must contains ->, but {}".format( 46 | prompt 47 | ), 48 | flush=True, 49 | ) 50 | continue 51 | prefix = prompt[:anchor].strip() + "->" 52 | x = json.loads(prompt[anchor + 2 :]) 53 | 54 | # Process input 55 | 56 | kwargs = x.pop("kwargs", {}) 57 | x.update(kwargs) 58 | if "return_timestamps" not in x: 59 | x["return_timestamps"] = True 60 | 61 | logger.info(f"Received input: {x}") 62 | 63 | result = pipe(x.pop("audio"), **x) 64 | retry = 3 65 | while retry: 66 | print(f"{prefix}{result['text']}", flush=True) 67 | rlist, _, _ = select.select([sys.stdin], [], [], 1) 68 | if rlist: 69 | finish = sys.stdin.readline().strip() 70 | if finish == "{}close".format(prefix): 71 | break 72 | print("not found close signal, will emit again", flush=True) 73 | retry -= 1 74 | except Exception as e: 75 | print(f"Error: {str(e)}", flush=True) 76 | -------------------------------------------------------------------------------- /audio_evals/lib/whisper/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate>=0.26.0 2 | torchaudio==2.5.1 3 | transformers==4.49.0 4 | -------------------------------------------------------------------------------- /audio_evals/models/AudioEncoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/models/AudioEncoder/__init__.py -------------------------------------------------------------------------------- /audio_evals/models/AudioEncoder/chattts.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import tempfile 4 | from dataclasses import asdict 5 | import torch 6 | import soundfile as sf 7 | import librosa 8 | from typing import Dict 9 | from vocos import Vocos 10 | from vocos.pretrained import instantiate_class 11 | 12 | from audio_evals.base import PromptStruct 13 | from audio_evals.lib.chattts import VocosConfig, DVAEConfig, DVAE 14 | from audio_evals.models.model import Model 15 | 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class ChatTTSModel(Model): 21 | def __init__(self, model_path: str, sample_params: Dict[str, any] = None): 22 | super().__init__(True, sample_params) 23 | self.device = "cuda:0" if torch.cuda.is_available() else "cpu" 24 | 25 | vocos_ckpt_path = os.path.join(model_path, "Vocos.pt") 26 | dvae_ckpt_path = os.path.join(model_path, "DVAE_full.pt") 27 | 28 | vocos_config = VocosConfig() 29 | feature_extractor = instantiate_class( 30 | args=(), init=asdict(vocos_config.feature_extractor) 31 | ) 32 | backbone = instantiate_class(args=(), init=asdict(vocos_config.backbone)) 33 | head = instantiate_class(args=(), init=asdict(vocos_config.head)) 34 | vocos = ( 35 | Vocos(feature_extractor=feature_extractor, backbone=backbone, head=head) 36 | .to(self.device) 37 | .eval() 38 | ) 39 | vocos.load_state_dict(torch.load(vocos_ckpt_path)) 40 | self.vocos = vocos 41 | 42 | dvae_config = DVAEConfig() 43 | dvae = DVAE( 44 | decoder_config=asdict(dvae_config.decoder), 45 | encoder_config=asdict(dvae_config.encoder), 46 | vq_config=asdict(dvae_config.vq), 47 | dim=dvae_config.decoder.idim, 48 | coef=None, 49 | device=self.device, 50 | ) 51 | dvae.load_pretrained(dvae_ckpt_path, self.device) 52 | 53 | self.dvae = dvae.eval() 54 | logger.info("model loaded successfully") 55 | 56 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 57 | audio_path = prompt["audio"] 58 | logger.debug(f"Processing audio file: {audio_path}") 59 | 60 | y, sr = librosa.load(audio_path, sr=24000, mono=True) 61 | waveform = torch.tensor(y).to(self.device) 62 | x = self.dvae(waveform, "encode") 63 | reconstructed_mel = self.dvae(x, "decode") 64 | reconstructed_waveform = self.vocos.decode(reconstructed_mel).cpu().numpy() 65 | 66 | waveform_mono = reconstructed_waveform.squeeze() 67 | # 保存生成的音频到临时文件 68 | with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: 69 | sf.write(f.name, waveform_mono, samplerate=24000, subtype="PCM_16") 70 | return f.name 71 | -------------------------------------------------------------------------------- /audio_evals/models/AudioEncoder/cosyvoice.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | 4 | sys.path.append("/DATA/disk1/home/shiqundong/project/CosyVoice/third_party/Matcha-TTS") 5 | sys.path.append( 6 | "/DATA/disk1/home/shiqundong/project/CosyVoice/env/lib/python3.10/site-packages/" 7 | ) 8 | sys.path.append("/DATA/disk1/home/shiqundong/project/CosyVoice") 9 | 10 | import logging 11 | import os 12 | import tempfile 13 | import torch 14 | import soundfile as sf 15 | import librosa 16 | from typing import Dict 17 | import s3tokenizer 18 | from audio_evals.base import PromptStruct 19 | from audio_evals.models.model import OfflineModel 20 | from cosyvoice.cli.cosyvoice import CosyVoice2 21 | from cosyvoice.utils.file_utils import load_wav 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | class CosyVoiceEncoder(OfflineModel): 27 | def __init__(self, model_path: str, sample_params: Dict[str, any] = None): 28 | super().__init__(True, sample_params) 29 | self.device = "cuda:0" if torch.cuda.is_available() else "cpu" 30 | self.tokenizer = s3tokenizer.load_model("speech_tokenizer_v2_25hz") 31 | self.tokenizer.to(self.device) 32 | 33 | self.model = CosyVoice2(model_path, load_jit=False, load_trt=False, fp16=False) 34 | logger.info("model loaded successfully") 35 | 36 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 37 | audio_path = prompt["audio"] 38 | logger.debug(f"Processing audio file: {audio_path}") 39 | 40 | x = load_wav(audio_path, 16_000) 41 | mel = s3tokenizer.log_mel_spectrogram(x.squeeze(0)) 42 | mels, mels_lens = s3tokenizer.padding([mel]) 43 | audio_tokens = self.tokenizer.quantize( 44 | mels.to(self.device), mels_lens.to(self.device) 45 | )[0] 46 | 47 | waveform = x.to(self.device) 48 | sr = torch.tensor(self.model.sample_rate).to(self.device) 49 | model_input = self.model.frontend.frontend_token2wav(waveform, sr) 50 | wav_out = self.model.model.token2wav( 51 | token=audio_tokens, 52 | prompt_token=model_input["flow_prompt_speech_token"], 53 | prompt_feat=model_input["prompt_speech_feat"], 54 | embedding=model_input["flow_embedding"], 55 | uuid=None, 56 | token_offset=0, 57 | speed=1.0, 58 | ) 59 | wav_out = wav_out.squeeze() 60 | # 保存生成的音频到临时文件 61 | with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: 62 | sf.write(f.name, wav_out.cpu().numpy(), samplerate=self.model.sample_rate) 63 | return f.name 64 | -------------------------------------------------------------------------------- /audio_evals/models/AudioEncoder/vocos_encode.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import tempfile 3 | 4 | import torch 5 | import soundfile as sf 6 | import librosa 7 | from typing import Dict 8 | from vocos import Vocos 9 | from vocos.pretrained import instantiate_class 10 | 11 | from audio_evals.base import PromptStruct 12 | from audio_evals.models.model import Model 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class VocosModel(Model): 18 | def __init__( 19 | self, 20 | model_path: str, 21 | feature_extractor: Dict[str, any], 22 | backbone: Dict[str, any], 23 | head: Dict[str, any], 24 | sample_params: Dict[str, any] = None, 25 | ): 26 | super().__init__(True, sample_params) # 作为非聊天模型 27 | self.device = "cuda:0" if torch.cuda.is_available() else "cpu" 28 | logger.info(f"Loading Vocos model from {model_path} to device {self.device}") 29 | feature_extractor = instantiate_class(args=(), init=feature_extractor) 30 | backbone = instantiate_class(args=(), init=backbone) 31 | head = instantiate_class(args=(), init=head) 32 | self.model = Vocos( 33 | feature_extractor=feature_extractor, backbone=backbone, head=head 34 | ) 35 | self.model.to(self.device) 36 | self.model.eval() 37 | self.model.load_state_dict(torch.load(model_path, weights_only=True, mmap=True)) 38 | logger.info("Vocos model loaded successfully") 39 | 40 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 41 | audio_path = prompt["audio"] 42 | logger.debug(f"Processing audio file: {audio_path}") 43 | y, sr = librosa.load(audio_path, sr=None) 44 | waveform = torch.tensor(y).unsqueeze(0).to(self.device) 45 | generated_audio = self.model(waveform) 46 | 47 | # 保存生成的音频到临时文件 48 | with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: 49 | sf.write(f.name, generated_audio.squeeze().cpu().numpy(), samplerate=22050) 50 | return f.name 51 | -------------------------------------------------------------------------------- /audio_evals/models/AudioEncoder/wav_tokenizer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append("audio_evals/lib/WavTokenizer") 4 | import logging 5 | import os.path 6 | import tempfile 7 | 8 | 9 | from typing import Dict 10 | from audio_evals.lib.WavTokenizer.encoder.utils import convert_audio 11 | import torchaudio 12 | import torch 13 | from audio_evals.lib.WavTokenizer.decoder.pretrained import WavTokenizer 14 | from audio_evals.base import PromptStruct 15 | from audio_evals.models.model import OfflineModel 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class WavTokenizerEncoder(OfflineModel): 21 | 22 | def __init__( 23 | self, 24 | config_name: str, 25 | model_path: str, 26 | sample_params: Dict[str, any] = None, 27 | ): 28 | super().__init__(is_chat=True, sample_params=sample_params) 29 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 30 | self.config_name = os.path.join( 31 | "audio_evals/lib/WavTokenizer/configs", config_name 32 | ) 33 | 34 | logger.info(f"Loading WavTokenizer from {model_path}") 35 | self.model = WavTokenizer.from_pretrained0802(self.config_name, model_path) 36 | self.model = self.model.to(self.device) 37 | logger.info(f"WavTokenizer loaded on {self.device}") 38 | 39 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 40 | audio_path = prompt["audio"] 41 | 42 | wav, sr = torchaudio.load(audio_path) 43 | wav = convert_audio(wav, sr, 24000, 1) 44 | bandwidth_id = torch.tensor([0]) 45 | wav = wav.to(self.device) 46 | features, discrete_code = self.model.encode_infer( 47 | wav, bandwidth_id=bandwidth_id 48 | ) 49 | audio_out = self.model.decode( 50 | features, bandwidth_id=bandwidth_id.to(self.device) 51 | ) 52 | 53 | with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: 54 | torchaudio.save( 55 | f.name, 56 | audio_out.cpu(), 57 | sample_rate=24000, 58 | encoding="PCM_S", 59 | bits_per_sample=16, 60 | ) 61 | return f.name 62 | -------------------------------------------------------------------------------- /audio_evals/models/TTS/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/models/TTS/__init__.py -------------------------------------------------------------------------------- /audio_evals/models/TTS/amphion.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Dict 4 | from audio_evals.base import PromptStruct 5 | from audio_evals.models.model import OfflineModel 6 | from audio_evals.isolate import isolated 7 | import select 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | @isolated("audio_evals/lib/Amphion/main.py") 13 | class Amphion(OfflineModel): 14 | def __init__(self, path: str, sample_params: Dict = None, *args, **kwargs): 15 | self.command_args = { 16 | "path": path, 17 | } 18 | super().__init__(is_chat=True, sample_params=sample_params) 19 | 20 | def _inference(self, prompt: PromptStruct, **kwargs): 21 | import uuid 22 | 23 | uid = str(uuid.uuid4()) 24 | prefix = f"{uid}->" 25 | 26 | while True: 27 | _, wlist, _ = select.select([], [self.process.stdin], [], 60) 28 | if wlist: 29 | self.process.stdin.write(f"{prefix}{json.dumps(prompt)}\n") 30 | self.process.stdin.flush() 31 | print("already write in") 32 | break 33 | while True: 34 | rlist, _, _ = select.select( 35 | [self.process.stdout, self.process.stderr], [], [], 60 36 | ) 37 | if not rlist: 38 | err_msg = "Read timeout after 60 seconds" 39 | logger.error(err_msg) 40 | raise RuntimeError(err_msg) 41 | 42 | try: 43 | for stream in rlist: 44 | if stream == self.process.stdout: 45 | result = self.process.stdout.readline().strip() 46 | if not result: 47 | continue 48 | if result.startswith(prefix): 49 | self.process.stdin.write("{}close\n".format(prefix)) 50 | self.process.stdin.flush() 51 | return result[len(prefix) :] 52 | elif result.startswith("Error:"): 53 | raise RuntimeError("Amphion failed: {}".format(result)) 54 | else: 55 | logger.info(result) 56 | elif stream == self.process.stderr: 57 | err = self.process.stderr.readline().strip() 58 | if err: 59 | logger.error(f"Process stderr: {err}") 60 | except BlockingIOError as e: 61 | logger.error(f"BlockingIOError occurred: {str(e)}") 62 | -------------------------------------------------------------------------------- /audio_evals/models/TTS/megatts.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Dict 4 | from audio_evals.base import PromptStruct 5 | from audio_evals.models.model import OfflineModel 6 | from audio_evals.isolate import isolated 7 | import select 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | @isolated("audio_evals/lib/MegaTTS3/main.py") 13 | class MegaTTS(OfflineModel): 14 | def __init__(self, path: str, sample_params: Dict = None, *args, **kwargs): 15 | self.command_args = { 16 | "path": path, 17 | } 18 | super().__init__(is_chat=True, sample_params=sample_params) 19 | 20 | def _inference(self, prompt: PromptStruct, **kwargs): 21 | import uuid 22 | 23 | uid = str(uuid.uuid4()) 24 | prefix = f"{uid}->" 25 | 26 | while True: 27 | _, wlist, _ = select.select([], [self.process.stdin], [], 60) 28 | if wlist: 29 | self.process.stdin.write(f"{prefix}{json.dumps(prompt)}\n") 30 | self.process.stdin.flush() 31 | print("already write in") 32 | break 33 | while True: 34 | rlist, _, _ = select.select( 35 | [self.process.stdout, self.process.stderr], [], [], 60 36 | ) 37 | if not rlist: 38 | err_msg = "Read timeout after 60 seconds" 39 | logger.error(err_msg) 40 | raise RuntimeError(err_msg) 41 | 42 | try: 43 | for stream in rlist: 44 | if stream == self.process.stdout: 45 | result = self.process.stdout.readline().strip() 46 | if not result: 47 | continue 48 | if result.startswith(prefix): 49 | self.process.stdin.write("{}close\n".format(prefix)) 50 | self.process.stdin.flush() 51 | return result[len(prefix) :] 52 | elif result.startswith("Error:"): 53 | raise RuntimeError("MegaTTS failed: {}".format(result)) 54 | else: 55 | logger.info(result) 56 | elif stream == self.process.stderr: 57 | err = self.process.stderr.readline().strip() 58 | if err: 59 | logger.error(f"Process stderr: {err}") 60 | except BlockingIOError as e: 61 | logger.error(f"BlockingIOError occurred: {str(e)}") 62 | -------------------------------------------------------------------------------- /audio_evals/models/TTS/melotts.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Dict 4 | from audio_evals.base import PromptStruct 5 | from audio_evals.models.model import OfflineModel 6 | from audio_evals.isolate import isolated 7 | import select 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | @isolated("audio_evals/lib/MeloTTS/main.py") 13 | class MeloTTS(OfflineModel): 14 | def __init__( 15 | self, path: str, lang: str, sample_params: Dict = None, *args, **kwargs 16 | ): 17 | self.command_args = { 18 | "path": path, 19 | "lang": lang, 20 | } 21 | super().__init__(is_chat=True, sample_params=sample_params) 22 | 23 | def _inference(self, prompt: PromptStruct, **kwargs): 24 | import uuid 25 | 26 | uid = str(uuid.uuid4()) 27 | prefix = f"{uid}->" 28 | prompt.update(kwargs) 29 | 30 | while True: 31 | _, wlist, _ = select.select([], [self.process.stdin], [], 60) 32 | if wlist: 33 | self.process.stdin.write(f"{prefix}{json.dumps(prompt)}\n") 34 | self.process.stdin.flush() 35 | print("already write in") 36 | break 37 | while True: 38 | rlist, _, _ = select.select( 39 | [self.process.stdout, self.process.stderr], [], [], 60 40 | ) 41 | if not rlist: 42 | err_msg = "Read timeout after 60 seconds" 43 | logger.error(err_msg) 44 | raise RuntimeError(err_msg) 45 | 46 | try: 47 | for stream in rlist: 48 | if stream == self.process.stdout: 49 | result = self.process.stdout.readline().strip() 50 | if not result: 51 | continue 52 | if result.startswith(prefix): 53 | self.process.stdin.write("{}close\n".format(prefix)) 54 | self.process.stdin.flush() 55 | return result[len(prefix) :] 56 | elif result.startswith("Error:"): 57 | raise RuntimeError("MeloTTS failed: {}".format(result)) 58 | else: 59 | logger.info(result) 60 | elif stream == self.process.stderr: 61 | err = self.process.stderr.readline().strip() 62 | if err: 63 | logger.error(f"Process stderr: {err}") 64 | except BlockingIOError as e: 65 | logger.error(f"BlockingIOError occurred: {str(e)}") 66 | -------------------------------------------------------------------------------- /audio_evals/models/TTS/spark.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Dict 4 | from audio_evals.base import PromptStruct 5 | from audio_evals.models.model import OfflineModel 6 | from audio_evals.isolate import isolated 7 | import select 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | @isolated("audio_evals/lib/Spark-TTS/main.py") 13 | class SparkVoiceClone(OfflineModel): 14 | def __init__( 15 | self, path: str, vc_mode: bool, sample_params: Dict = None, *args, **kwargs 16 | ): 17 | self.command_args = { 18 | "path": path, 19 | } 20 | if vc_mode: 21 | self.command_args["vc_mode"] = "" 22 | super().__init__(is_chat=True, sample_params=sample_params) 23 | 24 | def _inference(self, prompt: PromptStruct, **kwargs): 25 | import uuid 26 | 27 | uid = str(uuid.uuid4()) 28 | prefix = f"{uid}->" 29 | 30 | while True: 31 | _, wlist, _ = select.select([], [self.process.stdin], [], 60) 32 | if wlist: 33 | self.process.stdin.write(f"{prefix}{json.dumps(prompt)}\n") 34 | self.process.stdin.flush() 35 | print("already write in") 36 | break 37 | while True: 38 | rlist, _, _ = select.select( 39 | [self.process.stdout, self.process.stderr], [], [], 60 40 | ) 41 | if not rlist: 42 | err_msg = "Read timeout after 60 seconds" 43 | logger.error(err_msg) 44 | raise RuntimeError(err_msg) 45 | 46 | try: 47 | for stream in rlist: 48 | if stream == self.process.stdout: 49 | result = self.process.stdout.readline().strip() 50 | if not result: 51 | continue 52 | if result.startswith(prefix): 53 | self.process.stdin.write("{}close\n".format(prefix)) 54 | self.process.stdin.flush() 55 | return result[len(prefix) :] 56 | elif result.startswith("Error:"): 57 | raise RuntimeError("Spark failed: {}".format(result)) 58 | else: 59 | logger.info(result) 60 | elif stream == self.process.stderr: 61 | err = self.process.stderr.readline().strip() 62 | if err: 63 | logger.error(f"Process stderr: {err}") 64 | except BlockingIOError as e: 65 | logger.error(f"BlockingIOError occurred: {str(e)}") 66 | -------------------------------------------------------------------------------- /audio_evals/models/TTS/stabletts.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Dict 4 | from audio_evals.base import PromptStruct 5 | from audio_evals.models.model import OfflineModel 6 | from audio_evals.isolate import isolated 7 | import select 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | @isolated("audio_evals/lib/StableTTS/main.py") 13 | class StableTTS(OfflineModel): 14 | def __init__( 15 | self, 16 | tts_path: str, 17 | vocoder_path: str, 18 | vocoder_type: str, 19 | sample_params: Dict = None, 20 | *args, 21 | **kwargs, 22 | ): 23 | self.command_args = { 24 | "tts_path": tts_path, 25 | "vocoder_path": vocoder_path, 26 | "vocoder_type": vocoder_type, 27 | } 28 | super().__init__(is_chat=True, sample_params=sample_params) 29 | 30 | def _inference(self, prompt: PromptStruct, **kwargs): 31 | import uuid 32 | 33 | uid = str(uuid.uuid4()) 34 | prefix = f"{uid}->" 35 | 36 | while True: 37 | _, wlist, _ = select.select([], [self.process.stdin], [], 60) 38 | if wlist: 39 | self.process.stdin.write(f"{prefix}{json.dumps(prompt)}\n") 40 | self.process.stdin.flush() 41 | print("already write in") 42 | break 43 | while True: 44 | rlist, _, _ = select.select( 45 | [self.process.stdout, self.process.stderr], [], [], 60 46 | ) 47 | if not rlist: 48 | err_msg = "Read timeout after 60 seconds" 49 | logger.error(err_msg) 50 | raise RuntimeError(err_msg) 51 | 52 | try: 53 | for stream in rlist: 54 | if stream == self.process.stdout: 55 | result = self.process.stdout.readline().strip() 56 | if not result: 57 | continue 58 | if result.startswith(prefix): 59 | self.process.stdin.write("{}close\n".format(prefix)) 60 | self.process.stdin.flush() 61 | return result[len(prefix) :] 62 | elif result.startswith("Error:"): 63 | raise RuntimeError("StableTTS failed: {}".format(result)) 64 | else: 65 | logger.info(result) 66 | elif stream == self.process.stderr: 67 | err = self.process.stderr.readline().strip() 68 | if err: 69 | logger.error(f"Process stderr: {err}") 70 | except BlockingIOError as e: 71 | logger.error(f"BlockingIOError occurred: {str(e)}") 72 | -------------------------------------------------------------------------------- /audio_evals/models/UltraVOX.py: -------------------------------------------------------------------------------- 1 | # pip install transformers peft librosa 2 | import logging 3 | from typing import List, Dict, Tuple 4 | 5 | import transformers 6 | import librosa 7 | 8 | from audio_evals.base import PromptStruct 9 | from audio_evals.models.model import Model 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class UltraVOX(Model): 15 | def __init__(self, path: str, sample_params: Dict[str, any] = None): 16 | super().__init__(True, sample_params) # as a chat model 17 | logger.debug("start load model from {}".format(path)) 18 | self.pipe = transformers.pipeline(model=path, trust_remote_code=True, device=0) 19 | logger.debug("model loaded") 20 | self.max_new_tokens = 30 21 | 22 | @staticmethod 23 | def _conv_prompt(prompt: PromptStruct) -> Tuple[str, str, List[Dict[str, str]]]: 24 | audio, sr = "", "" 25 | turns = [ 26 | { 27 | "role": "system", 28 | "content": "You are a friendly and helpful character. You love to answer questions for people.", 29 | }, 30 | ] 31 | for line in prompt: 32 | role = line["role"] 33 | for c in line["contents"]: 34 | if c["type"] == "audio": 35 | audio, sr = librosa.load(c["value"], sr=16000) 36 | if c["type"] == "text": 37 | turns.append({"role": role, "content": c["value"] + " <|audio|>"}) 38 | return audio, sr, turns 39 | 40 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 41 | audio, sr, turns = self._conv_prompt(prompt) 42 | logger.debug("turns: {}".format(turns)) 43 | return self.pipe( 44 | {"audio": audio, "turns": turns, "sampling_rate": sr}, **kwargs 45 | ) 46 | -------------------------------------------------------------------------------- /audio_evals/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/models/__init__.py -------------------------------------------------------------------------------- /audio_evals/models/ali.py: -------------------------------------------------------------------------------- 1 | import os 2 | from copy import deepcopy 3 | from http import HTTPStatus 4 | from typing import Dict 5 | 6 | from dashscope import MultiModalConversation 7 | 8 | from audio_evals.base import PromptStruct 9 | from audio_evals.models.model import APIModel 10 | 11 | 12 | class AliApi(APIModel): 13 | 14 | def __init__( 15 | self, 16 | model_name: str = "qwen2-audio-instruct", 17 | sample_params: Dict[str, any] = None, 18 | ): 19 | super().__init__(True, sample_params) 20 | self.model = model_name 21 | assert "DASHSCOPE_API_KEY" in os.environ, ValueError( 22 | "not found DASHSCOPE_API_KEY in your ENV" 23 | ) 24 | 25 | def _inference(self, prompt: PromptStruct, **kwargs): 26 | messages = [] 27 | for content in deepcopy(prompt): 28 | for i, line in enumerate(content["contents"]): 29 | if line["type"] == "text": 30 | content["contents"][i] = {"text": line["value"]} 31 | else: 32 | content["contents"][i] = { 33 | line["type"]: "file://{}".format(line["value"]) 34 | } 35 | 36 | content["content"] = content["contents"] 37 | del content["contents"] 38 | messages.append(content) 39 | 40 | response = MultiModalConversation.call(model=self.model, messages=messages) 41 | if response.status_code == HTTPStatus.OK: 42 | return response.output.choices[0].message.content[0]["text"] 43 | raise Exception("{}: {}".format(response.code, response.message)) 44 | -------------------------------------------------------------------------------- /audio_evals/models/asr/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Dict 3 | 4 | import requests 5 | 6 | from audio_evals.base import PromptStruct 7 | from audio_evals.models.model import APIModel 8 | from audio_evals.utils import get_base64_from_file 9 | 10 | 11 | class AsrServer(APIModel): 12 | def __init__(self, url: str, sample_params: Dict[str, any] = None): 13 | super().__init__(True, sample_params) 14 | self.url = url 15 | 16 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 17 | 18 | audio_file = prompt["audio"] 19 | audio_base64 = get_base64_from_file(audio_file) 20 | headers = {"Content-Type": "application/json"} 21 | data = {"audio": audio_base64} 22 | response = requests.post( 23 | self.url, headers=headers, data=json.dumps(data), stream=True 24 | ) 25 | if response.status_code == 200: 26 | return response.text 27 | else: 28 | raise Exception(f"Error: {response.status_code} - {response.text}") 29 | -------------------------------------------------------------------------------- /audio_evals/models/asr/fireredasr.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict 3 | from audio_evals.base import PromptStruct 4 | from audio_evals.models.model import OfflineModel 5 | from audio_evals.isolate import isolated 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | @isolated( 12 | "audio_evals/lib/FireRedASR/main.py", 13 | pre_command="export PYTHONPATH=$PWD/:$PYTHONPATH", 14 | ) 15 | class FireRedASR(OfflineModel): 16 | def __init__(self, path: str, sample_params: Dict = None, *args, **kwargs): 17 | self.command_args = { 18 | "path": path, 19 | } 20 | super().__init__(is_chat=False, sample_params=sample_params) 21 | 22 | def _inference(self, prompt: PromptStruct, **kwargs) -> float: 23 | audio = prompt["audio"] 24 | self.process.stdin.write(f"{audio}\n") 25 | self.process.stdin.flush() 26 | import select 27 | 28 | while True: 29 | reads, _, _ = select.select( 30 | [self.process.stdout, self.process.stderr], [], [], 1.0 31 | ) 32 | for read in reads: 33 | if read is self.process.stdout: 34 | result = self.process.stdout.readline().strip() 35 | if result: 36 | if result.startswith("Result:"): 37 | return result[7:] 38 | elif result.startswith("Error:"): 39 | raise RuntimeError("FireRedASR failed: {}".format(result)) 40 | else: 41 | logger.info(result) 42 | if read is self.process.stderr: 43 | error_output = self.process.stderr.readline() 44 | if error_output: 45 | print(f"stderr: {error_output.strip()}") 46 | -------------------------------------------------------------------------------- /audio_evals/models/asr/paraformer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Dict 4 | from audio_evals.base import PromptStruct 5 | from audio_evals.models.model import OfflineModel 6 | from audio_evals.isolate import isolated 7 | import select 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | @isolated("audio_evals/lib/paraformer/main.py") 14 | class Paraformer(OfflineModel): 15 | def __init__(self, path: str, sample_params: Dict = None, *args, **kwargs): 16 | if not os.path.exists(path): 17 | path = self._download_model(path) 18 | 19 | self.command_args = { 20 | "path": path, 21 | } 22 | super().__init__(is_chat=False, sample_params=sample_params) 23 | 24 | def _inference(self, prompt: PromptStruct, **kwargs) -> float: 25 | audio = prompt["audio"] 26 | import uuid 27 | 28 | uid = str(uuid.uuid4()) 29 | prefix = f"{uid}->" 30 | while True: 31 | _, wlist, _ = select.select([], [self.process.stdin], [], 60) 32 | if wlist: 33 | self.process.stdin.write(f"{prefix}{audio}\n") 34 | self.process.stdin.flush() 35 | print("already write in") 36 | break 37 | print("waiting for write") 38 | while True: 39 | reads, _, _ = select.select( 40 | [self.process.stdout, self.process.stderr], [], [], 1.0 41 | ) 42 | for read in reads: 43 | if read is self.process.stdout: 44 | result = self.process.stdout.readline().strip() 45 | if result: 46 | if result.startswith(prefix): 47 | self.process.stdin.write("{}close\n".format(prefix)) 48 | self.process.stdin.flush() 49 | return result[len(prefix) :] 50 | elif result.startswith("Error:"): 51 | raise RuntimeError("FireRedASR failed: {}".format(result)) 52 | else: 53 | logger.info(result) 54 | if read is self.process.stderr: 55 | error_output = self.process.stderr.readline() 56 | if error_output: 57 | print(f"stderr: {error_output.strip()}") 58 | -------------------------------------------------------------------------------- /audio_evals/models/asr/sherpa.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import select 4 | import uuid 5 | from typing import Dict 6 | 7 | from audio_evals.base import PromptStruct 8 | from audio_evals.models.model import OfflineModel 9 | from audio_evals.isolate import isolated 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | @isolated("audio_evals/lib/sherpa-onnx/main.py") 15 | class SherpaOnnx(OfflineModel): 16 | def __init__(self, tokens: str, sample_params: Dict = None, *args, **kwargs): 17 | self.command_args = { 18 | "tokens": tokens, 19 | } 20 | for k, v in kwargs.items(): 21 | if k == "offline": 22 | if v: 23 | v = "" 24 | else: 25 | continue 26 | 27 | self.command_args[k] = v 28 | super().__init__(is_chat=True, sample_params=sample_params) 29 | 30 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 31 | audio = prompt["audio"] 32 | uid = str(uuid.uuid4()) 33 | prefix = f"{uid}->" 34 | 35 | # Send request 36 | while True: 37 | _, wlist, _ = select.select([], [self.process.stdin], [], 60) 38 | if wlist: 39 | request = json.dumps({"audio": audio}) 40 | self.process.stdin.write(f"{prefix}{request}\n") 41 | self.process.stdin.flush() 42 | break 43 | 44 | # Receive response 45 | while True: 46 | reads, _, _ = select.select( 47 | [self.process.stdout, self.process.stderr], [], [], 1.0 48 | ) 49 | for read in reads: 50 | if read is self.process.stdout: 51 | result = self.process.stdout.readline().strip() 52 | if result: 53 | if result.startswith(prefix): 54 | # Close the request 55 | self.process.stdin.write(f"{prefix}close\n") 56 | self.process.stdin.flush() 57 | 58 | # Parse and return the result 59 | response = json.loads(result[len(prefix) :]) 60 | return response["text"] 61 | elif result.startswith("Error:"): 62 | raise RuntimeError(f"SherpaOnnx failed: {result}") 63 | else: 64 | logger.info(result) 65 | if read is self.process.stderr: 66 | error_output = self.process.stderr.readline() 67 | if error_output: 68 | logger.error(f"stderr: {error_output.strip()}") 69 | -------------------------------------------------------------------------------- /audio_evals/models/asr/tencent.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | import subprocess 4 | import tempfile 5 | from typing import Dict 6 | from tencentcloud.common import credential 7 | from tencentcloud.common.profile.client_profile import ClientProfile 8 | from tencentcloud.common.profile.http_profile import HttpProfile 9 | from tencentcloud.asr.v20190614 import asr_client, models 10 | from audio_evals.base import PromptStruct 11 | from audio_evals.models.model import APIModel 12 | import logging 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class TencentASRModel(APIModel): 19 | def __init__( 20 | self, 21 | secret_id: str, 22 | secret_key: str, 23 | region: str = "ap-guangzhou", 24 | sample_params: Dict[str, any] = None, 25 | ): 26 | super().__init__(False, sample_params) 27 | self.secret_id = secret_id 28 | self.secret_key = secret_key 29 | self.region = region 30 | 31 | # 初始化认证对象 32 | self.cred = credential.Credential(self.secret_id, self.secret_key) 33 | 34 | # 配置 HTTP 选项 35 | self.http_profile = HttpProfile() 36 | self.http_profile.endpoint = "asr.tencentcloudapi.com" 37 | 38 | # 配置客户端参数 39 | self.client_profile = ClientProfile() 40 | self.client_profile.httpProfile = self.http_profile 41 | 42 | # 初始化 ASR 客户端 43 | self.client = asr_client.AsrClient(self.cred, self.region, self.client_profile) 44 | 45 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 46 | audio = prompt["audio"] 47 | logger.debug(f"Processing audio file: {audio}") 48 | 49 | with tempfile.NamedTemporaryFile(suffix=".wav") as tmp_file: 50 | audio_path = tmp_file.name 51 | subprocess.run( 52 | ["ffmpeg", "-y", "-i", audio, "-ar", "16000", "-ac", "1", audio_path], 53 | capture_output=True, 54 | text=True, 55 | check=True, 56 | ) 57 | # 读取音频文件并进行 base64 编码 58 | with open(audio_path, "rb") as f: 59 | audio_data = f.read() 60 | audio_base64 = base64.b64encode(audio_data).decode("utf-8") 61 | 62 | # 创建请求对象 63 | req = models.SentenceRecognitionRequest() 64 | params = { 65 | "ProjectId": 0, 66 | "SubServiceType": 2, 67 | "SourceType": 1, 68 | "VoiceFormat": "wav", 69 | "UsrAudioKey": "session-123", 70 | "Data": audio_base64, 71 | "DataLen": len(audio_data), 72 | **kwargs, 73 | } 74 | req.from_json_string(json.dumps(params)) 75 | 76 | # 发送请求并获取响应 77 | resp = self.client.SentenceRecognition(req) 78 | return resp.Result 79 | -------------------------------------------------------------------------------- /audio_evals/models/bytedance/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/models/bytedance/__init__.py -------------------------------------------------------------------------------- /audio_evals/models/bytedance/doubao.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, Any 3 | from openai import OpenAI 4 | 5 | from audio_evals.models.model import APIModel 6 | from audio_evals.base import PromptStruct 7 | 8 | 9 | API_KEY = os.getenv("DOUBAO_API_KEY") 10 | URL = os.getenv("DOUBAO_URL") 11 | 12 | 13 | class Doubao(APIModel): 14 | def __init__( 15 | self, model_name: str, api_key: str = None, sample_params: Dict[str, Any] = None 16 | ): 17 | super().__init__(True, sample_params) 18 | self.model_name = model_name 19 | assert "DOUBAO_API_KEY" in os.environ or api_key is not None, ValueError( 20 | "not found DOUBAO_API_KEY in your ENV" 21 | ) 22 | if api_key is None: 23 | api_key = os.environ.get("DOUBAO_API_KEY") 24 | self.client = OpenAI( 25 | # 此为默认路径,您可根据业务所在地域进行配置 26 | base_url="https://ark.cn-beijing.volces.com/api/v3", 27 | # 从环境变量中获取您的 API Key 28 | api_key=api_key, 29 | ) 30 | 31 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 32 | 33 | messages = [] 34 | for item in prompt: 35 | messages.append( 36 | {"role": item["role"], "content": item["contents"][0]["value"]} 37 | ) 38 | 39 | response = self.client.chat.completions.create( 40 | model=self.model_name, messages=messages, **kwargs 41 | ) 42 | 43 | return response.choices[0].message.content 44 | 45 | 46 | class DoubaoAudioPipeline(APIModel): 47 | def __init__(self, asr: str, llm: str): 48 | super().__init__(True) 49 | from audio_evals.registry import registry 50 | 51 | self.asr = registry.get_model(asr) 52 | self.llm = registry.get_model(llm) 53 | 54 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 55 | text = self.asr.inference(prompt) 56 | res = self.llm.inference(text) 57 | return res 58 | -------------------------------------------------------------------------------- /audio_evals/models/mini_cpm.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import logging 4 | import select 5 | from typing import Dict 6 | from audio_evals.base import PromptStruct 7 | from audio_evals.models.model import OfflineModel 8 | from audio_evals.isolate import isolated 9 | 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | @isolated("audio_evals/lib/minicpm/main.py") 15 | class MiniCPMo(OfflineModel): 16 | def __init__( 17 | self, 18 | path: str, 19 | speech: bool = False, 20 | sample_params: Dict = None, 21 | *args, 22 | **kwargs, 23 | ): 24 | if path == "openbmb/MiniCPM-o-2_6" and not os.path.exists(path): 25 | path = self._download_model(path) 26 | 27 | self.command_args = { 28 | "path": path, 29 | } 30 | if speech: 31 | self.command_args["speech"] = "" 32 | super().__init__(is_chat=True, sample_params=sample_params) 33 | 34 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 35 | import uuid 36 | 37 | uid = str(uuid.uuid4()) 38 | prefix = f"{uid}->" 39 | 40 | input_o = {"prompt": prompt} 41 | input_o.update(kwargs) 42 | 43 | while True: 44 | _, wlist, _ = select.select([], [self.process.stdin], [], 60) 45 | if wlist: 46 | self.process.stdin.write(f"{prefix}{json.dumps(input_o)}\n") 47 | self.process.stdin.flush() 48 | print("already write in") 49 | break 50 | 51 | while True: 52 | reads, _, _ = select.select( 53 | [self.process.stdout, self.process.stderr], [], [], 1.0 54 | ) 55 | for read in reads: 56 | if read is self.process.stdout: 57 | result = self.process.stdout.readline() 58 | if result: 59 | if result.startswith(prefix): 60 | self.process.stdin.write("{}close\n".format(prefix)) 61 | self.process.stdin.flush() 62 | res = json.loads(result[len(prefix) :]) 63 | if len(res) == 1: 64 | return res["text"] 65 | return json.dumps(res, ensure_ascii=False) 66 | elif result.startswith("Error:"): 67 | raise RuntimeError( 68 | "mimicpm-o 2.6 failed: {}".format(result) 69 | ) 70 | else: 71 | logger.info(result) 72 | if read is self.process.stderr: 73 | error_output = self.process.stderr.readline() 74 | if error_output: 75 | print(f"stderr: {error_output.strip()}") 76 | -------------------------------------------------------------------------------- /audio_evals/models/mini_omni.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | from typing import Dict 4 | 5 | import requests 6 | 7 | from audio_evals.base import PromptStruct 8 | from audio_evals.models.model import APIModel 9 | from audio_evals.utils import get_base64_from_file 10 | import wave 11 | import numpy as np 12 | 13 | 14 | OUT_CHANNELS = 1 15 | 16 | 17 | def save_audio_response(response, output_file): 18 | """保存服务器返回的音频流为文件""" 19 | if response.status_code == 200: 20 | text = "" 21 | with wave.open(output_file, 'wb') as wf: 22 | wf.setnchannels(OUT_CHANNELS) 23 | wf.setsampwidth(2) # 2 bytes per sample (16-bit audio) 24 | wf.setframerate(24000) 25 | 26 | for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"): 27 | if chunk: 28 | data = json.loads(chunk.decode()) 29 | text = data["text"] 30 | audio_data = np.frombuffer(bytes.fromhex(data["audio"]), dtype=np.int16) 31 | audio_data = audio_data.reshape(-1, OUT_CHANNELS) 32 | wf.writeframes(audio_data.tobytes()) 33 | return output_file, text 34 | else: 35 | raise Exception(f"下载失败,状态码: {response.status_code}") 36 | 37 | 38 | class MiniOmni(APIModel): 39 | def __init__( 40 | self, url: str, sample_params: Dict[str, any] = None 41 | ): 42 | super().__init__(True, sample_params) 43 | self.url = url 44 | 45 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 46 | 47 | audio_file = "" 48 | for content in prompt: 49 | if content["role"] == "user": 50 | for line in content["contents"]: 51 | if line["type"] == "audio": 52 | audio_file = line["value"] 53 | break 54 | 55 | audio_base64 = get_base64_from_file(audio_file) 56 | headers = { 57 | 'Content-Type': 'application/json', 58 | 'Connection': 'keep-alive', 59 | 'Upgrade-Insecure-Requests': '1' 60 | } 61 | data = { 62 | 'audio': audio_base64 63 | } 64 | response = requests.post(self.url, headers=headers, data=json.dumps(data), stream=True) 65 | with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: 66 | audio, text = save_audio_response(response, f.name) 67 | return json.dumps({"audio": audio, "text": text}, ensure_ascii=False) 68 | 69 | -------------------------------------------------------------------------------- /audio_evals/models/openai.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, Any 3 | from openai import OpenAI, AzureOpenAI 4 | from azure.core.credentials import AzureKeyCredential 5 | 6 | 7 | from audio_evals.models.model import APIModel 8 | from audio_evals.base import PromptStruct 9 | 10 | 11 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 12 | OPENAI_URL = os.getenv("OPENAI_URL", "https://api.openai.com") 13 | 14 | 15 | class GPT(APIModel): 16 | def __init__( 17 | self, 18 | model_name: str, 19 | is_azure: bool = False, 20 | sample_params: Dict[str, Any] = None, 21 | ): 22 | super().__init__(True, sample_params) 23 | self.model_name = model_name 24 | assert "OPENAI_API_KEY" in os.environ, ValueError( 25 | "not found OPENAI_API_KEY in your ENV" 26 | ) 27 | if is_azure: 28 | key = os.environ["AZURE_OPENAI_KEY"] 29 | endpoint = os.environ["AZURE_OPENAI_BASE"] 30 | print(f"Using Azure OpenAI with key {key} and endpoint {endpoint}") 31 | self.client = AzureOpenAI( 32 | api_version="2025-03-01-preview", api_key=key, azure_endpoint=endpoint 33 | ) 34 | else: 35 | self.client = OpenAI() 36 | 37 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 38 | 39 | messages = [] 40 | for item in prompt: 41 | messages.append( 42 | {"role": item["role"], "content": item["contents"][0]["value"]} 43 | ) 44 | 45 | response = self.client.chat.completions.create( 46 | model=self.model_name, messages=messages, **kwargs 47 | ) 48 | 49 | return response.choices[0].message.content 50 | 51 | 52 | class AudioTranscribe(GPT): 53 | """ 54 | This model is used to transcribe audio to text. 55 | """ 56 | 57 | def _inference(self, prompt, **kwargs): 58 | audio_file = open(prompt["audio"], "rb") 59 | transcript = self.client.audio.transcriptions.create( 60 | model=self.model_name, file=audio_file 61 | ) 62 | return transcript["text"] 63 | -------------------------------------------------------------------------------- /audio_evals/models/step_audio.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | import tempfile 4 | from typing import Dict 5 | 6 | import requests 7 | 8 | from audio_evals.base import PromptStruct 9 | from audio_evals.models.model import APIModel 10 | from audio_evals.utils import get_base64_from_file 11 | 12 | 13 | def save_audio_response(response, output_file): 14 | """保存服务器返回的音频流为文件""" 15 | if response.status_code == 200: 16 | audio_data = b"" 17 | text_response = "" 18 | 19 | for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"): 20 | if chunk: 21 | try: 22 | event = json.loads(chunk.decode("utf-8")) 23 | text_response += event.get("text", "") 24 | if output_file: 25 | audio_data += base64.b64decode(event.get("audio", "")) 26 | except json.JSONDecodeError: 27 | continue 28 | if output_file: 29 | with open(output_file, "wb") as f: 30 | f.write(audio_data) 31 | return output_file, text_response 32 | else: 33 | raise Exception(f"下载失败,状态码: {response.status_code}") 34 | 35 | 36 | class StepAudioChat(APIModel): 37 | def __init__(self, url: str, s2t: bool, sample_params: Dict[str, any] = None): 38 | super().__init__(True, sample_params) 39 | self.url = url 40 | self.s2t = s2t 41 | 42 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 43 | text, audio_file = "", "" 44 | for content in prompt: 45 | if content["role"] == "user": 46 | for line in content["contents"]: 47 | if line["type"] == "audio": 48 | audio_file = line["value"] 49 | if line["type"] == "text": 50 | text = line["value"] 51 | endfix = audio_file.split(".")[-1] 52 | audio_base64 = get_base64_from_file(audio_file) 53 | headers = {"Content-Type": "application/json"} 54 | data = { 55 | "text": text, 56 | "audio": audio_base64, 57 | "audio_format": endfix, 58 | } 59 | response = requests.post( 60 | self.url, headers=headers, data=json.dumps(data), stream=True 61 | ) 62 | if self.s2t: 63 | _, text = save_audio_response(response, None) 64 | return text 65 | 66 | with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: 67 | audio, text = save_audio_response(response, f.name) 68 | return json.dumps({"audio": audio, "text": text}, ensure_ascii=False) 69 | -------------------------------------------------------------------------------- /audio_evals/models/utmos.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Dict 4 | from audio_evals.base import PromptStruct 5 | from audio_evals.models.model import OfflineModel 6 | from audio_evals.isolate import isolated 7 | import select 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | @isolated( 14 | "audio_evals/lib/utmos/main.py", 15 | pre_command="pip install pip==24.0 &&export SACREBLEU_ROOT=envs/utmos/.sacrebleu", 16 | ) 17 | class UTMOS(OfflineModel): 18 | def __init__( 19 | self, 20 | path: str = "sarulab-speech/UTMOS-demo", 21 | sample_params: Dict = None, 22 | *args, 23 | **kwargs, 24 | ): 25 | if path == "sarulab-speech/UTMOS-demo" and not os.path.exists(path): 26 | path = self._download_model(path, repo_type="space") 27 | 28 | self.command_args = { 29 | "path": path, 30 | } 31 | super().__init__(is_chat=False, sample_params=sample_params) 32 | 33 | def _inference(self, prompt: PromptStruct, **kwargs) -> float: 34 | import uuid 35 | 36 | uid = str(uuid.uuid4()) 37 | prefix = f"{uid}->" 38 | 39 | while True: 40 | _, wlist, _ = select.select([], [self.process.stdin], [], 60) 41 | if wlist: 42 | self.process.stdin.write(f"{prefix}{prompt}\n") 43 | self.process.stdin.flush() 44 | logger.info("already write in") 45 | break 46 | 47 | while True: 48 | reads, _, _ = select.select( 49 | [self.process.stdout, self.process.stderr], [], [], 1.0 50 | ) 51 | for read in reads: 52 | if read is self.process.stdout: 53 | result = self.process.stdout.readline() 54 | if result: 55 | if result.startswith(prefix): 56 | self.process.stdin.write("{}close\n".format(prefix)) 57 | self.process.stdin.flush() 58 | return float(result[len(prefix) :]) 59 | elif result.startswith("Error:"): 60 | raise RuntimeError("utmos failed: {}".format(result)) 61 | else: 62 | logger.info(result) 63 | if read is self.process.stderr: 64 | error_output = self.process.stderr.readline() 65 | if error_output: 66 | print(f"stderr: {error_output.strip()}") 67 | -------------------------------------------------------------------------------- /audio_evals/models/whisper.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import select 5 | from typing import Dict 6 | 7 | from audio_evals.base import PromptStruct 8 | from audio_evals.models.model import OfflineModel 9 | from audio_evals.constants import DEFAULT_MODEL_PATH 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | from audio_evals.isolate import isolated 15 | 16 | 17 | @isolated("audio_evals/lib/whisper/main.py") 18 | class WhisperModel(OfflineModel): 19 | def __init__( 20 | self, 21 | path: str = "openai/whisper-large-v3", 22 | sample_params: Dict[str, any] = None, 23 | ): 24 | if path.startswith("openai/") and not os.path.exists(path): 25 | path = self._download_model(path) 26 | 27 | self.command_args = { 28 | "path": path, 29 | } 30 | super().__init__(is_chat=True, sample_params=sample_params) 31 | 32 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 33 | import uuid 34 | 35 | uid = str(uuid.uuid4()) 36 | prefix = f"{uid}->" 37 | 38 | while True: 39 | _, wlist, _ = select.select([], [self.process.stdin], [], 60) 40 | if wlist: 41 | prompt["kwargs"] = kwargs 42 | self.process.stdin.write(f"{prefix}{json.dumps(prompt)}\n") 43 | self.process.stdin.flush() 44 | print("already write in") 45 | break 46 | while True: 47 | rlist, _, _ = select.select( 48 | [self.process.stdout, self.process.stderr], [], [], 1 49 | ) 50 | 51 | try: 52 | for stream in rlist: 53 | if stream == self.process.stdout: 54 | result = self.process.stdout.readline().strip() 55 | if not result: 56 | continue 57 | if result.startswith(prefix): 58 | self.process.stdin.write("{}close\n".format(prefix)) 59 | self.process.stdin.flush() 60 | return result[len(prefix) :] 61 | elif result.startswith("Error:"): 62 | raise RuntimeError("WhisperModel failed: {}".format(result)) 63 | else: 64 | logger.info(result) 65 | elif stream == self.process.stderr: 66 | err = self.process.stderr.readline().strip() 67 | if err: 68 | logger.error(f"Process stderr: {err}") 69 | except BlockingIOError as e: 70 | logger.error(f"BlockingIOError occurred: {str(e)}") 71 | -------------------------------------------------------------------------------- /audio_evals/process/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/process/__init__.py -------------------------------------------------------------------------------- /audio_evals/process/base.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import json 3 | import logging 4 | from abc import ABC, abstractmethod 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class Process(ABC): 10 | @abstractmethod 11 | def __call__(self, answer: str) -> str: 12 | raise NotImplementedError() 13 | 14 | 15 | class ContentExtract(Process): 16 | 17 | def __call__(self, answer: str) -> str: 18 | try: 19 | answer = answer.strip() 20 | if answer.startswith("```json"): 21 | answer = answer[7:-3].strip() 22 | elif answer.startswith("```"): 23 | answer = answer[3:-3].strip() 24 | return json.loads(answer)["content"] 25 | except Exception as e: 26 | try: 27 | return ast.literal_eval(answer)["content"] 28 | except Exception as e: 29 | logger.warning(f"process {answer} fail: {str(e)}") 30 | return answer 31 | 32 | 33 | class JsonExtract(Process): 34 | """ 35 | Extract a specific key from a json string. 36 | the key is specified by the `extract_key` parameter. 37 | if the key is not found, return the `default_value` if specified, 38 | otherwise raise a KeyError. 39 | """ 40 | 41 | def __init__(self, extract_key: str = None, default_value: str = None): 42 | """ 43 | Initialize the JsonExtract process. 44 | Args: 45 | extract_key: required, the key to extract from the json string. 46 | default_value: optional, the default value to return if the key is not found. 47 | 48 | Returns: JsonExtract object. 49 | 50 | """ 51 | self.extract_key = extract_key 52 | self.default_value = default_value 53 | 54 | def __call__(self, answer: str) -> any: 55 | """ 56 | Extract the value of the `extract_key` from the json string `answer`. 57 | Args: 58 | answer: required, the json string to extract the value from. 59 | 60 | Returns: any, the value of the `extract_key` in the json string `answer`. 61 | 62 | """ 63 | if isinstance(answer, str): 64 | try: 65 | d = json.loads(answer.strip()) 66 | except Exception as e: 67 | logger.warning(f"load json `{answer}` fail: {str(e)}") 68 | return answer 69 | elif isinstance(answer, dict): 70 | d = answer 71 | else: 72 | raise ValueError(f"Unsupported answer type: {type(answer)}") 73 | if self.extract_key is None: 74 | return d 75 | 76 | if self.default_value is not None: 77 | return d.get(self.extract_key, self.default_value) 78 | return d[self.extract_key] 79 | -------------------------------------------------------------------------------- /audio_evals/process/eliminate.py: -------------------------------------------------------------------------------- 1 | from audio_evals.process.base import Process 2 | 3 | 4 | class Eliminate(Process): 5 | 6 | def __init__(self, target: str): 7 | self.target = target 8 | 9 | def __call__(self, answer: str) -> str: 10 | return answer.replace(self.target, "") 11 | 12 | 13 | class ForceStop(Process): 14 | def __init__(self, target: str): 15 | self.target = target 16 | 17 | def __call__(self, answer: str) -> str: 18 | return answer.split(self.target)[0] 19 | 20 | 21 | class ExtractResponse(Process): 22 | def __init__(self, target: str): 23 | self.target = target 24 | 25 | def __call__(self, answer: str) -> str: 26 | return answer.split(self.target)[1] 27 | -------------------------------------------------------------------------------- /audio_evals/process/normalization.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from audio_evals.lib.text_normalization.basic import BasicTextNormalizer 4 | from audio_evals.lib.text_normalization.cn_tn import TextNorm 5 | from audio_evals.lib.text_normalization.en import EnglishTextNormalizer 6 | from audio_evals.process.base import Process 7 | 8 | 9 | class TextNormalization(Process): 10 | 11 | def __init__(self, lang: str = ""): 12 | if lang == "en": 13 | self.normalizer = EnglishTextNormalizer() 14 | elif lang == "zh": 15 | self.normalizer = TextNorm( 16 | to_banjiao=False, 17 | to_upper=False, 18 | to_lower=False, 19 | remove_fillers=False, 20 | remove_erhua=False, 21 | check_chars=False, 22 | remove_space=False, 23 | cc_mode="", 24 | ) 25 | else: 26 | self.normalizer = BasicTextNormalizer() 27 | 28 | def __call__(self, answer: str) -> str: 29 | return self.normalizer(answer) 30 | -------------------------------------------------------------------------------- /audio_evals/process/qwen.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from audio_evals.process.base import Process 4 | 5 | 6 | class QwenAudioASRExtract(Process): 7 | PUNCS = "!,.?;:" 8 | 9 | def __init__(self, lang: str): 10 | self.lang = lang 11 | 12 | def __call__(self, answer: str) -> str: 13 | gt = re.sub(r"<\|.*?\|>", " ", answer) 14 | gt = re.sub(rf"\s+", r" ", gt) # 将文本中的连续空格替换为单个空格 15 | gt = re.sub(f" ?([{self.PUNCS}])", r"\1", gt) 16 | gt = gt.lstrip(" ") 17 | if self.lang == "zh": 18 | gt = re.sub(rf"\s+", r"", gt) 19 | return gt 20 | -------------------------------------------------------------------------------- /audio_evals/process/speech.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | from audio_evals.process.base import Process 3 | 4 | 5 | class Speech2text(Process): 6 | 7 | def __init__(self, model_name: str = "whisper", prompt_name: str = "whisper-asr"): 8 | from audio_evals.registry import registry 9 | 10 | self.model = registry.get_model(model_name) 11 | self.prompt = registry.get_prompt(prompt_name) 12 | 13 | def __call__(self, answer: str) -> str: 14 | assert os.path.exists(answer), "must be a valid audio file, but got {}".format( 15 | answer 16 | ) 17 | real_prompt = self.prompt.load(WavPath=answer) 18 | return self.model.inference(real_prompt) 19 | -------------------------------------------------------------------------------- /audio_evals/prompt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/prompt/__init__.py -------------------------------------------------------------------------------- /audio_evals/prompt/base.py: -------------------------------------------------------------------------------- 1 | from functools import singledispatch 2 | from typing import Any, Dict, List 3 | 4 | from jinja2 import StrictUndefined, Template 5 | from jinja2.exceptions import UndefinedError 6 | 7 | from audio_evals.base import PromptStruct 8 | 9 | 10 | @singledispatch 11 | def _load(t: Any, **kwargs: Any) -> Any: 12 | return t 13 | 14 | 15 | @_load.register 16 | def _(t: str, **kwargs: Any) -> str: 17 | template = Template(t, undefined=StrictUndefined) 18 | try: 19 | return template.render(**kwargs) 20 | except UndefinedError as e: 21 | raise ValueError("{}: template is {}\ndoc is {}".format(e, t, kwargs)) 22 | 23 | 24 | @_load.register 25 | def _(t: list, **kwargs: Any) -> List[Any]: 26 | return [_load(item, **kwargs) for item in t] 27 | 28 | 29 | @_load.register 30 | def _(t: dict, **kwargs: Any) -> Dict[Any, Any]: 31 | return {k: _load(v, **kwargs) for k, v in t.items()} 32 | 33 | 34 | class Prompt: 35 | def __init__(self, template: PromptStruct): 36 | self.prompt = template 37 | 38 | def load(self, **kwargs): 39 | return _load(self.prompt, **kwargs) 40 | -------------------------------------------------------------------------------- /audio_evals/recorder.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import typing 4 | 5 | 6 | class Recorder: 7 | def __init__(self, f_name: str): 8 | self.name = f_name 9 | directory = os.path.dirname(f_name) 10 | os.makedirs(directory, exist_ok=True) 11 | if os.path.exists(f_name): 12 | print(f"File {f_name} already exists, overwriting it.") 13 | os.remove(f_name) 14 | 15 | def add(self, data: typing.Dict[str, typing.Any]): 16 | with open(self.name, "a+") as f: 17 | f.write(json.dumps(data, ensure_ascii=False) + "\n") 18 | -------------------------------------------------------------------------------- /docs/Procedures for Restarting an Incomplete Evaluation.md: -------------------------------------------------------------------------------- 1 | # Resume Evaluation 2 | 3 | In practice, evaluation processes may occasionally fail due to various technical issues, such as model request network interruptions, system failures, or unexpected errors. To ensure the continuity and integrity of the evaluation, follow these steps to effectively restart and complete the process. 4 | 5 | **Example Scenario:** 6 | 7 | 8 | If the evaluation process for the `GPT-4o-Audio` model with the dataset `my_dataset` fails due to a model request network interruption, the last checkpoint is saved in the `res/gpt4o_audio/last_res.jsonl` file. 9 | 10 | To restart the evaluation process, follow these steps: 11 | 12 | ```shell 13 | python audio_evals/main.py --dataset my_dataset --model gpt4o_audio -r 14 | ``` 15 | is equivalent to: 16 | 17 | ```shell 18 | python audio_evals/main.py --dataset my_dataset --model gpt4o_audio --resume res/gpt4o_audio/last_res.jsonl 19 | ``` 20 | 21 | This command will resume the evaluation from the last saved checkpoint, ensuring that the process continues seamlessly. 22 | -------------------------------------------------------------------------------- /docs/how add a dataset.md: -------------------------------------------------------------------------------- 1 | # how add a dataset in AudioEvals? 2 | 3 | 4 | In practice, you may need eval your custom audio dataset. 5 | 6 | before this, you need now how launch a custom eval task: [how launch a custom eval task.md](how%20launch%20a%20custom%20eval%20task.md) 7 | 8 | here are steps: 9 | 10 | 11 | ## JSON file: 12 | 13 | ### register the dataset 14 | 1. make sure your dataset file is `jsonl` format and with `WavPath` column which specific the audio file path. 15 | 2. new a file `**.yaml` in `registry/dataset/` 16 | content like : 17 | ```yaml 18 | $name: # name after cli: --dataset $name 19 | class: audio_evals.dataset.dataset.JsonlFile 20 | args: 21 | default_task: alei_asr # you should specify an eval task as default, you can find valid task in `registry/eval_task` 22 | f_name: # the file name 23 | ref_col: # the reference answer column name in file 24 | ``` 25 | after registry dataset, you can eval your dataset with --dataset $name, enjoy 😘 26 | 27 | Example: 28 | 29 | 1. create a file `my_dataset.jsonl` with `WavPath` and `Transcript` columns, the content like this: 30 | ```json lines 31 | {"WavPath": "path/to/audio1.wav", "Transcript": "this is the first audio"} 32 | {"WavPath": "path/to/audio2.wav", "Transcript": "this is the second audio"} 33 | ``` 34 | 35 | 2. create a file `my_dataset.yaml` in `registry/dataset/` with content: 36 | ```yaml 37 | my_dataset: 38 | class: audio_evals.dataset.dataset.JsonlFile 39 | args: 40 | default_task: asr 41 | f_name: my_dataset.jsonl # the file name 42 | ref_col: Transcript # the reference answer column name in file 43 | ``` 44 | 45 | 3. eval your dataset with `--dataset my_dataset` 46 | 47 | ```sh 48 | export PYTHONPATH=$PWD:$PYTHONPATH 49 | export OPENAI_API_KEY=$your-key 50 | python audio_evals/main.py --dataset my_dataset --model gpt4o_audio 51 | ``` 52 | -------------------------------------------------------------------------------- /docs/how eval your model.md: -------------------------------------------------------------------------------- 1 | 2 | In the QuickStart, it's easy to launch an eval task, but your model not be integrated AudioEvals, how can eval it? 3 | 4 | Here are steps: 5 | 6 | # model api 7 | > your model is deployed as a service 8 | 9 | ## 1. add model inference code 10 | 11 | add a py-file in `audio_evals/models/` path, content like: 12 | 13 | ```PYTHON 14 | from audio_evals.models.model import APIModel 15 | from audio_evals.base import PromptStruct 16 | 17 | class MyAudioModel(APIModel): 18 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 19 | # TODO 20 | # my request code 21 | ``` 22 | 23 | reference: `audio_evals/models/google.py` 24 | 25 | 26 | ## 2. register the model 27 | 28 | in the `registry/model/` path, new a yaml file, with content: 29 | 30 | ```yaml 31 | $name: # the name after command: --model $name 32 | class: audio_evals.models.$new_file.$MyAudioModel 33 | args: 34 | ... # your specific args. If not need args, just fill args: {} 35 | 36 | 37 | ``` 38 | 39 | 40 | # offline model 41 | 42 | 43 | ## 1. add model inference code (optional) 44 | > if your model is supported with huggingface AutoModelForCausalLM, you can skip this step. 45 | 46 | add a py-file in `audio_evals/models/` path, content like: 47 | ```PYTHON 48 | from audio_evals.models.offline_model import OfflineModel 49 | from audio_evals.base import PromptStruct 50 | from typing import Dict 51 | 52 | class MyAudioModel(OfflineModel): 53 | def __init__(self, is_chat: bool, sample_params: Dict[str, any] = None): 54 | super().__init__(is_chat, sample_params) 55 | # TODO 56 | # init code 57 | 58 | def _inference(self, prompt: PromptStruct, **kwargs) -> str: 59 | # TODO 60 | # inference code 61 | ``` 62 | 63 | ## 2. register the model 64 | 65 | the `registry/model/` path, new a yaml file, with content: 66 | 67 | ```yaml 68 | $name: # the name after command: --model $name 69 | class: audio_evals.models.offline_model.OfflineModel 70 | args: 71 | path: # the name of model from huggingface model or the download model path download from huggingface 72 | 73 | 74 | ``` 75 | 76 | 77 | after registry model, you can eval your model with `--model $name`, enjoy 😘 78 | -------------------------------------------------------------------------------- /registry/agg/air-bench.yaml: -------------------------------------------------------------------------------- 1 | airbench-chat: 2 | class: audio_evals.agg.air_chat.AirChat 3 | args: {} 4 | -------------------------------------------------------------------------------- /registry/agg/naive.yaml: -------------------------------------------------------------------------------- 1 | dump: 2 | class: audio_evals.agg.base.Dump 3 | args: {} 4 | 5 | acc: 6 | class: audio_evals.agg.base.ACC 7 | args: {} 8 | 9 | mean: 10 | class: audio_evals.agg.base.NaiveMean 11 | args: {} 12 | 13 | wer-zh: 14 | class: audio_evals.agg.base.PracticeWER 15 | args: 16 | lang: zh 17 | 18 | wer-yue: 19 | class: audio_evals.agg.base.PracticeWER 20 | args: 21 | lang: yue 22 | 23 | wer-jp: 24 | class: audio_evals.agg.base.PracticeWER 25 | args: 26 | lang: jp 27 | 28 | wer-kr: 29 | class: audio_evals.agg.base.PracticeWER 30 | args: 31 | lang: kr 32 | 33 | wer: 34 | class: audio_evals.agg.base.PracticeWER 35 | args: {} 36 | 37 | cer: 38 | class: audio_evals.agg.base.CER 39 | args: {} 40 | 41 | bleu: 42 | class: audio_evals.agg.base.BLEU 43 | args: {} 44 | 45 | bleu-zh: 46 | class: audio_evals.agg.base.BLEU 47 | args: 48 | lang: zh 49 | 50 | bleu-char: 51 | class: audio_evals.agg.base.BLEU 52 | args: 53 | lang: char 54 | 55 | bleu-jp: 56 | class: audio_evals.agg.base.BLEU 57 | args: 58 | lang: jp 59 | 60 | coco: 61 | class: audio_evals.agg.base.Coco 62 | args: {} 63 | 64 | naive-acc: 65 | class: audio_evals.agg.base.NaiveMean 66 | args: 67 | need_score_col: 68 | - acc 69 | 70 | geval: 71 | class: audio_evals.agg.base.NaiveMean 72 | args: 73 | need_score_col: 74 | - geval 75 | -------------------------------------------------------------------------------- /registry/dataset/AudioCaps.yaml: -------------------------------------------------------------------------------- 1 | audiocaps: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: caption 5 | name: TwinkStart/AudioCaps 6 | split: test 7 | ref_col: caption 8 | -------------------------------------------------------------------------------- /registry/dataset/COVID-recognizer.yaml: -------------------------------------------------------------------------------- 1 | COVID-recognizer: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: COVID-recognizer 5 | name: TwinkStart/COVID-recognizer 6 | split: test 7 | ref_col: status 8 | -------------------------------------------------------------------------------- /registry/dataset/CatDog.yaml: -------------------------------------------------------------------------------- 1 | catdog: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: catdog_identify 5 | name: TwinkStart/CatDog 6 | split: test 7 | ref_col: label 8 | -------------------------------------------------------------------------------- /registry/dataset/ClothoAQA.yaml: -------------------------------------------------------------------------------- 1 | clotho-aqa-sample: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | name: TwinkStart/ClothoAQA 5 | split: sample 6 | default_task: aqa 7 | ref_col: answer 8 | 9 | clotho-aqa: 10 | class: audio_evals.dataset.huggingface.Huggingface 11 | args: 12 | name: TwinkStart/ClothoAQA 13 | split: test 14 | default_task: aqa 15 | ref_col: answer 16 | -------------------------------------------------------------------------------- /registry/dataset/CommonVoice.yaml: -------------------------------------------------------------------------------- 1 | cv-15-en: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: asr 5 | name: TwinkStart/CommonVoice_15 6 | split: en 7 | ref_col: sentence 8 | cv-15-zh: 9 | class: audio_evals.dataset.huggingface.Huggingface 10 | args: 11 | default_task: asr-zh 12 | name: TwinkStart/CommonVoice_15 13 | split: zh 14 | ref_col: sentence 15 | cv-15-fr: 16 | class: audio_evals.dataset.huggingface.Huggingface 17 | args: 18 | default_task: asr 19 | name: TwinkStart/CommonVoice_15 20 | split: fr 21 | ref_col: sentence 22 | cv-15-yue: 23 | class: audio_evals.dataset.huggingface.Huggingface 24 | args: 25 | default_task: asr-yue 26 | name: TwinkStart/CommonVoice_15 27 | split: yue 28 | ref_col: sentence 29 | -------------------------------------------------------------------------------- /registry/dataset/DESEDpublic_eval.yaml: -------------------------------------------------------------------------------- 1 | desed: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: desed_sound_analysis 5 | name: TwinkStart/DESEDpublic_eval 6 | split: test 7 | ref_col: event_label 8 | -------------------------------------------------------------------------------- /registry/dataset/GTZAN.yaml: -------------------------------------------------------------------------------- 1 | GTZAN: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: music_genre 5 | name: TwinkStart/GTZAN 6 | split: test 7 | ref_col: label 8 | -------------------------------------------------------------------------------- /registry/dataset/GigaSpeech.yaml: -------------------------------------------------------------------------------- 1 | gigaspeech: 2 | class: audio_evals.dataset.giga.GigaSpeechDataset 3 | args: 4 | default_task: asr 5 | name: speechcolab/gigaspeech 6 | subset: test 7 | ref_col: text 8 | -------------------------------------------------------------------------------- /registry/dataset/KeSpeech.yaml: -------------------------------------------------------------------------------- 1 | KeSpeech: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: asr-zh 5 | name: TwinkStart/kespeech 6 | split: test 7 | ref_col: Text 8 | -------------------------------------------------------------------------------- /registry/dataset/MELD.yaml: -------------------------------------------------------------------------------- 1 | meld-emo: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | name: TwinkStart/MELD 5 | split: test 6 | default_task: emotion_analysis 7 | ref_col: Emotion 8 | 9 | meld-sentiment: 10 | class: audio_evals.dataset.huggingface.Huggingface 11 | args: 12 | name: TwinkStart/MELD 13 | split: test 14 | default_task: sentiment_analysis 15 | ref_col: Sentiment 16 | -------------------------------------------------------------------------------- /registry/dataset/MMAU.yaml: -------------------------------------------------------------------------------- 1 | mmau-test-mini: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: single_choice_with_answer 5 | name: TwinkStart/MMAU 6 | split: test_mini 7 | ref_col: answer 8 | -------------------------------------------------------------------------------- /registry/dataset/Nsynth.yaml: -------------------------------------------------------------------------------- 1 | nsynth: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: instrument_recognition 5 | name: TwinkStart/Nsynth 6 | split: test 7 | ref_col: instrument_family_str 8 | -------------------------------------------------------------------------------- /registry/dataset/RAVDESS.yaml: -------------------------------------------------------------------------------- 1 | ravdess-emo: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: ravdess_emotion_analysis 5 | name: TwinkStart/RAVDESS 6 | split: ravdess_emo 7 | ref_col: emotion 8 | ravdess-gender: 9 | class: audio_evals.dataset.huggingface.Huggingface 10 | args: 11 | default_task: gender_analysis 12 | name: TwinkStart/RAVDESS 13 | split: ravdess_gender 14 | ref_col: Gender 15 | -------------------------------------------------------------------------------- /registry/dataset/RespiratorySound.yaml: -------------------------------------------------------------------------------- 1 | respiratory-crackles: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: Respiratory-crackles-recognizer 5 | name: TwinkStart/RespiratorySound 6 | split: respiratory_crackles 7 | ref_col: Crackles 8 | respiratory-wheezes: 9 | class: audio_evals.dataset.huggingface.Huggingface 10 | args: 11 | default_task: Respiratory-wheezes-recognizer 12 | name: TwinkStart/RespiratorySound 13 | split: respiratory_wheezes 14 | ref_col: Wheezes 15 | -------------------------------------------------------------------------------- /registry/dataset/TESS.yaml: -------------------------------------------------------------------------------- 1 | TESS: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: emotion_analysis 5 | name: TwinkStart/TESS 6 | split: test 7 | ref_col: label 8 | -------------------------------------------------------------------------------- /registry/dataset/VSC.yaml: -------------------------------------------------------------------------------- 1 | vocalsound: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | name: TwinkStart/vocalsound 5 | split: test 6 | default_task: vocalsound_analysis 7 | ref_col: label 8 | -------------------------------------------------------------------------------- /registry/dataset/VoxCeleb.yaml: -------------------------------------------------------------------------------- 1 | voxceleb1: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: gender_analysis 5 | name: TwinkStart/VoxCeleb 6 | split: voxceleb1 7 | ref_col: Gender 8 | voxceleb2: 9 | class: audio_evals.dataset.huggingface.Huggingface 10 | args: 11 | default_task: gender_analysis 12 | name: TwinkStart/VoxCeleb 13 | split: voxceleb2 14 | ref_col: Gender 15 | -------------------------------------------------------------------------------- /registry/dataset/WavCaps.yaml: -------------------------------------------------------------------------------- 1 | wavcaps-audioset: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: caption 5 | name: TwinkStart/wavcaps-audioset 6 | split: test 7 | ref_col: caption 8 | wavcaps-freesound: 9 | class: audio_evals.dataset.huggingface.Huggingface 10 | args: 11 | default_task: caption 12 | name: TwinkStart/wavcaps-freesound 13 | split: test 14 | ref_col: caption 15 | wavcaps-soundbible: 16 | class: audio_evals.dataset.huggingface.Huggingface 17 | args: 18 | default_task: caption 19 | name: TwinkStart/wavcaps-soundbible 20 | split: test 21 | ref_col: caption 22 | -------------------------------------------------------------------------------- /registry/dataset/WenetSpeech.yaml: -------------------------------------------------------------------------------- 1 | WenetSpeech-test-meeting: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: asr-zh 5 | name: TwinkStart/WenetSpeech 6 | split: test_meeting 7 | ref_col: text 8 | 9 | WenetSpeech-test-net: 10 | class: audio_evals.dataset.huggingface.Huggingface 11 | args: 12 | default_task: asr-zh 13 | name: TwinkStart/WenetSpeech 14 | split: test_net 15 | ref_col: text 16 | -------------------------------------------------------------------------------- /registry/dataset/air.yaml: -------------------------------------------------------------------------------- 1 | air-foundation: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: single_choice 5 | name: TwinkStart/air-foundation 6 | split: test 7 | ref_col: answer 8 | air-chat: 9 | class: audio_evals.dataset.huggingface.Huggingface 10 | args: 11 | default_task: air_chat 12 | name: TwinkStart/air_chat 13 | split: test 14 | ref_col: answer_gt 15 | -------------------------------------------------------------------------------- /registry/dataset/aishell.yaml: -------------------------------------------------------------------------------- 1 | aishell-1: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: asr-zh 5 | name: TwinkStart/AISHELL-1 6 | split: test 7 | ref_col: text 8 | -------------------------------------------------------------------------------- /registry/dataset/alpaca_eval.yaml: -------------------------------------------------------------------------------- 1 | speech-chatbot-alpaca-eval: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: glm-alpaca-eval 5 | name: TwinkStart/speech-chatbot-alpaca-eval 6 | split: test 7 | ref_col: output -------------------------------------------------------------------------------- /registry/dataset/audio-MNIST.yaml: -------------------------------------------------------------------------------- 1 | audio-MNIST: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: digit 5 | name: TwinkStart/audio-MNIST 6 | split: test 7 | ref_col: Digit 8 | -------------------------------------------------------------------------------- /registry/dataset/chord_recoganition.yaml: -------------------------------------------------------------------------------- 1 | chord-recognition: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: chord_recognition 5 | name: TwinkStart/chord_recoganition 6 | split: test 7 | ref_col: Label 8 | -------------------------------------------------------------------------------- /registry/dataset/fleurs.yaml: -------------------------------------------------------------------------------- 1 | fleurs-zh: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | subset: cmn_hans_cn 5 | default_task: asr-zh 6 | name: google/fleurs 7 | ref_col: raw_transcription 8 | split: test 9 | fleurs-hi_in: 10 | class: audio_evals.dataset.huggingface.Huggingface 11 | args: 12 | subset: hi_in 13 | default_task: asr 14 | name: google/fleurs 15 | ref_col: raw_transcription 16 | split: test 17 | fleurs-de_de: 18 | class: audio_evals.dataset.huggingface.Huggingface 19 | args: 20 | subset: de_de 21 | default_task: asr 22 | name: google/fleurs 23 | ref_col: raw_transcription 24 | split: test 25 | fleurs-ja_jp: 26 | class: audio_evals.dataset.huggingface.Huggingface 27 | args: 28 | subset: ja_jp 29 | default_task: asr-jp 30 | name: google/fleurs 31 | ref_col: raw_transcription 32 | split: test 33 | fleurs-ru_ru: 34 | class: audio_evals.dataset.huggingface.Huggingface 35 | args: 36 | subset: ru_ru 37 | default_task: asr 38 | name: google/fleurs 39 | ref_col: raw_transcription 40 | split: test 41 | fleurs-en_us: 42 | class: audio_evals.dataset.huggingface.Huggingface 43 | args: 44 | subset: en_us 45 | default_task: asr 46 | name: google/fleurs 47 | ref_col: raw_transcription 48 | split: test 49 | fleurs-fa_ir: 50 | class: audio_evals.dataset.huggingface.Huggingface 51 | args: 52 | subset: fa_ir 53 | default_task: asr 54 | name: google/fleurs 55 | ref_col: raw_transcription 56 | split: test 57 | fleurs-ar_eg: 58 | class: audio_evals.dataset.huggingface.Huggingface 59 | args: 60 | subset: ar_eg 61 | default_task: asr 62 | name: google/fleurs 63 | ref_col: raw_transcription 64 | split: test 65 | fleurs-fr_fr: 66 | class: audio_evals.dataset.huggingface.Huggingface 67 | args: 68 | subset: fr_fr 69 | default_task: asr 70 | name: google/fleurs 71 | ref_col: raw_transcription 72 | split: test 73 | fleurs-ko_kr: 74 | class: audio_evals.dataset.huggingface.Huggingface 75 | args: 76 | subset: ko_kr 77 | default_task: asr 78 | name: google/fleurs 79 | ref_col: raw_transcription 80 | split: test 81 | -------------------------------------------------------------------------------- /registry/dataset/heart_beat.yaml: -------------------------------------------------------------------------------- 1 | heartbeat_sound: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: Heartbeat-recognizer 5 | name: TwinkStart/heart_beat 6 | split: test 7 | ref_col: label 8 | -------------------------------------------------------------------------------- /registry/dataset/librispeech.yaml: -------------------------------------------------------------------------------- 1 | librispeech-test-clean: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: asr 5 | name: TwinkStart/librispeech 6 | split: test_clean 7 | ref_col: text 8 | 9 | librispeech-dev-clean: 10 | class: audio_evals.dataset.huggingface.Huggingface 11 | args: 12 | default_task: asr 13 | name: TwinkStart/librispeech 14 | split: dev_clean 15 | ref_col: text 16 | 17 | librispeech-test-other: 18 | class: audio_evals.dataset.huggingface.Huggingface 19 | args: 20 | default_task: asr 21 | name: TwinkStart/librispeech 22 | split: test_other 23 | ref_col: text 24 | 25 | librispeech-dev-other: 26 | class: audio_evals.dataset.huggingface.Huggingface 27 | args: 28 | default_task: asr 29 | name: TwinkStart/librispeech 30 | split: dev_other 31 | ref_col: text 32 | -------------------------------------------------------------------------------- /registry/dataset/llama_questions.yaml: -------------------------------------------------------------------------------- 1 | llama-questions: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: s2s-aqa 5 | name: TwinkStart/llama-questions 6 | split: test 7 | ref_col: Answer 8 | 9 | llama-questions-s2t: 10 | class: audio_evals.dataset.huggingface.Huggingface 11 | args: 12 | default_task: loose-aqa 13 | name: TwinkStart/llama-questions 14 | split: test 15 | ref_col: Answer -------------------------------------------------------------------------------- /registry/dataset/multilingual_librispeech.yaml: -------------------------------------------------------------------------------- 1 | mls_dutch: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: asr 5 | name: TwinkStart/facebook_multilingual_librispeech 6 | split: mls_dutch 7 | ref_col: Text 8 | 9 | mls_french: 10 | class: audio_evals.dataset.huggingface.Huggingface 11 | args: 12 | default_task: asr 13 | name: TwinkStart/facebook_multilingual_librispeech 14 | split: mls_french 15 | ref_col: Text 16 | 17 | mls_german: 18 | class: audio_evals.dataset.huggingface.Huggingface 19 | args: 20 | default_task: asr 21 | name: TwinkStart/facebook_multilingual_librispeech 22 | split: mls_german 23 | ref_col: Text 24 | 25 | mls_italian: 26 | class: audio_evals.dataset.huggingface.Huggingface 27 | args: 28 | default_task: asr 29 | name: TwinkStart/facebook_multilingual_librispeech 30 | split: mls_italian 31 | ref_col: Text 32 | 33 | mls_polish: 34 | class: audio_evals.dataset.huggingface.Huggingface 35 | args: 36 | default_task: asr 37 | name: TwinkStart/facebook_multilingual_librispeech 38 | split: mls_polish 39 | ref_col: Text 40 | 41 | mls_portuguese: 42 | class: audio_evals.dataset.huggingface.Huggingface 43 | args: 44 | default_task: asr 45 | name: TwinkStart/facebook_multilingual_librispeech 46 | split: mls_portuguese 47 | ref_col: Text 48 | 49 | mls_spanish: 50 | class: audio_evals.dataset.huggingface.Huggingface 51 | args: 52 | default_task: asr 53 | name: TwinkStart/facebook_multilingual_librispeech 54 | split: mls_spanish 55 | ref_col: Text 56 | -------------------------------------------------------------------------------- /registry/dataset/peoples_speech.yaml: -------------------------------------------------------------------------------- 1 | peoples-speech: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: asr 5 | name: TwinkStart/peoples_speech 6 | split: test 7 | ref_col: label 8 | -------------------------------------------------------------------------------- /registry/dataset/sample.yaml: -------------------------------------------------------------------------------- 1 | sample: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: asr-zh 5 | name: TwinkStart/sample 6 | split: sample 7 | ref_col: Text 8 | 9 | sample-en: 10 | class: audio_evals.dataset.huggingface.Huggingface 11 | args: 12 | default_task: asr 13 | name: TwinkStart/sample 14 | split: sample-en 15 | ref_col: Text 16 | -------------------------------------------------------------------------------- /registry/dataset/tedlium.yaml: -------------------------------------------------------------------------------- 1 | tedlium-release1: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: asr 5 | name: TwinkStart/tedlium 6 | subset: release1 7 | ref_col: text 8 | tedlium-release2: 9 | class: audio_evals.dataset.huggingface.Huggingface 10 | args: 11 | default_task: asr 12 | name: TwinkStart/tedlium 13 | subset: release2 14 | ref_col: text 15 | tedlium-release3: 16 | class: audio_evals.dataset.huggingface.Huggingface 17 | args: 18 | default_task: asr 19 | name: TwinkStart/tedlium 20 | subset: release3 21 | ref_col: text 22 | -------------------------------------------------------------------------------- /registry/dataset/triviaqa.yaml: -------------------------------------------------------------------------------- 1 | speech-triviaqa: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: s2s-aqa 5 | name: TwinkStart/speech-triavia-qa 6 | ref_col: answer 7 | split: test 8 | 9 | speech-triviaqa-s2t: 10 | class: audio_evals.dataset.huggingface.Huggingface 11 | args: 12 | default_task: loose-aqa 13 | name: TwinkStart/speech-triavia-qa 14 | ref_col: answer 15 | split: test 16 | -------------------------------------------------------------------------------- /registry/dataset/voxpopuli.yaml: -------------------------------------------------------------------------------- 1 | voxpopuli-en: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | subset: en 5 | default_task: asr 6 | name: facebook/voxpopuli 7 | ref_col: normalized_text 8 | split: test 9 | -------------------------------------------------------------------------------- /registry/dataset/webQ.yaml: -------------------------------------------------------------------------------- 1 | speech-web-questions: 2 | class: audio_evals.dataset.huggingface.Huggingface 3 | args: 4 | default_task: s2s-aqa 5 | name: TwinkStart/speech-web-questions 6 | ref_col: answers 7 | split: test 8 | 9 | speech-web-questions-s2t: 10 | class: audio_evals.dataset.huggingface.Huggingface 11 | args: 12 | default_task: loose-aqa 13 | f_name: TwinkStart/speech-web-questions 14 | ref_col: answers 15 | split: test -------------------------------------------------------------------------------- /registry/eval_task/acoustics.yaml: -------------------------------------------------------------------------------- 1 | speech-quality: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: clotho-aqa 5 | prompt: direct-aqa 6 | model: qwen-audio-chat 7 | post_process: ['extract_audio'] 8 | evaluator: speech_quality 9 | agg: mean 10 | -------------------------------------------------------------------------------- /registry/eval_task/air.yaml: -------------------------------------------------------------------------------- 1 | single_choice: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: air-foundation 5 | prompt: single_choice 6 | model: qwen-audio-chat 7 | evaluator: prefix-match 8 | agg: acc 9 | 10 | air_chat: 11 | class: audio_evals.base.EvalTaskCfg 12 | args: 13 | dataset: air-chat 14 | prompt: qa 15 | model: qwen-audio-chat 16 | evaluator: air-bench-geval 17 | agg: airbench-chat 18 | 19 | single_choice_with_answer: 20 | class: audio_evals.base.EvalTaskCfg 21 | args: 22 | dataset: mmau 23 | prompt: single_choice_with_answer 24 | model: qwen-audio-chat 25 | evaluator: choice-strings-match 26 | agg: acc 27 | -------------------------------------------------------------------------------- /registry/eval_task/alpaca.yaml: -------------------------------------------------------------------------------- 1 | alpaca-eval: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: alpaca-eval-audio 5 | prompt: direct-aqa 6 | model: qwen-audio-chat 7 | evaluator: alpaca_eval_gpt4 8 | agg: naive-acc 9 | 10 | glm-alpaca-eval: 11 | class: audio_evals.base.EvalTaskCfg 12 | args: 13 | dataset: speech-chatbot-alpaca-eval 14 | prompt: direct-aqa 15 | model: qwen-audio-chat 16 | post_process: ['extract_audio', 'speech2text'] 17 | evaluator: chatbot_eval 18 | agg: geval 19 | 20 | glm-alpaca-eval-s2t: 21 | class: audio_evals.base.EvalTaskCfg 22 | args: 23 | dataset: speech-chatbot-alpaca-eval 24 | prompt: direct-aqa 25 | model: qwen-audio-chat 26 | post_process: [] 27 | evaluator: chatbot_eval 28 | agg: geval 29 | -------------------------------------------------------------------------------- /registry/eval_task/aqa.yaml: -------------------------------------------------------------------------------- 1 | aqa: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: clotho-aqa 5 | prompt: aqa 6 | model: qwen-audio-chat 7 | evaluator: prefix-match 8 | agg: acc 9 | 10 | loose-aqa: 11 | class: audio_evals.base.EvalTaskCfg 12 | args: 13 | dataset: clotho-aqa 14 | prompt: direct-aqa 15 | model: qwen-audio-chat 16 | post_process: ['extract_text'] 17 | evaluator: qa-exist-match 18 | agg: acc 19 | 20 | s2s-aqa: 21 | class: audio_evals.base.EvalTaskCfg 22 | args: 23 | dataset: clotho-aqa 24 | prompt: direct-aqa 25 | model: qwen-audio-chat 26 | post_process: ['extract_audio', 'speech2text'] 27 | evaluator: qa-exist-match 28 | agg: acc 29 | 30 | choice-aqa: 31 | class: audio_evals.base.EvalTaskCfg 32 | args: 33 | dataset: clotho-aqa 34 | prompt: direct-aqa 35 | model: qwen-audio-chat 36 | post_process: ['extract_audio', 'speech2text', 'first_option'] 37 | evaluator: em 38 | agg: acc 39 | 40 | s2t-choice-aqa: 41 | class: audio_evals.base.EvalTaskCfg 42 | args: 43 | dataset: clotho-aqa 44 | prompt: direct-aqa 45 | model: qwen-audio-chat 46 | post_process: ['first_option'] 47 | evaluator: em 48 | agg: acc 49 | -------------------------------------------------------------------------------- /registry/eval_task/asr.yaml: -------------------------------------------------------------------------------- 1 | asr: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: KeSpeech 5 | prompt: asr 6 | model: qwen-audio 7 | post_process: ['json_content'] 8 | evaluator: wer 9 | agg: wer 10 | 11 | asr-3o: 12 | class: audio_evals.base.EvalTaskCfg 13 | args: 14 | dataset: KeSpeech 15 | prompt: asr-en 16 | model: qwen-audio 17 | evaluator: wer 18 | agg: wer 19 | 20 | asr-zh-3o: 21 | class: audio_evals.base.EvalTaskCfg 22 | args: 23 | dataset: KeSpeech 24 | prompt: asr-zh 25 | model: qwen-audio 26 | evaluator: cer 27 | agg: wer-zh 28 | 29 | asr-zh: 30 | class: audio_evals.base.EvalTaskCfg 31 | args: 32 | dataset: KeSpeech 33 | prompt: asr 34 | model: qwen-audio 35 | post_process: ['json_content'] 36 | evaluator: cer 37 | agg: wer-zh 38 | 39 | asr-jp: 40 | class: audio_evals.base.EvalTaskCfg 41 | args: 42 | dataset: fleurs-ja_jp 43 | prompt: asr 44 | model: qwen-audio 45 | post_process: ['json_content'] 46 | evaluator: wer-jp 47 | agg: wer-jp 48 | 49 | asr-yue: 50 | class: audio_evals.base.EvalTaskCfg 51 | args: 52 | dataset: KeSpeech 53 | prompt: asr 54 | model: qwen-audio 55 | post_process: ['json_content'] 56 | evaluator: wer-yue 57 | agg: wer-yue 58 | 59 | asr-kr: 60 | class: audio_evals.base.EvalTaskCfg 61 | args: 62 | dataset: KeSpeech 63 | prompt: asr 64 | model: qwen-audio 65 | post_process: ['json_content'] 66 | evaluator: wer-kr 67 | agg: wer-kr 68 | -------------------------------------------------------------------------------- /registry/eval_task/caption.yaml: -------------------------------------------------------------------------------- 1 | caption: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: air-foundation 5 | prompt: caption 6 | model: qwen-audio-chat 7 | evaluator: dump 8 | agg: coco 9 | -------------------------------------------------------------------------------- /registry/eval_task/digit.yaml: -------------------------------------------------------------------------------- 1 | digit: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: clotho-aqa 5 | prompt: digit 6 | model: qwen-audio-chat 7 | evaluator: prefix-match 8 | agg: acc 9 | -------------------------------------------------------------------------------- /registry/eval_task/emo.yaml: -------------------------------------------------------------------------------- 1 | emotion_analysis: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: meld 5 | prompt: emo_analysis 6 | model: qwen-audio-chat 7 | evaluator: prefix-match 8 | agg: acc 9 | 10 | ravdess_emotion_analysis: 11 | class: audio_evals.base.EvalTaskCfg 12 | args: 13 | dataset: meld 14 | prompt: ravdess_emo_analysis 15 | model: qwen-audio-chat 16 | evaluator: prefix-match 17 | agg: acc 18 | 19 | sentiment_analysis: 20 | class: audio_evals.base.EvalTaskCfg 21 | args: 22 | dataset: meld 23 | prompt: sentiment_analysis 24 | model: qwen-audio-chat 25 | evaluator: prefix-match 26 | agg: acc 27 | 28 | desed_sound_analysis: 29 | class: audio_evals.base.EvalTaskCfg 30 | args: 31 | dataset: desed 32 | prompt: sound_analysis 33 | model: qwen-audio-chat 34 | evaluator: prefix-match 35 | agg: acc 36 | -------------------------------------------------------------------------------- /registry/eval_task/gender.yaml: -------------------------------------------------------------------------------- 1 | gender_analysis: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: meld 5 | prompt: gender_analysis 6 | model: qwen-audio-chat 7 | evaluator: prefix-match 8 | agg: acc 9 | -------------------------------------------------------------------------------- /registry/eval_task/inference.yaml: -------------------------------------------------------------------------------- 1 | inference: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: covost2-local 5 | prompt: covost2-en-zh 6 | model: qwen-audio 7 | evaluator: dump 8 | agg: dump 9 | -------------------------------------------------------------------------------- /registry/eval_task/medicine.yaml: -------------------------------------------------------------------------------- 1 | COVID-recognizer: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: clotho-aqa 5 | prompt: COVID-recognizer 6 | model: qwen-audio-chat 7 | evaluator: prefix-match 8 | agg: acc 9 | 10 | Heartbeat-recognizer: 11 | class: audio_evals.base.EvalTaskCfg 12 | args: 13 | dataset: clotho-aqa 14 | prompt: Heartbeat-recognizer 15 | model: qwen-audio-chat 16 | evaluator: prefix-match 17 | agg: acc 18 | 19 | Respiratory-crackles-recognizer: 20 | class: audio_evals.base.EvalTaskCfg 21 | args: 22 | dataset: clotho-aqa 23 | prompt: Respiratory-crackles-recognizer 24 | model: qwen-audio-chat 25 | evaluator: prefix-match 26 | agg: acc 27 | 28 | Respiratory-wheezes-recognizer: 29 | class: audio_evals.base.EvalTaskCfg 30 | args: 31 | dataset: clotho-aqa 32 | prompt: Respiratory-wheezes-recognizer 33 | model: qwen-audio-chat 34 | evaluator: prefix-match 35 | agg: acc 36 | -------------------------------------------------------------------------------- /registry/eval_task/music.yaml: -------------------------------------------------------------------------------- 1 | instrument_recognition: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: nsyth 5 | prompt: instrument_recognition 6 | model: qwen-audio-chat 7 | evaluator: prefix-match 8 | agg: acc 9 | 10 | chord_recognition: 11 | class: audio_evals.base.EvalTaskCfg 12 | args: 13 | dataset: nsyth 14 | prompt: chord_recognition 15 | model: qwen-audio-chat 16 | evaluator: prefix-match 17 | agg: acc 18 | 19 | music_genre: 20 | class: audio_evals.base.EvalTaskCfg 21 | args: 22 | dataset: nsyth 23 | prompt: music_genre 24 | model: qwen-audio-chat 25 | evaluator: prefix-match 26 | agg: acc 27 | 28 | 29 | music_tempo: 30 | class: audio_evals.base.EvalTaskCfg 31 | args: 32 | dataset: nsyth 33 | prompt: music_tempo 34 | model: qwen-audio-chat 35 | evaluator: em 36 | agg: acc 37 | -------------------------------------------------------------------------------- /registry/eval_task/sound_identify.yaml: -------------------------------------------------------------------------------- 1 | catdog_identify: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: vocalsound 5 | prompt: catdog_identify 6 | model: qwen-audio-chat 7 | evaluator: prefix-match 8 | agg: acc 9 | -------------------------------------------------------------------------------- /registry/eval_task/vsc.yaml: -------------------------------------------------------------------------------- 1 | vocalsound_analysis: 2 | class: audio_evals.base.EvalTaskCfg 3 | args: 4 | dataset: vocalsound 5 | prompt: vocal_sound_analysis 6 | model: qwen-audio-chat 7 | evaluator: prefix-match 8 | agg: acc 9 | -------------------------------------------------------------------------------- /registry/evaluator/air-bench.yaml: -------------------------------------------------------------------------------- 1 | air-bench-geval: 2 | class: audio_evals.evaluator.air_chat.AIRChatEvaluator 3 | args: 4 | model_name: gpt4o-mini 5 | -------------------------------------------------------------------------------- /registry/evaluator/alpaca.yaml: -------------------------------------------------------------------------------- 1 | alpaca_eval_gpt4: 2 | class: audio_evals.evaluator.alpaca_eval.AlpacaEvaluator 3 | args: 4 | model_name: gpt4o-mini 5 | 6 | chatbot_eval: 7 | class: audio_evals.evaluator.alpaca_eval.ChatbotEvaluator 8 | args: 9 | model_name: gpt4o-mini 10 | 11 | ref_qa_geval: 12 | class: audio_evals.evaluator.ref_qa_geval.RefQAGEval 13 | args: 14 | model_name: mb-gpt4o-mini 15 | -------------------------------------------------------------------------------- /registry/evaluator/choice-with-ans.yaml: -------------------------------------------------------------------------------- 1 | choice-strings-match: 2 | class: audio_evals.evaluator.string_match.ChoiceStringMatch 3 | args: {} 4 | -------------------------------------------------------------------------------- /registry/evaluator/common.yaml: -------------------------------------------------------------------------------- 1 | dump: 2 | class: audio_evals.evaluator.base.Dump 3 | args: {} 4 | 5 | em: 6 | class: audio_evals.evaluator.base.EM 7 | args: {} 8 | 9 | exist-match: 10 | class: audio_evals.evaluator.base.ExistMatch 11 | args: {} 12 | 13 | prefix-match: 14 | class: audio_evals.evaluator.base.PrefixMatch 15 | args: {} 16 | 17 | wer: 18 | class: audio_evals.evaluator.wer.WER 19 | args: 20 | ignore_case: true 21 | 22 | wer-jp: 23 | class: audio_evals.evaluator.wer.WER 24 | args: 25 | ignore_case: true 26 | lang: jp 27 | 28 | wer-kr: 29 | class: audio_evals.evaluator.wer.WER 30 | args: 31 | ignore_case: true 32 | lang: kr 33 | 34 | wer-yue: 35 | class: audio_evals.evaluator.wer.WER 36 | args: 37 | ignore_case: true 38 | lang: yue 39 | 40 | wer-sensitive-case: 41 | class: audio_evals.evaluator.wer.WER 42 | args: {} 43 | 44 | cer: 45 | class: audio_evals.evaluator.wer.CER 46 | args: {} 47 | 48 | bleu: 49 | class: audio_evals.evaluator.bleu.BLEU 50 | args: {} 51 | 52 | bleu-zh: 53 | class: audio_evals.evaluator.bleu.BLEU 54 | args: 55 | lang: zh 56 | 57 | bleu-jp: 58 | class: audio_evals.evaluator.bleu.BLEU 59 | args: 60 | lang: jp 61 | 62 | bleu-char: 63 | class: audio_evals.evaluator.bleu.BLEU 64 | args: 65 | lang: char 66 | 67 | coco: 68 | class: audio_evals.evaluator.coco.Coco 69 | args: {} 70 | -------------------------------------------------------------------------------- /registry/evaluator/dnsmos.yaml: -------------------------------------------------------------------------------- 1 | dnsmos: 2 | class: audio_evals.evaluator.dnsmos.DNSMOS 3 | args: 4 | model_name: dnsmos 5 | -------------------------------------------------------------------------------- /registry/evaluator/llama-speech.yaml: -------------------------------------------------------------------------------- 1 | llama_speech_eval_gpt4: 2 | class: audio_evals.evaluator.alpaca_eval.AlpacaEvaluator 3 | args: 4 | model_name: gpt4o-mini 5 | -------------------------------------------------------------------------------- /registry/evaluator/qa.yaml: -------------------------------------------------------------------------------- 1 | qa-exist-match: 2 | class: audio_evals.evaluator.qa_exact_match.QAExistMatchEvaluator 3 | args: {} 4 | 5 | -------------------------------------------------------------------------------- /registry/evaluator/simo.yaml: -------------------------------------------------------------------------------- 1 | simo: 2 | class: audio_evals.evaluator.simo.Simo 3 | args: 4 | model_name: wavlm_large 5 | -------------------------------------------------------------------------------- /registry/evaluator/speech_qulity.yaml: -------------------------------------------------------------------------------- 1 | speech_quality: 2 | class: audio_evals.evaluator.ensemble.Ensemble 3 | args: 4 | components: 5 | - dnsmos 6 | - utmos 7 | 8 | 9 | vc_quality: 10 | class: audio_evals.evaluator.ensemble.Ensemble 11 | args: 12 | components: 13 | - dnsmos 14 | - utmos 15 | - simo 16 | -------------------------------------------------------------------------------- /registry/evaluator/utmos.yaml: -------------------------------------------------------------------------------- 1 | utmos: 2 | class: audio_evals.evaluator.utmos.UTMOS 3 | args: {} 4 | -------------------------------------------------------------------------------- /registry/model/ali.yaml: -------------------------------------------------------------------------------- 1 | qwen-audio: 2 | class: audio_evals.models.ali.AliApi 3 | args: 4 | model_name: 'qwen-audio-chat' 5 | -------------------------------------------------------------------------------- /registry/model/dnsmos.yaml: -------------------------------------------------------------------------------- 1 | dnsmos: 2 | class: audio_evals.models.dnsmos.DNSMOS 3 | args: 4 | model_path: 5 | p_model_path: 6 | p808_model_path: 7 | env_path: envs/dnsmos 8 | requirements_path: audio_evals/lib/DNSMOS/requirements.txt 9 | -------------------------------------------------------------------------------- /registry/model/gemini.yaml: -------------------------------------------------------------------------------- 1 | gemini-pro: 2 | class: audio_evals.models.google.Gemini 3 | args: 4 | model_name: 'gemini-pro' 5 | 6 | gemini-1.5-pro: 7 | class: audio_evals.models.google.Gemini 8 | args: 9 | model_name: 'gemini-1.5-pro' 10 | 11 | gemini-1.5-flash: 12 | class: audio_evals.models.google.Gemini 13 | args: 14 | model_name: 'gemini-1.5-flash' 15 | 16 | gemini-2.0-flash-exp: 17 | class: audio_evals.models.google.Gemini 18 | args: 19 | model_name: 'gemini-2.0-flash-exp' 20 | 21 | gemini-2.5-flash: 22 | class: audio_evals.models.google.Gemini 23 | args: 24 | model_name: 'gemini-2.5-flash-preview-04-17' 25 | 26 | gemini-2.5-pro: 27 | class: audio_evals.models.google.Gemini 28 | args: 29 | model_name: 'gemini-2.5-pro-preview-05-06' 30 | -------------------------------------------------------------------------------- /registry/model/minicpmo.yaml: -------------------------------------------------------------------------------- 1 | MiniCPMo2_6-audio: 2 | class: audio_evals.models.mini_cpm.MiniCPMo 3 | args: 4 | path: openbmb/MiniCPM-o-2_6 5 | speech: false 6 | env_path: envs/minicpmo2_6 7 | requirements_path: audio_evals/lib/minicpm/requirements.txt 8 | sample_params: 9 | sampling: false 10 | num_beams: 5 11 | max_new_tokens: 128 12 | 13 | MiniCPMo2_6-speech: 14 | class: audio_evals.models.mini_cpm.MiniCPMo 15 | args: 16 | path: openbmb/MiniCPM-o-2_6 17 | speech: true 18 | env_path: envs/minicpmo2_6 19 | requirements_path: audio_evals/lib/minicpm/requirements.txt 20 | sample_params: 21 | sampling: false 22 | num_beams: 5 23 | max_new_tokens: 128 24 | -------------------------------------------------------------------------------- /registry/model/moonshot.yaml: -------------------------------------------------------------------------------- 1 | kimiaudio: 2 | class: audio_evals.models.moonshot.KimiAudioModel 3 | args: 4 | model_path: moonshotai/Kimi-Audio-7B-Instruct 5 | env_path: envs/kimiaudio 6 | requirements_path: audio_evals/lib/Kimi-Audio/requirements.txt 7 | 8 | kimiaudio-speech: 9 | class: audio_evals.models.moonshot.KimiAudioModel 10 | args: 11 | model_path: /data/shiqundong/model/Kimi-Audio-7B-Instruct 12 | speech: True 13 | env_path: envs/kimiaudio 14 | requirements_path: audio_evals/lib/Kimi-Audio/requirements.txt 15 | -------------------------------------------------------------------------------- /registry/model/offline.yaml: -------------------------------------------------------------------------------- 1 | qwen2-audio-offline: 2 | class: audio_evals.models.qwen.Qwen2audioPretrain 3 | args: 4 | path: Qwen/Qwen2-Audio-7B 5 | sample_params: 6 | do_sample: false 7 | max_new_tokens: 256 8 | min_new_tokens: 1 9 | length_penalty: 1.0 10 | num_return_sequences: 1 11 | repetition_penalty: 1.0 12 | use_cache: True 13 | 14 | qwen2-audio-chat: 15 | class: audio_evals.models.qwen.Qwen2audio 16 | args: 17 | path: Qwen/Qwen2-Audio-7B-Instruct 18 | sample_params: 19 | do_sample: false 20 | max_new_tokens: 256 21 | min_new_tokens: 1 22 | length_penalty: 1.0 23 | num_return_sequences: 1 24 | repetition_penalty: 1.0 25 | use_cache: True 26 | 27 | qwen-audio-chat-offline: 28 | class: audio_evals.models.offline_model.OfflineModel 29 | args: 30 | is_chat: True 31 | path: Qwen/Qwen-Audio-Chat 32 | sample_params: 33 | do_sample: false 34 | max_new_tokens: 256 35 | min_new_tokens: 1 36 | length_penalty: 1.0 37 | num_return_sequences: 1 38 | repetition_penalty: 1.0 39 | use_cache: True 40 | 41 | qwen-audio-pretrain-offline: 42 | class: audio_evals.models.offline_model.OfflinePretrainModel 43 | args: 44 | is_chat: False 45 | path: Qwen/Qwen-Audio 46 | padding_side: left 47 | sample_params: 48 | do_sample: false 49 | max_new_tokens: 256 50 | min_new_tokens: 1 51 | length_penalty: 1.0 52 | num_return_sequences: 1 53 | repetition_penalty: 1.0 54 | use_cache: True 55 | -------------------------------------------------------------------------------- /registry/model/ola.yaml: -------------------------------------------------------------------------------- 1 | ola-7b: 2 | class: audio_evals.models.ola.OlaModel 3 | args: 4 | path: THUdyh/Ola-7b 5 | env_path: envs/ola 6 | requirements_path: audio_evals/lib/Ola/requirements.txt 7 | -------------------------------------------------------------------------------- /registry/model/paraformer.yaml: -------------------------------------------------------------------------------- 1 | paraformer-zh: 2 | class: audio_evals.models.asr.paraformer.Paraformer 3 | args: 4 | path: funasr/paraformer-zh 5 | env_path: envs/paraformer 6 | requirements_path: audio_evals/lib/paraformer/requirements.txt 7 | 8 | paraformer-large: 9 | class: audio_evals.models.asr.paraformer.Paraformer 10 | args: 11 | path: funasr/Paraformer-large 12 | env_path: envs/paraformer 13 | requirements_path: audio_evals/lib/paraformer/requirements.txt 14 | 15 | paraformer-zh-streaming: 16 | class: audio_evals.models.asr.paraformer.Paraformer 17 | args: 18 | path: funasr/paraformer-zh-streaming 19 | env_path: envs/paraformer 20 | requirements_path: audio_evals/lib/paraformer/requirements.txt 21 | 22 | paraformer-en: 23 | class: audio_evals.models.asr.paraformer.Paraformer 24 | args: 25 | path: funasr/paraformer-en 26 | env_path: envs/paraformer 27 | requirements_path: audio_evals/lib/paraformer/requirements.txt 28 | 29 | conformer-en: 30 | class: audio_evals.models.asr.paraformer.Paraformer 31 | args: 32 | path: funasr/conformer-en 33 | env_path: envs/paraformer 34 | requirements_path: audio_evals/lib/paraformer/requirements.txt 35 | -------------------------------------------------------------------------------- /registry/model/qwen2.5.yaml: -------------------------------------------------------------------------------- 1 | qwen2.5-omni-audio: 2 | class: audio_evals.models.qwen2_5.QwenOmni 3 | args: 4 | path: Qwen/Qwen2.5-Omni-7B 5 | env_path: envs/qwen2.5-omni 6 | requirements_path: audio_evals/lib/qwen2-5omni/requirements.txt 7 | 8 | qwen2.5-omni-speech: 9 | class: audio_evals.models.qwen2_5.QwenOmni 10 | args: 11 | path: Qwen/Qwen2.5-Omni-7B 12 | speech: true 13 | env_path: envs/qwen2.5-omni 14 | requirements_path: audio_evals/lib/qwen2-5omni/requirements.txt 15 | -------------------------------------------------------------------------------- /registry/model/speechLLM.yaml: -------------------------------------------------------------------------------- 1 | glm-4-voice: 2 | class: audio_evals.models.glm4voice.GLM4Voice 3 | args: 4 | url: http://127.0.0.1:10000/generate_stream 5 | sr: 22500 6 | volume: 32767 7 | 8 | speech-gpt: 9 | class: audio_evals.models.glm4voice.GLM4Voice 10 | args: 11 | url: http://127.0.0.1:31505/chat 12 | sr: 16000 13 | volume: 32767 14 | 15 | moshi: 16 | class: audio_evals.models.glm4voice.GLM4Voice 17 | args: 18 | url: http://127.0.0.1:31610/chat 19 | sr: 22500 20 | cut_greeting: True 21 | 22 | 23 | llama-omni: 24 | class: audio_evals.models.llama_omni.LlamaOmni 25 | args: 26 | url: http://127.0.0.1:32039/worker_generate_stream 27 | 28 | mini-omni: 29 | class: audio_evals.models.mini_omni.MiniOmni 30 | args: 31 | url: http://127.0.0.1:32213/chat 32 | -------------------------------------------------------------------------------- /registry/model/step.yaml: -------------------------------------------------------------------------------- 1 | step-audio: 2 | class: audio_evals.models.step_audio.StepAudioChat 3 | args: 4 | url: http://127.0.0.1:5000/inference 5 | s2t: true 6 | 7 | step-speech: 8 | class: audio_evals.models.step_audio.StepAudioChat 9 | args: 10 | url: http://127.0.0.1:5000/inference 11 | s2t: false 12 | -------------------------------------------------------------------------------- /registry/model/tencent.yaml: -------------------------------------------------------------------------------- 1 | tencent-zh: 2 | class: audio_evals.models.asr.tencent.TencentASRModel 3 | args: 4 | secret_id: 5 | secret_key: 6 | sample_params: 7 | EngSerViceType: 16k_zh 8 | 9 | tencent-en: 10 | class: audio_evals.models.asr.tencent.TencentASRModel 11 | args: 12 | secret_id: 13 | secret_key: 14 | sample_params: 15 | EngSerViceType: 16k_en 16 | -------------------------------------------------------------------------------- /registry/model/ultravox.yaml: -------------------------------------------------------------------------------- 1 | ultravox: 2 | class: audio_evals.models.UltraVOX.UltraVOX 3 | args: 4 | path: fixie-ai/ultravox-v0_4 5 | sample_params: 6 | max_new_tokens: 256 -------------------------------------------------------------------------------- /registry/model/utmos.yaml: -------------------------------------------------------------------------------- 1 | utmos-en: 2 | class: audio_evals.models.utmos.UTMOS 3 | args: 4 | path: sarulab-speech/UTMOS-demo 5 | env_path: envs/utmos 6 | requirements_path: audio_evals/lib/utmos/requirements.txt 7 | -------------------------------------------------------------------------------- /registry/model/wavlm.yaml: -------------------------------------------------------------------------------- 1 | wavlm_large: 2 | class: audio_evals.models.wavlm.WavLM 3 | args: 4 | path: https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view 5 | env_path: envs/simo 6 | requirements_path: audio_evals/lib/simo/requirements.txt 7 | -------------------------------------------------------------------------------- /registry/process/base.yaml: -------------------------------------------------------------------------------- 1 | json_content: 2 | class: audio_evals.process.base.ContentExtract 3 | args: {} 4 | 5 | 6 | qwen_pretrain_asr_tractor_zh: 7 | class: audio_evals.process.qwen.QwenAudioASRExtract 8 | args: 9 | lang: zh 10 | 11 | qwen_pretrain_asr_tractor: 12 | class: audio_evals.process.qwen.QwenAudioASRExtract 13 | args: 14 | lang: en 15 | 16 | zh_text_normalizer: 17 | class: audio_evals.process.normalization.TextNormalization 18 | args: 19 | lang: zh 20 | 21 | en_text_normalizer: 22 | class: audio_evals.process.normalization.TextNormalization 23 | args: 24 | lang: en 25 | 26 | text_normalizer: 27 | class: audio_evals.process.normalization.TextNormalization 28 | args: {} 29 | 30 | trivia_qa_normalizer: 31 | class: audio_evals.process.triviaqa.TriviaQaNormalizer 32 | args: {} -------------------------------------------------------------------------------- /registry/process/choice.yaml: -------------------------------------------------------------------------------- 1 | first_option: 2 | class: audio_evals.process.firstoption.FirstOption 3 | args: 4 | options: ABCD 5 | -------------------------------------------------------------------------------- /registry/process/speech_model_output.yaml: -------------------------------------------------------------------------------- 1 | extract_audio: 2 | class: audio_evals.process.base.JsonExtract 3 | args: 4 | extract_key: audio 5 | 6 | extract_text: 7 | class: audio_evals.process.base.JsonExtract 8 | args: 9 | extract_key: text 10 | 11 | speech2text: 12 | class: audio_evals.process.speech.Speech2text 13 | args: 14 | model_name: whisper 15 | 16 | speech2text-zh: 17 | class: audio_evals.process.speech.Speech2text 18 | args: 19 | model_name: paraformer-zh 20 | prompt_name: simple-asr 21 | -------------------------------------------------------------------------------- /registry/prompt/aqa.yaml: -------------------------------------------------------------------------------- 1 | aqa: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: user 6 | contents: 7 | - type: audio 8 | value: "{{WavPath}}" 9 | - type: text 10 | value: "answer the question without explain\n# Question: {{QuestionText}}" 11 | 12 | direct-aqa: 13 | class: audio_evals.prompt.base.Prompt 14 | args: 15 | template: 16 | - role: user 17 | contents: 18 | - type: audio 19 | value: "{{WavPath}}" 20 | 21 | text-aqa: 22 | class: audio_evals.prompt.base.Prompt 23 | args: 24 | template: 25 | - role: user 26 | contents: 27 | - type: text 28 | value: "{{question_text}}" 29 | -------------------------------------------------------------------------------- /registry/prompt/asr.yaml: -------------------------------------------------------------------------------- 1 | asr: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: user 6 | contents: 7 | - type: audio 8 | value: "{{WavPath}}" 9 | - type: text 10 | value: "listen the audio, output the audio content with format {\"content\": \"\"}" 11 | 12 | simple-asr: 13 | class: audio_evals.prompt.base.Prompt 14 | args: 15 | template: 16 | audio: "{{WavPath}}" -------------------------------------------------------------------------------- /registry/prompt/caption.yaml: -------------------------------------------------------------------------------- 1 | caption: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: user 6 | contents: 7 | - type: audio 8 | value: "{{WavPath}}" 9 | - type: text 10 | value: "describe the audio:" 11 | -------------------------------------------------------------------------------- /registry/prompt/chatbot.yaml: -------------------------------------------------------------------------------- 1 | chatbot-eval: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: '[Instruction] 5 | Please act as an impartial judge and evaluate the quality of the response provided 6 | by an AI assistant to the user question displayed below. Your evaluation should 7 | consider factors such as the helpfulness, relevance, accuracy, depth, creativity, 8 | and level of detail of the response. Begin your evaluation by providing a short 9 | explanation. Be as objective as possible. After providing your explanation, you 10 | must rate the response on a scale of 1 to 10 by strictly following this format: 11 | "[[rating]]", for example: "Rating: [[5]]". 12 | [Question] 13 | {{instruction}} 14 | [The Start of Assistant’s Answer] 15 | {{response}} 16 | [The End of Assistant’s Answer]' 17 | 18 | 19 | -------------------------------------------------------------------------------- /registry/prompt/choice.yaml: -------------------------------------------------------------------------------- 1 | single_choice: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: user 6 | contents: 7 | - type: audio 8 | value: "{{WavPath}}" 9 | - type: text 10 | value: "Choose the most suitable answer from options A, B, C, and D to 11 | respond the question in next line, you may only choose A or B or C or D 12 | .\n{{question}}\nA. {{choice_a}}\nB. {{choice_b}}\nC. {{choice_c}}\nD. {{choice_d}}" 13 | 14 | single_choice_with_answer: 15 | class: audio_evals.prompt.base.Prompt 16 | args: 17 | template: 18 | - role: user 19 | contents: 20 | - type: audio 21 | value: "{{WavPath}}" 22 | - type: text 23 | value: "{{question}} Select one option from the provided choices.\n{{choices}}" 24 | -------------------------------------------------------------------------------- /registry/prompt/digit.yaml: -------------------------------------------------------------------------------- 1 | digit: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: user 6 | contents: 7 | - type: audio 8 | value: "{{WavPath}}" 9 | - type: text 10 | value: "Identify which digit [0-9] is spoken in the provided audio clip, answer one of [0,1,2,3,4,5,6,7,8,9] without explain" 11 | -------------------------------------------------------------------------------- /registry/prompt/emotion_anlysis.yaml: -------------------------------------------------------------------------------- 1 | emo_analysis: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: user 6 | contents: 7 | - type: audio 8 | value: "{{WavPath}}" 9 | - type: text 10 | value: "listen the audio and judge the emotion of the speaker, the answer must be one of [surprise,anger,neutral,joy,sadness,fear,disgust], answer without explain" 11 | 12 | ravdess_emo_analysis: 13 | class: audio_evals.prompt.base.Prompt 14 | args: 15 | template: 16 | - role: user 17 | contents: 18 | - type: audio 19 | value: "{{WavPath}}" 20 | - type: text 21 | value: "listen the audio and judge the emotion of the speaker, the answer must be one of [neutral,calm,happy,sad,angry,fearful,disgust,surprised], answer without explain" 22 | 23 | sentiment_analysis: 24 | class: audio_evals.prompt.base.Prompt 25 | args: 26 | template: 27 | - role: user 28 | contents: 29 | - type: audio 30 | value: "{{WavPath}}" 31 | - type: text 32 | value: "listen the audio and judge the sentiment of the speaker, the answer must be one of [positive,negative,neutral], answer without explain" 33 | 34 | vocal_sound_analysis: 35 | class: audio_evals.prompt.base.Prompt 36 | args: 37 | template: 38 | - role: user 39 | contents: 40 | - type: audio 41 | value: "{{WavPath}}" 42 | - type: text 43 | value: "listen the audio and judge the vocal sound, the answer must be one of [Cough,Sigh,Throat clearing,Sneeze,Laughter,Sniff], answer without explain" 44 | 45 | sound_analysis: 46 | class: audio_evals.prompt.base.Prompt 47 | args: 48 | template: 49 | - role: user 50 | contents: 51 | - type: audio 52 | value: "{{WavPath}}" 53 | - type: text 54 | value: "listen the audio and judge the sound, the answer must be one of ['Speech', 'Frying', 'Dishes', 55 | 'Running_water', 'Blender', 'Electric_shaver_toothbrush', 'Cat', 56 | 'Alarm_bell_ringing', 'Dog', 'Vacuum_cleaner'], answer without explain" 57 | -------------------------------------------------------------------------------- /registry/prompt/gender_anlysis.yaml: -------------------------------------------------------------------------------- 1 | gender_analysis: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: user 6 | contents: 7 | - type: audio 8 | value: "{{WavPath}}" 9 | - type: text 10 | value: "listen the audio and judge the gender of the speaker, the answer must be one of [female, male], answer without explain" 11 | -------------------------------------------------------------------------------- /registry/prompt/geval.yaml: -------------------------------------------------------------------------------- 1 | yes_no_judge: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: system 6 | contents: 7 | - type: text 8 | value: "You are a helpful assistant who tries to help answer the user's question." 9 | - role: user 10 | contents: 11 | - type: text 12 | value: "{{real_prompt}}" 13 | -------------------------------------------------------------------------------- /registry/prompt/kimi-audio.yaml: -------------------------------------------------------------------------------- 1 | kimi-audio-asr-en: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: user 6 | message_type: text 7 | content: Please transcribe the spoken content into written text. 8 | - role: user 9 | message_type: audio 10 | content: '{{WavPath}}' 11 | 12 | kimi-audio-asr-zh: 13 | class: audio_evals.prompt.base.Prompt 14 | args: 15 | template: 16 | - role: user 17 | message_type: text 18 | content: 请把这段语音转录成文本。 19 | - role: user 20 | message_type: audio 21 | content: '{{WavPath}}' 22 | -------------------------------------------------------------------------------- /registry/prompt/medicine.yaml: -------------------------------------------------------------------------------- 1 | COVID-recognizer: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: user 6 | contents: 7 | - type: audio 8 | value: "{{WavPath}}" 9 | - type: text 10 | value: "Listen to the provided audio and determine the health status, answer one of ['healthy', 'symptomatic', 'COVID-19'] without explain" 11 | 12 | Heartbeat-recognizer: 13 | class: audio_evals.prompt.base.Prompt 14 | args: 15 | template: 16 | - role: user 17 | contents: 18 | - type: audio 19 | value: "{{WavPath}}" 20 | - type: text 21 | value: "Listen to the heartbeat sound and determine the type of heart sound present, answer one of ['normal', 'murmur', 'extrastole'] without explain" 22 | 23 | 24 | Respiratory-crackles-recognizer: 25 | class: audio_evals.prompt.base.Prompt 26 | args: 27 | template: 28 | - role: user 29 | contents: 30 | - type: audio 31 | value: "{{WavPath}}" 32 | - type: text 33 | value: "Listen to the respiratory sound and determine if crackles are present. Answer with either 'present' or 'absent' without explanation." 34 | 35 | Respiratory-wheezes-recognizer: 36 | class: audio_evals.prompt.base.Prompt 37 | args: 38 | template: 39 | - role: user 40 | contents: 41 | - type: audio 42 | value: "{{WavPath}}" 43 | - type: text 44 | value: "Listen to the respiratory sound and determine if wheezes are present. Answer with either 'present' or 'absent' without explanation." 45 | -------------------------------------------------------------------------------- /registry/prompt/music.yaml: -------------------------------------------------------------------------------- 1 | instrument_recognition: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: user 6 | contents: 7 | - type: audio 8 | value: "{{WavPath}}" 9 | - type: text 10 | value: "listen the music and judge instrument of the music, the answer must be one of [Bass,Brass,Flute,Guitar,Keyboard,Mallet,Organ,Reed,String,Synth Lead,Vocal], answer without explain" 11 | 12 | chord_recognition: 13 | class: audio_evals.prompt.base.Prompt 14 | args: 15 | template: 16 | - role: user 17 | contents: 18 | - type: audio 19 | value: "{{WavPath}}" 20 | - type: text 21 | value: "Listen to the music and determine the chord quality. The answer should be either 'Major' or 'Minor', answer without explain" 22 | 23 | music_genre: 24 | class: audio_evals.prompt.base.Prompt 25 | args: 26 | template: 27 | - role: user 28 | contents: 29 | - type: audio 30 | value: "{{WavPath}}" 31 | - type: text 32 | value: "Listen to the provided music clip and identify the genre. the answer must be one of [blues,classical,country,disco,hiphop,jazz,metal,pop,reggae,rock], answer without explain" 33 | 34 | 35 | music_tempo: 36 | class: audio_evals.prompt.base.Prompt 37 | args: 38 | template: 39 | - role: user 40 | contents: 41 | - type: audio 42 | value: "{{WavPath}}" 43 | - type: text 44 | value: "Listen to the audio clip and determine the exact tempo (BPM). Respond only with a numerical value without explain" 45 | -------------------------------------------------------------------------------- /registry/prompt/ola.yaml: -------------------------------------------------------------------------------- 1 | ola-asr: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: user 6 | contents: 7 | - type: text 8 | value: 'Please give the ASR results of the given speech.' 9 | - type: audio 10 | value: '{{WavPath}}' 11 | 12 | ola-aqa: 13 | class: audio_evals.prompt.base.Prompt 14 | args: 15 | template: 16 | - role: user 17 | contents: 18 | - type: text 19 | value: "Please directly answer the questions in the user's speech." 20 | - type: audio 21 | value: '{{WavPath}}' 22 | -------------------------------------------------------------------------------- /registry/prompt/qa.yaml: -------------------------------------------------------------------------------- 1 | qa: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: user 6 | contents: 7 | - type: audio 8 | value: "{{WavPath}}" 9 | - type: text 10 | value: "{{question}}" 11 | -------------------------------------------------------------------------------- /registry/prompt/sound_identify.yaml: -------------------------------------------------------------------------------- 1 | catdog_identify: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | - role: user 6 | contents: 7 | - type: audio 8 | value: "{{WavPath}}" 9 | - type: text 10 | value: "Listen to the audio and determine if it's a dog or a cat. The answer must be one of the following: [dog, cat]. Provide the answer without explanation." 11 | -------------------------------------------------------------------------------- /registry/prompt/whisper-pretrain.yaml: -------------------------------------------------------------------------------- 1 | whisper-asr: 2 | class: audio_evals.prompt.base.Prompt 3 | args: 4 | template: 5 | audio: '{{WavPath}}' 6 | generate_kwargs: {} 7 | 8 | whisper-asr-zh: 9 | class: audio_evals.prompt.base.Prompt 10 | args: 11 | template: 12 | audio: '{{WavPath}}' 13 | generate_kwargs: 14 | language: chinese 15 | 16 | whisper-asr-en: 17 | class: audio_evals.prompt.base.Prompt 18 | args: 19 | template: 20 | audio: '{{WavPath}}' 21 | generate_kwargs: 22 | language: english 23 | 24 | whisper-asr-fr: 25 | class: audio_evals.prompt.base.Prompt 26 | args: 27 | template: 28 | audio: '{{WavPath}}' 29 | generate_kwargs: 30 | language: french 31 | 32 | whisper-asr-yue: 33 | class: audio_evals.prompt.base.Prompt 34 | args: 35 | template: 36 | audio: '{{WavPath}}' 37 | generate_kwargs: 38 | language: yue 39 | 40 | whisper-sst-zh2en: 41 | class: audio_evals.prompt.base.Prompt 42 | args: 43 | template: 44 | audio: '{{WavPath}}' 45 | generate_kwargs: 46 | language: chinese 47 | task: translate 48 | whisper-sst-de2en: 49 | class: audio_evals.prompt.base.Prompt 50 | args: 51 | template: 52 | audio: '{{WavPath}}' 53 | generate_kwargs: 54 | language: german 55 | task: translate 56 | whisper-sst-es2en: 57 | class: audio_evals.prompt.base.Prompt 58 | args: 59 | template: 60 | audio: '{{WavPath}}' 61 | generate_kwargs: 62 | language: spanish 63 | task: translate 64 | 65 | whisper-sst-fr2en: 66 | class: audio_evals.prompt.base.Prompt 67 | args: 68 | template: 69 | audio: '{{WavPath}}' 70 | generate_kwargs: 71 | language: french 72 | task: translate 73 | whisper-sst-it2en: 74 | class: audio_evals.prompt.base.Prompt 75 | args: 76 | template: 77 | audio: '{{WavPath}}' 78 | generate_kwargs: 79 | language: italian 80 | task: translate 81 | -------------------------------------------------------------------------------- /registry/recorder/local.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/registry/recorder/local.yaml -------------------------------------------------------------------------------- /requirments-offline-model.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | accelerate>=0.20.3 3 | einops 4 | matplotlib 5 | pillow 6 | scipy 7 | tensorboard 8 | tiktoken 9 | openai 10 | transformers_stream_generator==0.0.4 11 | -------------------------------------------------------------------------------- /requirments.txt: -------------------------------------------------------------------------------- 1 | jinja2 2 | tqdm 3 | requests 4 | aiohttp 5 | pyyaml 6 | pytest 7 | jiwer 8 | sacrebleu==1.5.1 9 | editdistance 10 | scikit-learn 11 | librosa 12 | soundfile 13 | dashscope 14 | datasets 15 | pre-commit 16 | more_itertools 17 | pandas 18 | zhconv 19 | pycocoevalcap 20 | regex 21 | openai>=1.0.0 22 | websockets==12.0 23 | pydub 24 | openpyxl 25 | gdown 26 | -------------------------------------------------------------------------------- /requirments/minicpm_o2_6.txt: -------------------------------------------------------------------------------- 1 | Pillow==10.1.0 2 | torch==2.2.0 3 | torchaudio==2.2.0 4 | torchvision==0.17.0 5 | transformers==4.44.2 6 | librosa==0.9.0 7 | soundfile==0.12.1 8 | vector-quantize-pytorch==1.18.5 9 | vocos==0.1.0 10 | decord 11 | moviepy 12 | numpy==1.26 13 | -------------------------------------------------------------------------------- /tests/test_audio_evals_registry.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pytest 4 | 5 | from audio_evals.eval_task import EvalTask 6 | from audio_evals.recorder import Recorder 7 | from audio_evals.registry import registry 8 | 9 | # 配置根日志记录器 10 | logging.basicConfig( 11 | level=logging.DEBUG, 12 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 13 | handlers=[logging.StreamHandler()], 14 | ) 15 | 16 | 17 | def test_registry_model(): 18 | model = registry.get_model("gpt4o") 19 | print(model.inference("how are you")) 20 | 21 | 22 | def test_prompt(): 23 | prompt = registry.get_prompt("asr") 24 | model = registry.get_model("qwen-audio-offline") 25 | real_prompt = prompt.load( 26 | a="/Users/a1/Downloads/语音转文字/嘉德罗斯/嘉德罗斯_12.wav" 27 | ) 28 | print(model.inference(real_prompt)) 29 | 30 | 31 | def test_evaluator(): 32 | e = registry.get_evaluator("em") 33 | assert e("0", 0)["match"] 34 | assert e(0, "0")["match"] 35 | assert e(1, "0")["match"] == 0 36 | 37 | e = registry.get_evaluator("cer") 38 | print( 39 | e( 40 | "买一张万能卡也有不少好处带着这张卡你可以进入南非的一些公园或全部的国家公园", 41 | "买一张万能卡(Wild Card)也有不少好处。带着这张卡,你可以进入南非的一些公园或全部的国家公园。", 42 | ) 43 | ) 44 | 45 | e = registry.get_evaluator("wer") 46 | print(e("It is good", "it is good")) 47 | 48 | 49 | def test_agg(): 50 | a = registry.get_agg("acc") 51 | assert a([{"match": 0}])["acc"] == 0 52 | assert a([{"match": 1}])["acc"] == 1 53 | assert a([])["acc"] == 0 54 | with pytest.raises(Exception): 55 | a([{"count": 1}]) 56 | 57 | 58 | def test_task(): 59 | task_cfg = registry.get_eval_task("alei_asr") 60 | 61 | t = EvalTask( 62 | dataset=registry.get_dataset("KeSpeech"), 63 | prompt=registry.get_prompt("KeSpeech"), 64 | predictor=registry.get_model(task_cfg.model), 65 | evaluator=registry.get_evaluator(task_cfg.evaluator), 66 | post_process=[registry.get_process(item) for item in task_cfg.post_process], 67 | agg=registry.get_agg(task_cfg.agg), 68 | recorder=Recorder("log/KeSpeech.jsonl"), 69 | ) 70 | res = t.run() 71 | print(res) 72 | -------------------------------------------------------------------------------- /tests/test_dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from audio_evals.registry import registry 4 | 5 | # 配置根日志记录器 6 | logging.basicConfig( 7 | level=logging.DEBUG, 8 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 9 | handlers=[logging.StreamHandler()], 10 | ) 11 | 12 | 13 | def test_huggingface_dataset(): 14 | a = registry.get_dataset("KeSpeech-hf") 15 | b = a.load() 16 | b = list(b) 17 | with open(b[0]["audio"]["path"], "rb") as f: 18 | content = f.read() 19 | print(content) 20 | print(a) 21 | --------------------------------------------------------------------------------