├── .gitignore
├── .pre-commit-config.yaml
├── FAQ.md
├── LICENSE
├── README.md
├── README_en.md
├── assets
    ├── audio_understanding_leaderboard.png
    ├── dataset_distribute.png
    ├── default.wav
    ├── img_1.png
    ├── leaderboard.md
    ├── logo.png
    ├── performance.png
    ├── s2s_leaderboard.png
    ├── s2s_semantic_leaderboard.png
    └── utmos.png
├── audio_evals
    ├── __init__.py
    ├── agg
    │   ├── __init__.py
    │   ├── air_chat.py
    │   └── base.py
    ├── base.py
    ├── constants.py
    ├── dataset
    │   ├── __init__.py
    │   ├── dataset.py
    │   ├── giga.py
    │   ├── huggingface.py
    │   └── resume.py
    ├── eval_task.py
    ├── evaluator
    │   ├── __init__.py
    │   ├── air_chat.py
    │   ├── alpaca_eval.py
    │   ├── alpaca_eval.txt
    │   ├── base.py
    │   ├── bbh.py
    │   ├── bleu.py
    │   ├── coco.py
    │   ├── dict_match.py
    │   ├── dnsmos.py
    │   ├── ensemble.py
    │   ├── harm.py
    │   ├── ifeval.py
    │   ├── mcq.py
    │   ├── qa_eval.py
    │   ├── qa_exact_match.py
    │   ├── ref_qa_geval.py
    │   ├── ref_qa_geval.txt
    │   ├── simo.py
    │   ├── string_match.py
    │   ├── utmos.py
    │   ├── voice_bench.py
    │   └── wer.py
    ├── isolate.py
    ├── lib
    │   ├── DNSMOS
    │   │   ├── README.md
    │   │   ├── dnsmos_single.py
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   ├── SenseVoice
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   ├── Spark-TTS
    │   │   ├── .gitignore
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── cli
    │   │   │   ├── SparkTTS.py
    │   │   │   └── inference.py
    │   │   ├── encodec.py
    │   │   ├── example
    │   │   │   └── infer.sh
    │   │   ├── main.py
    │   │   ├── requirements.txt
    │   │   ├── sparktts
    │   │   │   ├── models
    │   │   │   │   ├── audio_tokenizer.py
    │   │   │   │   └── bicodec.py
    │   │   │   ├── modules
    │   │   │   │   ├── blocks
    │   │   │   │   │   ├── layers.py
    │   │   │   │   │   ├── samper.py
    │   │   │   │   │   └── vocos.py
    │   │   │   │   ├── encoder_decoder
    │   │   │   │   │   ├── feat_decoder.py
    │   │   │   │   │   ├── feat_encoder.py
    │   │   │   │   │   └── wave_generator.py
    │   │   │   │   ├── fsq
    │   │   │   │   │   ├── finite_scalar_quantization.py
    │   │   │   │   │   └── residual_fsq.py
    │   │   │   │   ├── speaker
    │   │   │   │   │   ├── ecapa_tdnn.py
    │   │   │   │   │   ├── perceiver_encoder.py
    │   │   │   │   │   ├── pooling_layers.py
    │   │   │   │   │   └── speaker_encoder.py
    │   │   │   │   └── vq
    │   │   │   │   │   └── factorized_vector_quantize.py
    │   │   │   └── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── audio.py
    │   │   │   │   ├── file.py
    │   │   │   │   ├── parse_options.sh
    │   │   │   │   └── token_parser.py
    │   │   ├── src
    │   │   │   ├── figures
    │   │   │   │   ├── gradio_TTS.png
    │   │   │   │   ├── gradio_control.png
    │   │   │   │   ├── infer_control.png
    │   │   │   │   └── infer_voice_cloning.png
    │   │   │   └── logo
    │   │   │   │   ├── HKUST.jpg
    │   │   │   │   ├── NPU.jpg
    │   │   │   │   ├── NTU.jpg
    │   │   │   │   ├── SJU.jpg
    │   │   │   │   ├── SparkAudio.jpg
    │   │   │   │   ├── SparkAudio2.jpg
    │   │   │   │   ├── SparkTTS.jpg
    │   │   │   │   ├── SparkTTS.png
    │   │   │   │   ├── mobvoi.jpg
    │   │   │   │   └── mobvoi.png
    │   │   └── webui.py
    │   ├── WavTokenizer
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── configs
    │   │   │   ├── wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml
    │   │   │   └── wavtokenizer_smalldata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
    │   │   ├── data
    │   │   │   └── demo.txt
    │   │   ├── decoder
    │   │   │   ├── __init__.py
    │   │   │   ├── dataset.py
    │   │   │   ├── discriminator_dac.py
    │   │   │   ├── discriminators.py
    │   │   │   ├── experiment.py
    │   │   │   ├── feature_extractors.py
    │   │   │   ├── heads.py
    │   │   │   ├── helpers.py
    │   │   │   ├── loss.py
    │   │   │   ├── models.py
    │   │   │   ├── modules.py
    │   │   │   ├── pretrained.py
    │   │   │   ├── pretrained_model.py
    │   │   │   └── spectral_ops.py
    │   │   ├── encoder
    │   │   │   ├── __init__.py
    │   │   │   ├── distrib.py
    │   │   │   ├── model.py
    │   │   │   ├── modules
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── conv.py
    │   │   │   │   ├── lstm.py
    │   │   │   │   ├── norm.py
    │   │   │   │   ├── seanet.py
    │   │   │   │   └── transformer.py
    │   │   │   ├── msstftd.py
    │   │   │   ├── quantization
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── ac.py
    │   │   │   │   ├── core_vq.py
    │   │   │   │   └── vq.py
    │   │   │   └── utils.py
    │   │   ├── infer.py
    │   │   ├── metrics
    │   │   │   ├── UTMOS.py
    │   │   │   ├── infer.py
    │   │   │   └── periodicity.py
    │   │   ├── requirements.txt
    │   │   ├── result.png
    │   │   └── train.py
    │   ├── __init__.py
    │   ├── chattts.py
    │   ├── coco.py
    │   ├── cpm_tts
    │   │   ├── __init__.py
    │   │   ├── chattts.py
    │   │   ├── config.py
    │   │   ├── dvae.py
    │   │   ├── gpt.py
    │   │   ├── minicpmv26_resampler.py
    │   │   └── processor.py
    │   ├── doubao
    │   │   ├── simplex_websocket_demo.py
    │   │   └── stream_asr.py
    │   ├── encodec
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   ├── evaluate_tokenizer.py
    │   ├── mimi
    │   │   ├── main.py
    │   │   ├── requirements.txt
    │   │   └── stream.py
    │   ├── minicpm
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   ├── minicpm_0_5B
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   ├── paraformer
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   ├── qwen2-5omni
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   ├── sensevoicelib.py
    │   ├── simo
    │   │   ├── models_ecapa_tdnn.py
    │   │   ├── requirements.txt
    │   │   └── simo.py
    │   ├── ssnact
    │   │   └── ssnact.py
    │   ├── streaming_asr_demo.py
    │   ├── text_normalization
    │   │   ├── __init__.py
    │   │   ├── basic.py
    │   │   ├── cn_tn.py
    │   │   ├── en.py
    │   │   └── english.json
    │   ├── utmos
    │   │   ├── lightning_module.py
    │   │   ├── main.py
    │   │   ├── model.py
    │   │   └── requirements.txt
    │   ├── wer.py
    │   └── whisper
    │   │   ├── main.py
    │   │   └── requirements.txt
    ├── main.py
    ├── models
    │   ├── AudioEncoder
    │   │   ├── __init__.py
    │   │   ├── chattts.py
    │   │   ├── cosyvoice.py
    │   │   ├── cosyvoice_adv.py
    │   │   ├── encodec.py
    │   │   ├── mimi.py
    │   │   ├── spark.py
    │   │   ├── vocos_encode.py
    │   │   └── wav_tokenizer.py
    │   ├── TTS
    │   │   ├── __init__.py
    │   │   ├── amphion.py
    │   │   ├── indextts.py
    │   │   ├── megatts.py
    │   │   ├── melotts.py
    │   │   ├── spark.py
    │   │   └── stabletts.py
    │   ├── UltraVOX.py
    │   ├── __init__.py
    │   ├── ali.py
    │   ├── asr
    │   │   ├── __init__.py
    │   │   ├── ali.py
    │   │   ├── baidu.py
    │   │   ├── fireredasr.py
    │   │   ├── huawei.py
    │   │   ├── huoshan.py
    │   │   ├── paraformer.py
    │   │   ├── sensevoice.py
    │   │   ├── sherpa.py
    │   │   ├── tencent.py
    │   │   └── xfyun.py
    │   ├── bytedance
    │   │   ├── __init__.py
    │   │   └── doubao.py
    │   ├── dnsmos.py
    │   ├── glm4audio.py
    │   ├── glm4voice.py
    │   ├── google.py
    │   ├── llama_omni.py
    │   ├── llmcenter.py
    │   ├── mini_cpm.py
    │   ├── mini_omni.py
    │   ├── model.py
    │   ├── moonshot.py
    │   ├── offline_model.py
    │   ├── ola.py
    │   ├── openai.py
    │   ├── openai_realtime.py
    │   ├── qwen.py
    │   ├── qwen2_5.py
    │   ├── sp_gemini.py
    │   ├── step_audio.py
    │   ├── utmos.py
    │   ├── wavlm.py
    │   └── whisper.py
    ├── process
    │   ├── __init__.py
    │   ├── base.py
    │   ├── eliminate.py
    │   ├── firstoption.py
    │   ├── normalization.py
    │   ├── qwen.py
    │   └── speech.py
    ├── prompt
    │   ├── __init__.py
    │   └── base.py
    ├── recorder.py
    ├── registry.py
    └── utils.py
├── docs
    ├── Procedures for Restarting an Incomplete Evaluation.md
    ├── how add a dataset.md
    ├── how eval your model.md
    ├── how launch a custom eval task.md
    └── how use UTMOS, DNSMOS eval speech quality.md
├── registry
    ├── agg
    │   ├── air-bench.yaml
    │   └── naive.yaml
    ├── dataset
    │   ├── AudioCaps.yaml
    │   ├── COVID-recognizer.yaml
    │   ├── CatDog.yaml
    │   ├── ClothoAQA.yaml
    │   ├── CommonVoice.yaml
    │   ├── DESEDpublic_eval.yaml
    │   ├── GTZAN.yaml
    │   ├── GigaSpeech.yaml
    │   ├── KeSpeech.yaml
    │   ├── MELD.yaml
    │   ├── MMAU.yaml
    │   ├── Nsynth.yaml
    │   ├── RAVDESS.yaml
    │   ├── RespiratorySound.yaml
    │   ├── TESS.yaml
    │   ├── VSC.yaml
    │   ├── VoxCeleb.yaml
    │   ├── WavCaps.yaml
    │   ├── WenetSpeech.yaml
    │   ├── air.yaml
    │   ├── aishell.yaml
    │   ├── alpaca_eval.yaml
    │   ├── audio-MNIST.yaml
    │   ├── chord_recoganition.yaml
    │   ├── covost2.yaml
    │   ├── fleurs.yaml
    │   ├── heart_beat.yaml
    │   ├── librispeech.yaml
    │   ├── llama_questions.yaml
    │   ├── multilingual_librispeech.yaml
    │   ├── peoples_speech.yaml
    │   ├── sample.yaml
    │   ├── tedlium.yaml
    │   ├── triviaqa.yaml
    │   ├── voxpopuli.yaml
    │   └── webQ.yaml
    ├── eval_task
    │   ├── acoustics.yaml
    │   ├── air.yaml
    │   ├── alpaca.yaml
    │   ├── aqa.yaml
    │   ├── asr.yaml
    │   ├── caption.yaml
    │   ├── digit.yaml
    │   ├── emo.yaml
    │   ├── gender.yaml
    │   ├── inference.yaml
    │   ├── medicine.yaml
    │   ├── music.yaml
    │   ├── sound_identify.yaml
    │   ├── stt.yaml
    │   └── vsc.yaml
    ├── evaluator
    │   ├── air-bench.yaml
    │   ├── alpaca.yaml
    │   ├── choice-with-ans.yaml
    │   ├── common.yaml
    │   ├── dnsmos.yaml
    │   ├── llama-speech.yaml
    │   ├── qa.yaml
    │   ├── simo.yaml
    │   ├── speech_qulity.yaml
    │   └── utmos.yaml
    ├── model
    │   ├── ali.yaml
    │   ├── dnsmos.yaml
    │   ├── gemini.yaml
    │   ├── minicpmo.yaml
    │   ├── moonshot.yaml
    │   ├── offline.yaml
    │   ├── ola.yaml
    │   ├── paraformer.yaml
    │   ├── qwen2.5.yaml
    │   ├── speechLLM.yaml
    │   ├── step.yaml
    │   ├── tencent.yaml
    │   ├── ultravox.yaml
    │   ├── utmos.yaml
    │   └── wavlm.yaml
    ├── process
    │   ├── base.yaml
    │   ├── choice.yaml
    │   └── speech_model_output.yaml
    ├── prompt
    │   ├── 3o.yaml
    │   ├── aqa.yaml
    │   ├── asr.yaml
    │   ├── caption.yaml
    │   ├── chatbot.yaml
    │   ├── choice.yaml
    │   ├── digit.yaml
    │   ├── emotion_anlysis.yaml
    │   ├── gender_anlysis.yaml
    │   ├── geval.yaml
    │   ├── kimi-audio.yaml
    │   ├── medicine.yaml
    │   ├── mini-cpm-omni.yaml
    │   ├── music.yaml
    │   ├── ola.yaml
    │   ├── qa.yaml
    │   ├── qwen-audio-pretrain.yaml
    │   ├── qwen-omni.yaml
    │   ├── qwen2-audio-pretrain.yaml
    │   ├── sound_identify.yaml
    │   ├── stt.yaml
    │   └── whisper-pretrain.yaml
    └── recorder
    │   └── local.yaml
├── requirments-offline-model.txt
├── requirments.txt
├── requirments
    └── minicpm_o2_6.txt
└── tests
    ├── test_audio_evals_registry.py
    └── test_dataset.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | cremote_registry/
 2 | cyb_registry/
 3 | temp*
 4 | # 忽略操作系统生成的文件
 5 | init_model/
 6 | anna/
 7 | .vscode/
 8 | envs/
 9 | env/
10 | tests/*
11 | *.DS_Store
12 | .DS_Store
13 | ._*
14 | *.wav
15 | 
16 | local_registry/
17 | cyb_dev_registry/
18 | res/
19 | log/
20 | script/
21 | raw_data/
22 | tmp/
23 | synthetic_data/
24 | .run/
25 | 
26 | *.xlsx
27 | 
28 | 
29 | # 忽略编辑器和IDE生成的文件
30 | .idea/
31 | *.sublime-workspace
32 | *.sublime-project
33 | *.swp
34 | *.swo
35 | 
36 | # 忽略构建和编译生成的文件
37 | __pycache__/
38 | *.pyc
39 | *.pyo
40 | *.pyd
41 | dist/
42 | build/
43 | *.egg-info/
44 | node_modules/
45 | *.log
46 | *.tmp
47 | *.bak
48 | *.swp
49 | 
50 | # 忽略配置文件
51 | *.env
52 | 
53 | # 忽略数据库文件
54 | *.sqlite
55 | *.sqlite3
56 | *.db
57 | 
58 | # 忽略压缩文件
59 | *.zip
60 | *.tar
61 | *.gz
62 | *.bz2
63 | *.7z
64 | 
65 | # 忽略临时文件
66 | *.tmp
67 | *.temp
68 | 
69 | 
70 | # 忽略虚拟环境
71 | venv/
72 | env/
73 | 
74 | # 忽略测试生成的文件
75 | coverage/
76 | .coverage
77 | 
78 | # 忽略其他可能的敏感信息
79 | *.pem
80 | *.key
81 | *.crt
82 | *.p12
83 | *.pfx
84 | *.der
85 | 
86 | # 忽略其他可能的临时文件
87 | *.log
88 | *.out
89 | *.pid
90 | *.pid.lock
91 | 
92 | # 忽略其他可能的缓存文件
93 | *.cache
94 | *.tmp
95 | 
96 | # 忽略其他可能的备份文件
97 | *.bak
98 | *.old
99 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.6.0
 4 |     hooks:
 5 |       - id: trailing-whitespace
 6 |       - id: check-yaml
 7 |       - id: end-of-file-fixer
 8 |       - id: requirements-txt-fixer
 9 |       - id: check-merge-conflict
10 |       - id: fix-encoding-pragma
11 |         args: ["--remove"]
12 |       - id: mixed-line-ending
13 |         args: ["--fix=lf"]
14 |   - repo: https://github.com/psf/black
15 |     rev: 24.4.2
16 |     hooks:
17 |       - id: black
18 | 


--------------------------------------------------------------------------------
/FAQ.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ## 1.  ./nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkAddData_12_1, version libnvJitLink.so.12
 4 | 
 5 | ref:
 6 | https://github.com/pytorch/pytorch/issues/111469
 7 | 
 8 | two solutions:
 9 | - you can update your nvidia to match torch
10 | - use your python env nvidia path not system, like: `export LD_LIBRARY_PATH=$HOME/path/to/my/venv3115/lib64/
11 | python3.11/site-packages/nvidia/nvjitlink/lib` or`export LD_LIBRARY_PATH=env/lib/python3.10/site-packages/nvidia/nvjitlink/lib`
12 | 
13 | ## 2. ConnectionError: Couldn't reach 'TwinkStart/xx' on the Hub (LocalEntryNotFoundError)
14 | 
15 | make sure you can access the huggingface hub, you maybe need use proxy:
16 | 
17 | > export HF_ENDPOINT=https://hf-mirror.com
18 | 
19 | 
20 | ## 3. gigaspeech: 'None Type' object is not callable
21 | 
22 | Gigaspeech is not a directly accessible dataset; you need to request permission from the authors.
23 | https://huggingface.co/datasets/speechcolab/gigaspeech
24 | 
25 | When you attempt to download it, you will encounter a login page. If you do not have permission, there will be an HF link prompting you to apply for access.
26 | 
27 | If the above does not appear, enter the following code in the Python interactive shell:
28 | 
29 | ```python
30 | from datasets import load_dataset
31 | gs_test = load_dataset("speechcolab/gigaspeech", "test")
32 | ```
33 | 
34 | If this code runs successfully, you can proceed with the evaluation.
35 | 
36 | ## 4. The official evaluation prompts for MiniCPM-O 2.6
37 | 
38 | 1. ASR zh: --prompt mini-cpm-omni-asr-zh
39 | 2. ASR en: --prompt mini-cpm-omni-asr-en
40 | 3. AST 2zh: --prompt mini-cpm-omni-asr-zh
41 | 4. AST 2en: --prompt mini-cpm-omni-ast-en
42 | 5. emotion analysis: --prompt mini-cpm-omni-emotion_analysis
43 | 


--------------------------------------------------------------------------------
/assets/audio_understanding_leaderboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/audio_understanding_leaderboard.png


--------------------------------------------------------------------------------
/assets/dataset_distribute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/dataset_distribute.png


--------------------------------------------------------------------------------
/assets/default.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/default.wav


--------------------------------------------------------------------------------
/assets/img_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/img_1.png


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/logo.png


--------------------------------------------------------------------------------
/assets/performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/performance.png


--------------------------------------------------------------------------------
/assets/s2s_leaderboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/s2s_leaderboard.png


--------------------------------------------------------------------------------
/assets/s2s_semantic_leaderboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/s2s_semantic_leaderboard.png


--------------------------------------------------------------------------------
/assets/utmos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/assets/utmos.png


--------------------------------------------------------------------------------
/audio_evals/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/__init__.py


--------------------------------------------------------------------------------
/audio_evals/agg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/agg/__init__.py


--------------------------------------------------------------------------------
/audio_evals/agg/air_chat.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | from audio_evals.agg.base import AggPolicy
 3 | 
 4 | 
 5 | class AirChat(AggPolicy):
 6 | 
 7 |     def _agg(self, score_detail: List[Dict[str, any]]) -> Dict[str, float]:
 8 |         predl, refl = [item["pred_score"] for item in score_detail], [
 9 |             item["ref_score"] for item in score_detail
10 |         ]
11 |         win_count = sum([1 for i in range(len(predl)) if predl[i] > refl[i]])
12 |         return {
13 |             "win(%)": win_count / len(predl) * 100,
14 |             "ref_score": sum(refl) / len(refl),
15 |             "pred_score": sum(predl) / len(predl),
16 |         }
17 | 


--------------------------------------------------------------------------------
/audio_evals/base.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Dict, List, Union
 3 | 
 4 | 
 5 | class EarlyStop(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | """
10 | request format
11 | eg1: how are you
12 | eg2: [{'role': 'user', 'content': 'how are you'}]
13 | eg3: [{'role': 'user', 'contents': [{'type':'text', 'content': 'how are you'}, {'type':'image', 'content': '/mnt/a.git'}]]
14 | """
15 | PromptStruct = Union[str, Dict[str, any], List[Dict[str, Union[str, List[Dict[str, str]]]]]]
16 | 
17 | ScoreUnit = Dict[str, Union[int, float]]
18 | 
19 | 
20 | @dataclass
21 | class EvalTaskCfg:
22 |     dataset: str
23 |     prompt: str
24 |     model: str
25 |     agg: str = "dump"
26 |     evaluator: str = "dump"
27 |     post_process: List[str] = field(default_factory=list)
28 | 


--------------------------------------------------------------------------------
/audio_evals/constants.py:
--------------------------------------------------------------------------------
1 | DEFAULT_MODEL_PATH = "init_model/"
2 | 


--------------------------------------------------------------------------------
/audio_evals/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/dataset/__init__.py


--------------------------------------------------------------------------------
/audio_evals/dataset/dataset.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os.path
 3 | from abc import ABC, abstractmethod
 4 | from typing import Any, Dict, Generator, List
 5 | import pandas as pd
 6 | from tqdm import tqdm
 7 | 
 8 | tqdm.pandas()
 9 | 
10 | 
11 | class Dataset(ABC):
12 |     def __init__(self, default_task: str, ref_col: str, col_aliases=None):
13 |         if col_aliases is None:
14 |             col_aliases = {}
15 |         self.col_aliases = col_aliases
16 |         self.task_name = default_task
17 |         self.ref_col = ref_col
18 | 
19 |     def reset_ref_col(self, ref_col: str):
20 |         self.ref_col = ref_col
21 | 
22 |     @abstractmethod
23 |     def load(self, limit=0) -> List[Dict[str, any]]:
24 |         raise NotImplementedError()
25 | 
26 |     def resume_from(self, f_name: str):
27 |         from audio_evals.dataset.resume import ResumeDataset
28 | 
29 |         return ResumeDataset(self, f_name)
30 | 
31 |     def load_inf_file(self, f_name: str):
32 |         from audio_evals.dataset.resume import ResumeDataset
33 | 
34 |         return ResumeDataset(self, f_name, save_type=["prompt", "inference"])
35 | 
36 | 
37 | class JsonlFile(Dataset):
38 |     def __init__(self, f_name: str, default_task: str, ref_col: str, col_aliases=None):
39 |         super().__init__(default_task, ref_col, col_aliases)
40 |         self.f_name = f_name
41 | 
42 |     def add_col_alias(self, df):
43 |         for k, v in self.col_aliases.items():
44 |             if v in df.columns:
45 |                 raise ValueError(f"Column alias {v} already exists in the dataframe")
46 |             df[v] = df[k]
47 |         return df
48 | 
49 |     def load(self, limit=0) -> List[Dict[str, any]]:
50 |         df = pd.read_json(self.f_name, lines=True)
51 |         if limit > 0:
52 |             df = df[:limit]
53 |         df = self.add_col_alias(df)
54 |         return df.to_dict(orient="records")
55 | 
56 | 
57 | class RelativePath(JsonlFile):
58 |     def __init__(
59 |         self,
60 |         f_name: str,
61 |         default_task: str,
62 |         ref_col: str,
63 |         file_path_prefix: str,
64 |         col_aliases=None,
65 |     ):
66 |         super().__init__(f_name, default_task, ref_col, col_aliases)
67 |         if not file_path_prefix.endswith("/"):
68 |             file_path_prefix += "/"
69 |         self.file_path = file_path_prefix
70 | 
71 |     def load(self, limit=0) -> List[Dict[str, any]]:
72 |         df = pd.read_json(self.f_name, lines=True)
73 |         if limit > 0:
74 |             df = df[:limit]
75 | 
76 |         def abs_path(x):
77 |             temp = os.path.join(self.file_path, str(x))
78 |             if os.path.exists(temp) and os.path.isfile(temp):
79 |                 return temp
80 |             return x
81 | 
82 |         for item in df.columns:
83 |             df[item] = df[item].progress_apply(abs_path)
84 |         df = self.add_col_alias(df)
85 |         return df.to_dict(orient="records")
86 | 


--------------------------------------------------------------------------------
/audio_evals/dataset/giga.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import List, Dict
 3 | from audio_evals.dataset.huggingface import Huggingface, load_audio_hf_dataset
 4 | from huggingface_hub import login
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | conversational_filler = [
 9 |     "UH",
10 |     "UHH",
11 |     "UM",
12 |     "EH",
13 |     "MM",
14 |     "HM",
15 |     "AH",
16 |     "HUH",
17 |     "HA",
18 |     "ER",
19 |     "OOF",
20 |     "HEE",
21 |     "ACH",
22 |     "EEE",
23 |     "EW",
24 | ]
25 | unk_tags = ["<UNK>", "<unk>"]
26 | gigaspeech_punctuations = [
27 |     "<COMMA>",
28 |     "<PERIOD>",
29 |     "<QUESTIONMARK>",
30 |     "<EXCLAMATIONPOINT>",
31 | ]
32 | gigaspeech_garbage_utterance_tags = ["<SIL>", "<NOISE>", "<MUSIC>", "<OTHER>"]
33 | non_scoring_words = (
34 |     conversational_filler
35 |     + unk_tags
36 |     + gigaspeech_punctuations
37 |     + gigaspeech_garbage_utterance_tags
38 | )
39 | 
40 | 
41 | def asr_text_post_processing(text):
42 |     # 1. convert to uppercase
43 |     text = text.upper()
44 | 
45 |     # 2. remove hyphen
46 |     #   "E-COMMERCE" -> "E COMMERCE", "STATE-OF-THE-ART" -> "STATE OF THE ART"
47 |     text = text.replace("-", " ")
48 | 
49 |     # 3. remove non-scoring words from evaluation
50 |     remaining_words = []
51 |     for word in text.split():
52 |         if word in non_scoring_words:
53 |             continue
54 |         remaining_words.append(word)
55 | 
56 |     return " ".join(remaining_words)
57 | 
58 | 
59 | class GigaSpeechDataset(Huggingface):
60 |     def __init__(self, **kwargs):
61 |         super().__init__(**kwargs)
62 |         logger.info(f"very import!!!  GigaSpeech need to login to huggingface hub")
63 |         login()
64 | 
65 |     def load(self, limit=0) -> List[Dict[str, any]]:
66 |         logger.info(
67 |             "start load data, it will take a while for download dataset when first load dataset"
68 |         )
69 |         raw = load_audio_hf_dataset(
70 |             self.name, self.subset, self.split, self.local_path, self.col_aliases
71 |         )
72 |         res = []
73 |         for item in raw:
74 |             item["text"] = asr_text_post_processing(item["text"])
75 |             if item["text"]:
76 |                 res.append(item)
77 |         if limit > 0:
78 |             res = res[:limit]
79 |         return res
80 | 


--------------------------------------------------------------------------------
/audio_evals/dataset/resume.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import shutil
 4 | from typing import List, Dict, Union
 5 | 
 6 | from audio_evals.dataset.dataset import Dataset
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | class ResumeDataset(Dataset):
11 |     def __init__(
12 |         self,
13 |         raw_dataset: Union[str, Dataset],
14 |         resume_file: str,
15 |         save_type: List[str] = None,
16 |     ):
17 |         if isinstance(raw_dataset, str):
18 |             from audio_evals.registry import registry
19 | 
20 |             raw_dataset = registry.get_dataset(raw_dataset)
21 |         super().__init__(
22 |             raw_dataset.task_name, raw_dataset.ref_col, raw_dataset.col_aliases
23 |         )
24 |         self.raw_dataset = raw_dataset
25 |         path, base_name = os.path.split(resume_file)
26 |         base_name = "temp_{}".format(base_name)
27 |         # in case resume file be delete before read
28 |         temp_file = os.path.join(path, base_name)
29 |         shutil.copy2(resume_file, temp_file)
30 |         self.resume_file = temp_file
31 |         self.save_type = save_type
32 | 
33 |     def load(self, limit=0) -> List[Dict[str, any]]:
34 |         data = self.raw_dataset.load(limit)
35 |         with open(self.resume_file, "r") as f:
36 |             for line in tqdm(f):
37 |                 doc = json.loads(line)
38 |                 if doc["type"] == "error":
39 |                     continue
40 |                 if self.save_type is not None and doc["type"] not in self.save_type:
41 |                     continue
42 |                 idx = int(doc["id"])
43 |                 if "eval_info" not in data[idx]:
44 |                     data[idx]["eval_info"] = {}
45 |                 data[idx]["eval_info"].update({doc["type"]: doc["data"]})
46 |         os.remove(self.resume_file)
47 |         return data
48 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/evaluator/__init__.py


--------------------------------------------------------------------------------
/audio_evals/evaluator/air_chat.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | from audio_evals.evaluator.base import Evaluator
 4 | 
 5 | 
 6 | prompt = (
 7 |     "You are a helpful and precise assistant for checking the quality of the answer.\n"
 8 |     "[Detailed Audio Description]\n{meta_info}\n[Question]\n{question}\n"
 9 |     "[The Start of Assistant 1s Answer]\n{label}\n[The End of Assistant 1s Answer]\n"
10 |     "[The Start of Assistant 2s Answer]\n{pred}\n[The End of Assistant 2s Answer]\n[System]\n"
11 |     "We would like to request your feedback on the performance of two AI assistants in response to the user question "
12 |     "and audio description displayed above. AI assistants are provided with detailed audio descriptions and questions.\n"
13 |     "Please rate the helpfulness, relevance, accuracy, and comprehensiveness of their responses. "
14 |     "Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance. "
15 |     "Please output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. "
16 |     "The two scores are separated by a space."
17 | )
18 | 
19 | 
20 | class AIRChatEvaluator(Evaluator):
21 |     def __init__(self, model_name: str):
22 |         self.model_name = model_name
23 | 
24 |     def _eval(self, pred, label, **kwargs) -> Dict[str, any]:
25 |         from audio_evals.registry import registry
26 | 
27 |         model = registry.get_model(self.model_name)
28 |         p = prompt.format(
29 |             meta_info=kwargs["meta_info"],
30 |             question=kwargs["question"],
31 |             label=label,
32 |             pred=pred,
33 |         )
34 |         res = model.inference(p)
35 |         ref_score, pred_score = res.split(" ")[0], res.split(" ")[1]
36 |         return {
37 |             "pred_score": float(pred_score),
38 |             "ref_score": float(ref_score),
39 |             "pred": pred,
40 |             "ref": label,
41 |         }
42 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/alpaca_eval.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import os.path
 3 | import re
 4 | from copy import deepcopy
 5 | from typing import Dict
 6 | import yaml
 7 | import json
 8 | from audio_evals.evaluator.base import Evaluator
 9 | 
10 | path = os.path.dirname(__file__)
11 | prompt = open(os.path.join(path, "alpaca_eval.txt"), "r").read()
12 | 
13 | 
14 | class AlpacaEvaluator(Evaluator):
15 |     def __init__(self, model_name: str):
16 |         self.model_name = model_name
17 | 
18 |     def _eval(self, pred, label, **kwargs) -> Dict[str, any]:
19 |         from audio_evals.registry import registry
20 | 
21 |         model = registry.get_model(self.model_name)
22 | 
23 |         p = deepcopy(prompt)
24 |         for k, v in {"instruction": kwargs["instruction"],
25 |                      "output_1": pred,
26 |                      "output_2": label}.items():
27 |             p = p.replace(f"{{{k}}}", v)
28 | 
29 |         # with open("/Users/a1/project/alpaca_eval-main/src/alpaca_eval/evaluators_configs/alpaca_eval_cot_gpt4_turbo_fn/configs.yaml", "r", encoding="utf-8") as f:
30 |         #     d = yaml.safe_load(f.read())
31 |         res = model.inference(p, temperature=0, maxTokens=100)
32 | 
33 |         # res_d = re.search(r"```json(.*?)```", res, re.DOTALL)
34 |         # if res_d:
35 |         #     d = json.loads(res_d.group(1))
36 |         if res.startswith("```python"):
37 |             res = res[9:-3].strip()
38 |         elif res.startswith("```"):
39 |             res = res[3:-3].strip()
40 |         try:
41 |             res = ast.literal_eval(res)
42 |             if isinstance(res, dict):
43 |                 for k in res:
44 |                     res = res[k]
45 |                     break
46 |             return {
47 |                 "acc": 1 if res[0]["model"] == "model_1" else 0,
48 |                 "pred": pred,
49 |                 "ref": label,
50 |             }
51 |         except Exception as e:
52 |             print(f"output is {res}\nError: {e}")
53 |             raise e
54 | 
55 | 
56 | class ChatbotEvaluator(Evaluator):
57 |     def __init__(self, model_name: str):
58 |         self.model_name = model_name
59 | 
60 |     def _eval(self, pred, label, **kwargs) -> Dict[str, any]:
61 |         from audio_evals.registry import registry
62 | 
63 |         model = registry.get_model(self.model_name)
64 |         prompt = registry.get_prompt("chatbot-eval")
65 | 
66 |         p = prompt.load(instruction=kwargs["instruction"], response=pred)
67 |         res = model.inference(p, temperature=0, maxTokens=2048)
68 | 
69 |         # res_d = re.search(r"```json(.*?)```", res, re.DOTALL)
70 |         d = re.search(r'\[\[(\d+)\]\]', res)
71 |         return {
72 |             "geval": int(d.group(1)),
73 |             "pred": pred,
74 |             "ref": label,
75 |         }


--------------------------------------------------------------------------------
/audio_evals/evaluator/alpaca_eval.txt:
--------------------------------------------------------------------------------
 1 | <|im_start|>system
 2 | You are a helpful assistant, that ranks models by the quality of their answers.
 3 | <|im_end|>
 4 | <|im_start|>user
 5 | I want you to create a leaderboard of different of large-language models. To do so, I will give you the instructions (prompts) given to the models, and the responses of two models. Please rank the models based on which responses would be preferred by humans. All inputs and outputs should be python dictionaries.
 6 | 
 7 | Here is the prompt:
 8 | {
 9 |     "instruction": """{instruction}""",
10 | }
11 | 
12 | Here are the outputs of the models:
13 | [
14 |     {
15 |         "model": "model_1",
16 |         "answer": """{output_1}"""
17 |     },
18 |     {
19 |         "model": "model_2",
20 |         "answer": """{output_2}"""
21 |     }
22 | ]
23 | 
24 | Now please rank the models by the quality of their answers, so that the model with rank 1 has the best output. Then return a list of the model names and ranks, i.e., produce the following output:
25 | [
26 |     {'model': <model-name>, 'rank': <model-rank>},
27 |     {'model': <model-name>, 'rank': <model-rank>}
28 | ]
29 | 
30 | Your response must be a valid Python dictionary and should contain nothing else because we will directly execute it in Python. Please provide the ranking that the majority of humans would give.
31 | <|im_end|>


--------------------------------------------------------------------------------
/audio_evals/evaluator/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Dict
 3 | 
 4 | 
 5 | class Evaluator(ABC):
 6 | 
 7 |     @abstractmethod
 8 |     def _eval(self, pred, label, **kwargs) -> Dict[str, any]:
 9 |         raise NotImplementedError()
10 | 
11 |     def __call__(self, pred, ref, **kwargs) -> Dict[str, any]:
12 |         res = {"pred": pred, "ref": ref}
13 |         eval_kwargs = {k: v for k, v in kwargs.items() if k not in ["pred", "label"]}
14 |         res.update(self._eval(pred, ref, **eval_kwargs))
15 |         return res
16 | 
17 | 
18 | class Dump(Evaluator):
19 | 
20 |     def _eval(self, pred, label, **kwargs):
21 |         return {}
22 | 
23 | 
24 | class EM(Evaluator):
25 | 
26 |     def _eval(self, pred, label, **kwargs) -> Dict[str, any]:
27 |         if type(label) in [int, float]:
28 |             try:
29 |                 pred, label = float(pred), float(label)
30 |             except:
31 |                 return {"match": 0, "pred": pred, "ref": label}
32 |         elif isinstance(label, str):
33 |             pred, label = str(pred).strip(), label.strip()
34 | 
35 |         return {"match": 1 if pred == label else 0, "pred": pred, "ref": label}
36 | 
37 | 
38 | class ExistMatch(Evaluator):
39 | 
40 |     def _eval(self, pred, label, **kwargs) -> Dict[str, any]:
41 |         if isinstance(label, list):
42 |             for item in label:
43 |                 ans = self._eval(pred, item, **kwargs)
44 |                 if ans["match"] == 1:
45 |                     return ans
46 |             return {"match": 0, "pred": pred, "ref": label}
47 | 
48 |         if type(label) in [int, float]:
49 |             pred, label = float(pred), float(label)
50 |         elif isinstance(label, str):
51 |             pred, label = str(pred).strip().lower(), label.strip().lower()
52 | 
53 |         match = 0
54 |         if label in pred:
55 |             match = 1
56 | 
57 |         return {"match": match, "pred": label if match else pred, "ref": label}
58 | 
59 | 
60 | class PrefixMatch(Evaluator):
61 | 
62 |     def __init__(self, ignore_case: bool = True):
63 |         self.ignore_case = ignore_case
64 | 
65 |     def _eval(self, pred, label, **kwargs) -> Dict[str, any]:
66 |         if self.ignore_case:
67 |             pred = pred.lower().strip()
68 |             label = str(label).lower().strip()
69 |         n = len(label)
70 |         return {
71 |             "match": 1 if pred[:n] == label else 0,
72 |             "pred": pred[:n],
73 |             "ref": label,
74 |         }
75 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/bbh.py:
--------------------------------------------------------------------------------
 1 | from .base import Evaluator
 2 | from typing import Dict, List
 3 | import json
 4 | 
 5 | 
 6 | class BBH(Evaluator):
 7 |     def __init__(self, ignore_case: bool = True):
 8 |         self.ignore_case = ignore_case
 9 | 
10 |     def _extract_answer(self, response: str) -> str:
11 |         response = response.lower() if self.ignore_case else response
12 | 
13 |         # 尝试从 JSON 格式中提取答案
14 |         try:
15 |             data = json.loads(response)
16 |             if isinstance(data, dict) and "answer" in data:
17 |                 return data["answer"]
18 |         except:
19 |             pass
20 | 
21 |         # 尝试从文本中提取答案
22 |         for line in response.split("\n"):
23 |             line = line.strip()
24 |             if line.startswith("answer:") or line.startswith("Answer:"):
25 |                 return line.split(":", 1)[1].strip()
26 |             if line.startswith("the answer is") or line.startswith("The answer is"):
27 |                 return line.split("is", 1)[1].strip()
28 | 
29 |         return None
30 | 
31 |     def _eval(self, pred: str, label: str, **kwargs) -> Dict[str, any]:
32 |         pred = str(pred)
33 |         label = str(label)
34 | 
35 |         if self.ignore_case:
36 |             pred = pred.lower()
37 |             label = label.lower()
38 | 
39 |         extracted_answer = self._extract_answer(pred)
40 |         if extracted_answer is None:
41 |             return {"match": 0, "pred": pred, "ref": label, "fail": 1}
42 | 
43 |         return {
44 |             "match": 1 if extracted_answer == label else 0,
45 |             "pred": extracted_answer,
46 |             "ref": label,
47 |             "fail": 0,
48 |         }
49 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/bleu.py:
--------------------------------------------------------------------------------
 1 | import sacrebleu
 2 | 
 3 | from audio_evals.evaluator.base import Evaluator
 4 | 
 5 | 
 6 | class BLEU(Evaluator):
 7 |     def __init__(self, lang: str = "13a"):
 8 |         self.lang = "13a"
 9 |         if lang == "zh":
10 |             self.lang = "zh"
11 |         elif lang == "ja":
12 |             self.lang = "ja-mecab"
13 | 
14 |     def _eval(self, pred: str, label: str, **kwargs):
15 |         res = sacrebleu.corpus_bleu([pred], [[label]], tokenize=self.lang)
16 |         return {"bleu": res.score}
17 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/coco.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, List
 2 | 
 3 | from audio_evals.evaluator.base import Evaluator
 4 | from audio_evals.lib.coco import compute_caption
 5 | 
 6 | 
 7 | class Coco(Evaluator):
 8 | 
 9 |     def _eval(self, pred: str, label: Union[str, List[str]], **kwargs):
10 |         pred = str(pred)
11 |         if isinstance(label, str):
12 |             label = [label]
13 |         return compute_caption([label], [pred])
14 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/dict_match.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | from audio_evals.evaluator.base import Evaluator
 4 | 
 5 | 
 6 | class DictEM(Evaluator):
 7 | 
 8 |     def _eval(self, pred, label, **kwargs) -> Dict[str, any]:
 9 |         assert isinstance(label, dict), "label must be dictionaries, but {}".format(
10 |             type(label)
11 |         )
12 |         return {"match": 1 if pred == label else 0, "pred": pred, "ref": label}
13 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/dnsmos.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Dict
 3 | 
 4 | from audio_evals.evaluator.base import Evaluator
 5 | 
 6 | 
 7 | class DNSMOS(Evaluator):
 8 |     def __init__(self, model_name: str = "DNSMOS"):
 9 |         from audio_evals.registry import registry
10 | 
11 |         self.model = registry.get_model(model_name)
12 | 
13 |     def _eval(self, pred, label, **kwargs) -> Dict[str, any]:
14 |         pred = {"audio": str(pred)}
15 |         res = self.model.inference(pred)
16 |         res = json.loads(res)
17 |         res["pred"] = pred
18 |         res["ref"] = label
19 |         return res
20 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/ensemble.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | 
 3 | from audio_evals.evaluator.base import Evaluator
 4 | 
 5 | 
 6 | class Ensemble(Evaluator):
 7 |     def __init__(self, components: List[str]):
 8 |         from audio_evals.registry import registry
 9 | 
10 |         self.es = []
11 |         for item in components:
12 |             e = registry.get_evaluator(item)
13 |             if e is None:
14 |                 raise ValueError(f"Invalid component: {item}")
15 |             self.es.append(e)
16 | 
17 |     def _eval(self, pred, label, **kwargs) -> Dict[str, any]:
18 |         res = {}
19 |         for e in self.es:
20 |             res.update(e(pred, label, **kwargs))
21 |         return res
22 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/qa_eval.py:
--------------------------------------------------------------------------------
 1 | from .base import Evaluator
 2 | import numpy as np
 3 | from typing import Dict, List
 4 | from qa_metrics.pedant import PEDANT
 5 | 
 6 | 
 7 | def majority_vote(scores: List[str]) -> bool:
 8 |     scores = [item.lower() for item in scores]
 9 |     final_answer = max(set(scores), key=scores.count)
10 |     return True if final_answer == "yes" else False
11 | 
12 | 
13 | class QAEval(Evaluator):
14 |     def __init__(self):
15 |         self.pedant = PEDANT()
16 | 
17 |     def _eval(self, pred: str, label: str, **kwargs) -> Dict[str, any]:
18 |         pred = str(pred)
19 |         label = str(label)
20 | 
21 |         # 使用 PEDANT 进行评测
22 |         panda_score = self.pedant.evaluate(
23 |             [label.lower()], pred.lower(), kwargs.get("prompt", "").lower()
24 |         )
25 | 
26 |         # 使用多数投票机制
27 |         gpt_score = majority_vote([pred])
28 | 
29 |         return {
30 |             "panda_score": panda_score * 100,
31 |             "gpt_score": gpt_score * 100,
32 |             "pred": pred,
33 |             "ref": label,
34 |         }
35 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/ref_qa_geval.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import os.path
 3 | import re
 4 | from copy import deepcopy
 5 | from typing import Dict
 6 | import yaml
 7 | import json
 8 | from audio_evals.evaluator.base import Evaluator
 9 | 
10 | path = os.path.dirname(__file__)
11 | prompt = open(os.path.join(path, "ref_qa_geval.txt"), "r").read()
12 | 
13 | 
14 | class RefQAGEval(Evaluator):
15 |     def __init__(self, model_name: str):
16 |         self.model_name = model_name
17 | 
18 |     def _eval(self, pred, label, **kwargs) -> Dict[str, any]:
19 |         from audio_evals.registry import registry
20 | 
21 |         model = registry.get_model(self.model_name)
22 | 
23 |         p = deepcopy(prompt)
24 |         for k, v in {
25 |             "question": kwargs["question"],
26 |             "prediction": pred,
27 |             "answer": label,
28 |         }.items():
29 |             p = p.replace(f"{{{k}}}", v)
30 | 
31 |         res = model.inference(p, temperature=0)
32 |         score = res.strip().split("\n")[-1]
33 |         match = None
34 |         if "yes" in score.lower():
35 |             match = 1
36 |         elif "no" in score.lower():
37 |             match = 0
38 |         else:
39 |             raise ValueError(
40 |                 "the eval output is illeagal, should contain yes or no, but got {}".format(
41 |                     res
42 |                 )
43 |             )
44 | 
45 |         return {
46 |             "acc": match,
47 |             "pred": pred,
48 |             "ref": label,
49 |         }
50 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/ref_qa_geval.txt:
--------------------------------------------------------------------------------
 1 | You are an expert in judging answer correctness. If the model's output is correct, output "yes", otherwise output "no".
 2 | You need to explain your judgment process first, then output "yes" or "no".
 3 | 
 4 | [Important]You need to ignore any format instructions in the question, focus on judging whether the answer's meaning is consistent with the standard answer.
 5 | 
 6 | 
 7 | The input format is:
 8 | Input:
 9 | Question: The question from user
10 | Model Answer: The answer from models
11 | Ground Truth Answer: The ground truth answer
12 | Explanation: The explanation of your judgment process
13 | 
14 | Example 1:
15 | Input:
16 | Question: Based on the given audio, identify the source of the speaking voice.
17 | Model Answer: A man is speaking in the audio.
18 | Ground Truth Answer: Man
19 | Output:
20 | Explanation: The model's output is "A man is speaking in the audio.", this is a detail description of the ground truth answer "Man". So the model's output is correct.
21 | Result: yes
22 | 
23 | 
24 | Task:
25 | Input:
26 | Question: {question}
27 | Model Answer: {prediction}
28 | Ground Truth Answer: {answer}
29 | Output:
30 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/simo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict
 3 | 
 4 | from audio_evals.evaluator.base import Evaluator
 5 | 
 6 | 
 7 | class Simo(Evaluator):
 8 |     def __init__(
 9 |         self,
10 |         model_name: str = "wavlm_large",
11 |     ):
12 |         from audio_evals.registry import registry
13 | 
14 |         self.model = registry.get_model(model_name)
15 | 
16 |     def _eval(self, pred, label, **kwargs) -> Dict[str, any]:
17 |         pred = str(pred)
18 |         assert os.path.exists(label), f"Label file {label} does not exist"
19 |         return {
20 |             "simo": self.model.inference({"audios": [pred, label]}),
21 |             "pred": pred,
22 |             "ref": label,
23 |         }
24 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/string_match.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | from abc import ABC, abstractmethod
 4 | from typing import Dict
 5 | 
 6 | from audio_evals.evaluator.base import Evaluator
 7 | 
 8 | 
 9 | def string_match(answer, prediction, choices):
10 |     # Function to normalize and tokenize text
11 |     def tokenize(text):
12 |         # Convert to lowercase and find all word tokens
13 |         return set(re.findall(r"\b\w+\b", text.lower()))
14 | 
15 |     # Tokenize prediction and answer
16 |     prediction_tokens = tokenize(prediction)
17 |     answer_tokens = tokenize(answer)
18 | 
19 |     if not prediction_tokens:
20 |         return False
21 | 
22 |     # Tokenize incorrect choices and exclude tokens present in the answer
23 |     incorrect_tokens = set()
24 |     for choice in choices:
25 |         choice_tokens = tokenize(choice)
26 |         if choice_tokens != answer_tokens:
27 |             incorrect_tokens.update(choice_tokens - answer_tokens)
28 | 
29 |     # Condition 1: All tokens of the answer are in the prediction
30 |     cond1 = answer_tokens.issubset(prediction_tokens)
31 | 
32 |     # Condition 2: Prediction does not contain any tokens from incorrect choices (excluding shared words)
33 |     cond2 = prediction_tokens.isdisjoint(incorrect_tokens)
34 | 
35 |     return cond1 and cond2
36 | 
37 | 
38 | class ChoiceStringMatch(Evaluator):
39 | 
40 |     def _eval(self, pred, label, choices, **kwargs) -> Dict[str, any]:
41 |         pred = str(pred)
42 |         match = string_match(label, pred, choices)
43 |         return {
44 |             "match": 1 if match else 0,
45 |             "pred": label if match else pred,
46 |             "ref": label,
47 |         }
48 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/utmos.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | from audio_evals.evaluator.base import Evaluator
 4 | 
 5 | 
 6 | class UTMOS(Evaluator):
 7 |     def __init__(self, model_name: str = "utmos-en"):
 8 |         from audio_evals.registry import registry
 9 | 
10 |         self.model = registry.get_model(model_name)
11 | 
12 |     def _eval(self, pred, label, **kwargs) -> Dict[str, any]:
13 |         pred = str(pred)
14 |         return {
15 |             "utmos": self.model.inference(pred),
16 |             "pred": pred,
17 |             "ref": label,
18 |         }
19 | 


--------------------------------------------------------------------------------
/audio_evals/evaluator/wer.py:
--------------------------------------------------------------------------------
 1 | from audio_evals.evaluator.base import Evaluator
 2 | from audio_evals.lib.wer import compute_wer
 3 | 
 4 | 
 5 | class WER(Evaluator):
 6 |     def __init__(self, ignore_case: bool = False, lang="en"):
 7 |         self.ignore_case = ignore_case
 8 |         self.lang = lang
 9 | 
10 |     def _eval(self, pred: str, label: str, **kwargs):
11 |         pred, label = str(pred), str(label)
12 |         if self.ignore_case:
13 |             pred, label = pred.lower(), label.lower()
14 |         return {
15 |             "wer%": compute_wer([label], [pred], language=self.lang) * 100,
16 |         }
17 | 
18 | 
19 | class CER(Evaluator):
20 |     def __init__(self, ignore_case: bool = False):
21 |         self.ignore_case = ignore_case
22 | 
23 |     def _eval(self, pred: str, label: str, **kwargs):
24 |         pred, label = str(pred), str(label)
25 |         if self.ignore_case:
26 |             pred, label = pred.lower(), label.lower()
27 |         return {"cer%": compute_wer([label], [pred], language="zh") * 100}
28 | 


--------------------------------------------------------------------------------
/audio_evals/lib/DNSMOS/README.md:
--------------------------------------------------------------------------------
 1 | # DNSMOS: A non-intrusive perceptual objective speech quality metric to evaluate noise suppressors
 2 | 
 3 | Human subjective evaluation is the ”gold standard” to evaluate speech quality optimized for human perception.  Perceptual objective metrics serve as a proxy for subjective scores. The conventional and widely used metrics require a reference clean speech signal, which is unavailable in real recordings. The no-reference approaches correlate poorly with human ratings and are not widely adopted in the research community. One of the biggest use cases of these perceptual objective metrics is to evaluate noise suppression algorithms. DNSMOS generalizes well in challenging test conditions with a high correlation to human ratings in stack ranking noise suppression methods. More details can be found in [DNSMOS paper](https://arxiv.org/pdf/2010.15258.pdf).
 4 | 
 5 | ## Evaluation methodology:
 6 | Use the **dnsmos_local.py** script.
 7 | 1. To compute a personalized MOS score (where interfering speaker is penalized) provide the '-p' argument
 8 | Ex: python dnsmos_local.py -t C:\temp\SampleClips -o sample.csv -p
 9 | 2. To compute a regular MOS score omit the '-p' argument.
10 | Ex: python dnsmos_local.py -t C:\temp\SampleClips -o sample.csv
11 | 
12 | ## Citation:
13 | If you have used the API for your research and development purpose, please cite the [DNSMOS paper](https://arxiv.org/pdf/2010.15258.pdf):
14 | ```BibTex
15 | @inproceedings{reddy2021dnsmos,
16 |   title={Dnsmos: A non-intrusive perceptual objective speech quality metric to evaluate noise suppressors},
17 |   author={Reddy, Chandan KA and Gopal, Vishak and Cutler, Ross},
18 |   booktitle={ICASSP 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
19 |   pages={6493--6497},
20 |   year={2021},
21 |   organization={IEEE}
22 | }
23 | ```
24 | 
25 | If you used DNSMOS P.835 please cite the [DNSMOS P.835](https://arxiv.org/pdf/2110.01763.pdf) paper:
26 | 
27 | ```BibTex
28 | @inproceedings{reddy2022dnsmos,
29 |   title={DNSMOS P.835: A non-intrusive perceptual objective speech quality metric to evaluate noise suppressors},
30 |   author={Reddy, Chandan KA and Gopal, Vishak and Cutler, Ross},
31 |   booktitle={ICASSP 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
32 |   year={2022},
33 |   organization={IEEE}
34 | }
35 |  ```
36 | 


--------------------------------------------------------------------------------
/audio_evals/lib/DNSMOS/requirements.txt:
--------------------------------------------------------------------------------
1 | argparse==1.1
2 | configparser==5.3.0
3 | librosa==0.8.1
4 | numpy==1.22.4
5 | onnxruntime==1.13.1
6 | 


--------------------------------------------------------------------------------
/audio_evals/lib/SenseVoice/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | import select
 5 | import sys
 6 | from funasr import AutoModel
 7 | from funasr.utils.postprocess_utils import rich_transcription_postprocess
 8 | import torch
 9 | 
10 | 
11 | logging.basicConfig(level=logging.INFO)
12 | logger = logging.getLogger(__name__)
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument(
17 |         "--path", type=str, required=True, help="Path to SenseVoice model"
18 |     )
19 |     config = parser.parse_args()
20 | 
21 |     # Initialize model
22 |     model = AutoModel(
23 |         model=config.path,
24 |         vad_model="fsmn-vad",
25 |         vad_kwargs={"max_single_segment_time": 30000},
26 |         device="cuda" if torch.cuda.is_available() else "cpu",
27 |     )
28 |     logger.info(f"Using SenseVoice model from: {config.path}")
29 | 
30 |     while True:
31 |         try:
32 |             prompt = input()
33 |             anchor = prompt.find("->")
34 |             if anchor == -1:
35 |                 print(
36 |                     "Error: Invalid conversation format, must contains  ->, but {}".format(
37 |                         prompt
38 |                     ),
39 |                     flush=True,
40 |                 )
41 |                 continue
42 |             prefix = prompt[:anchor].strip() + "->"
43 |             x = json.loads(prompt[anchor + 2 :])
44 | 
45 |             # Process input
46 |             res = model.generate(
47 |                 input=x["audio"],
48 |                 cache={},
49 |                 language=x.get("language", "auto"),
50 |                 use_itn=True,
51 |                 batch_size_s=30000,
52 |                 merge_vad=True,
53 |                 merge_length_s=15,
54 |             )
55 |             text = rich_transcription_postprocess(res[0]["text"])
56 |             while True:
57 |                 print(f"{prefix}{text}", flush=True)
58 |                 rlist, _, _ = select.select([sys.stdin], [], [], 1)
59 |                 if rlist:
60 |                     finish = sys.stdin.readline().strip()
61 |                     if finish == "{}close".format(prefix):
62 |                         break
63 |                 print("not found close signal, will emit again", flush=True)
64 | 
65 |         except Exception as e:
66 |             print(f"Error: {str(e)}", flush=True)
67 | 


--------------------------------------------------------------------------------
/audio_evals/lib/SenseVoice/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi>=0.111.1
 2 | funasr
 3 | funasr>=1.1.3
 4 | gradio
 5 | huggingface
 6 | huggingface_hub
 7 | modelscope
 8 | numpy<=1.26.4
 9 | torch<=2.3
10 | torchaudio
11 | 


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/encodec.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import tempfile
 3 | 
 4 | import soundfile as sf
 5 | from sparktts.models.audio_tokenizer import BiCodecTokenizer
 6 | import torch
 7 | import logging
 8 | 
 9 | logging.basicConfig(level=logging.INFO)
10 | device = "cuda" if torch.cuda.is_available() else "cpu"
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument(
18 |         "--path", type=str, required=True, help="Path to checkpoint file"
19 |     )
20 |     config = parser.parse_args()
21 | 
22 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23 |     logger.info(f"Using device: {device}")
24 |     logger.info(f"Loading tokenizer from {config.path}")
25 |     tokenizer = BiCodecTokenizer(
26 |         model_dir=config.path,
27 |         device=device,
28 |     )
29 |     logger.info(f"successfully loaded tokenizer")
30 | 
31 |     while True:
32 |         try:
33 |             prompt = input()
34 |             global_tokens, semantic_tokens = tokenizer.tokenize(prompt)
35 |             wav_rec = tokenizer.detokenize(global_tokens.squeeze(0), semantic_tokens)
36 |             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
37 |                 sf.write(f.name, wav_rec, 16000)
38 |                 print("Result:" + f.name)
39 |         except Exception as e:
40 |             print("Error:{}".format(e))
41 | 


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/example/infer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2025 SparkAudio
 4 | #               2025 Xinsheng Wang (w.xinshawn@gmail.com)
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | # Get the absolute path of the script's directory
20 | script_dir=$(dirname "$(realpath "$0")")
21 | 
22 | # Get the root directory
23 | root_dir=$(dirname "$script_dir")
24 | 
25 | # Set default parameters
26 | device=0
27 | save_dir='example/results'
28 | model_dir="pretrained_models/Spark-TTS-0.5B"
29 | text="身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。"
30 | prompt_text="吃燕窝就选燕之屋，本节目由26年专注高品质燕窝的燕之屋冠名播出。豆奶牛奶换着喝，营养更均衡，本节目由豆本豆豆奶特约播出。"
31 | prompt_speech_path="example/prompt_audio.wav"
32 | 
33 | # Change directory to the root directory
34 | cd "$root_dir" || exit
35 | 
36 | source sparktts/utils/parse_options.sh
37 | 
38 | # Run inference
39 | python -m cli.inference \
40 |     --text "${text}" \
41 |     --device "${device}" \
42 |     --save_dir "${save_dir}" \
43 |     --model_dir "${model_dir}" \
44 |     --prompt_text "${prompt_text}" \
45 |     --prompt_speech_path "${prompt_speech_path}"
46 | 


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/requirements.txt:
--------------------------------------------------------------------------------
 1 | einops==0.8.1
 2 | einx==0.3.0
 3 | gradio==5.18.0
 4 | numpy==2.2.3
 5 | omegaconf==2.3.0
 6 | packaging==24.2
 7 | safetensors==0.5.2
 8 | soundfile==0.12.1
 9 | soxr==0.5.0.post1
10 | torch==2.5.1
11 | torchaudio==2.5.1
12 | tqdm==4.66.5
13 | transformers==4.46.2
14 | 


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/sparktts/modules/blocks/layers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 SparkAudio
 2 | #               2025 Xinsheng Wang (w.xinshawn@gmail.com)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # Adapted from https://github.com/descriptinc/descript-audio-codec under the Apache License 2.0
17 | 
18 | 
19 | import torch
20 | import torch.nn as nn
21 | from torch.nn.utils import weight_norm
22 | 
23 | 
24 | def WNConv1d(*args, **kwargs):
25 |     return weight_norm(nn.Conv1d(*args, **kwargs))
26 | 
27 | 
28 | def WNConvTranspose1d(*args, **kwargs):
29 |     return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
30 | 
31 | 
32 | # Scripting this brings model speed up 1.4x
33 | @torch.jit.script
34 | def snake(x, alpha):
35 |     shape = x.shape
36 |     x = x.reshape(shape[0], shape[1], -1)
37 |     x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
38 |     x = x.reshape(shape)
39 |     return x
40 | 
41 | 
42 | class Snake1d(nn.Module):
43 |     def __init__(self, channels):
44 |         super().__init__()
45 |         self.alpha = nn.Parameter(torch.ones(1, channels, 1))
46 | 
47 |     def forward(self, x):
48 |         return snake(x, self.alpha)
49 | 
50 | 
51 | class ResidualUnit(nn.Module):
52 |     def __init__(self, dim: int = 16, dilation: int = 1):
53 |         super().__init__()
54 |         pad = ((7 - 1) * dilation) // 2
55 |         self.block = nn.Sequential(
56 |             Snake1d(dim),
57 |             WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
58 |             Snake1d(dim),
59 |             WNConv1d(dim, dim, kernel_size=1),
60 |         )
61 | 
62 |     def forward(self, x):
63 |         y = self.block(x)
64 |         pad = (x.shape[-1] - y.shape[-1]) // 2
65 |         if pad > 0:
66 |             x = x[..., pad:-pad]
67 |         return x + y
68 | 
69 | 
70 | def init_weights(m):
71 |     if isinstance(m, nn.Conv1d):
72 |         nn.init.trunc_normal_(m.weight, std=0.02)
73 |         nn.init.constant_(m.bias, 0)
74 | 


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/sparktts/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/sparktts/utils/__init__.py


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/figures/gradio_TTS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/figures/gradio_TTS.png


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/figures/gradio_control.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/figures/gradio_control.png


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/figures/infer_control.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/figures/infer_control.png


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/figures/infer_voice_cloning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/figures/infer_voice_cloning.png


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/logo/HKUST.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/HKUST.jpg


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/logo/NPU.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/NPU.jpg


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/logo/NTU.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/NTU.jpg


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/logo/SJU.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/SJU.jpg


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/logo/SparkAudio.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/SparkAudio.jpg


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/logo/SparkAudio2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/SparkAudio2.jpg


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/logo/SparkTTS.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/SparkTTS.jpg


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/logo/SparkTTS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/SparkTTS.png


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/logo/mobvoi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/mobvoi.jpg


--------------------------------------------------------------------------------
/audio_evals/lib/Spark-TTS/src/logo/mobvoi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/Spark-TTS/src/logo/mobvoi.png


--------------------------------------------------------------------------------
/audio_evals/lib/WavTokenizer/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 jishengpeng
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/audio_evals/lib/WavTokenizer/data/demo.txt:
--------------------------------------------------------------------------------
1 | ./example1.wav
2 | ./example2.wav
3 | ./example3.mp3
4 | ./example4.flac
5 | 


--------------------------------------------------------------------------------
/audio_evals/lib/WavTokenizer/decoder/__init__.py:
--------------------------------------------------------------------------------
1 | from decoder.pretrained import WavTokenizer
2 | 
3 | 
4 | __version__ = "0.0.3"
5 | 


--------------------------------------------------------------------------------
/audio_evals/lib/WavTokenizer/decoder/helpers.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | import numpy as np
 3 | import torch
 4 | from matplotlib import pyplot as plt
 5 | from pytorch_lightning import Callback
 6 | 
 7 | matplotlib.use("Agg")
 8 | 
 9 | 
10 | def save_figure_to_numpy(fig: plt.Figure) -> np.ndarray:
11 |     """
12 |     Save a matplotlib figure to a numpy array.
13 | 
14 |     Args:
15 |         fig (Figure): Matplotlib figure object.
16 | 
17 |     Returns:
18 |         ndarray: Numpy array representing the figure.
19 |     """
20 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
21 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
22 |     return data
23 | 
24 | 
25 | def plot_spectrogram_to_numpy(spectrogram: np.ndarray) -> np.ndarray:
26 |     """
27 |     Plot a spectrogram and convert it to a numpy array.
28 | 
29 |     Args:
30 |         spectrogram (ndarray): Spectrogram data.
31 | 
32 |     Returns:
33 |         ndarray: Numpy array representing the plotted spectrogram.
34 |     """
35 |     spectrogram = spectrogram.astype(np.float32)
36 |     fig, ax = plt.subplots(figsize=(12, 3))
37 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
38 |     plt.colorbar(im, ax=ax)
39 |     plt.xlabel("Frames")
40 |     plt.ylabel("Channels")
41 |     plt.tight_layout()
42 | 
43 |     fig.canvas.draw()
44 |     data = save_figure_to_numpy(fig)
45 |     plt.close()
46 |     return data
47 | 
48 | 
49 | class GradNormCallback(Callback):
50 |     """
51 |     Callback to log the gradient norm.
52 |     """
53 | 
54 |     def on_after_backward(self, trainer, model):
55 |         model.log("grad_norm", gradient_norm(model))
56 | 
57 | 
58 | def gradient_norm(model: torch.nn.Module, norm_type: float = 2.0) -> torch.Tensor:
59 |     """
60 |     Compute the gradient norm.
61 | 
62 |     Args:
63 |         model (Module): PyTorch model.
64 |         norm_type (float, optional): Type of the norm. Defaults to 2.0.
65 | 
66 |     Returns:
67 |         Tensor: Gradient norm.
68 |     """
69 |     grads = [p.grad for p in model.parameters() if p.grad is not None]
70 |     total_norm = torch.norm(
71 |         torch.stack([torch.norm(g.detach(), norm_type) for g in grads]), norm_type
72 |     )
73 |     return total_norm
74 | 


--------------------------------------------------------------------------------
/audio_evals/lib/WavTokenizer/encoder/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | # flake8: noqa
 7 | 
 8 | """EnCodec neural audio codec."""
 9 | 
10 | __version__ = "0.1.2a3"
11 | 
12 | from .model import EncodecModel
13 | 


--------------------------------------------------------------------------------
/audio_evals/lib/WavTokenizer/encoder/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | """Torch modules."""
 8 | 
 9 | # flake8: noqa
10 | from .conv import (
11 |     pad1d,
12 |     unpad1d,
13 |     NormConv1d,
14 |     NormConvTranspose1d,
15 |     NormConv2d,
16 |     NormConvTranspose2d,
17 |     SConv1d,
18 |     SConvTranspose1d,
19 | )
20 | from .lstm import SLSTM
21 | from .seanet import SEANetEncoder, SEANetDecoder
22 | from .transformer import StreamingTransformerEncoder
23 | 


--------------------------------------------------------------------------------
/audio_evals/lib/WavTokenizer/encoder/modules/lstm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | """LSTM layers module."""
 8 | 
 9 | from torch import nn
10 | 
11 | 
12 | class SLSTM(nn.Module):
13 |     """
14 |     LSTM without worrying about the hidden state, nor the layout of the data.
15 |     Expects input as convolutional layout.
16 |     """
17 | 
18 |     def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True):
19 |         super().__init__()
20 |         self.skip = skip
21 |         self.lstm = nn.LSTM(dimension, dimension, num_layers)
22 | 
23 |     # def forward(self, x):
24 |     #     x = x.permute(2, 0, 1)
25 |     #     y, _ = self.lstm(x)
26 |     #     if self.skip:
27 |     #         y = y + x
28 |     #     y = y.permute(1, 2, 0)
29 |     #     return y
30 | 
31 |     # 修改transpose顺序
32 |     def forward(self, x):
33 |         # # 插入reshape
34 |         # x = x.reshape(x.shape)
35 |         x1 = x.permute(2, 0, 1)
36 |         y, _ = self.lstm(x1)
37 |         y = y.permute(1, 2, 0)
38 |         if self.skip:
39 |             y = y + x
40 |         return y
41 | 


--------------------------------------------------------------------------------
/audio_evals/lib/WavTokenizer/encoder/modules/norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | """Normalization modules."""
 8 | 
 9 | import typing as tp
10 | 
11 | import einops
12 | import torch
13 | from torch import nn
14 | 
15 | 
16 | class ConvLayerNorm(nn.LayerNorm):
17 |     """
18 |     Convolution-friendly LayerNorm that moves channels to last dimensions
19 |     before running the normalization and moves them back to original position right after.
20 |     """
21 | 
22 |     def __init__(
23 |         self, normalized_shape: tp.Union[int, tp.List[int], torch.Size], **kwargs
24 |     ):
25 |         super().__init__(normalized_shape, **kwargs)
26 | 
27 |     def forward(self, x):
28 |         x = einops.rearrange(x, "b ... t -> b t ...")
29 |         x = super().forward(x)
30 |         x = einops.rearrange(x, "b t ... -> b ... t")
31 |         return
32 | 


--------------------------------------------------------------------------------
/audio_evals/lib/WavTokenizer/encoder/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | # flake8: noqa
8 | from .vq import QuantizedResult, ResidualVectorQuantizer
9 | 


--------------------------------------------------------------------------------
/audio_evals/lib/WavTokenizer/infer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from encoder.utils import convert_audio
 4 | import torchaudio
 5 | import torch
 6 | from decoder.pretrained import WavTokenizer
 7 | 
 8 | import time
 9 | 
10 | import logging
11 | 
12 | device1 = torch.device("cuda:0")
13 | # device2=torch.device('cpu')
14 | 
15 | input_path = "./WavTokenizer/data/infer/lirbitts_testclean"
16 | out_folder = "./WavTokenizer/result/infer"
17 | # os.system("rm -r %s"%(out_folder))
18 | # os.system("mkdir -p %s"%(out_folder))
19 | # ll="libritts_testclean500_large"
20 | ll = "wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn_testclean_epoch34"
21 | 
22 | tmptmp = out_folder + "/" + ll
23 | 
24 | os.system("rm -r %s" % (tmptmp))
25 | os.system("mkdir -p %s" % (tmptmp))
26 | 
27 | # 自己数据模型加载
28 | config_path = "./WavTokenizer/configs/wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
29 | model_path = "./WavTokenizer/result/train/wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn/lightning_logs/version_3/checkpoints/wavtokenizer_checkpoint_epoch=24_step=137150_val_loss=5.6731.ckpt"
30 | wavtokenizer = WavTokenizer.from_pretrained0802(config_path, model_path)
31 | wavtokenizer = wavtokenizer.to(device1)
32 | # wavtokenizer = wavtokenizer.to(device2)
33 | 
34 | with open(input_path, "r") as fin:
35 |     x = fin.readlines()
36 | 
37 | x = [i.strip() for i in x]
38 | 
39 | # 完成一些加速处理
40 | 
41 | features_all = []
42 | 
43 | for i in range(len(x)):
44 | 
45 |     wav, sr = torchaudio.load(x[i])
46 |     # print("***:",x[i])
47 |     # wav = convert_audio(wav, sr, 24000, 1)                             # (1,131040)
48 |     bandwidth_id = torch.tensor([0])
49 |     wav = wav.to(device1)
50 |     print(i)
51 | 
52 |     features, discrete_code = wavtokenizer.encode_infer(wav, bandwidth_id=bandwidth_id)
53 |     features_all.append(features)
54 | 
55 | # wavtokenizer = wavtokenizer.to(device2)
56 | 
57 | for i in range(len(x)):
58 | 
59 |     bandwidth_id = torch.tensor([0])
60 | 
61 |     bandwidth_id = bandwidth_id.to(device1)
62 | 
63 |     print(i)
64 |     audio_out = wavtokenizer.decode(features_all[i], bandwidth_id=bandwidth_id)
65 |     # print(i,time.time())
66 |     # breakpoint()                        # (1, 131200)
67 |     audio_path = out_folder + "/" + ll + "/" + x[i].split("/")[-1]
68 |     # os.makedirs(out_folder + '/' + ll, exist_ok=True)
69 |     torchaudio.save(
70 |         audio_path,
71 |         audio_out.cpu(),
72 |         sample_rate=24000,
73 |         encoding="PCM_S",
74 |         bits_per_sample=16,
75 |     )
76 | 


--------------------------------------------------------------------------------
/audio_evals/lib/WavTokenizer/requirements.txt:
--------------------------------------------------------------------------------
 1 | einops==0.6.1
 2 | encodec==0.1.1
 3 | fairseq
 4 | huggingface_hub==0.23.0
 5 | jsonargparse[signatures]>=4.15.2
 6 | librosa
 7 | matplotlib==3.7.1
 8 | numpy==1.23.5
 9 | pesq
10 | pytorch-lightning==1.8.6
11 | pyyaml==6.0
12 | scipy==1.10.1
13 | soundfile==0.12.1
14 | tensorboardX==2.6
15 | torch==2.0.0
16 | torchaudio==2.0.1
17 | torchcrepe
18 | transformers==4.28.1
19 | 


--------------------------------------------------------------------------------
/audio_evals/lib/WavTokenizer/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/WavTokenizer/result.png


--------------------------------------------------------------------------------
/audio_evals/lib/WavTokenizer/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 4 | 
 5 | from pytorch_lightning.cli import LightningCLI, ArgsType
 6 | 
 7 | 
 8 | def cli_main(args: ArgsType = None):
 9 |     # breakpoint()
10 |     cli = LightningCLI(args=args)
11 |     # breakpoint()
12 |     cli.trainer.fit(model=cli.model, datamodule=cli.datamodule)
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     cli_main()
17 | 


--------------------------------------------------------------------------------
/audio_evals/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/__init__.py


--------------------------------------------------------------------------------
/audio_evals/lib/cpm_tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/cpm_tts/__init__.py


--------------------------------------------------------------------------------
/audio_evals/lib/encodec/requirements.txt:
--------------------------------------------------------------------------------
1 | librosa
2 | numpy
3 | soundfile
4 | torch
5 | transformers
6 | 


--------------------------------------------------------------------------------
/audio_evals/lib/evaluate_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 The OFA-Sys Team. All rights reserved.
 2 | # This source code is licensed under the Apache 2.0 license
 3 | # found in the LICENSE file in the root directory.
 4 | 
 5 | import unicodedata
 6 | 
 7 | from sacrebleu.tokenizers import TOKENIZERS
 8 | 
 9 | 
10 | class EvaluationTokenizer(object):
11 |     """A generic evaluation-time tokenizer, which leverages built-in tokenizers
12 |     in sacreBLEU (https://github.com/mjpost/sacrebleu). It additionally provides
13 |     lowercasing, punctuation removal and character tokenization, which are
14 |     applied after sacreBLEU tokenization.
15 | 
16 |     Args:
17 |         tokenizer_type (str): the type of sacreBLEU tokenizer to apply.
18 |         lowercase (bool): lowercase the text.
19 |         punctuation_removal (bool): remove punctuation (based on unicode
20 |         category) from text.
21 |         character_tokenization (bool): tokenize the text to characters.
22 |     """
23 | 
24 |     SPACE = chr(32)
25 |     SPACE_ESCAPE = chr(9601)
26 |     # ALL_TOKENIZER_TYPES = ChoiceEnum(["none", "13a", "intl", "zh", "ja-mecab"])
27 | 
28 |     def __init__(
29 |         self,
30 |         tokenizer_type: str = "13a",
31 |         lowercase: bool = False,
32 |         punctuation_removal: bool = False,
33 |         character_tokenization: bool = False,
34 |     ):
35 | 
36 |         assert tokenizer_type in TOKENIZERS, f"{tokenizer_type}, {TOKENIZERS}"
37 |         self.lowercase = lowercase
38 |         self.punctuation_removal = punctuation_removal
39 |         self.character_tokenization = character_tokenization
40 |         self.tokenizer = TOKENIZERS[tokenizer_type]
41 | 
42 |     @classmethod
43 |     def remove_punctuation(cls, sent: str):
44 |         """Remove punctuation based on Unicode category."""
45 |         return cls.SPACE.join(
46 |             t
47 |             for t in sent.split(cls.SPACE)
48 |             if not all(unicodedata.category(c)[0] == "P" for c in t)
49 |         )
50 | 
51 |     def tokenize(self, sent: str):
52 |         tokenized = self.tokenizer()(sent)
53 | 
54 |         if self.punctuation_removal:
55 |             tokenized = self.remove_punctuation(tokenized)
56 | 
57 |         if self.character_tokenization:
58 |             tokenized = self.SPACE.join(
59 |                 list(tokenized.replace(self.SPACE, self.SPACE_ESCAPE))
60 |             )
61 | 
62 |         if self.lowercase:
63 |             tokenized = tokenized.lower()
64 | 
65 |         return tokenized
66 | 


--------------------------------------------------------------------------------
/audio_evals/lib/mimi/requirements.txt:
--------------------------------------------------------------------------------
1 | librosa
2 | numpy
3 | soundfile
4 | torch
5 | torchaudio
6 | transformers==4.49.0
7 | 


--------------------------------------------------------------------------------
/audio_evals/lib/minicpm/requirements.txt:
--------------------------------------------------------------------------------
 1 | decord
 2 | librosa==0.9.0
 3 | moviepy
 4 | numpy==1.26
 5 | Pillow==10.1.0
 6 | soundfile==0.12.1
 7 | torch==2.2.0
 8 | torchaudio==2.2.0
 9 | torchvision==0.17.0
10 | transformers==4.44.2
11 | vector-quantize-pytorch==1.18.5
12 | vocos==0.1.0
13 | 


--------------------------------------------------------------------------------
/audio_evals/lib/minicpm_0_5B/requirements.txt:
--------------------------------------------------------------------------------
 1 | decord
 2 | librosa==0.9.0
 3 | moviepy
 4 | numpy==1.26
 5 | Pillow==10.1.0
 6 | soundfile==0.12.1
 7 | torch==2.2.0
 8 | torchaudio==2.2.0
 9 | torchvision==0.17.0
10 | transformers==4.44.2
11 | vector-quantize-pytorch==1.18.5
12 | vocos==0.1.0
13 | 


--------------------------------------------------------------------------------
/audio_evals/lib/paraformer/requirements.txt:
--------------------------------------------------------------------------------
1 | funasr
2 | soundfile
3 | torch
4 | torchaudio
5 | 


--------------------------------------------------------------------------------
/audio_evals/lib/qwen2-5omni/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | flash-attn
3 | git+https://github.com/huggingface/transformers@3a1ead0aabed473eafe527915eea8c197d424356
4 | qwen-omni-utils[decord]
5 | torchvision
6 | 


--------------------------------------------------------------------------------
/audio_evals/lib/simo/requirements.txt:
--------------------------------------------------------------------------------
1 | librosa
2 | omegaconf
3 | pyyaml
4 | s3prl
5 | torch==2.2.0
6 | torchaudio
7 | tqdm
8 | transformers
9 | 


--------------------------------------------------------------------------------
/audio_evals/lib/text_normalization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/lib/text_normalization/__init__.py


--------------------------------------------------------------------------------
/audio_evals/lib/text_normalization/basic.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import unicodedata
 3 | 
 4 | import regex
 5 | 
 6 | # non-ASCII letters that are not separated by "NFKD" normalization
 7 | ADDITIONAL_DIACRITICS = {
 8 |     "œ": "oe",
 9 |     "Œ": "OE",
10 |     "ø": "o",
11 |     "Ø": "O",
12 |     "æ": "ae",
13 |     "Æ": "AE",
14 |     "ß": "ss",
15 |     "ẞ": "SS",
16 |     "đ": "d",
17 |     "Đ": "D",
18 |     "ð": "d",
19 |     "Ð": "D",
20 |     "þ": "th",
21 |     "Þ": "th",
22 |     "ł": "l",
23 |     "Ł": "L",
24 | }
25 | 
26 | 
27 | def remove_symbols_and_diacritics(s: str, keep=""):
28 |     """
29 |     Replace any other markers, symbols, and punctuations with a space,
30 |     and drop any diacritics (category 'Mn' and some manual mappings)
31 |     """
32 |     return "".join(
33 |         (
34 |             c
35 |             if c in keep
36 |             else (
37 |                 ADDITIONAL_DIACRITICS[c]
38 |                 if c in ADDITIONAL_DIACRITICS
39 |                 else (
40 |                     ""
41 |                     if unicodedata.category(c) == "Mn"
42 |                     else " " if unicodedata.category(c)[0] in "MSP" else c
43 |                 )
44 |             )
45 |         )
46 |         for c in unicodedata.normalize("NFKD", s)
47 |     )
48 | 
49 | 
50 | def remove_symbols(s: str):
51 |     """
52 |     Replace any other markers, symbols, punctuations with a space, keeping diacritics
53 |     """
54 |     return "".join(
55 |         " " if unicodedata.category(c)[0] in "MSP" else c
56 |         for c in unicodedata.normalize("NFKC", s)
57 |     )
58 | 
59 | 
60 | class BasicTextNormalizer:
61 |     def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
62 |         self.clean = (
63 |             remove_symbols_and_diacritics if remove_diacritics else remove_symbols
64 |         )
65 |         self.split_letters = split_letters
66 | 
67 |     def __call__(self, s: str):
68 |         s = s.lower()
69 |         s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
70 |         s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
71 |         s = self.clean(s).lower()
72 | 
73 |         if self.split_letters:
74 |             s = " ".join(regex.findall(r"\X", s, regex.U))
75 | 
76 |         s = re.sub(
77 |             r"\s+", " ", s
78 |         )  # replace any successive whitespace characters with a space
79 | 
80 |         return s
81 | 


--------------------------------------------------------------------------------
/audio_evals/lib/utmos/lightning_module.py:
--------------------------------------------------------------------------------
 1 | import pytorch_lightning as pl
 2 | import torch
 3 | import torch.nn as nn
 4 | from model import load_ssl_model, DomainEmbedding, LDConditioner, Projection
 5 | import os
 6 | 
 7 | 
 8 | class BaselineLightningModule(pl.LightningModule):
 9 |     def __init__(self, cfg):
10 |         super().__init__()
11 |         self.cfg = cfg
12 |         self.construct_model()
13 |         self.save_hyperparameters()
14 | 
15 |     def construct_model(self, path=None):
16 |         if path is None:
17 |             assert (
18 |                 os.environ.get("SSL_MODEL_PATH") is not None
19 |             ), "SSL_MODEL_PATH is not set"
20 |             path = os.environ.get("SSL_MODEL_PATH")
21 | 
22 |         self.feature_extractors = nn.ModuleList(
23 |             [
24 |                 load_ssl_model(cp_path=path),
25 |                 DomainEmbedding(3, 128),
26 |             ]
27 |         )
28 |         output_dim = sum(
29 |             [
30 |                 feature_extractor.get_output_dim()
31 |                 for feature_extractor in self.feature_extractors
32 |             ]
33 |         )
34 |         output_layers = [
35 |             LDConditioner(judge_dim=128, num_judges=3000, input_dim=output_dim)
36 |         ]
37 |         output_dim = output_layers[-1].get_output_dim()
38 |         output_layers.append(
39 |             Projection(
40 |                 hidden_dim=2048,
41 |                 activation=torch.nn.ReLU(),
42 |                 range_clipping=False,
43 |                 input_dim=output_dim,
44 |             )
45 |         )
46 | 
47 |         self.output_layers = nn.ModuleList(output_layers)
48 | 
49 |     def forward(self, inputs):
50 |         outputs = {}
51 |         for feature_extractor in self.feature_extractors:
52 |             outputs.update(feature_extractor(inputs))
53 |         x = outputs
54 |         for output_layer in self.output_layers:
55 |             x = output_layer(x, inputs)
56 |         return x
57 | 


--------------------------------------------------------------------------------
/audio_evals/lib/utmos/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==1.0.0
 2 | aiohttp==3.8.1
 3 | aiosignal==1.2.0
 4 | analytics-python==1.4.0
 5 | antlr4-python3-runtime==4.8
 6 | anyio==3.5.0
 7 | asgiref==3.5.0
 8 | async-timeout==4.0.2
 9 | attrs==21.4.0
10 | backoff==1.10.0
11 | bcrypt==3.2.0
12 | bitarray==2.4.0
13 | cachetools==5.0.0
14 | certifi==2021.10.8
15 | cffi==1.15.0
16 | charset-normalizer==2.0.12
17 | click==8.0.4
18 | colorama==0.4.4
19 | cryptography==36.0.1
20 | cycler==0.11.0
21 | Cython==0.29.28
22 | fairseq @ git+https://github.com/pytorch/fairseq.git@d03f4e771484a433f025f47744017c2eb6e9c6bc
23 | fastapi==0.75.0
24 | ffmpy==0.3.0
25 | fonttools==4.30.0
26 | frozenlist==1.3.0
27 | fsspec==2022.2.0
28 | future==0.18.2
29 | google-auth==2.6.0
30 | google-auth-oauthlib==0.4.6
31 | gradio==2.8.10
32 | grpcio==1.44.0
33 | h11==0.13.0
34 | hydra-core==1.0.7
35 | idna==3.3
36 | importlib-metadata==4.11.3
37 | Jinja2==3.0.3
38 | kiwisolver==1.3.2
39 | linkify-it-py==1.0.3
40 | Markdown==3.3.6
41 | markdown-it-py==2.0.1
42 | MarkupSafe==2.1.0
43 | matplotlib==3.5.1
44 | mdit-py-plugins==0.3.0
45 | mdurl==0.1.0
46 | monotonic==1.6
47 | multidict==6.0.2
48 | numpy==1.22.3
49 | oauthlib==3.2.0
50 | omegaconf==2.0.6
51 | orjson==3.6.7
52 | packaging==21.3
53 | pandas==1.4.1
54 | paramiko==2.10.1
55 | Pillow==9.0.1
56 | protobuf==3.19.4
57 | pyasn1==0.4.8
58 | pyasn1-modules==0.2.8
59 | pycparser==2.21
60 | pycryptodome==3.14.1
61 | pydantic==1.9.0
62 | pyDeprecate==0.3.1
63 | pydub==0.25.1
64 | PyNaCl==1.5.0
65 | pyparsing==3.0.7
66 | python-dateutil==2.8.2
67 | python-multipart==0.0.5
68 | pytorch-lightning==1.5.10
69 | pytz==2021.3
70 | PyYAML==6.0
71 | regex==2022.3.2
72 | requests==2.27.1
73 | requests-oauthlib==1.3.1
74 | rsa==4.8
75 | sacrebleu==2.0.0
76 | six==1.16.0
77 | sniffio==1.2.0
78 | starlette==0.17.1
79 | tabulate==0.8.9
80 | tensorboard==2.8.0
81 | tensorboard-data-server==0.6.1
82 | tensorboard-plugin-wit==1.8.1
83 | torch==1.11.0
84 | torchaudio==0.11.0
85 | torchmetrics==0.7.2
86 | tqdm==4.63.0
87 | typing-extensions==4.1.1
88 | uc-micro-py==1.0.1
89 | urllib3==1.26.8
90 | uvicorn==0.17.6
91 | Werkzeug==2.0.3
92 | yarl==1.7.2
93 | zipp==3.7.0
94 | 


--------------------------------------------------------------------------------
/audio_evals/lib/wer.py:
--------------------------------------------------------------------------------
 1 | import editdistance as ed
 2 | import zhconv
 3 | 
 4 | from audio_evals.lib.evaluate_tokenizer import EvaluationTokenizer
 5 | from audio_evals.lib.text_normalization.basic import BasicTextNormalizer
 6 | from audio_evals.lib.text_normalization.cn_tn import TextNorm
 7 | from audio_evals.lib.text_normalization.en import EnglishTextNormalizer
 8 | 
 9 | english_normalizer = EnglishTextNormalizer()
10 | chinese_normalizer = TextNorm(
11 |     to_banjiao=False,
12 |     to_upper=False,
13 |     to_lower=False,
14 |     remove_fillers=False,
15 |     remove_erhua=False,
16 |     check_chars=False,
17 |     remove_space=False,
18 |     cc_mode="",
19 | )
20 | basic_normalizer = BasicTextNormalizer()
21 | 
22 | 
23 | def compute_wer(refs, hyps, language="en"):
24 |     distance = 0
25 |     ref_length = 0
26 |     tokenizer = EvaluationTokenizer(
27 |         tokenizer_type="none",
28 |         lowercase=True,
29 |         punctuation_removal=False,
30 |         character_tokenization=False,
31 |     )
32 |     for i in range(len(refs)):
33 |         ref = refs[i]
34 |         pred = hyps[i]
35 | 
36 |         ref = english_normalizer(ref)
37 |         pred = english_normalizer(pred)
38 |         if language in ["zh"]:
39 |             ref = chinese_normalizer(ref)
40 |             pred = chinese_normalizer(pred)
41 |         if language in ["yue"]:
42 |             ref = zhconv.convert(ref, "zh-cn")
43 |             pred = zhconv.convert(pred, "zh-cn")
44 | 
45 |         ref_items = tokenizer.tokenize(ref).split()
46 |         pred_items = tokenizer.tokenize(pred).split()
47 | 
48 |         if language in ["zh", "yue"]:
49 |             ref_items = [x for x in "".join(ref_items)]
50 |             pred_items = [x for x in "".join(pred_items)]
51 |         if len(refs) > 1 and i == 0:
52 |             print(f"ref: {ref}")
53 |             print(f"pred: {pred}")
54 |             print(f"ref_items:\n{ref_items}\n{len(ref_items)}\n{ref_items[0]}")
55 |             print(f"pred_items:\n{pred_items}\n{len(ref_items)}\n{ref_items[0]}")
56 |         distance += ed.eval(ref_items, pred_items)
57 |         ref_length += len(ref_items)
58 |     return distance / ref_length
59 | 


--------------------------------------------------------------------------------
/audio_evals/lib/whisper/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | import select
 5 | import sys
 6 | import torch
 7 | from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 8 | 
 9 | logging.basicConfig(level=logging.INFO)
10 | logger = logging.getLogger(__name__)
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--path", type=str, required=True, help="Path to Whisper model")
15 |     config = parser.parse_args()
16 | 
17 |     # Initialize model
18 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
19 |     torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
20 | 
21 |     model = AutoModelForSpeechSeq2Seq.from_pretrained(
22 |         config.path,
23 |         torch_dtype=torch_dtype,
24 |         low_cpu_mem_usage=True,
25 |         use_safetensors=True,
26 |     ).eval()
27 |     model.to(device)
28 | 
29 |     processor = AutoProcessor.from_pretrained(config.path)
30 |     logger.info(f"Using Whisper model from: {config.path} on device: {device}")
31 |     pipe = pipeline(
32 |         "automatic-speech-recognition",
33 |         model=model,
34 |         tokenizer=processor.tokenizer,
35 |         feature_extractor=processor.feature_extractor,
36 |         torch_dtype=torch_dtype,
37 |         device=device,
38 |     )
39 |     while True:
40 |         try:
41 |             prompt = input()
42 |             anchor = prompt.find("->")
43 |             if anchor == -1:
44 |                 print(
45 |                     "Error: Invalid conversation format, must contains  ->, but {}".format(
46 |                         prompt
47 |                     ),
48 |                     flush=True,
49 |                 )
50 |                 continue
51 |             prefix = prompt[:anchor].strip() + "->"
52 |             x = json.loads(prompt[anchor + 2 :])
53 | 
54 |             # Process input
55 | 
56 |             kwargs = x.pop("kwargs", {})
57 |             x.update(kwargs)
58 |             if "return_timestamps" not in x:
59 |                 x["return_timestamps"] = True
60 | 
61 |             logger.info(f"Received input: {x}")
62 | 
63 |             result = pipe(x.pop("audio"), **x)
64 |             retry = 3
65 |             while retry:
66 |                 print(f"{prefix}{result['text']}", flush=True)
67 |                 rlist, _, _ = select.select([sys.stdin], [], [], 1)
68 |                 if rlist:
69 |                     finish = sys.stdin.readline().strip()
70 |                     if finish == "{}close".format(prefix):
71 |                         break
72 |                 print("not found close signal, will emit again", flush=True)
73 |                 retry -= 1
74 |         except Exception as e:
75 |             print(f"Error: {str(e)}", flush=True)
76 | 


--------------------------------------------------------------------------------
/audio_evals/lib/whisper/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.26.0
2 | torchaudio==2.5.1
3 | transformers==4.49.0
4 | 


--------------------------------------------------------------------------------
/audio_evals/models/AudioEncoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/models/AudioEncoder/__init__.py


--------------------------------------------------------------------------------
/audio_evals/models/AudioEncoder/chattts.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import tempfile
 4 | from dataclasses import asdict
 5 | import torch
 6 | import soundfile as sf
 7 | import librosa
 8 | from typing import Dict
 9 | from vocos import Vocos
10 | from vocos.pretrained import instantiate_class
11 | 
12 | from audio_evals.base import PromptStruct
13 | from audio_evals.lib.chattts import VocosConfig, DVAEConfig, DVAE
14 | from audio_evals.models.model import Model
15 | 
16 | 
17 | logger = logging.getLogger(__name__)
18 | 
19 | 
20 | class ChatTTSModel(Model):
21 |     def __init__(self, model_path: str, sample_params: Dict[str, any] = None):
22 |         super().__init__(True, sample_params)
23 |         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
24 | 
25 |         vocos_ckpt_path = os.path.join(model_path, "Vocos.pt")
26 |         dvae_ckpt_path = os.path.join(model_path, "DVAE_full.pt")
27 | 
28 |         vocos_config = VocosConfig()
29 |         feature_extractor = instantiate_class(
30 |             args=(), init=asdict(vocos_config.feature_extractor)
31 |         )
32 |         backbone = instantiate_class(args=(), init=asdict(vocos_config.backbone))
33 |         head = instantiate_class(args=(), init=asdict(vocos_config.head))
34 |         vocos = (
35 |             Vocos(feature_extractor=feature_extractor, backbone=backbone, head=head)
36 |             .to(self.device)
37 |             .eval()
38 |         )
39 |         vocos.load_state_dict(torch.load(vocos_ckpt_path))
40 |         self.vocos = vocos
41 | 
42 |         dvae_config = DVAEConfig()
43 |         dvae = DVAE(
44 |             decoder_config=asdict(dvae_config.decoder),
45 |             encoder_config=asdict(dvae_config.encoder),
46 |             vq_config=asdict(dvae_config.vq),
47 |             dim=dvae_config.decoder.idim,
48 |             coef=None,
49 |             device=self.device,
50 |         )
51 |         dvae.load_pretrained(dvae_ckpt_path, self.device)
52 | 
53 |         self.dvae = dvae.eval()
54 |         logger.info("model loaded successfully")
55 | 
56 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
57 |         audio_path = prompt["audio"]
58 |         logger.debug(f"Processing audio file: {audio_path}")
59 | 
60 |         y, sr = librosa.load(audio_path, sr=24000, mono=True)
61 |         waveform = torch.tensor(y).to(self.device)
62 |         x = self.dvae(waveform, "encode")
63 |         reconstructed_mel = self.dvae(x, "decode")
64 |         reconstructed_waveform = self.vocos.decode(reconstructed_mel).cpu().numpy()
65 | 
66 |         waveform_mono = reconstructed_waveform.squeeze()
67 |         # 保存生成的音频到临时文件
68 |         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
69 |             sf.write(f.name, waveform_mono, samplerate=24000, subtype="PCM_16")
70 |             return f.name
71 | 


--------------------------------------------------------------------------------
/audio_evals/models/AudioEncoder/cosyvoice.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | 
 4 | sys.path.append("/DATA/disk1/home/shiqundong/project/CosyVoice/third_party/Matcha-TTS")
 5 | sys.path.append(
 6 |     "/DATA/disk1/home/shiqundong/project/CosyVoice/env/lib/python3.10/site-packages/"
 7 | )
 8 | sys.path.append("/DATA/disk1/home/shiqundong/project/CosyVoice")
 9 | 
10 | import logging
11 | import os
12 | import tempfile
13 | import torch
14 | import soundfile as sf
15 | import librosa
16 | from typing import Dict
17 | import s3tokenizer
18 | from audio_evals.base import PromptStruct
19 | from audio_evals.models.model import OfflineModel
20 | from cosyvoice.cli.cosyvoice import CosyVoice2
21 | from cosyvoice.utils.file_utils import load_wav
22 | 
23 | logger = logging.getLogger(__name__)
24 | 
25 | 
26 | class CosyVoiceEncoder(OfflineModel):
27 |     def __init__(self, model_path: str, sample_params: Dict[str, any] = None):
28 |         super().__init__(True, sample_params)
29 |         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
30 |         self.tokenizer = s3tokenizer.load_model("speech_tokenizer_v2_25hz")
31 |         self.tokenizer.to(self.device)
32 | 
33 |         self.model = CosyVoice2(model_path, load_jit=False, load_trt=False, fp16=False)
34 |         logger.info("model loaded successfully")
35 | 
36 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
37 |         audio_path = prompt["audio"]
38 |         logger.debug(f"Processing audio file: {audio_path}")
39 | 
40 |         x = load_wav(audio_path, 16_000)
41 |         mel = s3tokenizer.log_mel_spectrogram(x.squeeze(0))
42 |         mels, mels_lens = s3tokenizer.padding([mel])
43 |         audio_tokens = self.tokenizer.quantize(
44 |             mels.to(self.device), mels_lens.to(self.device)
45 |         )[0]
46 | 
47 |         waveform = x.to(self.device)
48 |         sr = torch.tensor(self.model.sample_rate).to(self.device)
49 |         model_input = self.model.frontend.frontend_token2wav(waveform, sr)
50 |         wav_out = self.model.model.token2wav(
51 |             token=audio_tokens,
52 |             prompt_token=model_input["flow_prompt_speech_token"],
53 |             prompt_feat=model_input["prompt_speech_feat"],
54 |             embedding=model_input["flow_embedding"],
55 |             uuid=None,
56 |             token_offset=0,
57 |             speed=1.0,
58 |         )
59 |         wav_out = wav_out.squeeze()
60 |         # 保存生成的音频到临时文件
61 |         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
62 |             sf.write(f.name, wav_out.cpu().numpy(), samplerate=self.model.sample_rate)
63 |             return f.name
64 | 


--------------------------------------------------------------------------------
/audio_evals/models/AudioEncoder/vocos_encode.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import tempfile
 3 | 
 4 | import torch
 5 | import soundfile as sf
 6 | import librosa
 7 | from typing import Dict
 8 | from vocos import Vocos
 9 | from vocos.pretrained import instantiate_class
10 | 
11 | from audio_evals.base import PromptStruct
12 | from audio_evals.models.model import Model
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class VocosModel(Model):
18 |     def __init__(
19 |         self,
20 |         model_path: str,
21 |         feature_extractor: Dict[str, any],
22 |         backbone: Dict[str, any],
23 |         head: Dict[str, any],
24 |         sample_params: Dict[str, any] = None,
25 |     ):
26 |         super().__init__(True, sample_params)  # 作为非聊天模型
27 |         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
28 |         logger.info(f"Loading Vocos model from {model_path} to device {self.device}")
29 |         feature_extractor = instantiate_class(args=(), init=feature_extractor)
30 |         backbone = instantiate_class(args=(), init=backbone)
31 |         head = instantiate_class(args=(), init=head)
32 |         self.model = Vocos(
33 |             feature_extractor=feature_extractor, backbone=backbone, head=head
34 |         )
35 |         self.model.to(self.device)
36 |         self.model.eval()
37 |         self.model.load_state_dict(torch.load(model_path, weights_only=True, mmap=True))
38 |         logger.info("Vocos model loaded successfully")
39 | 
40 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
41 |         audio_path = prompt["audio"]
42 |         logger.debug(f"Processing audio file: {audio_path}")
43 |         y, sr = librosa.load(audio_path, sr=None)
44 |         waveform = torch.tensor(y).unsqueeze(0).to(self.device)
45 |         generated_audio = self.model(waveform)
46 | 
47 |         # 保存生成的音频到临时文件
48 |         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
49 |             sf.write(f.name, generated_audio.squeeze().cpu().numpy(), samplerate=22050)
50 |             return f.name
51 | 


--------------------------------------------------------------------------------
/audio_evals/models/AudioEncoder/wav_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | sys.path.append("audio_evals/lib/WavTokenizer")
 4 | import logging
 5 | import os.path
 6 | import tempfile
 7 | 
 8 | 
 9 | from typing import Dict
10 | from audio_evals.lib.WavTokenizer.encoder.utils import convert_audio
11 | import torchaudio
12 | import torch
13 | from audio_evals.lib.WavTokenizer.decoder.pretrained import WavTokenizer
14 | from audio_evals.base import PromptStruct
15 | from audio_evals.models.model import OfflineModel
16 | 
17 | logger = logging.getLogger(__name__)
18 | 
19 | 
20 | class WavTokenizerEncoder(OfflineModel):
21 | 
22 |     def __init__(
23 |         self,
24 |         config_name: str,
25 |         model_path: str,
26 |         sample_params: Dict[str, any] = None,
27 |     ):
28 |         super().__init__(is_chat=True, sample_params=sample_params)
29 |         self.device = "cuda" if torch.cuda.is_available() else "cpu"
30 |         self.config_name = os.path.join(
31 |             "audio_evals/lib/WavTokenizer/configs", config_name
32 |         )
33 | 
34 |         logger.info(f"Loading WavTokenizer from {model_path}")
35 |         self.model = WavTokenizer.from_pretrained0802(self.config_name, model_path)
36 |         self.model = self.model.to(self.device)
37 |         logger.info(f"WavTokenizer loaded on {self.device}")
38 | 
39 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
40 |         audio_path = prompt["audio"]
41 | 
42 |         wav, sr = torchaudio.load(audio_path)
43 |         wav = convert_audio(wav, sr, 24000, 1)
44 |         bandwidth_id = torch.tensor([0])
45 |         wav = wav.to(self.device)
46 |         features, discrete_code = self.model.encode_infer(
47 |             wav, bandwidth_id=bandwidth_id
48 |         )
49 |         audio_out = self.model.decode(
50 |             features, bandwidth_id=bandwidth_id.to(self.device)
51 |         )
52 | 
53 |         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
54 |             torchaudio.save(
55 |                 f.name,
56 |                 audio_out.cpu(),
57 |                 sample_rate=24000,
58 |                 encoding="PCM_S",
59 |                 bits_per_sample=16,
60 |             )
61 |             return f.name
62 | 


--------------------------------------------------------------------------------
/audio_evals/models/TTS/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/models/TTS/__init__.py


--------------------------------------------------------------------------------
/audio_evals/models/TTS/amphion.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from typing import Dict
 4 | from audio_evals.base import PromptStruct
 5 | from audio_evals.models.model import OfflineModel
 6 | from audio_evals.isolate import isolated
 7 | import select
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | @isolated("audio_evals/lib/Amphion/main.py")
13 | class Amphion(OfflineModel):
14 |     def __init__(self, path: str, sample_params: Dict = None, *args, **kwargs):
15 |         self.command_args = {
16 |             "path": path,
17 |         }
18 |         super().__init__(is_chat=True, sample_params=sample_params)
19 | 
20 |     def _inference(self, prompt: PromptStruct, **kwargs):
21 |         import uuid
22 | 
23 |         uid = str(uuid.uuid4())
24 |         prefix = f"{uid}->"
25 | 
26 |         while True:
27 |             _, wlist, _ = select.select([], [self.process.stdin], [], 60)
28 |             if wlist:
29 |                 self.process.stdin.write(f"{prefix}{json.dumps(prompt)}\n")
30 |                 self.process.stdin.flush()
31 |                 print("already write in")
32 |                 break
33 |         while True:
34 |             rlist, _, _ = select.select(
35 |                 [self.process.stdout, self.process.stderr], [], [], 60
36 |             )
37 |             if not rlist:
38 |                 err_msg = "Read timeout after 60 seconds"
39 |                 logger.error(err_msg)
40 |                 raise RuntimeError(err_msg)
41 | 
42 |             try:
43 |                 for stream in rlist:
44 |                     if stream == self.process.stdout:
45 |                         result = self.process.stdout.readline().strip()
46 |                         if not result:
47 |                             continue
48 |                         if result.startswith(prefix):
49 |                             self.process.stdin.write("{}close\n".format(prefix))
50 |                             self.process.stdin.flush()
51 |                             return result[len(prefix) :]
52 |                         elif result.startswith("Error:"):
53 |                             raise RuntimeError("Amphion failed: {}".format(result))
54 |                         else:
55 |                             logger.info(result)
56 |                     elif stream == self.process.stderr:
57 |                         err = self.process.stderr.readline().strip()
58 |                         if err:
59 |                             logger.error(f"Process stderr: {err}")
60 |             except BlockingIOError as e:
61 |                 logger.error(f"BlockingIOError occurred: {str(e)}")
62 | 


--------------------------------------------------------------------------------
/audio_evals/models/TTS/megatts.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from typing import Dict
 4 | from audio_evals.base import PromptStruct
 5 | from audio_evals.models.model import OfflineModel
 6 | from audio_evals.isolate import isolated
 7 | import select
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | @isolated("audio_evals/lib/MegaTTS3/main.py")
13 | class MegaTTS(OfflineModel):
14 |     def __init__(self, path: str, sample_params: Dict = None, *args, **kwargs):
15 |         self.command_args = {
16 |             "path": path,
17 |         }
18 |         super().__init__(is_chat=True, sample_params=sample_params)
19 | 
20 |     def _inference(self, prompt: PromptStruct, **kwargs):
21 |         import uuid
22 | 
23 |         uid = str(uuid.uuid4())
24 |         prefix = f"{uid}->"
25 | 
26 |         while True:
27 |             _, wlist, _ = select.select([], [self.process.stdin], [], 60)
28 |             if wlist:
29 |                 self.process.stdin.write(f"{prefix}{json.dumps(prompt)}\n")
30 |                 self.process.stdin.flush()
31 |                 print("already write in")
32 |                 break
33 |         while True:
34 |             rlist, _, _ = select.select(
35 |                 [self.process.stdout, self.process.stderr], [], [], 60
36 |             )
37 |             if not rlist:
38 |                 err_msg = "Read timeout after 60 seconds"
39 |                 logger.error(err_msg)
40 |                 raise RuntimeError(err_msg)
41 | 
42 |             try:
43 |                 for stream in rlist:
44 |                     if stream == self.process.stdout:
45 |                         result = self.process.stdout.readline().strip()
46 |                         if not result:
47 |                             continue
48 |                         if result.startswith(prefix):
49 |                             self.process.stdin.write("{}close\n".format(prefix))
50 |                             self.process.stdin.flush()
51 |                             return result[len(prefix) :]
52 |                         elif result.startswith("Error:"):
53 |                             raise RuntimeError("MegaTTS failed: {}".format(result))
54 |                         else:
55 |                             logger.info(result)
56 |                     elif stream == self.process.stderr:
57 |                         err = self.process.stderr.readline().strip()
58 |                         if err:
59 |                             logger.error(f"Process stderr: {err}")
60 |             except BlockingIOError as e:
61 |                 logger.error(f"BlockingIOError occurred: {str(e)}")
62 | 


--------------------------------------------------------------------------------
/audio_evals/models/TTS/melotts.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from typing import Dict
 4 | from audio_evals.base import PromptStruct
 5 | from audio_evals.models.model import OfflineModel
 6 | from audio_evals.isolate import isolated
 7 | import select
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | @isolated("audio_evals/lib/MeloTTS/main.py")
13 | class MeloTTS(OfflineModel):
14 |     def __init__(
15 |         self, path: str, lang: str, sample_params: Dict = None, *args, **kwargs
16 |     ):
17 |         self.command_args = {
18 |             "path": path,
19 |             "lang": lang,
20 |         }
21 |         super().__init__(is_chat=True, sample_params=sample_params)
22 | 
23 |     def _inference(self, prompt: PromptStruct, **kwargs):
24 |         import uuid
25 | 
26 |         uid = str(uuid.uuid4())
27 |         prefix = f"{uid}->"
28 |         prompt.update(kwargs)
29 | 
30 |         while True:
31 |             _, wlist, _ = select.select([], [self.process.stdin], [], 60)
32 |             if wlist:
33 |                 self.process.stdin.write(f"{prefix}{json.dumps(prompt)}\n")
34 |                 self.process.stdin.flush()
35 |                 print("already write in")
36 |                 break
37 |         while True:
38 |             rlist, _, _ = select.select(
39 |                 [self.process.stdout, self.process.stderr], [], [], 60
40 |             )
41 |             if not rlist:
42 |                 err_msg = "Read timeout after 60 seconds"
43 |                 logger.error(err_msg)
44 |                 raise RuntimeError(err_msg)
45 | 
46 |             try:
47 |                 for stream in rlist:
48 |                     if stream == self.process.stdout:
49 |                         result = self.process.stdout.readline().strip()
50 |                         if not result:
51 |                             continue
52 |                         if result.startswith(prefix):
53 |                             self.process.stdin.write("{}close\n".format(prefix))
54 |                             self.process.stdin.flush()
55 |                             return result[len(prefix) :]
56 |                         elif result.startswith("Error:"):
57 |                             raise RuntimeError("MeloTTS failed: {}".format(result))
58 |                         else:
59 |                             logger.info(result)
60 |                     elif stream == self.process.stderr:
61 |                         err = self.process.stderr.readline().strip()
62 |                         if err:
63 |                             logger.error(f"Process stderr: {err}")
64 |             except BlockingIOError as e:
65 |                 logger.error(f"BlockingIOError occurred: {str(e)}")
66 | 


--------------------------------------------------------------------------------
/audio_evals/models/TTS/spark.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from typing import Dict
 4 | from audio_evals.base import PromptStruct
 5 | from audio_evals.models.model import OfflineModel
 6 | from audio_evals.isolate import isolated
 7 | import select
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | @isolated("audio_evals/lib/Spark-TTS/main.py")
13 | class SparkVoiceClone(OfflineModel):
14 |     def __init__(
15 |         self, path: str, vc_mode: bool, sample_params: Dict = None, *args, **kwargs
16 |     ):
17 |         self.command_args = {
18 |             "path": path,
19 |         }
20 |         if vc_mode:
21 |             self.command_args["vc_mode"] = ""
22 |         super().__init__(is_chat=True, sample_params=sample_params)
23 | 
24 |     def _inference(self, prompt: PromptStruct, **kwargs):
25 |         import uuid
26 | 
27 |         uid = str(uuid.uuid4())
28 |         prefix = f"{uid}->"
29 | 
30 |         while True:
31 |             _, wlist, _ = select.select([], [self.process.stdin], [], 60)
32 |             if wlist:
33 |                 self.process.stdin.write(f"{prefix}{json.dumps(prompt)}\n")
34 |                 self.process.stdin.flush()
35 |                 print("already write in")
36 |                 break
37 |         while True:
38 |             rlist, _, _ = select.select(
39 |                 [self.process.stdout, self.process.stderr], [], [], 60
40 |             )
41 |             if not rlist:
42 |                 err_msg = "Read timeout after 60 seconds"
43 |                 logger.error(err_msg)
44 |                 raise RuntimeError(err_msg)
45 | 
46 |             try:
47 |                 for stream in rlist:
48 |                     if stream == self.process.stdout:
49 |                         result = self.process.stdout.readline().strip()
50 |                         if not result:
51 |                             continue
52 |                         if result.startswith(prefix):
53 |                             self.process.stdin.write("{}close\n".format(prefix))
54 |                             self.process.stdin.flush()
55 |                             return result[len(prefix) :]
56 |                         elif result.startswith("Error:"):
57 |                             raise RuntimeError("Spark failed: {}".format(result))
58 |                         else:
59 |                             logger.info(result)
60 |                     elif stream == self.process.stderr:
61 |                         err = self.process.stderr.readline().strip()
62 |                         if err:
63 |                             logger.error(f"Process stderr: {err}")
64 |             except BlockingIOError as e:
65 |                 logger.error(f"BlockingIOError occurred: {str(e)}")
66 | 


--------------------------------------------------------------------------------
/audio_evals/models/TTS/stabletts.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from typing import Dict
 4 | from audio_evals.base import PromptStruct
 5 | from audio_evals.models.model import OfflineModel
 6 | from audio_evals.isolate import isolated
 7 | import select
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | @isolated("audio_evals/lib/StableTTS/main.py")
13 | class StableTTS(OfflineModel):
14 |     def __init__(
15 |         self,
16 |         tts_path: str,
17 |         vocoder_path: str,
18 |         vocoder_type: str,
19 |         sample_params: Dict = None,
20 |         *args,
21 |         **kwargs,
22 |     ):
23 |         self.command_args = {
24 |             "tts_path": tts_path,
25 |             "vocoder_path": vocoder_path,
26 |             "vocoder_type": vocoder_type,
27 |         }
28 |         super().__init__(is_chat=True, sample_params=sample_params)
29 | 
30 |     def _inference(self, prompt: PromptStruct, **kwargs):
31 |         import uuid
32 | 
33 |         uid = str(uuid.uuid4())
34 |         prefix = f"{uid}->"
35 | 
36 |         while True:
37 |             _, wlist, _ = select.select([], [self.process.stdin], [], 60)
38 |             if wlist:
39 |                 self.process.stdin.write(f"{prefix}{json.dumps(prompt)}\n")
40 |                 self.process.stdin.flush()
41 |                 print("already write in")
42 |                 break
43 |         while True:
44 |             rlist, _, _ = select.select(
45 |                 [self.process.stdout, self.process.stderr], [], [], 60
46 |             )
47 |             if not rlist:
48 |                 err_msg = "Read timeout after 60 seconds"
49 |                 logger.error(err_msg)
50 |                 raise RuntimeError(err_msg)
51 | 
52 |             try:
53 |                 for stream in rlist:
54 |                     if stream == self.process.stdout:
55 |                         result = self.process.stdout.readline().strip()
56 |                         if not result:
57 |                             continue
58 |                         if result.startswith(prefix):
59 |                             self.process.stdin.write("{}close\n".format(prefix))
60 |                             self.process.stdin.flush()
61 |                             return result[len(prefix) :]
62 |                         elif result.startswith("Error:"):
63 |                             raise RuntimeError("StableTTS failed: {}".format(result))
64 |                         else:
65 |                             logger.info(result)
66 |                     elif stream == self.process.stderr:
67 |                         err = self.process.stderr.readline().strip()
68 |                         if err:
69 |                             logger.error(f"Process stderr: {err}")
70 |             except BlockingIOError as e:
71 |                 logger.error(f"BlockingIOError occurred: {str(e)}")
72 | 


--------------------------------------------------------------------------------
/audio_evals/models/UltraVOX.py:
--------------------------------------------------------------------------------
 1 | # pip install transformers peft librosa
 2 | import logging
 3 | from typing import List, Dict, Tuple
 4 | 
 5 | import transformers
 6 | import librosa
 7 | 
 8 | from audio_evals.base import PromptStruct
 9 | from audio_evals.models.model import Model
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class UltraVOX(Model):
15 |     def __init__(self, path: str, sample_params: Dict[str, any] = None):
16 |         super().__init__(True, sample_params)  # as a chat model
17 |         logger.debug("start load model from {}".format(path))
18 |         self.pipe = transformers.pipeline(model=path, trust_remote_code=True, device=0)
19 |         logger.debug("model loaded")
20 |         self.max_new_tokens = 30
21 | 
22 |     @staticmethod
23 |     def _conv_prompt(prompt: PromptStruct) -> Tuple[str, str, List[Dict[str, str]]]:
24 |         audio, sr = "", ""
25 |         turns = [
26 |             {
27 |                 "role": "system",
28 |                 "content": "You are a friendly and helpful character. You love to answer questions for people.",
29 |             },
30 |         ]
31 |         for line in prompt:
32 |             role = line["role"]
33 |             for c in line["contents"]:
34 |                 if c["type"] == "audio":
35 |                     audio, sr = librosa.load(c["value"], sr=16000)
36 |                 if c["type"] == "text":
37 |                     turns.append({"role": role, "content": c["value"] + " <|audio|>"})
38 |         return audio, sr, turns
39 | 
40 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
41 |         audio, sr, turns = self._conv_prompt(prompt)
42 |         logger.debug("turns: {}".format(turns))
43 |         return self.pipe(
44 |             {"audio": audio, "turns": turns, "sampling_rate": sr}, **kwargs
45 |         )
46 | 


--------------------------------------------------------------------------------
/audio_evals/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/models/__init__.py


--------------------------------------------------------------------------------
/audio_evals/models/ali.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from copy import deepcopy
 3 | from http import HTTPStatus
 4 | from typing import Dict
 5 | 
 6 | from dashscope import MultiModalConversation
 7 | 
 8 | from audio_evals.base import PromptStruct
 9 | from audio_evals.models.model import APIModel
10 | 
11 | 
12 | class AliApi(APIModel):
13 | 
14 |     def __init__(
15 |         self,
16 |         model_name: str = "qwen2-audio-instruct",
17 |         sample_params: Dict[str, any] = None,
18 |     ):
19 |         super().__init__(True, sample_params)
20 |         self.model = model_name
21 |         assert "DASHSCOPE_API_KEY" in os.environ, ValueError(
22 |             "not found DASHSCOPE_API_KEY in your ENV"
23 |         )
24 | 
25 |     def _inference(self, prompt: PromptStruct, **kwargs):
26 |         messages = []
27 |         for content in deepcopy(prompt):
28 |             for i, line in enumerate(content["contents"]):
29 |                 if line["type"] == "text":
30 |                     content["contents"][i] = {"text": line["value"]}
31 |                 else:
32 |                     content["contents"][i] = {
33 |                         line["type"]: "file://{}".format(line["value"])
34 |                     }
35 | 
36 |             content["content"] = content["contents"]
37 |             del content["contents"]
38 |             messages.append(content)
39 | 
40 |         response = MultiModalConversation.call(model=self.model, messages=messages)
41 |         if response.status_code == HTTPStatus.OK:
42 |             return response.output.choices[0].message.content[0]["text"]
43 |         raise Exception("{}: {}".format(response.code, response.message))
44 | 


--------------------------------------------------------------------------------
/audio_evals/models/asr/__init__.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Dict
 3 | 
 4 | import requests
 5 | 
 6 | from audio_evals.base import PromptStruct
 7 | from audio_evals.models.model import APIModel
 8 | from audio_evals.utils import get_base64_from_file
 9 | 
10 | 
11 | class AsrServer(APIModel):
12 |     def __init__(self, url: str, sample_params: Dict[str, any] = None):
13 |         super().__init__(True, sample_params)
14 |         self.url = url
15 | 
16 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
17 | 
18 |         audio_file = prompt["audio"]
19 |         audio_base64 = get_base64_from_file(audio_file)
20 |         headers = {"Content-Type": "application/json"}
21 |         data = {"audio": audio_base64}
22 |         response = requests.post(
23 |             self.url, headers=headers, data=json.dumps(data), stream=True
24 |         )
25 |         if response.status_code == 200:
26 |             return response.text
27 |         else:
28 |             raise Exception(f"Error: {response.status_code} - {response.text}")
29 | 


--------------------------------------------------------------------------------
/audio_evals/models/asr/fireredasr.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Dict
 3 | from audio_evals.base import PromptStruct
 4 | from audio_evals.models.model import OfflineModel
 5 | from audio_evals.isolate import isolated
 6 | 
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | @isolated(
12 |     "audio_evals/lib/FireRedASR/main.py",
13 |     pre_command="export PYTHONPATH=$PWD/:$PYTHONPATH",
14 | )
15 | class FireRedASR(OfflineModel):
16 |     def __init__(self, path: str, sample_params: Dict = None, *args, **kwargs):
17 |         self.command_args = {
18 |             "path": path,
19 |         }
20 |         super().__init__(is_chat=False, sample_params=sample_params)
21 | 
22 |     def _inference(self, prompt: PromptStruct, **kwargs) -> float:
23 |         audio = prompt["audio"]
24 |         self.process.stdin.write(f"{audio}\n")
25 |         self.process.stdin.flush()
26 |         import select
27 | 
28 |         while True:
29 |             reads, _, _ = select.select(
30 |                 [self.process.stdout, self.process.stderr], [], [], 1.0
31 |             )
32 |             for read in reads:
33 |                 if read is self.process.stdout:
34 |                     result = self.process.stdout.readline().strip()
35 |                     if result:
36 |                         if result.startswith("Result:"):
37 |                             return result[7:]
38 |                         elif result.startswith("Error:"):
39 |                             raise RuntimeError("FireRedASR failed: {}".format(result))
40 |                         else:
41 |                             logger.info(result)
42 |                 if read is self.process.stderr:
43 |                     error_output = self.process.stderr.readline()
44 |                     if error_output:
45 |                         print(f"stderr: {error_output.strip()}")
46 | 


--------------------------------------------------------------------------------
/audio_evals/models/asr/paraformer.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from typing import Dict
 4 | from audio_evals.base import PromptStruct
 5 | from audio_evals.models.model import OfflineModel
 6 | from audio_evals.isolate import isolated
 7 | import select
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | @isolated("audio_evals/lib/paraformer/main.py")
14 | class Paraformer(OfflineModel):
15 |     def __init__(self, path: str, sample_params: Dict = None, *args, **kwargs):
16 |         if not os.path.exists(path):
17 |             path = self._download_model(path)
18 | 
19 |         self.command_args = {
20 |             "path": path,
21 |         }
22 |         super().__init__(is_chat=False, sample_params=sample_params)
23 | 
24 |     def _inference(self, prompt: PromptStruct, **kwargs) -> float:
25 |         audio = prompt["audio"]
26 |         import uuid
27 | 
28 |         uid = str(uuid.uuid4())
29 |         prefix = f"{uid}->"
30 |         while True:
31 |             _, wlist, _ = select.select([], [self.process.stdin], [], 60)
32 |             if wlist:
33 |                 self.process.stdin.write(f"{prefix}{audio}\n")
34 |                 self.process.stdin.flush()
35 |                 print("already write in")
36 |                 break
37 |             print("waiting for write")
38 |         while True:
39 |             reads, _, _ = select.select(
40 |                 [self.process.stdout, self.process.stderr], [], [], 1.0
41 |             )
42 |             for read in reads:
43 |                 if read is self.process.stdout:
44 |                     result = self.process.stdout.readline().strip()
45 |                     if result:
46 |                         if result.startswith(prefix):
47 |                             self.process.stdin.write("{}close\n".format(prefix))
48 |                             self.process.stdin.flush()
49 |                             return result[len(prefix) :]
50 |                         elif result.startswith("Error:"):
51 |                             raise RuntimeError("FireRedASR failed: {}".format(result))
52 |                         else:
53 |                             logger.info(result)
54 |                 if read is self.process.stderr:
55 |                     error_output = self.process.stderr.readline()
56 |                     if error_output:
57 |                         print(f"stderr: {error_output.strip()}")
58 | 


--------------------------------------------------------------------------------
/audio_evals/models/asr/sherpa.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import select
 4 | import uuid
 5 | from typing import Dict
 6 | 
 7 | from audio_evals.base import PromptStruct
 8 | from audio_evals.models.model import OfflineModel
 9 | from audio_evals.isolate import isolated
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | @isolated("audio_evals/lib/sherpa-onnx/main.py")
15 | class SherpaOnnx(OfflineModel):
16 |     def __init__(self, tokens: str, sample_params: Dict = None, *args, **kwargs):
17 |         self.command_args = {
18 |             "tokens": tokens,
19 |         }
20 |         for k, v in kwargs.items():
21 |             if k == "offline":
22 |                 if v:
23 |                     v = ""
24 |                 else:
25 |                     continue
26 | 
27 |             self.command_args[k] = v
28 |         super().__init__(is_chat=True, sample_params=sample_params)
29 | 
30 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
31 |         audio = prompt["audio"]
32 |         uid = str(uuid.uuid4())
33 |         prefix = f"{uid}->"
34 | 
35 |         # Send request
36 |         while True:
37 |             _, wlist, _ = select.select([], [self.process.stdin], [], 60)
38 |             if wlist:
39 |                 request = json.dumps({"audio": audio})
40 |                 self.process.stdin.write(f"{prefix}{request}\n")
41 |                 self.process.stdin.flush()
42 |                 break
43 | 
44 |         # Receive response
45 |         while True:
46 |             reads, _, _ = select.select(
47 |                 [self.process.stdout, self.process.stderr], [], [], 1.0
48 |             )
49 |             for read in reads:
50 |                 if read is self.process.stdout:
51 |                     result = self.process.stdout.readline().strip()
52 |                     if result:
53 |                         if result.startswith(prefix):
54 |                             # Close the request
55 |                             self.process.stdin.write(f"{prefix}close\n")
56 |                             self.process.stdin.flush()
57 | 
58 |                             # Parse and return the result
59 |                             response = json.loads(result[len(prefix) :])
60 |                             return response["text"]
61 |                         elif result.startswith("Error:"):
62 |                             raise RuntimeError(f"SherpaOnnx failed: {result}")
63 |                         else:
64 |                             logger.info(result)
65 |                 if read is self.process.stderr:
66 |                     error_output = self.process.stderr.readline()
67 |                     if error_output:
68 |                         logger.error(f"stderr: {error_output.strip()}")
69 | 


--------------------------------------------------------------------------------
/audio_evals/models/asr/tencent.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import json
 3 | import subprocess
 4 | import tempfile
 5 | from typing import Dict
 6 | from tencentcloud.common import credential
 7 | from tencentcloud.common.profile.client_profile import ClientProfile
 8 | from tencentcloud.common.profile.http_profile import HttpProfile
 9 | from tencentcloud.asr.v20190614 import asr_client, models
10 | from audio_evals.base import PromptStruct
11 | from audio_evals.models.model import APIModel
12 | import logging
13 | 
14 | 
15 | logger = logging.getLogger(__name__)
16 | 
17 | 
18 | class TencentASRModel(APIModel):
19 |     def __init__(
20 |         self,
21 |         secret_id: str,
22 |         secret_key: str,
23 |         region: str = "ap-guangzhou",
24 |         sample_params: Dict[str, any] = None,
25 |     ):
26 |         super().__init__(False, sample_params)
27 |         self.secret_id = secret_id
28 |         self.secret_key = secret_key
29 |         self.region = region
30 | 
31 |         # 初始化认证对象
32 |         self.cred = credential.Credential(self.secret_id, self.secret_key)
33 | 
34 |         # 配置 HTTP 选项
35 |         self.http_profile = HttpProfile()
36 |         self.http_profile.endpoint = "asr.tencentcloudapi.com"
37 | 
38 |         # 配置客户端参数
39 |         self.client_profile = ClientProfile()
40 |         self.client_profile.httpProfile = self.http_profile
41 | 
42 |         # 初始化 ASR 客户端
43 |         self.client = asr_client.AsrClient(self.cred, self.region, self.client_profile)
44 | 
45 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
46 |         audio = prompt["audio"]
47 |         logger.debug(f"Processing audio file: {audio}")
48 | 
49 |         with tempfile.NamedTemporaryFile(suffix=".wav") as tmp_file:
50 |             audio_path = tmp_file.name
51 |             subprocess.run(
52 |                 ["ffmpeg", "-y", "-i", audio, "-ar", "16000", "-ac", "1", audio_path],
53 |                 capture_output=True,
54 |                 text=True,
55 |                 check=True,
56 |             )
57 |             # 读取音频文件并进行 base64 编码
58 |             with open(audio_path, "rb") as f:
59 |                 audio_data = f.read()
60 |                 audio_base64 = base64.b64encode(audio_data).decode("utf-8")
61 | 
62 |             # 创建请求对象
63 |             req = models.SentenceRecognitionRequest()
64 |             params = {
65 |                 "ProjectId": 0,
66 |                 "SubServiceType": 2,
67 |                 "SourceType": 1,
68 |                 "VoiceFormat": "wav",
69 |                 "UsrAudioKey": "session-123",
70 |                 "Data": audio_base64,
71 |                 "DataLen": len(audio_data),
72 |                 **kwargs,
73 |             }
74 |             req.from_json_string(json.dumps(params))
75 | 
76 |             # 发送请求并获取响应
77 |             resp = self.client.SentenceRecognition(req)
78 |             return resp.Result
79 | 


--------------------------------------------------------------------------------
/audio_evals/models/bytedance/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/models/bytedance/__init__.py


--------------------------------------------------------------------------------
/audio_evals/models/bytedance/doubao.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict, Any
 3 | from openai import OpenAI
 4 | 
 5 | from audio_evals.models.model import APIModel
 6 | from audio_evals.base import PromptStruct
 7 | 
 8 | 
 9 | API_KEY = os.getenv("DOUBAO_API_KEY")
10 | URL = os.getenv("DOUBAO_URL")
11 | 
12 | 
13 | class Doubao(APIModel):
14 |     def __init__(
15 |         self, model_name: str, api_key: str = None, sample_params: Dict[str, Any] = None
16 |     ):
17 |         super().__init__(True, sample_params)
18 |         self.model_name = model_name
19 |         assert "DOUBAO_API_KEY" in os.environ or api_key is not None, ValueError(
20 |             "not found DOUBAO_API_KEY in your ENV"
21 |         )
22 |         if api_key is None:
23 |             api_key = os.environ.get("DOUBAO_API_KEY")
24 |         self.client = OpenAI(
25 |             # 此为默认路径，您可根据业务所在地域进行配置
26 |             base_url="https://ark.cn-beijing.volces.com/api/v3",
27 |             # 从环境变量中获取您的 API Key
28 |             api_key=api_key,
29 |         )
30 | 
31 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
32 | 
33 |         messages = []
34 |         for item in prompt:
35 |             messages.append(
36 |                 {"role": item["role"], "content": item["contents"][0]["value"]}
37 |             )
38 | 
39 |         response = self.client.chat.completions.create(
40 |             model=self.model_name, messages=messages, **kwargs
41 |         )
42 | 
43 |         return response.choices[0].message.content
44 | 
45 | 
46 | class DoubaoAudioPipeline(APIModel):
47 |     def __init__(self, asr: str, llm: str):
48 |         super().__init__(True)
49 |         from audio_evals.registry import registry
50 | 
51 |         self.asr = registry.get_model(asr)
52 |         self.llm = registry.get_model(llm)
53 | 
54 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
55 |         text = self.asr.inference(prompt)
56 |         res = self.llm.inference(text)
57 |         return res
58 | 


--------------------------------------------------------------------------------
/audio_evals/models/mini_cpm.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import logging
 4 | import select
 5 | from typing import Dict
 6 | from audio_evals.base import PromptStruct
 7 | from audio_evals.models.model import OfflineModel
 8 | from audio_evals.isolate import isolated
 9 | 
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | @isolated("audio_evals/lib/minicpm/main.py")
15 | class MiniCPMo(OfflineModel):
16 |     def __init__(
17 |         self,
18 |         path: str,
19 |         speech: bool = False,
20 |         sample_params: Dict = None,
21 |         *args,
22 |         **kwargs,
23 |     ):
24 |         if path == "openbmb/MiniCPM-o-2_6" and not os.path.exists(path):
25 |             path = self._download_model(path)
26 | 
27 |         self.command_args = {
28 |             "path": path,
29 |         }
30 |         if speech:
31 |             self.command_args["speech"] = ""
32 |         super().__init__(is_chat=True, sample_params=sample_params)
33 | 
34 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
35 |         import uuid
36 | 
37 |         uid = str(uuid.uuid4())
38 |         prefix = f"{uid}->"
39 | 
40 |         input_o = {"prompt": prompt}
41 |         input_o.update(kwargs)
42 | 
43 |         while True:
44 |             _, wlist, _ = select.select([], [self.process.stdin], [], 60)
45 |             if wlist:
46 |                 self.process.stdin.write(f"{prefix}{json.dumps(input_o)}\n")
47 |                 self.process.stdin.flush()
48 |                 print("already write in")
49 |                 break
50 | 
51 |         while True:
52 |             reads, _, _ = select.select(
53 |                 [self.process.stdout, self.process.stderr], [], [], 1.0
54 |             )
55 |             for read in reads:
56 |                 if read is self.process.stdout:
57 |                     result = self.process.stdout.readline()
58 |                     if result:
59 |                         if result.startswith(prefix):
60 |                             self.process.stdin.write("{}close\n".format(prefix))
61 |                             self.process.stdin.flush()
62 |                             res = json.loads(result[len(prefix) :])
63 |                             if len(res) == 1:
64 |                                 return res["text"]
65 |                             return json.dumps(res, ensure_ascii=False)
66 |                         elif result.startswith("Error:"):
67 |                             raise RuntimeError(
68 |                                 "mimicpm-o 2.6 failed: {}".format(result)
69 |                             )
70 |                         else:
71 |                             logger.info(result)
72 |                 if read is self.process.stderr:
73 |                     error_output = self.process.stderr.readline()
74 |                     if error_output:
75 |                         print(f"stderr: {error_output.strip()}")
76 | 


--------------------------------------------------------------------------------
/audio_evals/models/mini_omni.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import tempfile
 3 | from typing import Dict
 4 | 
 5 | import requests
 6 | 
 7 | from audio_evals.base import PromptStruct
 8 | from audio_evals.models.model import APIModel
 9 | from audio_evals.utils import get_base64_from_file
10 | import wave
11 | import numpy as np
12 | 
13 | 
14 | OUT_CHANNELS = 1
15 | 
16 | 
17 | def save_audio_response(response, output_file):
18 |     """保存服务器返回的音频流为文件"""
19 |     if response.status_code == 200:
20 |         text = ""
21 |         with wave.open(output_file, 'wb') as wf:
22 |             wf.setnchannels(OUT_CHANNELS)
23 |             wf.setsampwidth(2)  # 2 bytes per sample (16-bit audio)
24 |             wf.setframerate(24000)
25 | 
26 |             for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
27 |                 if chunk:
28 |                     data = json.loads(chunk.decode())
29 |                     text = data["text"]
30 |                     audio_data = np.frombuffer(bytes.fromhex(data["audio"]), dtype=np.int16)
31 |                     audio_data = audio_data.reshape(-1, OUT_CHANNELS)
32 |                     wf.writeframes(audio_data.tobytes())
33 |         return output_file, text
34 |     else:
35 |         raise Exception(f"下载失败，状态码: {response.status_code}")
36 | 
37 | 
38 | class MiniOmni(APIModel):
39 |     def __init__(
40 |         self, url: str, sample_params: Dict[str, any] = None
41 |     ):
42 |         super().__init__(True, sample_params)
43 |         self.url = url
44 | 
45 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
46 | 
47 |         audio_file = ""
48 |         for content in prompt:
49 |             if content["role"] == "user":
50 |                 for line in content["contents"]:
51 |                     if line["type"] == "audio":
52 |                         audio_file = line["value"]
53 |                         break
54 | 
55 |         audio_base64 = get_base64_from_file(audio_file)
56 |         headers = {
57 |             'Content-Type': 'application/json',
58 |             'Connection': 'keep-alive',
59 |             'Upgrade-Insecure-Requests': '1'
60 |         }
61 |         data = {
62 |             'audio': audio_base64
63 |         }
64 |         response = requests.post(self.url, headers=headers, data=json.dumps(data), stream=True)
65 |         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
66 |             audio, text = save_audio_response(response, f.name)
67 |             return json.dumps({"audio": audio, "text": text}, ensure_ascii=False)
68 | 
69 | 


--------------------------------------------------------------------------------
/audio_evals/models/openai.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict, Any
 3 | from openai import OpenAI, AzureOpenAI
 4 | from azure.core.credentials import AzureKeyCredential
 5 | 
 6 | 
 7 | from audio_evals.models.model import APIModel
 8 | from audio_evals.base import PromptStruct
 9 | 
10 | 
11 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
12 | OPENAI_URL = os.getenv("OPENAI_URL", "https://api.openai.com")
13 | 
14 | 
15 | class GPT(APIModel):
16 |     def __init__(
17 |         self,
18 |         model_name: str,
19 |         is_azure: bool = False,
20 |         sample_params: Dict[str, Any] = None,
21 |     ):
22 |         super().__init__(True, sample_params)
23 |         self.model_name = model_name
24 |         assert "OPENAI_API_KEY" in os.environ, ValueError(
25 |             "not found OPENAI_API_KEY in your ENV"
26 |         )
27 |         if is_azure:
28 |             key = os.environ["AZURE_OPENAI_KEY"]
29 |             endpoint = os.environ["AZURE_OPENAI_BASE"]
30 |             print(f"Using Azure OpenAI with key {key} and endpoint {endpoint}")
31 |             self.client = AzureOpenAI(
32 |                 api_version="2025-03-01-preview", api_key=key, azure_endpoint=endpoint
33 |             )
34 |         else:
35 |             self.client = OpenAI()
36 | 
37 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
38 | 
39 |         messages = []
40 |         for item in prompt:
41 |             messages.append(
42 |                 {"role": item["role"], "content": item["contents"][0]["value"]}
43 |             )
44 | 
45 |         response = self.client.chat.completions.create(
46 |             model=self.model_name, messages=messages, **kwargs
47 |         )
48 | 
49 |         return response.choices[0].message.content
50 | 
51 | 
52 | class AudioTranscribe(GPT):
53 |     """
54 |     This model is used to transcribe audio to text.
55 |     """
56 | 
57 |     def _inference(self, prompt, **kwargs):
58 |         audio_file = open(prompt["audio"], "rb")
59 |         transcript = self.client.audio.transcriptions.create(
60 |             model=self.model_name, file=audio_file
61 |         )
62 |         return transcript["text"]
63 | 


--------------------------------------------------------------------------------
/audio_evals/models/step_audio.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import json
 3 | import tempfile
 4 | from typing import Dict
 5 | 
 6 | import requests
 7 | 
 8 | from audio_evals.base import PromptStruct
 9 | from audio_evals.models.model import APIModel
10 | from audio_evals.utils import get_base64_from_file
11 | 
12 | 
13 | def save_audio_response(response, output_file):
14 |     """保存服务器返回的音频流为文件"""
15 |     if response.status_code == 200:
16 |         audio_data = b""
17 |         text_response = ""
18 | 
19 |         for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
20 |             if chunk:
21 |                 try:
22 |                     event = json.loads(chunk.decode("utf-8"))
23 |                     text_response += event.get("text", "")
24 |                     if output_file:
25 |                         audio_data += base64.b64decode(event.get("audio", ""))
26 |                 except json.JSONDecodeError:
27 |                     continue
28 |         if output_file:
29 |             with open(output_file, "wb") as f:
30 |                 f.write(audio_data)
31 |         return output_file, text_response
32 |     else:
33 |         raise Exception(f"下载失败，状态码: {response.status_code}")
34 | 
35 | 
36 | class StepAudioChat(APIModel):
37 |     def __init__(self, url: str, s2t: bool, sample_params: Dict[str, any] = None):
38 |         super().__init__(True, sample_params)
39 |         self.url = url
40 |         self.s2t = s2t
41 | 
42 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
43 |         text, audio_file = "", ""
44 |         for content in prompt:
45 |             if content["role"] == "user":
46 |                 for line in content["contents"]:
47 |                     if line["type"] == "audio":
48 |                         audio_file = line["value"]
49 |                     if line["type"] == "text":
50 |                         text = line["value"]
51 |         endfix = audio_file.split(".")[-1]
52 |         audio_base64 = get_base64_from_file(audio_file)
53 |         headers = {"Content-Type": "application/json"}
54 |         data = {
55 |             "text": text,
56 |             "audio": audio_base64,
57 |             "audio_format": endfix,
58 |         }
59 |         response = requests.post(
60 |             self.url, headers=headers, data=json.dumps(data), stream=True
61 |         )
62 |         if self.s2t:
63 |             _, text = save_audio_response(response, None)
64 |             return text
65 | 
66 |         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
67 |             audio, text = save_audio_response(response, f.name)
68 |             return json.dumps({"audio": audio, "text": text}, ensure_ascii=False)
69 | 


--------------------------------------------------------------------------------
/audio_evals/models/utmos.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from typing import Dict
 4 | from audio_evals.base import PromptStruct
 5 | from audio_evals.models.model import OfflineModel
 6 | from audio_evals.isolate import isolated
 7 | import select
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | @isolated(
14 |     "audio_evals/lib/utmos/main.py",
15 |     pre_command="pip install pip==24.0 &&export SACREBLEU_ROOT=envs/utmos/.sacrebleu",
16 | )
17 | class UTMOS(OfflineModel):
18 |     def __init__(
19 |         self,
20 |         path: str = "sarulab-speech/UTMOS-demo",
21 |         sample_params: Dict = None,
22 |         *args,
23 |         **kwargs,
24 |     ):
25 |         if path == "sarulab-speech/UTMOS-demo" and not os.path.exists(path):
26 |             path = self._download_model(path, repo_type="space")
27 | 
28 |         self.command_args = {
29 |             "path": path,
30 |         }
31 |         super().__init__(is_chat=False, sample_params=sample_params)
32 | 
33 |     def _inference(self, prompt: PromptStruct, **kwargs) -> float:
34 |         import uuid
35 | 
36 |         uid = str(uuid.uuid4())
37 |         prefix = f"{uid}->"
38 | 
39 |         while True:
40 |             _, wlist, _ = select.select([], [self.process.stdin], [], 60)
41 |             if wlist:
42 |                 self.process.stdin.write(f"{prefix}{prompt}\n")
43 |                 self.process.stdin.flush()
44 |                 logger.info("already write in")
45 |                 break
46 | 
47 |         while True:
48 |             reads, _, _ = select.select(
49 |                 [self.process.stdout, self.process.stderr], [], [], 1.0
50 |             )
51 |             for read in reads:
52 |                 if read is self.process.stdout:
53 |                     result = self.process.stdout.readline()
54 |                     if result:
55 |                         if result.startswith(prefix):
56 |                             self.process.stdin.write("{}close\n".format(prefix))
57 |                             self.process.stdin.flush()
58 |                             return float(result[len(prefix) :])
59 |                         elif result.startswith("Error:"):
60 |                             raise RuntimeError("utmos failed: {}".format(result))
61 |                         else:
62 |                             logger.info(result)
63 |                 if read is self.process.stderr:
64 |                     error_output = self.process.stderr.readline()
65 |                     if error_output:
66 |                         print(f"stderr: {error_output.strip()}")
67 | 


--------------------------------------------------------------------------------
/audio_evals/models/whisper.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import select
 5 | from typing import Dict
 6 | 
 7 | from audio_evals.base import PromptStruct
 8 | from audio_evals.models.model import OfflineModel
 9 | from audio_evals.constants import DEFAULT_MODEL_PATH
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | from audio_evals.isolate import isolated
15 | 
16 | 
17 | @isolated("audio_evals/lib/whisper/main.py")
18 | class WhisperModel(OfflineModel):
19 |     def __init__(
20 |         self,
21 |         path: str = "openai/whisper-large-v3",
22 |         sample_params: Dict[str, any] = None,
23 |     ):
24 |         if path.startswith("openai/") and not os.path.exists(path):
25 |             path = self._download_model(path)
26 | 
27 |         self.command_args = {
28 |             "path": path,
29 |         }
30 |         super().__init__(is_chat=True, sample_params=sample_params)
31 | 
32 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
33 |         import uuid
34 | 
35 |         uid = str(uuid.uuid4())
36 |         prefix = f"{uid}->"
37 | 
38 |         while True:
39 |             _, wlist, _ = select.select([], [self.process.stdin], [], 60)
40 |             if wlist:
41 |                 prompt["kwargs"] = kwargs
42 |                 self.process.stdin.write(f"{prefix}{json.dumps(prompt)}\n")
43 |                 self.process.stdin.flush()
44 |                 print("already write in")
45 |                 break
46 |         while True:
47 |             rlist, _, _ = select.select(
48 |                 [self.process.stdout, self.process.stderr], [], [], 1
49 |             )
50 | 
51 |             try:
52 |                 for stream in rlist:
53 |                     if stream == self.process.stdout:
54 |                         result = self.process.stdout.readline().strip()
55 |                         if not result:
56 |                             continue
57 |                         if result.startswith(prefix):
58 |                             self.process.stdin.write("{}close\n".format(prefix))
59 |                             self.process.stdin.flush()
60 |                             return result[len(prefix) :]
61 |                         elif result.startswith("Error:"):
62 |                             raise RuntimeError("WhisperModel failed: {}".format(result))
63 |                         else:
64 |                             logger.info(result)
65 |                     elif stream == self.process.stderr:
66 |                         err = self.process.stderr.readline().strip()
67 |                         if err:
68 |                             logger.error(f"Process stderr: {err}")
69 |             except BlockingIOError as e:
70 |                 logger.error(f"BlockingIOError occurred: {str(e)}")
71 | 


--------------------------------------------------------------------------------
/audio_evals/process/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/process/__init__.py


--------------------------------------------------------------------------------
/audio_evals/process/base.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import json
 3 | import logging
 4 | from abc import ABC, abstractmethod
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | class Process(ABC):
10 |     @abstractmethod
11 |     def __call__(self, answer: str) -> str:
12 |         raise NotImplementedError()
13 | 
14 | 
15 | class ContentExtract(Process):
16 | 
17 |     def __call__(self, answer: str) -> str:
18 |         try:
19 |             answer = answer.strip()
20 |             if answer.startswith("```json"):
21 |                 answer = answer[7:-3].strip()
22 |             elif answer.startswith("```"):
23 |                 answer = answer[3:-3].strip()
24 |             return json.loads(answer)["content"]
25 |         except Exception as e:
26 |             try:
27 |                 return ast.literal_eval(answer)["content"]
28 |             except Exception as e:
29 |                 logger.warning(f"process {answer} fail: {str(e)}")
30 |         return answer
31 | 
32 | 
33 | class JsonExtract(Process):
34 |     """
35 |     Extract a specific key from a json string.
36 |     the key is specified by the `extract_key` parameter.
37 |     if the key is not found, return the `default_value` if specified,
38 |     otherwise raise a KeyError.
39 |     """
40 | 
41 |     def __init__(self, extract_key: str = None, default_value: str = None):
42 |         """
43 |         Initialize the JsonExtract process.
44 |         Args:
45 |             extract_key: required, the key to extract from the json string.
46 |             default_value: optional, the default value to return if the key is not found.
47 | 
48 |         Returns: JsonExtract object.
49 | 
50 |         """
51 |         self.extract_key = extract_key
52 |         self.default_value = default_value
53 | 
54 |     def __call__(self, answer: str) -> any:
55 |         """
56 |         Extract the value of the `extract_key` from the json string `answer`.
57 |         Args:
58 |             answer: required, the json string to extract the value from.
59 | 
60 |         Returns: any, the value of the `extract_key` in the json string `answer`.
61 | 
62 |         """
63 |         if isinstance(answer, str):
64 |             try:
65 |                 d = json.loads(answer.strip())
66 |             except Exception as e:
67 |                 logger.warning(f"load json `{answer}` fail: {str(e)}")
68 |                 return answer
69 |         elif isinstance(answer, dict):
70 |             d = answer
71 |         else:
72 |             raise ValueError(f"Unsupported answer type: {type(answer)}")
73 |         if self.extract_key is None:
74 |             return d
75 | 
76 |         if self.default_value is not None:
77 |             return d.get(self.extract_key, self.default_value)
78 |         return d[self.extract_key]
79 | 


--------------------------------------------------------------------------------
/audio_evals/process/eliminate.py:
--------------------------------------------------------------------------------
 1 | from audio_evals.process.base import Process
 2 | 
 3 | 
 4 | class Eliminate(Process):
 5 | 
 6 |     def __init__(self, target: str):
 7 |         self.target = target
 8 | 
 9 |     def __call__(self, answer: str) -> str:
10 |         return answer.replace(self.target, "")
11 | 
12 | 
13 | class ForceStop(Process):
14 |     def __init__(self, target: str):
15 |         self.target = target
16 | 
17 |     def __call__(self, answer: str) -> str:
18 |         return answer.split(self.target)[0]
19 | 
20 | 
21 | class ExtractResponse(Process):
22 |     def __init__(self, target: str):
23 |         self.target = target
24 | 
25 |     def __call__(self, answer: str) -> str:
26 |         return answer.split(self.target)[1]
27 | 


--------------------------------------------------------------------------------
/audio_evals/process/normalization.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from audio_evals.lib.text_normalization.basic import BasicTextNormalizer
 4 | from audio_evals.lib.text_normalization.cn_tn import TextNorm
 5 | from audio_evals.lib.text_normalization.en import EnglishTextNormalizer
 6 | from audio_evals.process.base import Process
 7 | 
 8 | 
 9 | class TextNormalization(Process):
10 | 
11 |     def __init__(self, lang: str = ""):
12 |         if lang == "en":
13 |             self.normalizer = EnglishTextNormalizer()
14 |         elif lang == "zh":
15 |             self.normalizer = TextNorm(
16 |                 to_banjiao=False,
17 |                 to_upper=False,
18 |                 to_lower=False,
19 |                 remove_fillers=False,
20 |                 remove_erhua=False,
21 |                 check_chars=False,
22 |                 remove_space=False,
23 |                 cc_mode="",
24 |             )
25 |         else:
26 |             self.normalizer = BasicTextNormalizer()
27 | 
28 |     def __call__(self, answer: str) -> str:
29 |         return self.normalizer(answer)
30 | 


--------------------------------------------------------------------------------
/audio_evals/process/qwen.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from audio_evals.process.base import Process
 4 | 
 5 | 
 6 | class QwenAudioASRExtract(Process):
 7 |     PUNCS = "!,.?;:"
 8 | 
 9 |     def __init__(self, lang: str):
10 |         self.lang = lang
11 | 
12 |     def __call__(self, answer: str) -> str:
13 |         gt = re.sub(r"<\|.*?\|>", " ", answer)
14 |         gt = re.sub(rf"\s+", r" ", gt)  # 将文本中的连续空格替换为单个空格
15 |         gt = re.sub(f" ?([{self.PUNCS}])", r"\1", gt)
16 |         gt = gt.lstrip(" ")
17 |         if self.lang == "zh":
18 |             gt = re.sub(rf"\s+", r"", gt)
19 |         return gt
20 | 


--------------------------------------------------------------------------------
/audio_evals/process/speech.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | from audio_evals.process.base import Process
 3 | 
 4 | 
 5 | class Speech2text(Process):
 6 | 
 7 |     def __init__(self, model_name: str = "whisper", prompt_name: str = "whisper-asr"):
 8 |         from audio_evals.registry import registry
 9 | 
10 |         self.model = registry.get_model(model_name)
11 |         self.prompt = registry.get_prompt(prompt_name)
12 | 
13 |     def __call__(self, answer: str) -> str:
14 |         assert os.path.exists(answer), "must be a valid audio file, but got {}".format(
15 |             answer
16 |         )
17 |         real_prompt = self.prompt.load(WavPath=answer)
18 |         return self.model.inference(real_prompt)
19 | 


--------------------------------------------------------------------------------
/audio_evals/prompt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/audio_evals/prompt/__init__.py


--------------------------------------------------------------------------------
/audio_evals/prompt/base.py:
--------------------------------------------------------------------------------
 1 | from functools import singledispatch
 2 | from typing import Any, Dict, List
 3 | 
 4 | from jinja2 import StrictUndefined, Template
 5 | from jinja2.exceptions import UndefinedError
 6 | 
 7 | from audio_evals.base import PromptStruct
 8 | 
 9 | 
10 | @singledispatch
11 | def _load(t: Any, **kwargs: Any) -> Any:
12 |     return t
13 | 
14 | 
15 | @_load.register
16 | def _(t: str, **kwargs: Any) -> str:
17 |     template = Template(t, undefined=StrictUndefined)
18 |     try:
19 |         return template.render(**kwargs)
20 |     except UndefinedError as e:
21 |         raise ValueError("{}: template is {}\ndoc is {}".format(e, t, kwargs))
22 | 
23 | 
24 | @_load.register
25 | def _(t: list, **kwargs: Any) -> List[Any]:
26 |     return [_load(item, **kwargs) for item in t]
27 | 
28 | 
29 | @_load.register
30 | def _(t: dict, **kwargs: Any) -> Dict[Any, Any]:
31 |     return {k: _load(v, **kwargs) for k, v in t.items()}
32 | 
33 | 
34 | class Prompt:
35 |     def __init__(self, template: PromptStruct):
36 |         self.prompt = template
37 | 
38 |     def load(self, **kwargs):
39 |         return _load(self.prompt, **kwargs)
40 | 


--------------------------------------------------------------------------------
/audio_evals/recorder.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import typing
 4 | 
 5 | 
 6 | class Recorder:
 7 |     def __init__(self, f_name: str):
 8 |         self.name = f_name
 9 |         directory = os.path.dirname(f_name)
10 |         os.makedirs(directory, exist_ok=True)
11 |         if os.path.exists(f_name):
12 |             print(f"File {f_name} already exists, overwriting it.")
13 |             os.remove(f_name)
14 | 
15 |     def add(self, data: typing.Dict[str, typing.Any]):
16 |         with open(self.name, "a+") as f:
17 |             f.write(json.dumps(data, ensure_ascii=False) + "\n")
18 | 


--------------------------------------------------------------------------------
/docs/Procedures for Restarting an Incomplete Evaluation.md:
--------------------------------------------------------------------------------
 1 | # Resume Evaluation
 2 | 
 3 | In practice, evaluation processes may occasionally fail due to various technical issues, such as model request network interruptions, system failures, or unexpected errors. To ensure the continuity and integrity of the evaluation, follow these steps to effectively restart and complete the process.
 4 | 
 5 | **Example Scenario:**
 6 | 
 7 | 
 8 | If the evaluation process for the `GPT-4o-Audio` model with the dataset `my_dataset` fails due to a model request network interruption, the last checkpoint is saved in the `res/gpt4o_audio/last_res.jsonl` file.
 9 | 
10 | To restart the evaluation process, follow these steps:
11 | 
12 | ```shell
13 | python audio_evals/main.py --dataset my_dataset --model gpt4o_audio -r
14 | ```
15 | is equivalent to:
16 | 
17 | ```shell
18 | python audio_evals/main.py --dataset my_dataset --model gpt4o_audio --resume res/gpt4o_audio/last_res.jsonl
19 | ```
20 | 
21 | This command will resume the evaluation from the last saved checkpoint, ensuring that the process continues seamlessly.
22 | 


--------------------------------------------------------------------------------
/docs/how add a dataset.md:
--------------------------------------------------------------------------------
 1 | # how add a dataset in AudioEvals?
 2 | 
 3 | 
 4 | In practice, you may need eval your custom audio dataset.
 5 | 
 6 | before this, you need now how launch a custom eval task: [how launch a custom eval task.md](how%20launch%20a%20custom%20eval%20task.md)
 7 | 
 8 | here are steps:
 9 | 
10 | 
11 | ## JSON file:
12 | 
13 | ### register the dataset
14 | 1. make sure your dataset file is `jsonl` format and with `WavPath` column which specific the audio file path.
15 | 2. new a file `**.yaml` in `registry/dataset/`
16 |     content like :
17 |     ```yaml
18 |    $name:  # name after cli: --dataset $name
19 |    class: audio_evals.dataset.dataset.JsonlFile
20 |    args:
21 |      default_task: alei_asr  # you should specify an eval task as default, you can find valid task in  `registry/eval_task`
22 |      f_name:  # the file name
23 |      ref_col:  # the reference answer column name in file
24 |     ```
25 | after registry dataset, you can eval your dataset with --dataset $name, enjoy 😘
26 | 
27 | Example:
28 | 
29 | 1. create a file `my_dataset.jsonl` with `WavPath` and `Transcript` columns, the content like this:
30 | ```json lines
31 | {"WavPath": "path/to/audio1.wav", "Transcript": "this is the first audio"}
32 | {"WavPath": "path/to/audio2.wav", "Transcript": "this is the second audio"}
33 | ```
34 | 
35 | 2. create a file `my_dataset.yaml` in `registry/dataset/` with content:
36 | ```yaml
37 | my_dataset:
38 |   class: audio_evals.dataset.dataset.JsonlFile
39 |   args:
40 |     default_task: asr
41 |     f_name: my_dataset.jsonl     # the file name
42 |     ref_col: Transcript           # the reference answer column name in file
43 | ```
44 | 
45 | 3. eval your dataset with `--dataset my_dataset`
46 | 
47 | ```sh
48 | export PYTHONPATH=$PWD:$PYTHONPATH
49 | export OPENAI_API_KEY=$your-key
50 | python audio_evals/main.py --dataset my_dataset --model gpt4o_audio
51 | ```
52 | 


--------------------------------------------------------------------------------
/docs/how eval your model.md:
--------------------------------------------------------------------------------
 1 | 
 2 | In the QuickStart, it's easy to launch an eval task, but your model not be integrated AudioEvals, how can eval it?
 3 | 
 4 | Here are steps:
 5 | 
 6 | # model api
 7 | > your model is deployed as a service
 8 | 
 9 | ## 1. add model inference code
10 | 
11 | add a py-file in `audio_evals/models/` path, content like:
12 | 
13 | ```PYTHON
14 | from audio_evals.models.model import APIModel
15 | from audio_evals.base import PromptStruct
16 | 
17 | class MyAudioModel(APIModel):
18 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
19 |         # TODO
20 |         # my request code
21 | ```
22 | 
23 | reference: `audio_evals/models/google.py`
24 | 
25 | 
26 | ## 2. register the model
27 | 
28 | in the `registry/model/` path, new a yaml file, with content:
29 | 
30 | ```yaml
31 | $name:  # the name after command: --model $name
32 |   class: audio_evals.models.$new_file.$MyAudioModel
33 |   args:
34 |     ...  # your specific args. If not need args, just fill args: {}
35 | 
36 | 
37 | ```
38 | 
39 | 
40 | # offline model
41 | 
42 | 
43 | ## 1. add model inference code (optional)
44 | > if your model is supported with huggingface AutoModelForCausalLM, you can skip this step.
45 | 
46 | add a py-file in `audio_evals/models/` path, content like:
47 | ```PYTHON
48 | from audio_evals.models.offline_model import  OfflineModel
49 | from audio_evals.base import PromptStruct
50 | from typing import Dict
51 | 
52 | class MyAudioModel(OfflineModel):
53 |     def __init__(self, is_chat: bool, sample_params: Dict[str, any] = None):
54 |         super().__init__(is_chat, sample_params)
55 |         # TODO
56 |         # init code
57 | 
58 |     def _inference(self, prompt: PromptStruct, **kwargs) -> str:
59 |         # TODO
60 |         # inference code
61 | ```
62 | 
63 | ## 2. register the model
64 | 
65 | the `registry/model/` path, new a yaml file, with content:
66 | 
67 | ```yaml
68 | $name:  # the name after command: --model $name
69 |   class: audio_evals.models.offline_model.OfflineModel
70 |   args:
71 |     path:   # the name of model from huggingface model or the download model path download from huggingface
72 | 
73 | 
74 | ```
75 | 
76 | 
77 | after registry model, you can eval your model with `--model $name`, enjoy 😘
78 | 


--------------------------------------------------------------------------------
/registry/agg/air-bench.yaml:
--------------------------------------------------------------------------------
1 | airbench-chat:
2 |   class: audio_evals.agg.air_chat.AirChat
3 |   args: {}
4 | 


--------------------------------------------------------------------------------
/registry/agg/naive.yaml:
--------------------------------------------------------------------------------
 1 | dump:
 2 |   class: audio_evals.agg.base.Dump
 3 |   args: {}
 4 | 
 5 | acc:
 6 |   class: audio_evals.agg.base.ACC
 7 |   args: {}
 8 | 
 9 | mean:
10 |   class: audio_evals.agg.base.NaiveMean
11 |   args: {}
12 | 
13 | wer-zh:
14 |   class: audio_evals.agg.base.PracticeWER
15 |   args:
16 |     lang: zh
17 | 
18 | wer-yue:
19 |   class: audio_evals.agg.base.PracticeWER
20 |   args:
21 |     lang: yue
22 | 
23 | wer-jp:
24 |   class: audio_evals.agg.base.PracticeWER
25 |   args:
26 |     lang: jp
27 | 
28 | wer-kr:
29 |   class: audio_evals.agg.base.PracticeWER
30 |   args:
31 |     lang: kr
32 | 
33 | wer:
34 |   class: audio_evals.agg.base.PracticeWER
35 |   args: {}
36 | 
37 | cer:
38 |   class: audio_evals.agg.base.CER
39 |   args: {}
40 | 
41 | bleu:
42 |   class: audio_evals.agg.base.BLEU
43 |   args: {}
44 | 
45 | bleu-zh:
46 |   class: audio_evals.agg.base.BLEU
47 |   args:
48 |     lang: zh
49 | 
50 | bleu-char:
51 |   class: audio_evals.agg.base.BLEU
52 |   args:
53 |     lang: char
54 | 
55 | bleu-jp:
56 |   class: audio_evals.agg.base.BLEU
57 |   args:
58 |     lang: jp
59 | 
60 | coco:
61 |   class: audio_evals.agg.base.Coco
62 |   args: {}
63 | 
64 | naive-acc:
65 |   class: audio_evals.agg.base.NaiveMean
66 |   args:
67 |     need_score_col:
68 |       - acc
69 | 
70 | geval:
71 |   class: audio_evals.agg.base.NaiveMean
72 |   args:
73 |     need_score_col:
74 |       - geval
75 | 


--------------------------------------------------------------------------------
/registry/dataset/AudioCaps.yaml:
--------------------------------------------------------------------------------
1 | audiocaps:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: caption
5 |     name: TwinkStart/AudioCaps
6 |     split: test
7 |     ref_col: caption
8 | 


--------------------------------------------------------------------------------
/registry/dataset/COVID-recognizer.yaml:
--------------------------------------------------------------------------------
1 | COVID-recognizer:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: COVID-recognizer
5 |     name: TwinkStart/COVID-recognizer
6 |     split: test
7 |     ref_col: status
8 | 


--------------------------------------------------------------------------------
/registry/dataset/CatDog.yaml:
--------------------------------------------------------------------------------
1 | catdog:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: catdog_identify
5 |     name: TwinkStart/CatDog
6 |     split: test
7 |     ref_col: label
8 | 


--------------------------------------------------------------------------------
/registry/dataset/ClothoAQA.yaml:
--------------------------------------------------------------------------------
 1 | clotho-aqa-sample:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     name: TwinkStart/ClothoAQA
 5 |     split: sample
 6 |     default_task: aqa
 7 |     ref_col: answer
 8 | 
 9 | clotho-aqa:
10 |   class: audio_evals.dataset.huggingface.Huggingface
11 |   args:
12 |     name: TwinkStart/ClothoAQA
13 |     split: test
14 |     default_task: aqa
15 |     ref_col: answer
16 | 


--------------------------------------------------------------------------------
/registry/dataset/CommonVoice.yaml:
--------------------------------------------------------------------------------
 1 | cv-15-en:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: asr
 5 |     name: TwinkStart/CommonVoice_15
 6 |     split: en
 7 |     ref_col: sentence
 8 | cv-15-zh:
 9 |   class: audio_evals.dataset.huggingface.Huggingface
10 |   args:
11 |     default_task: asr-zh
12 |     name: TwinkStart/CommonVoice_15
13 |     split: zh
14 |     ref_col: sentence
15 | cv-15-fr:
16 |   class: audio_evals.dataset.huggingface.Huggingface
17 |   args:
18 |     default_task: asr
19 |     name: TwinkStart/CommonVoice_15
20 |     split: fr
21 |     ref_col: sentence
22 | cv-15-yue:
23 |   class: audio_evals.dataset.huggingface.Huggingface
24 |   args:
25 |     default_task: asr-yue
26 |     name: TwinkStart/CommonVoice_15
27 |     split: yue
28 |     ref_col: sentence
29 | 


--------------------------------------------------------------------------------
/registry/dataset/DESEDpublic_eval.yaml:
--------------------------------------------------------------------------------
1 | desed:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: desed_sound_analysis
5 |     name: TwinkStart/DESEDpublic_eval
6 |     split: test
7 |     ref_col: event_label
8 | 


--------------------------------------------------------------------------------
/registry/dataset/GTZAN.yaml:
--------------------------------------------------------------------------------
1 | GTZAN:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: music_genre
5 |     name: TwinkStart/GTZAN
6 |     split: test
7 |     ref_col: label
8 | 


--------------------------------------------------------------------------------
/registry/dataset/GigaSpeech.yaml:
--------------------------------------------------------------------------------
1 | gigaspeech:
2 |   class: audio_evals.dataset.giga.GigaSpeechDataset
3 |   args:
4 |     default_task: asr
5 |     name: speechcolab/gigaspeech
6 |     subset: test
7 |     ref_col: text
8 | 


--------------------------------------------------------------------------------
/registry/dataset/KeSpeech.yaml:
--------------------------------------------------------------------------------
1 | KeSpeech:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: asr-zh
5 |     name: TwinkStart/kespeech
6 |     split: test
7 |     ref_col: Text
8 | 


--------------------------------------------------------------------------------
/registry/dataset/MELD.yaml:
--------------------------------------------------------------------------------
 1 | meld-emo:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     name: TwinkStart/MELD
 5 |     split: test
 6 |     default_task: emotion_analysis
 7 |     ref_col: Emotion
 8 | 
 9 | meld-sentiment:
10 |   class: audio_evals.dataset.huggingface.Huggingface
11 |   args:
12 |     name: TwinkStart/MELD
13 |     split: test
14 |     default_task: sentiment_analysis
15 |     ref_col: Sentiment
16 | 


--------------------------------------------------------------------------------
/registry/dataset/MMAU.yaml:
--------------------------------------------------------------------------------
1 | mmau-test-mini:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: single_choice_with_answer
5 |     name: TwinkStart/MMAU
6 |     split: test_mini
7 |     ref_col: answer
8 | 


--------------------------------------------------------------------------------
/registry/dataset/Nsynth.yaml:
--------------------------------------------------------------------------------
1 | nsynth:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: instrument_recognition
5 |     name: TwinkStart/Nsynth
6 |     split: test
7 |     ref_col: instrument_family_str
8 | 


--------------------------------------------------------------------------------
/registry/dataset/RAVDESS.yaml:
--------------------------------------------------------------------------------
 1 | ravdess-emo:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: ravdess_emotion_analysis
 5 |     name: TwinkStart/RAVDESS
 6 |     split: ravdess_emo
 7 |     ref_col: emotion
 8 | ravdess-gender:
 9 |   class: audio_evals.dataset.huggingface.Huggingface
10 |   args:
11 |     default_task: gender_analysis
12 |     name: TwinkStart/RAVDESS
13 |     split: ravdess_gender
14 |     ref_col: Gender
15 | 


--------------------------------------------------------------------------------
/registry/dataset/RespiratorySound.yaml:
--------------------------------------------------------------------------------
 1 | respiratory-crackles:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: Respiratory-crackles-recognizer
 5 |     name: TwinkStart/RespiratorySound
 6 |     split: respiratory_crackles
 7 |     ref_col: Crackles
 8 | respiratory-wheezes:
 9 |   class: audio_evals.dataset.huggingface.Huggingface
10 |   args:
11 |     default_task: Respiratory-wheezes-recognizer
12 |     name: TwinkStart/RespiratorySound
13 |     split: respiratory_wheezes
14 |     ref_col: Wheezes
15 | 


--------------------------------------------------------------------------------
/registry/dataset/TESS.yaml:
--------------------------------------------------------------------------------
1 | TESS:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: emotion_analysis
5 |     name: TwinkStart/TESS
6 |     split: test
7 |     ref_col: label
8 | 


--------------------------------------------------------------------------------
/registry/dataset/VSC.yaml:
--------------------------------------------------------------------------------
1 | vocalsound:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     name: TwinkStart/vocalsound
5 |     split: test
6 |     default_task: vocalsound_analysis
7 |     ref_col: label
8 | 


--------------------------------------------------------------------------------
/registry/dataset/VoxCeleb.yaml:
--------------------------------------------------------------------------------
 1 | voxceleb1:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: gender_analysis
 5 |     name: TwinkStart/VoxCeleb
 6 |     split: voxceleb1
 7 |     ref_col: Gender
 8 | voxceleb2:
 9 |   class: audio_evals.dataset.huggingface.Huggingface
10 |   args:
11 |     default_task: gender_analysis
12 |     name: TwinkStart/VoxCeleb
13 |     split: voxceleb2
14 |     ref_col: Gender
15 | 


--------------------------------------------------------------------------------
/registry/dataset/WavCaps.yaml:
--------------------------------------------------------------------------------
 1 | wavcaps-audioset:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: caption
 5 |     name: TwinkStart/wavcaps-audioset
 6 |     split: test
 7 |     ref_col: caption
 8 | wavcaps-freesound:
 9 |   class: audio_evals.dataset.huggingface.Huggingface
10 |   args:
11 |     default_task: caption
12 |     name: TwinkStart/wavcaps-freesound
13 |     split: test
14 |     ref_col: caption
15 | wavcaps-soundbible:
16 |   class: audio_evals.dataset.huggingface.Huggingface
17 |   args:
18 |     default_task: caption
19 |     name: TwinkStart/wavcaps-soundbible
20 |     split: test
21 |     ref_col: caption
22 | 


--------------------------------------------------------------------------------
/registry/dataset/WenetSpeech.yaml:
--------------------------------------------------------------------------------
 1 | WenetSpeech-test-meeting:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: asr-zh
 5 |     name: TwinkStart/WenetSpeech
 6 |     split: test_meeting
 7 |     ref_col: text
 8 | 
 9 | WenetSpeech-test-net:
10 |   class: audio_evals.dataset.huggingface.Huggingface
11 |   args:
12 |     default_task: asr-zh
13 |     name: TwinkStart/WenetSpeech
14 |     split: test_net
15 |     ref_col: text
16 | 


--------------------------------------------------------------------------------
/registry/dataset/air.yaml:
--------------------------------------------------------------------------------
 1 | air-foundation:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: single_choice
 5 |     name: TwinkStart/air-foundation
 6 |     split: test
 7 |     ref_col: answer
 8 | air-chat:
 9 |   class: audio_evals.dataset.huggingface.Huggingface
10 |   args:
11 |     default_task: air_chat
12 |     name: TwinkStart/air_chat
13 |     split: test
14 |     ref_col: answer_gt
15 | 


--------------------------------------------------------------------------------
/registry/dataset/aishell.yaml:
--------------------------------------------------------------------------------
1 | aishell-1:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: asr-zh
5 |     name: TwinkStart/AISHELL-1
6 |     split: test
7 |     ref_col: text
8 | 


--------------------------------------------------------------------------------
/registry/dataset/alpaca_eval.yaml:
--------------------------------------------------------------------------------
1 | speech-chatbot-alpaca-eval:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: glm-alpaca-eval
5 |     name: TwinkStart/speech-chatbot-alpaca-eval
6 |     split: test
7 |     ref_col: output


--------------------------------------------------------------------------------
/registry/dataset/audio-MNIST.yaml:
--------------------------------------------------------------------------------
1 | audio-MNIST:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: digit
5 |     name: TwinkStart/audio-MNIST
6 |     split: test
7 |     ref_col: Digit
8 | 


--------------------------------------------------------------------------------
/registry/dataset/chord_recoganition.yaml:
--------------------------------------------------------------------------------
1 | chord-recognition:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: chord_recognition
5 |     name: TwinkStart/chord_recoganition
6 |     split: test
7 |     ref_col: Label
8 | 


--------------------------------------------------------------------------------
/registry/dataset/fleurs.yaml:
--------------------------------------------------------------------------------
 1 | fleurs-zh:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     subset: cmn_hans_cn
 5 |     default_task: asr-zh
 6 |     name: google/fleurs
 7 |     ref_col: raw_transcription
 8 |     split: test
 9 | fleurs-hi_in:
10 |   class: audio_evals.dataset.huggingface.Huggingface
11 |   args:
12 |     subset: hi_in
13 |     default_task: asr
14 |     name: google/fleurs
15 |     ref_col: raw_transcription
16 |     split: test
17 | fleurs-de_de:
18 |   class: audio_evals.dataset.huggingface.Huggingface
19 |   args:
20 |     subset: de_de
21 |     default_task: asr
22 |     name: google/fleurs
23 |     ref_col: raw_transcription
24 |     split: test
25 | fleurs-ja_jp:
26 |   class: audio_evals.dataset.huggingface.Huggingface
27 |   args:
28 |     subset: ja_jp
29 |     default_task: asr-jp
30 |     name: google/fleurs
31 |     ref_col: raw_transcription
32 |     split: test
33 | fleurs-ru_ru:
34 |   class: audio_evals.dataset.huggingface.Huggingface
35 |   args:
36 |     subset: ru_ru
37 |     default_task: asr
38 |     name: google/fleurs
39 |     ref_col: raw_transcription
40 |     split: test
41 | fleurs-en_us:
42 |   class: audio_evals.dataset.huggingface.Huggingface
43 |   args:
44 |     subset: en_us
45 |     default_task: asr
46 |     name: google/fleurs
47 |     ref_col: raw_transcription
48 |     split: test
49 | fleurs-fa_ir:
50 |   class: audio_evals.dataset.huggingface.Huggingface
51 |   args:
52 |     subset: fa_ir
53 |     default_task: asr
54 |     name: google/fleurs
55 |     ref_col: raw_transcription
56 |     split: test
57 | fleurs-ar_eg:
58 |   class: audio_evals.dataset.huggingface.Huggingface
59 |   args:
60 |     subset: ar_eg
61 |     default_task: asr
62 |     name: google/fleurs
63 |     ref_col: raw_transcription
64 |     split: test
65 | fleurs-fr_fr:
66 |   class: audio_evals.dataset.huggingface.Huggingface
67 |   args:
68 |     subset: fr_fr
69 |     default_task: asr
70 |     name: google/fleurs
71 |     ref_col: raw_transcription
72 |     split: test
73 | fleurs-ko_kr:
74 |   class: audio_evals.dataset.huggingface.Huggingface
75 |   args:
76 |     subset: ko_kr
77 |     default_task: asr
78 |     name: google/fleurs
79 |     ref_col: raw_transcription
80 |     split: test
81 | 


--------------------------------------------------------------------------------
/registry/dataset/heart_beat.yaml:
--------------------------------------------------------------------------------
1 | heartbeat_sound:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: Heartbeat-recognizer
5 |     name: TwinkStart/heart_beat
6 |     split: test
7 |     ref_col: label
8 | 


--------------------------------------------------------------------------------
/registry/dataset/librispeech.yaml:
--------------------------------------------------------------------------------
 1 | librispeech-test-clean:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: asr
 5 |     name: TwinkStart/librispeech
 6 |     split: test_clean
 7 |     ref_col: text
 8 | 
 9 | librispeech-dev-clean:
10 |   class: audio_evals.dataset.huggingface.Huggingface
11 |   args:
12 |     default_task: asr
13 |     name: TwinkStart/librispeech
14 |     split: dev_clean
15 |     ref_col: text
16 | 
17 | librispeech-test-other:
18 |   class: audio_evals.dataset.huggingface.Huggingface
19 |   args:
20 |     default_task: asr
21 |     name: TwinkStart/librispeech
22 |     split: test_other
23 |     ref_col: text
24 | 
25 | librispeech-dev-other:
26 |   class: audio_evals.dataset.huggingface.Huggingface
27 |   args:
28 |     default_task: asr
29 |     name: TwinkStart/librispeech
30 |     split: dev_other
31 |     ref_col: text
32 | 


--------------------------------------------------------------------------------
/registry/dataset/llama_questions.yaml:
--------------------------------------------------------------------------------
 1 | llama-questions:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: s2s-aqa
 5 |     name: TwinkStart/llama-questions
 6 |     split: test
 7 |     ref_col: Answer
 8 | 
 9 | llama-questions-s2t:
10 |   class: audio_evals.dataset.huggingface.Huggingface
11 |   args:
12 |     default_task: loose-aqa
13 |     name: TwinkStart/llama-questions
14 |     split: test
15 |     ref_col: Answer


--------------------------------------------------------------------------------
/registry/dataset/multilingual_librispeech.yaml:
--------------------------------------------------------------------------------
 1 | mls_dutch:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: asr
 5 |     name: TwinkStart/facebook_multilingual_librispeech
 6 |     split: mls_dutch
 7 |     ref_col: Text
 8 | 
 9 | mls_french:
10 |   class: audio_evals.dataset.huggingface.Huggingface
11 |   args:
12 |     default_task: asr
13 |     name: TwinkStart/facebook_multilingual_librispeech
14 |     split: mls_french
15 |     ref_col: Text
16 | 
17 | mls_german:
18 |   class: audio_evals.dataset.huggingface.Huggingface
19 |   args:
20 |     default_task: asr
21 |     name: TwinkStart/facebook_multilingual_librispeech
22 |     split: mls_german
23 |     ref_col: Text
24 | 
25 | mls_italian:
26 |   class: audio_evals.dataset.huggingface.Huggingface
27 |   args:
28 |     default_task: asr
29 |     name: TwinkStart/facebook_multilingual_librispeech
30 |     split: mls_italian
31 |     ref_col: Text
32 | 
33 | mls_polish:
34 |   class: audio_evals.dataset.huggingface.Huggingface
35 |   args:
36 |     default_task: asr
37 |     name: TwinkStart/facebook_multilingual_librispeech
38 |     split: mls_polish
39 |     ref_col: Text
40 | 
41 | mls_portuguese:
42 |   class: audio_evals.dataset.huggingface.Huggingface
43 |   args:
44 |     default_task: asr
45 |     name: TwinkStart/facebook_multilingual_librispeech
46 |     split: mls_portuguese
47 |     ref_col: Text
48 | 
49 | mls_spanish:
50 |   class: audio_evals.dataset.huggingface.Huggingface
51 |   args:
52 |     default_task: asr
53 |     name: TwinkStart/facebook_multilingual_librispeech
54 |     split: mls_spanish
55 |     ref_col: Text
56 | 


--------------------------------------------------------------------------------
/registry/dataset/peoples_speech.yaml:
--------------------------------------------------------------------------------
1 | peoples-speech:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     default_task: asr
5 |     name: TwinkStart/peoples_speech
6 |     split: test
7 |     ref_col: label
8 | 


--------------------------------------------------------------------------------
/registry/dataset/sample.yaml:
--------------------------------------------------------------------------------
 1 | sample:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: asr-zh
 5 |     name: TwinkStart/sample
 6 |     split: sample
 7 |     ref_col: Text
 8 | 
 9 | sample-en:
10 |   class: audio_evals.dataset.huggingface.Huggingface
11 |   args:
12 |     default_task: asr
13 |     name: TwinkStart/sample
14 |     split: sample-en
15 |     ref_col: Text
16 | 


--------------------------------------------------------------------------------
/registry/dataset/tedlium.yaml:
--------------------------------------------------------------------------------
 1 | tedlium-release1:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: asr
 5 |     name: TwinkStart/tedlium
 6 |     subset: release1
 7 |     ref_col: text
 8 | tedlium-release2:
 9 |   class: audio_evals.dataset.huggingface.Huggingface
10 |   args:
11 |     default_task: asr
12 |     name: TwinkStart/tedlium
13 |     subset: release2
14 |     ref_col: text
15 | tedlium-release3:
16 |   class: audio_evals.dataset.huggingface.Huggingface
17 |   args:
18 |     default_task: asr
19 |     name: TwinkStart/tedlium
20 |     subset: release3
21 |     ref_col: text
22 | 


--------------------------------------------------------------------------------
/registry/dataset/triviaqa.yaml:
--------------------------------------------------------------------------------
 1 | speech-triviaqa:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: s2s-aqa
 5 |     name: TwinkStart/speech-triavia-qa
 6 |     ref_col: answer
 7 |     split: test
 8 | 
 9 | speech-triviaqa-s2t:
10 |   class: audio_evals.dataset.huggingface.Huggingface
11 |   args:
12 |     default_task: loose-aqa
13 |     name: TwinkStart/speech-triavia-qa
14 |     ref_col: answer
15 |     split: test
16 | 


--------------------------------------------------------------------------------
/registry/dataset/voxpopuli.yaml:
--------------------------------------------------------------------------------
1 | voxpopuli-en:
2 |   class: audio_evals.dataset.huggingface.Huggingface
3 |   args:
4 |     subset: en
5 |     default_task: asr
6 |     name: facebook/voxpopuli
7 |     ref_col: normalized_text
8 |     split: test
9 | 


--------------------------------------------------------------------------------
/registry/dataset/webQ.yaml:
--------------------------------------------------------------------------------
 1 | speech-web-questions:
 2 |   class: audio_evals.dataset.huggingface.Huggingface
 3 |   args:
 4 |     default_task: s2s-aqa
 5 |     name: TwinkStart/speech-web-questions
 6 |     ref_col: answers
 7 |     split: test
 8 | 
 9 | speech-web-questions-s2t:
10 |   class: audio_evals.dataset.huggingface.Huggingface
11 |   args:
12 |     default_task: loose-aqa
13 |     f_name: TwinkStart/speech-web-questions
14 |     ref_col: answers
15 |     split: test


--------------------------------------------------------------------------------
/registry/eval_task/acoustics.yaml:
--------------------------------------------------------------------------------
 1 | speech-quality:
 2 |   class: audio_evals.base.EvalTaskCfg
 3 |   args:
 4 |     dataset: clotho-aqa
 5 |     prompt: direct-aqa
 6 |     model: qwen-audio-chat
 7 |     post_process: ['extract_audio']
 8 |     evaluator: speech_quality
 9 |     agg: mean
10 | 


--------------------------------------------------------------------------------
/registry/eval_task/air.yaml:
--------------------------------------------------------------------------------
 1 | single_choice:
 2 |   class: audio_evals.base.EvalTaskCfg
 3 |   args:
 4 |     dataset: air-foundation
 5 |     prompt: single_choice
 6 |     model: qwen-audio-chat
 7 |     evaluator: prefix-match
 8 |     agg: acc
 9 | 
10 | air_chat:
11 |   class: audio_evals.base.EvalTaskCfg
12 |   args:
13 |     dataset: air-chat
14 |     prompt: qa
15 |     model: qwen-audio-chat
16 |     evaluator: air-bench-geval
17 |     agg: airbench-chat
18 | 
19 | single_choice_with_answer:
20 |   class: audio_evals.base.EvalTaskCfg
21 |   args:
22 |     dataset: mmau
23 |     prompt: single_choice_with_answer
24 |     model: qwen-audio-chat
25 |     evaluator: choice-strings-match
26 |     agg: acc
27 | 


--------------------------------------------------------------------------------
/registry/eval_task/alpaca.yaml:
--------------------------------------------------------------------------------
 1 | alpaca-eval:
 2 |   class: audio_evals.base.EvalTaskCfg
 3 |   args:
 4 |     dataset: alpaca-eval-audio
 5 |     prompt: direct-aqa
 6 |     model: qwen-audio-chat
 7 |     evaluator: alpaca_eval_gpt4
 8 |     agg: naive-acc
 9 | 
10 | glm-alpaca-eval:
11 |   class: audio_evals.base.EvalTaskCfg
12 |   args:
13 |     dataset: speech-chatbot-alpaca-eval
14 |     prompt: direct-aqa
15 |     model: qwen-audio-chat
16 |     post_process: ['extract_audio', 'speech2text']
17 |     evaluator: chatbot_eval
18 |     agg: geval
19 | 
20 | glm-alpaca-eval-s2t:
21 |   class: audio_evals.base.EvalTaskCfg
22 |   args:
23 |     dataset: speech-chatbot-alpaca-eval
24 |     prompt: direct-aqa
25 |     model: qwen-audio-chat
26 |     post_process: []
27 |     evaluator: chatbot_eval
28 |     agg: geval
29 | 


--------------------------------------------------------------------------------
/registry/eval_task/aqa.yaml:
--------------------------------------------------------------------------------
 1 | aqa:
 2 |   class: audio_evals.base.EvalTaskCfg
 3 |   args:
 4 |     dataset: clotho-aqa
 5 |     prompt: aqa
 6 |     model: qwen-audio-chat
 7 |     evaluator: prefix-match
 8 |     agg: acc
 9 | 
10 | loose-aqa:
11 |   class: audio_evals.base.EvalTaskCfg
12 |   args:
13 |     dataset: clotho-aqa
14 |     prompt: direct-aqa
15 |     model: qwen-audio-chat
16 |     post_process: ['extract_text']
17 |     evaluator: qa-exist-match
18 |     agg: acc
19 | 
20 | s2s-aqa:
21 |   class: audio_evals.base.EvalTaskCfg
22 |   args:
23 |     dataset: clotho-aqa
24 |     prompt: direct-aqa
25 |     model: qwen-audio-chat
26 |     post_process: ['extract_audio', 'speech2text']
27 |     evaluator: qa-exist-match
28 |     agg: acc
29 | 
30 | choice-aqa:
31 |   class: audio_evals.base.EvalTaskCfg
32 |   args:
33 |     dataset: clotho-aqa
34 |     prompt: direct-aqa
35 |     model: qwen-audio-chat
36 |     post_process: ['extract_audio', 'speech2text', 'first_option']
37 |     evaluator: em
38 |     agg: acc
39 | 
40 | s2t-choice-aqa:
41 |   class: audio_evals.base.EvalTaskCfg
42 |   args:
43 |     dataset: clotho-aqa
44 |     prompt: direct-aqa
45 |     model: qwen-audio-chat
46 |     post_process: ['first_option']
47 |     evaluator: em
48 |     agg: acc
49 | 


--------------------------------------------------------------------------------
/registry/eval_task/asr.yaml:
--------------------------------------------------------------------------------
 1 | asr:
 2 |   class: audio_evals.base.EvalTaskCfg
 3 |   args:
 4 |     dataset: KeSpeech
 5 |     prompt: asr
 6 |     model: qwen-audio
 7 |     post_process: ['json_content']
 8 |     evaluator: wer
 9 |     agg: wer
10 | 
11 | asr-3o:
12 |   class: audio_evals.base.EvalTaskCfg
13 |   args:
14 |     dataset: KeSpeech
15 |     prompt: asr-en
16 |     model: qwen-audio
17 |     evaluator: wer
18 |     agg: wer
19 | 
20 | asr-zh-3o:
21 |   class: audio_evals.base.EvalTaskCfg
22 |   args:
23 |     dataset: KeSpeech
24 |     prompt: asr-zh
25 |     model: qwen-audio
26 |     evaluator: cer
27 |     agg: wer-zh
28 | 
29 | asr-zh:
30 |   class: audio_evals.base.EvalTaskCfg
31 |   args:
32 |     dataset: KeSpeech
33 |     prompt: asr
34 |     model: qwen-audio
35 |     post_process: ['json_content']
36 |     evaluator: cer
37 |     agg: wer-zh
38 | 
39 | asr-jp:
40 |   class: audio_evals.base.EvalTaskCfg
41 |   args:
42 |     dataset: fleurs-ja_jp
43 |     prompt: asr
44 |     model: qwen-audio
45 |     post_process: ['json_content']
46 |     evaluator: wer-jp
47 |     agg: wer-jp
48 | 
49 | asr-yue:
50 |   class: audio_evals.base.EvalTaskCfg
51 |   args:
52 |     dataset: KeSpeech
53 |     prompt: asr
54 |     model: qwen-audio
55 |     post_process: ['json_content']
56 |     evaluator: wer-yue
57 |     agg: wer-yue
58 | 
59 | asr-kr:
60 |   class: audio_evals.base.EvalTaskCfg
61 |   args:
62 |     dataset: KeSpeech
63 |     prompt: asr
64 |     model: qwen-audio
65 |     post_process: ['json_content']
66 |     evaluator: wer-kr
67 |     agg: wer-kr
68 | 


--------------------------------------------------------------------------------
/registry/eval_task/caption.yaml:
--------------------------------------------------------------------------------
1 | caption:
2 |   class: audio_evals.base.EvalTaskCfg
3 |   args:
4 |     dataset: air-foundation
5 |     prompt: caption
6 |     model: qwen-audio-chat
7 |     evaluator: dump
8 |     agg: coco
9 | 


--------------------------------------------------------------------------------
/registry/eval_task/digit.yaml:
--------------------------------------------------------------------------------
1 | digit:
2 |   class: audio_evals.base.EvalTaskCfg
3 |   args:
4 |     dataset: clotho-aqa
5 |     prompt: digit
6 |     model: qwen-audio-chat
7 |     evaluator: prefix-match
8 |     agg: acc
9 | 


--------------------------------------------------------------------------------
/registry/eval_task/emo.yaml:
--------------------------------------------------------------------------------
 1 | emotion_analysis:
 2 |   class: audio_evals.base.EvalTaskCfg
 3 |   args:
 4 |     dataset: meld
 5 |     prompt: emo_analysis
 6 |     model: qwen-audio-chat
 7 |     evaluator: prefix-match
 8 |     agg: acc
 9 | 
10 | ravdess_emotion_analysis:
11 |   class: audio_evals.base.EvalTaskCfg
12 |   args:
13 |     dataset: meld
14 |     prompt: ravdess_emo_analysis
15 |     model: qwen-audio-chat
16 |     evaluator: prefix-match
17 |     agg: acc
18 | 
19 | sentiment_analysis:
20 |   class: audio_evals.base.EvalTaskCfg
21 |   args:
22 |     dataset: meld
23 |     prompt: sentiment_analysis
24 |     model: qwen-audio-chat
25 |     evaluator: prefix-match
26 |     agg: acc
27 | 
28 | desed_sound_analysis:
29 |   class: audio_evals.base.EvalTaskCfg
30 |   args:
31 |     dataset: desed
32 |     prompt: sound_analysis
33 |     model: qwen-audio-chat
34 |     evaluator: prefix-match
35 |     agg: acc
36 | 


--------------------------------------------------------------------------------
/registry/eval_task/gender.yaml:
--------------------------------------------------------------------------------
1 | gender_analysis:
2 |   class: audio_evals.base.EvalTaskCfg
3 |   args:
4 |     dataset: meld
5 |     prompt: gender_analysis
6 |     model: qwen-audio-chat
7 |     evaluator: prefix-match
8 |     agg: acc
9 | 


--------------------------------------------------------------------------------
/registry/eval_task/inference.yaml:
--------------------------------------------------------------------------------
1 | inference:
2 |   class: audio_evals.base.EvalTaskCfg
3 |   args:
4 |     dataset: covost2-local
5 |     prompt: covost2-en-zh
6 |     model: qwen-audio
7 |     evaluator: dump
8 |     agg: dump
9 | 


--------------------------------------------------------------------------------
/registry/eval_task/medicine.yaml:
--------------------------------------------------------------------------------
 1 | COVID-recognizer:
 2 |   class: audio_evals.base.EvalTaskCfg
 3 |   args:
 4 |     dataset: clotho-aqa
 5 |     prompt: COVID-recognizer
 6 |     model: qwen-audio-chat
 7 |     evaluator: prefix-match
 8 |     agg: acc
 9 | 
10 | Heartbeat-recognizer:
11 |   class: audio_evals.base.EvalTaskCfg
12 |   args:
13 |     dataset: clotho-aqa
14 |     prompt: Heartbeat-recognizer
15 |     model: qwen-audio-chat
16 |     evaluator: prefix-match
17 |     agg: acc
18 | 
19 | Respiratory-crackles-recognizer:
20 |   class: audio_evals.base.EvalTaskCfg
21 |   args:
22 |     dataset: clotho-aqa
23 |     prompt: Respiratory-crackles-recognizer
24 |     model: qwen-audio-chat
25 |     evaluator: prefix-match
26 |     agg: acc
27 | 
28 | Respiratory-wheezes-recognizer:
29 |   class: audio_evals.base.EvalTaskCfg
30 |   args:
31 |     dataset: clotho-aqa
32 |     prompt: Respiratory-wheezes-recognizer
33 |     model: qwen-audio-chat
34 |     evaluator: prefix-match
35 |     agg: acc
36 | 


--------------------------------------------------------------------------------
/registry/eval_task/music.yaml:
--------------------------------------------------------------------------------
 1 | instrument_recognition:
 2 |   class: audio_evals.base.EvalTaskCfg
 3 |   args:
 4 |     dataset: nsyth
 5 |     prompt: instrument_recognition
 6 |     model: qwen-audio-chat
 7 |     evaluator: prefix-match
 8 |     agg: acc
 9 | 
10 | chord_recognition:
11 |   class: audio_evals.base.EvalTaskCfg
12 |   args:
13 |     dataset: nsyth
14 |     prompt: chord_recognition
15 |     model: qwen-audio-chat
16 |     evaluator: prefix-match
17 |     agg: acc
18 | 
19 | music_genre:
20 |   class: audio_evals.base.EvalTaskCfg
21 |   args:
22 |     dataset: nsyth
23 |     prompt: music_genre
24 |     model: qwen-audio-chat
25 |     evaluator: prefix-match
26 |     agg: acc
27 | 
28 | 
29 | music_tempo:
30 |   class: audio_evals.base.EvalTaskCfg
31 |   args:
32 |     dataset: nsyth
33 |     prompt: music_tempo
34 |     model: qwen-audio-chat
35 |     evaluator: em
36 |     agg: acc
37 | 


--------------------------------------------------------------------------------
/registry/eval_task/sound_identify.yaml:
--------------------------------------------------------------------------------
1 | catdog_identify:
2 |   class: audio_evals.base.EvalTaskCfg
3 |   args:
4 |     dataset: vocalsound
5 |     prompt: catdog_identify
6 |     model: qwen-audio-chat
7 |     evaluator: prefix-match
8 |     agg: acc
9 | 


--------------------------------------------------------------------------------
/registry/eval_task/vsc.yaml:
--------------------------------------------------------------------------------
1 | vocalsound_analysis:
2 |   class: audio_evals.base.EvalTaskCfg
3 |   args:
4 |     dataset: vocalsound
5 |     prompt: vocal_sound_analysis
6 |     model: qwen-audio-chat
7 |     evaluator: prefix-match
8 |     agg: acc
9 | 


--------------------------------------------------------------------------------
/registry/evaluator/air-bench.yaml:
--------------------------------------------------------------------------------
1 | air-bench-geval:
2 |   class: audio_evals.evaluator.air_chat.AIRChatEvaluator
3 |   args:
4 |     model_name: gpt4o-mini
5 | 


--------------------------------------------------------------------------------
/registry/evaluator/alpaca.yaml:
--------------------------------------------------------------------------------
 1 | alpaca_eval_gpt4:
 2 |   class: audio_evals.evaluator.alpaca_eval.AlpacaEvaluator
 3 |   args:
 4 |     model_name: gpt4o-mini
 5 | 
 6 | chatbot_eval:
 7 |   class: audio_evals.evaluator.alpaca_eval.ChatbotEvaluator
 8 |   args:
 9 |     model_name: gpt4o-mini
10 | 
11 | ref_qa_geval:
12 |   class: audio_evals.evaluator.ref_qa_geval.RefQAGEval
13 |   args:
14 |     model_name: mb-gpt4o-mini
15 | 


--------------------------------------------------------------------------------
/registry/evaluator/choice-with-ans.yaml:
--------------------------------------------------------------------------------
1 | choice-strings-match:
2 |   class: audio_evals.evaluator.string_match.ChoiceStringMatch
3 |   args: {}
4 | 


--------------------------------------------------------------------------------
/registry/evaluator/common.yaml:
--------------------------------------------------------------------------------
 1 | dump:
 2 |   class: audio_evals.evaluator.base.Dump
 3 |   args: {}
 4 | 
 5 | em:
 6 |   class: audio_evals.evaluator.base.EM
 7 |   args: {}
 8 | 
 9 | exist-match:
10 |   class: audio_evals.evaluator.base.ExistMatch
11 |   args: {}
12 | 
13 | prefix-match:
14 |   class: audio_evals.evaluator.base.PrefixMatch
15 |   args: {}
16 | 
17 | wer:
18 |   class: audio_evals.evaluator.wer.WER
19 |   args:
20 |     ignore_case: true
21 | 
22 | wer-jp:
23 |   class: audio_evals.evaluator.wer.WER
24 |   args:
25 |     ignore_case: true
26 |     lang: jp
27 | 
28 | wer-kr:
29 |   class: audio_evals.evaluator.wer.WER
30 |   args:
31 |     ignore_case: true
32 |     lang: kr
33 | 
34 | wer-yue:
35 |   class: audio_evals.evaluator.wer.WER
36 |   args:
37 |     ignore_case: true
38 |     lang: yue
39 | 
40 | wer-sensitive-case:
41 |   class: audio_evals.evaluator.wer.WER
42 |   args: {}
43 | 
44 | cer:
45 |   class: audio_evals.evaluator.wer.CER
46 |   args: {}
47 | 
48 | bleu:
49 |   class: audio_evals.evaluator.bleu.BLEU
50 |   args: {}
51 | 
52 | bleu-zh:
53 |   class: audio_evals.evaluator.bleu.BLEU
54 |   args:
55 |     lang: zh
56 | 
57 | bleu-jp:
58 |   class: audio_evals.evaluator.bleu.BLEU
59 |   args:
60 |     lang: jp
61 | 
62 | bleu-char:
63 |   class: audio_evals.evaluator.bleu.BLEU
64 |   args:
65 |     lang: char
66 | 
67 | coco:
68 |   class: audio_evals.evaluator.coco.Coco
69 |   args: {}
70 | 


--------------------------------------------------------------------------------
/registry/evaluator/dnsmos.yaml:
--------------------------------------------------------------------------------
1 | dnsmos:
2 |   class: audio_evals.evaluator.dnsmos.DNSMOS
3 |   args:
4 |     model_name: dnsmos
5 | 


--------------------------------------------------------------------------------
/registry/evaluator/llama-speech.yaml:
--------------------------------------------------------------------------------
1 | llama_speech_eval_gpt4:
2 |   class: audio_evals.evaluator.alpaca_eval.AlpacaEvaluator
3 |   args:
4 |     model_name: gpt4o-mini
5 | 


--------------------------------------------------------------------------------
/registry/evaluator/qa.yaml:
--------------------------------------------------------------------------------
1 | qa-exist-match:
2 |   class: audio_evals.evaluator.qa_exact_match.QAExistMatchEvaluator
3 |   args: {}
4 | 
5 | 


--------------------------------------------------------------------------------
/registry/evaluator/simo.yaml:
--------------------------------------------------------------------------------
1 | simo:
2 |   class: audio_evals.evaluator.simo.Simo
3 |   args:
4 |     model_name: wavlm_large
5 | 


--------------------------------------------------------------------------------
/registry/evaluator/speech_qulity.yaml:
--------------------------------------------------------------------------------
 1 | speech_quality:
 2 |   class: audio_evals.evaluator.ensemble.Ensemble
 3 |   args:
 4 |     components:
 5 |       - dnsmos
 6 |       - utmos
 7 | 
 8 | 
 9 | vc_quality:
10 |   class: audio_evals.evaluator.ensemble.Ensemble
11 |   args:
12 |     components:
13 |       - dnsmos
14 |       - utmos
15 |       - simo
16 | 


--------------------------------------------------------------------------------
/registry/evaluator/utmos.yaml:
--------------------------------------------------------------------------------
1 | utmos:
2 |   class: audio_evals.evaluator.utmos.UTMOS
3 |   args: {}
4 | 


--------------------------------------------------------------------------------
/registry/model/ali.yaml:
--------------------------------------------------------------------------------
1 | qwen-audio:
2 |   class: audio_evals.models.ali.AliApi
3 |   args:
4 |     model_name: 'qwen-audio-chat'
5 | 


--------------------------------------------------------------------------------
/registry/model/dnsmos.yaml:
--------------------------------------------------------------------------------
1 | dnsmos:
2 |   class: audio_evals.models.dnsmos.DNSMOS
3 |   args:
4 |     model_path:
5 |     p_model_path:
6 |     p808_model_path:
7 |     env_path: envs/dnsmos
8 |     requirements_path: audio_evals/lib/DNSMOS/requirements.txt
9 | 


--------------------------------------------------------------------------------
/registry/model/gemini.yaml:
--------------------------------------------------------------------------------
 1 | gemini-pro:
 2 |   class: audio_evals.models.google.Gemini
 3 |   args:
 4 |     model_name: 'gemini-pro'
 5 | 
 6 | gemini-1.5-pro:
 7 |   class: audio_evals.models.google.Gemini
 8 |   args:
 9 |     model_name: 'gemini-1.5-pro'
10 | 
11 | gemini-1.5-flash:
12 |   class: audio_evals.models.google.Gemini
13 |   args:
14 |     model_name: 'gemini-1.5-flash'
15 | 
16 | gemini-2.0-flash-exp:
17 |   class: audio_evals.models.google.Gemini
18 |   args:
19 |     model_name: 'gemini-2.0-flash-exp'
20 | 
21 | gemini-2.5-flash:
22 |   class: audio_evals.models.google.Gemini
23 |   args:
24 |     model_name: 'gemini-2.5-flash-preview-04-17'
25 | 
26 | gemini-2.5-pro:
27 |   class: audio_evals.models.google.Gemini
28 |   args:
29 |     model_name: 'gemini-2.5-pro-preview-05-06'
30 | 


--------------------------------------------------------------------------------
/registry/model/minicpmo.yaml:
--------------------------------------------------------------------------------
 1 | MiniCPMo2_6-audio:
 2 |   class: audio_evals.models.mini_cpm.MiniCPMo
 3 |   args:
 4 |     path: openbmb/MiniCPM-o-2_6
 5 |     speech: false
 6 |     env_path: envs/minicpmo2_6
 7 |     requirements_path: audio_evals/lib/minicpm/requirements.txt
 8 |     sample_params:
 9 |       sampling: false
10 |       num_beams: 5
11 |       max_new_tokens: 128
12 | 
13 | MiniCPMo2_6-speech:
14 |   class: audio_evals.models.mini_cpm.MiniCPMo
15 |   args:
16 |     path: openbmb/MiniCPM-o-2_6
17 |     speech: true
18 |     env_path: envs/minicpmo2_6
19 |     requirements_path: audio_evals/lib/minicpm/requirements.txt
20 |     sample_params:
21 |       sampling: false
22 |       num_beams: 5
23 |       max_new_tokens: 128
24 | 


--------------------------------------------------------------------------------
/registry/model/moonshot.yaml:
--------------------------------------------------------------------------------
 1 | kimiaudio:
 2 |   class: audio_evals.models.moonshot.KimiAudioModel
 3 |   args:
 4 |     model_path: moonshotai/Kimi-Audio-7B-Instruct
 5 |     env_path: envs/kimiaudio
 6 |     requirements_path: audio_evals/lib/Kimi-Audio/requirements.txt
 7 | 
 8 | kimiaudio-speech:
 9 |   class: audio_evals.models.moonshot.KimiAudioModel
10 |   args:
11 |     model_path: /data/shiqundong/model/Kimi-Audio-7B-Instruct
12 |     speech: True
13 |     env_path: envs/kimiaudio
14 |     requirements_path: audio_evals/lib/Kimi-Audio/requirements.txt
15 | 


--------------------------------------------------------------------------------
/registry/model/offline.yaml:
--------------------------------------------------------------------------------
 1 | qwen2-audio-offline:
 2 |   class: audio_evals.models.qwen.Qwen2audioPretrain
 3 |   args:
 4 |     path: Qwen/Qwen2-Audio-7B
 5 |     sample_params:
 6 |       do_sample: false
 7 |       max_new_tokens: 256
 8 |       min_new_tokens: 1
 9 |       length_penalty: 1.0
10 |       num_return_sequences: 1
11 |       repetition_penalty: 1.0
12 |       use_cache: True
13 | 
14 | qwen2-audio-chat:
15 |    class: audio_evals.models.qwen.Qwen2audio
16 |    args:
17 |      path: Qwen/Qwen2-Audio-7B-Instruct
18 |      sample_params:
19 |        do_sample: false
20 |        max_new_tokens: 256
21 |        min_new_tokens: 1
22 |        length_penalty: 1.0
23 |        num_return_sequences: 1
24 |        repetition_penalty: 1.0
25 |        use_cache: True
26 | 
27 | qwen-audio-chat-offline:
28 |    class: audio_evals.models.offline_model.OfflineModel
29 |    args:
30 |      is_chat: True
31 |      path: Qwen/Qwen-Audio-Chat
32 |      sample_params:
33 |        do_sample: false
34 |        max_new_tokens: 256
35 |        min_new_tokens: 1
36 |        length_penalty: 1.0
37 |        num_return_sequences: 1
38 |        repetition_penalty: 1.0
39 |        use_cache: True
40 | 
41 | qwen-audio-pretrain-offline:
42 |   class: audio_evals.models.offline_model.OfflinePretrainModel
43 |   args:
44 |     is_chat: False
45 |     path: Qwen/Qwen-Audio
46 |     padding_side: left
47 |     sample_params:
48 |       do_sample: false
49 |       max_new_tokens: 256
50 |       min_new_tokens: 1
51 |       length_penalty: 1.0
52 |       num_return_sequences: 1
53 |       repetition_penalty: 1.0
54 |       use_cache: True
55 | 


--------------------------------------------------------------------------------
/registry/model/ola.yaml:
--------------------------------------------------------------------------------
1 | ola-7b:
2 |   class: audio_evals.models.ola.OlaModel
3 |   args:
4 |     path: THUdyh/Ola-7b
5 |     env_path: envs/ola
6 |     requirements_path: audio_evals/lib/Ola/requirements.txt
7 | 


--------------------------------------------------------------------------------
/registry/model/paraformer.yaml:
--------------------------------------------------------------------------------
 1 | paraformer-zh:
 2 |   class: audio_evals.models.asr.paraformer.Paraformer
 3 |   args:
 4 |     path: funasr/paraformer-zh
 5 |     env_path: envs/paraformer
 6 |     requirements_path: audio_evals/lib/paraformer/requirements.txt
 7 | 
 8 | paraformer-large:
 9 |   class: audio_evals.models.asr.paraformer.Paraformer
10 |   args:
11 |     path: funasr/Paraformer-large
12 |     env_path: envs/paraformer
13 |     requirements_path: audio_evals/lib/paraformer/requirements.txt
14 | 
15 | paraformer-zh-streaming:
16 |   class: audio_evals.models.asr.paraformer.Paraformer
17 |   args:
18 |     path: funasr/paraformer-zh-streaming
19 |     env_path: envs/paraformer
20 |     requirements_path: audio_evals/lib/paraformer/requirements.txt
21 | 
22 | paraformer-en:
23 |   class: audio_evals.models.asr.paraformer.Paraformer
24 |   args:
25 |     path: funasr/paraformer-en
26 |     env_path: envs/paraformer
27 |     requirements_path: audio_evals/lib/paraformer/requirements.txt
28 | 
29 | conformer-en:
30 |   class: audio_evals.models.asr.paraformer.Paraformer
31 |   args:
32 |     path: funasr/conformer-en
33 |     env_path: envs/paraformer
34 |     requirements_path: audio_evals/lib/paraformer/requirements.txt
35 | 


--------------------------------------------------------------------------------
/registry/model/qwen2.5.yaml:
--------------------------------------------------------------------------------
 1 | qwen2.5-omni-audio:
 2 |   class: audio_evals.models.qwen2_5.QwenOmni
 3 |   args:
 4 |     path: Qwen/Qwen2.5-Omni-7B
 5 |     env_path: envs/qwen2.5-omni
 6 |     requirements_path: audio_evals/lib/qwen2-5omni/requirements.txt
 7 | 
 8 | qwen2.5-omni-speech:
 9 |   class: audio_evals.models.qwen2_5.QwenOmni
10 |   args:
11 |     path: Qwen/Qwen2.5-Omni-7B
12 |     speech: true
13 |     env_path: envs/qwen2.5-omni
14 |     requirements_path: audio_evals/lib/qwen2-5omni/requirements.txt
15 | 


--------------------------------------------------------------------------------
/registry/model/speechLLM.yaml:
--------------------------------------------------------------------------------
 1 | glm-4-voice:
 2 |   class: audio_evals.models.glm4voice.GLM4Voice
 3 |   args:
 4 |     url: http://127.0.0.1:10000/generate_stream
 5 |     sr: 22500
 6 |     volume: 32767
 7 | 
 8 | speech-gpt:
 9 |   class: audio_evals.models.glm4voice.GLM4Voice
10 |   args:
11 |     url: http://127.0.0.1:31505/chat
12 |     sr: 16000
13 |     volume: 32767
14 | 
15 | moshi:
16 |   class: audio_evals.models.glm4voice.GLM4Voice
17 |   args:
18 |     url: http://127.0.0.1:31610/chat
19 |     sr: 22500
20 |     cut_greeting: True
21 | 
22 | 
23 | llama-omni:
24 |   class: audio_evals.models.llama_omni.LlamaOmni
25 |   args:
26 |     url: http://127.0.0.1:32039/worker_generate_stream
27 | 
28 | mini-omni:
29 |   class: audio_evals.models.mini_omni.MiniOmni
30 |   args:
31 |     url: http://127.0.0.1:32213/chat
32 | 


--------------------------------------------------------------------------------
/registry/model/step.yaml:
--------------------------------------------------------------------------------
 1 | step-audio:
 2 |   class: audio_evals.models.step_audio.StepAudioChat
 3 |   args:
 4 |     url: http://127.0.0.1:5000/inference
 5 |     s2t: true
 6 | 
 7 | step-speech:
 8 |   class: audio_evals.models.step_audio.StepAudioChat
 9 |   args:
10 |     url: http://127.0.0.1:5000/inference
11 |     s2t: false
12 | 


--------------------------------------------------------------------------------
/registry/model/tencent.yaml:
--------------------------------------------------------------------------------
 1 | tencent-zh:
 2 |   class: audio_evals.models.asr.tencent.TencentASRModel
 3 |   args:
 4 |     secret_id:
 5 |     secret_key:
 6 |     sample_params:
 7 |       EngSerViceType: 16k_zh
 8 | 
 9 | tencent-en:
10 |   class: audio_evals.models.asr.tencent.TencentASRModel
11 |   args:
12 |     secret_id:
13 |     secret_key:
14 |     sample_params:
15 |       EngSerViceType: 16k_en
16 | 


--------------------------------------------------------------------------------
/registry/model/ultravox.yaml:
--------------------------------------------------------------------------------
1 | ultravox:
2 |   class: audio_evals.models.UltraVOX.UltraVOX
3 |   args:
4 |     path: fixie-ai/ultravox-v0_4
5 |     sample_params:
6 |       max_new_tokens: 256


--------------------------------------------------------------------------------
/registry/model/utmos.yaml:
--------------------------------------------------------------------------------
1 | utmos-en:
2 |   class: audio_evals.models.utmos.UTMOS
3 |   args:
4 |     path: sarulab-speech/UTMOS-demo
5 |     env_path: envs/utmos
6 |     requirements_path: audio_evals/lib/utmos/requirements.txt
7 | 


--------------------------------------------------------------------------------
/registry/model/wavlm.yaml:
--------------------------------------------------------------------------------
1 | wavlm_large:
2 |   class: audio_evals.models.wavlm.WavLM
3 |   args:
4 |     path: https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view
5 |     env_path: envs/simo
6 |     requirements_path: audio_evals/lib/simo/requirements.txt
7 | 


--------------------------------------------------------------------------------
/registry/process/base.yaml:
--------------------------------------------------------------------------------
 1 | json_content:
 2 |   class: audio_evals.process.base.ContentExtract
 3 |   args: {}
 4 | 
 5 | 
 6 | qwen_pretrain_asr_tractor_zh:
 7 |   class: audio_evals.process.qwen.QwenAudioASRExtract
 8 |   args:
 9 |     lang: zh
10 | 
11 | qwen_pretrain_asr_tractor:
12 |   class: audio_evals.process.qwen.QwenAudioASRExtract
13 |   args:
14 |     lang: en
15 | 
16 | zh_text_normalizer:
17 |   class: audio_evals.process.normalization.TextNormalization
18 |   args:
19 |     lang: zh
20 | 
21 | en_text_normalizer:
22 |   class: audio_evals.process.normalization.TextNormalization
23 |   args:
24 |     lang: en
25 | 
26 | text_normalizer:
27 |   class: audio_evals.process.normalization.TextNormalization
28 |   args: {}
29 | 
30 | trivia_qa_normalizer:
31 |   class: audio_evals.process.triviaqa.TriviaQaNormalizer
32 |   args: {}


--------------------------------------------------------------------------------
/registry/process/choice.yaml:
--------------------------------------------------------------------------------
1 | first_option:
2 |   class: audio_evals.process.firstoption.FirstOption
3 |   args:
4 |     options: ABCD
5 | 


--------------------------------------------------------------------------------
/registry/process/speech_model_output.yaml:
--------------------------------------------------------------------------------
 1 | extract_audio:
 2 |   class: audio_evals.process.base.JsonExtract
 3 |   args:
 4 |     extract_key: audio
 5 | 
 6 | extract_text:
 7 |   class: audio_evals.process.base.JsonExtract
 8 |   args:
 9 |     extract_key: text
10 | 
11 | speech2text:
12 |   class: audio_evals.process.speech.Speech2text
13 |   args:
14 |     model_name: whisper
15 | 
16 | speech2text-zh:
17 |   class: audio_evals.process.speech.Speech2text
18 |   args:
19 |     model_name: paraformer-zh
20 |     prompt_name: simple-asr
21 | 


--------------------------------------------------------------------------------
/registry/prompt/aqa.yaml:
--------------------------------------------------------------------------------
 1 | aqa:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |       - role: user
 6 |         contents:
 7 |           - type: audio
 8 |             value: "{{WavPath}}"
 9 |           - type: text
10 |             value: "answer the question without explain\n# Question: {{QuestionText}}"
11 | 
12 | direct-aqa:
13 |   class: audio_evals.prompt.base.Prompt
14 |   args:
15 |     template:
16 |       - role: user
17 |         contents:
18 |           - type: audio
19 |             value: "{{WavPath}}"
20 | 
21 | text-aqa:
22 |   class: audio_evals.prompt.base.Prompt
23 |   args:
24 |     template:
25 |       - role: user
26 |         contents:
27 |           - type: text
28 |             value: "{{question_text}}"
29 | 


--------------------------------------------------------------------------------
/registry/prompt/asr.yaml:
--------------------------------------------------------------------------------
 1 | asr:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |       - role: user
 6 |         contents:
 7 |           - type: audio
 8 |             value: "{{WavPath}}"
 9 |           - type: text
10 |             value: "listen the audio, output the audio content with format {\"content\": \"\"}"
11 | 
12 | simple-asr:
13 |   class: audio_evals.prompt.base.Prompt
14 |   args:
15 |     template:
16 |       audio: "{{WavPath}}"


--------------------------------------------------------------------------------
/registry/prompt/caption.yaml:
--------------------------------------------------------------------------------
 1 | caption:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |       - role: user
 6 |         contents:
 7 |           - type: audio
 8 |             value: "{{WavPath}}"
 9 |           - type: text
10 |             value: "describe the audio:"
11 | 


--------------------------------------------------------------------------------
/registry/prompt/chatbot.yaml:
--------------------------------------------------------------------------------
 1 | chatbot-eval:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template: '[Instruction]
 5 | Please act as an impartial judge and evaluate the quality of the response provided
 6 | by an AI assistant to the user question displayed below. Your evaluation should
 7 | consider factors such as the helpfulness, relevance, accuracy, depth, creativity,
 8 | and level of detail of the response. Begin your evaluation by providing a short
 9 | explanation. Be as objective as possible. After providing your explanation, you
10 | must rate the response on a scale of 1 to 10 by strictly following this format:
11 | "[[rating]]", for example: "Rating: [[5]]".
12 | [Question]
13 | {{instruction}}
14 | [The Start of Assistant’s Answer]
15 | {{response}}
16 | [The End of Assistant’s Answer]'
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/registry/prompt/choice.yaml:
--------------------------------------------------------------------------------
 1 | single_choice:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |       - role: user
 6 |         contents:
 7 |           - type: audio
 8 |             value: "{{WavPath}}"
 9 |           - type: text
10 |             value: "Choose the most suitable answer from options A, B, C, and D to
11 |             respond the question in next line, you may only choose A or B or C or D
12 |             .\n{{question}}\nA. {{choice_a}}\nB. {{choice_b}}\nC. {{choice_c}}\nD. {{choice_d}}"
13 | 
14 | single_choice_with_answer:
15 |   class: audio_evals.prompt.base.Prompt
16 |   args:
17 |     template:
18 |       - role: user
19 |         contents:
20 |           - type: audio
21 |             value: "{{WavPath}}"
22 |           - type: text
23 |             value: "{{question}}  Select one option from the provided choices.\n{{choices}}"
24 | 


--------------------------------------------------------------------------------
/registry/prompt/digit.yaml:
--------------------------------------------------------------------------------
 1 | digit:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |       - role: user
 6 |         contents:
 7 |           - type: audio
 8 |             value: "{{WavPath}}"
 9 |           - type: text
10 |             value: "Identify which digit [0-9] is spoken in the provided audio clip, answer one of [0,1,2,3,4,5,6,7,8,9] without explain"
11 | 


--------------------------------------------------------------------------------
/registry/prompt/emotion_anlysis.yaml:
--------------------------------------------------------------------------------
 1 | emo_analysis:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |       - role: user
 6 |         contents:
 7 |           - type: audio
 8 |             value: "{{WavPath}}"
 9 |           - type: text
10 |             value: "listen the audio and judge the emotion of the speaker, the answer must be one of [surprise,anger,neutral,joy,sadness,fear,disgust], answer without explain"
11 | 
12 | ravdess_emo_analysis:
13 |   class: audio_evals.prompt.base.Prompt
14 |   args:
15 |     template:
16 |       - role: user
17 |         contents:
18 |           - type: audio
19 |             value: "{{WavPath}}"
20 |           - type: text
21 |             value: "listen the audio and judge the emotion of the speaker, the answer must be one of [neutral,calm,happy,sad,angry,fearful,disgust,surprised], answer without explain"
22 | 
23 | sentiment_analysis:
24 |   class: audio_evals.prompt.base.Prompt
25 |   args:
26 |     template:
27 |       - role: user
28 |         contents:
29 |           - type: audio
30 |             value: "{{WavPath}}"
31 |           - type: text
32 |             value: "listen the audio and judge the sentiment of the speaker, the answer must be one of [positive,negative,neutral], answer without explain"
33 | 
34 | vocal_sound_analysis:
35 |   class: audio_evals.prompt.base.Prompt
36 |   args:
37 |     template:
38 |       - role: user
39 |         contents:
40 |           - type: audio
41 |             value: "{{WavPath}}"
42 |           - type: text
43 |             value: "listen the audio and judge the vocal sound, the answer must be one of [Cough,Sigh,Throat clearing,Sneeze,Laughter,Sniff], answer without explain"
44 | 
45 | sound_analysis:
46 |   class: audio_evals.prompt.base.Prompt
47 |   args:
48 |     template:
49 |       - role: user
50 |         contents:
51 |           - type: audio
52 |             value: "{{WavPath}}"
53 |           - type: text
54 |             value: "listen the audio and judge the sound, the answer must be one of ['Speech', 'Frying', 'Dishes',
55 |             'Running_water', 'Blender', 'Electric_shaver_toothbrush', 'Cat',
56 |             'Alarm_bell_ringing', 'Dog', 'Vacuum_cleaner'], answer without explain"
57 | 


--------------------------------------------------------------------------------
/registry/prompt/gender_anlysis.yaml:
--------------------------------------------------------------------------------
 1 | gender_analysis:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |       - role: user
 6 |         contents:
 7 |           - type: audio
 8 |             value: "{{WavPath}}"
 9 |           - type: text
10 |             value: "listen the audio and judge the gender of the speaker, the answer must be one of [female, male], answer without explain"
11 | 


--------------------------------------------------------------------------------
/registry/prompt/geval.yaml:
--------------------------------------------------------------------------------
 1 | yes_no_judge:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |       - role: system
 6 |         contents:
 7 |           - type: text
 8 |             value: "You are a helpful assistant who tries to help answer the user's question."
 9 |       - role: user
10 |         contents:
11 |           - type: text
12 |             value: "{{real_prompt}}"
13 | 


--------------------------------------------------------------------------------
/registry/prompt/kimi-audio.yaml:
--------------------------------------------------------------------------------
 1 | kimi-audio-asr-en:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |     - role: user
 6 |       message_type: text
 7 |       content: Please transcribe the spoken content into written text.
 8 |     - role: user
 9 |       message_type: audio
10 |       content: '{{WavPath}}'
11 | 
12 | kimi-audio-asr-zh:
13 |   class: audio_evals.prompt.base.Prompt
14 |   args:
15 |     template:
16 |     - role: user
17 |       message_type: text
18 |       content: 请把这段语音转录成文本。
19 |     - role: user
20 |       message_type: audio
21 |       content: '{{WavPath}}'
22 | 


--------------------------------------------------------------------------------
/registry/prompt/medicine.yaml:
--------------------------------------------------------------------------------
 1 | COVID-recognizer:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |       - role: user
 6 |         contents:
 7 |           - type: audio
 8 |             value: "{{WavPath}}"
 9 |           - type: text
10 |             value: "Listen to the provided audio and determine the health status, answer one of ['healthy', 'symptomatic', 'COVID-19'] without explain"
11 | 
12 | Heartbeat-recognizer:
13 |   class: audio_evals.prompt.base.Prompt
14 |   args:
15 |     template:
16 |       - role: user
17 |         contents:
18 |           - type: audio
19 |             value: "{{WavPath}}"
20 |           - type: text
21 |             value: "Listen to the heartbeat sound and determine the type of heart sound present, answer one of ['normal', 'murmur', 'extrastole'] without explain"
22 | 
23 | 
24 | Respiratory-crackles-recognizer:
25 |   class: audio_evals.prompt.base.Prompt
26 |   args:
27 |     template:
28 |       - role: user
29 |         contents:
30 |           - type: audio
31 |             value: "{{WavPath}}"
32 |           - type: text
33 |             value: "Listen to the respiratory sound and determine if crackles are present. Answer with either 'present' or 'absent' without explanation."
34 | 
35 | Respiratory-wheezes-recognizer:
36 |   class: audio_evals.prompt.base.Prompt
37 |   args:
38 |     template:
39 |       - role: user
40 |         contents:
41 |           - type: audio
42 |             value: "{{WavPath}}"
43 |           - type: text
44 |             value: "Listen to the respiratory sound and determine if wheezes are present. Answer with either 'present' or 'absent' without explanation."
45 | 


--------------------------------------------------------------------------------
/registry/prompt/music.yaml:
--------------------------------------------------------------------------------
 1 | instrument_recognition:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |       - role: user
 6 |         contents:
 7 |           - type: audio
 8 |             value: "{{WavPath}}"
 9 |           - type: text
10 |             value: "listen the music and judge instrument of the music, the answer must be one of [Bass,Brass,Flute,Guitar,Keyboard,Mallet,Organ,Reed,String,Synth Lead,Vocal], answer without explain"
11 | 
12 | chord_recognition:
13 |   class: audio_evals.prompt.base.Prompt
14 |   args:
15 |     template:
16 |       - role: user
17 |         contents:
18 |           - type: audio
19 |             value: "{{WavPath}}"
20 |           - type: text
21 |             value: "Listen to the music and determine the chord quality. The answer should be either 'Major' or 'Minor', answer without explain"
22 | 
23 | music_genre:
24 |   class: audio_evals.prompt.base.Prompt
25 |   args:
26 |     template:
27 |       - role: user
28 |         contents:
29 |           - type: audio
30 |             value: "{{WavPath}}"
31 |           - type: text
32 |             value: "Listen to the provided music clip and identify the genre. the answer must be one of [blues,classical,country,disco,hiphop,jazz,metal,pop,reggae,rock], answer without explain"
33 | 
34 | 
35 | music_tempo:
36 |   class: audio_evals.prompt.base.Prompt
37 |   args:
38 |     template:
39 |       - role: user
40 |         contents:
41 |           - type: audio
42 |             value: "{{WavPath}}"
43 |           - type: text
44 |             value: "Listen to the audio clip and determine the exact tempo (BPM). Respond only with a numerical value without explain"
45 | 


--------------------------------------------------------------------------------
/registry/prompt/ola.yaml:
--------------------------------------------------------------------------------
 1 | ola-asr:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |     - role: user
 6 |       contents:
 7 |       - type: text
 8 |         value: 'Please give the ASR results of the given speech.'
 9 |       - type: audio
10 |         value: '{{WavPath}}'
11 | 
12 | ola-aqa:
13 |   class: audio_evals.prompt.base.Prompt
14 |   args:
15 |     template:
16 |     - role: user
17 |       contents:
18 |       - type: text
19 |         value: "Please directly answer the questions in the user's speech."
20 |       - type: audio
21 |         value: '{{WavPath}}'
22 | 


--------------------------------------------------------------------------------
/registry/prompt/qa.yaml:
--------------------------------------------------------------------------------
 1 | qa:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |       - role: user
 6 |         contents:
 7 |           - type: audio
 8 |             value: "{{WavPath}}"
 9 |           - type: text
10 |             value: "{{question}}"
11 | 


--------------------------------------------------------------------------------
/registry/prompt/sound_identify.yaml:
--------------------------------------------------------------------------------
 1 | catdog_identify:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |       - role: user
 6 |         contents:
 7 |           - type: audio
 8 |             value: "{{WavPath}}"
 9 |           - type: text
10 |             value: "Listen to the audio and determine if it's a dog or a cat. The answer must be one of the following: [dog, cat]. Provide the answer without explanation."
11 | 


--------------------------------------------------------------------------------
/registry/prompt/whisper-pretrain.yaml:
--------------------------------------------------------------------------------
 1 | whisper-asr:
 2 |   class: audio_evals.prompt.base.Prompt
 3 |   args:
 4 |     template:
 5 |       audio: '{{WavPath}}'
 6 |       generate_kwargs: {}
 7 | 
 8 | whisper-asr-zh:
 9 |   class: audio_evals.prompt.base.Prompt
10 |   args:
11 |     template:
12 |       audio: '{{WavPath}}'
13 |       generate_kwargs:
14 |         language: chinese
15 | 
16 | whisper-asr-en:
17 |   class: audio_evals.prompt.base.Prompt
18 |   args:
19 |     template:
20 |       audio: '{{WavPath}}'
21 |       generate_kwargs:
22 |         language: english
23 | 
24 | whisper-asr-fr:
25 |   class: audio_evals.prompt.base.Prompt
26 |   args:
27 |     template:
28 |       audio: '{{WavPath}}'
29 |       generate_kwargs:
30 |         language: french
31 | 
32 | whisper-asr-yue:
33 |   class: audio_evals.prompt.base.Prompt
34 |   args:
35 |     template:
36 |       audio: '{{WavPath}}'
37 |       generate_kwargs:
38 |         language: yue
39 | 
40 | whisper-sst-zh2en:
41 |   class: audio_evals.prompt.base.Prompt
42 |   args:
43 |     template:
44 |       audio: '{{WavPath}}'
45 |       generate_kwargs:
46 |         language: chinese
47 |         task: translate
48 | whisper-sst-de2en:
49 |   class: audio_evals.prompt.base.Prompt
50 |   args:
51 |     template:
52 |       audio: '{{WavPath}}'
53 |       generate_kwargs:
54 |         language: german
55 |         task: translate
56 | whisper-sst-es2en:
57 |   class: audio_evals.prompt.base.Prompt
58 |   args:
59 |     template:
60 |       audio: '{{WavPath}}'
61 |       generate_kwargs:
62 |         language: spanish
63 |         task: translate
64 | 
65 | whisper-sst-fr2en:
66 |   class: audio_evals.prompt.base.Prompt
67 |   args:
68 |     template:
69 |       audio: '{{WavPath}}'
70 |       generate_kwargs:
71 |         language: french
72 |         task: translate
73 | whisper-sst-it2en:
74 |   class: audio_evals.prompt.base.Prompt
75 |   args:
76 |     template:
77 |       audio: '{{WavPath}}'
78 |       generate_kwargs:
79 |         language: italian
80 |         task: translate
81 | 


--------------------------------------------------------------------------------
/registry/recorder/local.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval-Audio/26f2ba808ba455d6d4ea342f6b8d3aae123ecf39/registry/recorder/local.yaml


--------------------------------------------------------------------------------
/requirments-offline-model.txt:
--------------------------------------------------------------------------------
 1 | transformers
 2 | accelerate>=0.20.3
 3 | einops
 4 | matplotlib
 5 | pillow
 6 | scipy
 7 | tensorboard
 8 | tiktoken
 9 | openai
10 | transformers_stream_generator==0.0.4
11 | 


--------------------------------------------------------------------------------
/requirments.txt:
--------------------------------------------------------------------------------
 1 | jinja2
 2 | tqdm
 3 | requests
 4 | aiohttp
 5 | pyyaml
 6 | pytest
 7 | jiwer
 8 | sacrebleu==1.5.1
 9 | editdistance
10 | scikit-learn
11 | librosa
12 | soundfile
13 | dashscope
14 | datasets
15 | pre-commit
16 | more_itertools
17 | pandas
18 | zhconv
19 | pycocoevalcap
20 | regex
21 | openai>=1.0.0
22 | websockets==12.0
23 | pydub
24 | openpyxl
25 | gdown
26 | 


--------------------------------------------------------------------------------
/requirments/minicpm_o2_6.txt:
--------------------------------------------------------------------------------
 1 | Pillow==10.1.0
 2 | torch==2.2.0
 3 | torchaudio==2.2.0
 4 | torchvision==0.17.0
 5 | transformers==4.44.2
 6 | librosa==0.9.0
 7 | soundfile==0.12.1
 8 | vector-quantize-pytorch==1.18.5
 9 | vocos==0.1.0
10 | decord
11 | moviepy
12 | numpy==1.26
13 | 


--------------------------------------------------------------------------------
/tests/test_audio_evals_registry.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pytest
 4 | 
 5 | from audio_evals.eval_task import EvalTask
 6 | from audio_evals.recorder import Recorder
 7 | from audio_evals.registry import registry
 8 | 
 9 | # 配置根日志记录器
10 | logging.basicConfig(
11 |     level=logging.DEBUG,
12 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
13 |     handlers=[logging.StreamHandler()],
14 | )
15 | 
16 | 
17 | def test_registry_model():
18 |     model = registry.get_model("gpt4o")
19 |     print(model.inference("how are you"))
20 | 
21 | 
22 | def test_prompt():
23 |     prompt = registry.get_prompt("asr")
24 |     model = registry.get_model("qwen-audio-offline")
25 |     real_prompt = prompt.load(
26 |         a="/Users/a1/Downloads/语音转文字/嘉德罗斯/嘉德罗斯_12.wav"
27 |     )
28 |     print(model.inference(real_prompt))
29 | 
30 | 
31 | def test_evaluator():
32 |     e = registry.get_evaluator("em")
33 |     assert e("0", 0)["match"]
34 |     assert e(0, "0")["match"]
35 |     assert e(1, "0")["match"] == 0
36 | 
37 |     e = registry.get_evaluator("cer")
38 |     print(
39 |         e(
40 |             "买一张万能卡也有不少好处带着这张卡你可以进入南非的一些公园或全部的国家公园",
41 |             "买一张万能卡（Wild Card）也有不少好处。带着这张卡，你可以进入南非的一些公园或全部的国家公园。",
42 |         )
43 |     )
44 | 
45 |     e = registry.get_evaluator("wer")
46 |     print(e("It is good", "it is good"))
47 | 
48 | 
49 | def test_agg():
50 |     a = registry.get_agg("acc")
51 |     assert a([{"match": 0}])["acc"] == 0
52 |     assert a([{"match": 1}])["acc"] == 1
53 |     assert a([])["acc"] == 0
54 |     with pytest.raises(Exception):
55 |         a([{"count": 1}])
56 | 
57 | 
58 | def test_task():
59 |     task_cfg = registry.get_eval_task("alei_asr")
60 | 
61 |     t = EvalTask(
62 |         dataset=registry.get_dataset("KeSpeech"),
63 |         prompt=registry.get_prompt("KeSpeech"),
64 |         predictor=registry.get_model(task_cfg.model),
65 |         evaluator=registry.get_evaluator(task_cfg.evaluator),
66 |         post_process=[registry.get_process(item) for item in task_cfg.post_process],
67 |         agg=registry.get_agg(task_cfg.agg),
68 |         recorder=Recorder("log/KeSpeech.jsonl"),
69 |     )
70 |     res = t.run()
71 |     print(res)
72 | 


--------------------------------------------------------------------------------
/tests/test_dataset.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from audio_evals.registry import registry
 4 | 
 5 | # 配置根日志记录器
 6 | logging.basicConfig(
 7 |     level=logging.DEBUG,
 8 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 9 |     handlers=[logging.StreamHandler()],
10 | )
11 | 
12 | 
13 | def test_huggingface_dataset():
14 |     a = registry.get_dataset("KeSpeech-hf")
15 |     b = a.load()
16 |     b = list(b)
17 |     with open(b[0]["audio"]["path"], "rb") as f:
18 |         content = f.read()
19 |     print(content)
20 |     print(a)
21 | 


--------------------------------------------------------------------------------