├── .gitignore ├── .gitmodules ├── .readthedocs.yaml ├── LICENSE ├── README.md ├── README_zh-CN.md ├── data_scorer ├── README.md ├── README_zh-CN.md ├── data_process │ ├── example_input.jsonl │ ├── example_input_w_answer.jsonl │ ├── example_input_wo_output.jsonl │ ├── example_upload.jsonl │ ├── normalize.py │ └── utils_jsonl.py ├── heuristic │ ├── README.md │ ├── README_zh-CN.md │ ├── configs │ │ └── Length.yaml │ ├── main_para.py │ ├── results │ │ └── example_output_TokenLengthScorer │ │ │ ├── output.jsonl │ │ │ └── temp │ │ │ ├── OutputTokenLengthScorer.jsonl │ │ │ └── data.jsonl │ ├── scorers │ │ ├── LengthScorer.py │ │ ├── __init__.py │ │ ├── base_scorer.py │ │ ├── scorer_factory.py │ │ ├── scores_info.json │ │ └── utils.py │ ├── sh │ │ └── Length.sh │ └── utils │ │ ├── __init__.py │ │ ├── config_loader.py │ │ ├── data_loader.py │ │ └── utils_jsonl.py ├── llm_as_judge │ ├── README.md │ ├── README_zh-CN.md │ ├── config.yaml │ ├── llm_as_judge │ │ ├── __init__.py │ │ ├── config.py │ │ ├── evaluator.py │ │ ├── main.py │ │ ├── utils.py │ │ └── validators.py │ ├── output │ │ ├── example_input.jsonl │ │ ├── example_input_scored.jsonl │ │ └── scored_ids.txt │ ├── prompts │ │ ├── QA_All.txt │ │ ├── QA_Clarity.txt │ │ ├── QA_Coherence.txt │ │ ├── QA_Completeness.txt │ │ ├── QA_Complexity.txt │ │ ├── QA_Correctness.txt │ │ ├── QA_Meaningness.txt │ │ ├── QA_Relevance.txt │ │ ├── Q_All.txt │ │ ├── Q_Clarity.txt │ │ ├── Q_Code_Difficulty.txt │ │ ├── Q_Coherence.txt │ │ ├── Q_Completeness.txt │ │ ├── Q_Complexity.txt │ │ ├── Q_Correctness.txt │ │ ├── Q_Math_Difficulty.txt │ │ └── Q_Meaningness.txt │ ├── requirements.txt │ └── tools │ │ ├── add_empty_key.py │ │ ├── merge_tracks.py │ │ └── process_scores.py ├── model_based │ ├── README.md │ ├── README_zh-CN.md │ ├── configs │ │ ├── DeitaCScorer.yaml │ │ ├── DeitaQScorer.yaml │ │ ├── FailRateScorer.yaml │ │ ├── IFDScorer.yaml │ │ ├── MultiScorer.yaml │ │ ├── RewardModel.yaml │ │ └── ThinkingProbScorer.yaml │ ├── fail_rate │ │ ├── pyproject.toml │ │ ├── setup.py │ │ └── src │ │ │ └── lighteval │ │ │ ├── __init__.py │ │ │ ├── __main__.py │ │ │ ├── config │ │ │ └── lighteval_config.py │ │ │ ├── data.py │ │ │ ├── logging │ │ │ ├── evaluation_tracker.py │ │ │ └── info_loggers.py │ │ │ ├── main_accelerate.py │ │ │ ├── main_baseline.py │ │ │ ├── main_custom.py │ │ │ ├── main_endpoint.py │ │ │ ├── main_nanotron.py │ │ │ ├── main_sglang.py │ │ │ ├── main_tasks.py │ │ │ ├── main_vllm.py │ │ │ ├── metrics │ │ │ ├── __init__.py │ │ │ ├── dynamic_metrics.py │ │ │ ├── harness_compatibility │ │ │ │ ├── drop.py │ │ │ │ └── truthful_qa.py │ │ │ ├── imports │ │ │ │ ├── __init__.py │ │ │ │ ├── bert_scorer.py │ │ │ │ ├── data_stats_metric.py │ │ │ │ ├── data_stats_utils.py │ │ │ │ └── summac.py │ │ │ ├── judge_prompts.jsonl │ │ │ ├── llm_as_judge.py │ │ │ ├── metrics.py │ │ │ ├── metrics_corpus.py │ │ │ ├── metrics_sample.py │ │ │ ├── normalizations.py │ │ │ ├── sample_preparator.py │ │ │ ├── stderr.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── extractive_match_utils.py │ │ │ │ ├── judge_utils.py │ │ │ │ ├── linguistic_tokenizers.py │ │ │ │ ├── math_comparison.py │ │ │ │ └── metric_utils.py │ │ │ ├── models │ │ │ ├── abstract_model.py │ │ │ ├── custom │ │ │ │ └── custom_model.py │ │ │ ├── dummy │ │ │ │ └── dummy_model.py │ │ │ ├── endpoints │ │ │ │ ├── endpoint_model.py │ │ │ │ ├── inference_providers_model.py │ │ │ │ ├── openai_model.py │ │ │ │ └── tgi_model.py │ │ │ ├── litellm_model.py │ │ │ ├── model_input.py │ │ │ ├── model_loader.py │ │ │ ├── model_output.py │ │ │ ├── nanotron │ │ │ │ └── nanotron_model.py │ │ │ ├── sglang │ │ │ │ └── sglang_model.py │ │ │ ├── transformers │ │ │ │ ├── adapter_model.py │ │ │ │ ├── delta_model.py │ │ │ │ ├── transformers_model.py │ │ │ │ └── vlm_transformers_model.py │ │ │ ├── utils.py │ │ │ └── vllm │ │ │ │ └── vllm_model.py │ │ │ ├── pipeline.py │ │ │ ├── tasks │ │ │ ├── __init__.py │ │ │ ├── default_prompts.py │ │ │ ├── default_tasks.py │ │ │ ├── extended │ │ │ │ ├── __init__.py │ │ │ │ ├── hle │ │ │ │ │ └── main.py │ │ │ │ ├── ifeval │ │ │ │ │ ├── instructions.py │ │ │ │ │ ├── instructions_registry.py │ │ │ │ │ ├── instructions_utils.py │ │ │ │ │ └── main.py │ │ │ │ ├── lcb │ │ │ │ │ ├── codegen_metrics.py │ │ │ │ │ └── main.py │ │ │ │ ├── mix_eval │ │ │ │ │ ├── judge_prompts.py │ │ │ │ │ ├── main.py │ │ │ │ │ └── prompts.py │ │ │ │ ├── mt_bench │ │ │ │ │ ├── judge_prompt_templates.py │ │ │ │ │ └── main.py │ │ │ │ ├── olympiade_bench │ │ │ │ │ └── main.py │ │ │ │ └── tiny_benchmarks │ │ │ │ │ ├── main.py │ │ │ │ │ └── tinyBenchmarks.pkl │ │ │ ├── lighteval_task.py │ │ │ ├── multilingual │ │ │ │ ├── __init__.py │ │ │ │ ├── adapters.py │ │ │ │ ├── tasks.py │ │ │ │ └── utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── adapters_utils.py │ │ │ │ │ └── task_utils.py │ │ │ ├── prompt_manager.py │ │ │ ├── registry.py │ │ │ ├── requests.py │ │ │ └── templates │ │ │ │ ├── __init__.py │ │ │ │ ├── boolq.py │ │ │ │ ├── continuation.py │ │ │ │ ├── copa.py │ │ │ │ ├── hellaswag.py │ │ │ │ ├── multichoice.py │ │ │ │ ├── nli.py │ │ │ │ ├── qa.py │ │ │ │ ├── translation.py │ │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── adapter_utils.py │ │ │ │ ├── formatting_utils.py │ │ │ │ ├── formulation.py │ │ │ │ └── translation_literals.py │ │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── imports.py │ │ │ ├── language.py │ │ │ ├── parallelism.py │ │ │ ├── timeout.py │ │ │ └── utils.py │ ├── main_para.py │ ├── run.sh │ ├── scorers │ │ ├── DeitaCScorer.py │ │ ├── DeitaQScorer.py │ │ ├── FailRateScorer.py │ │ ├── IFDScorer.py │ │ ├── ModelAwareMarginScorer.py │ │ ├── SkyworkRewardScorer.py │ │ ├── ThinkingProbScorer.py │ │ ├── __init__.py │ │ ├── base_scorer.py │ │ ├── scorer_factory.py │ │ ├── scores_info.json │ │ └── utils.py │ └── utils │ │ ├── __init__.py │ │ ├── config_loader.py │ │ ├── data_loader.py │ │ └── utils_jsonl.py └── requirements.txt ├── docs ├── avatars_circle │ ├── ChampionZhong.svg │ ├── GX-XinGao.svg │ ├── LHL3341.svg │ ├── Leey21.svg │ ├── MySunX.svg │ ├── QizhiPei.svg │ ├── Word2VecT.svg │ ├── apeterswu.svg │ ├── conghui.svg │ ├── gavinwxy.svg │ ├── ming-bot.svg │ ├── orangeadegit.svg │ └── pzs19.svg ├── heuristic.md ├── imgs │ ├── OpenDataArena.svg │ ├── OpenDataLab.png │ ├── oda_comp.png │ ├── oda_comp_zh-CN.png │ ├── oda_first.png │ ├── oda_first_zh-CN.png │ ├── oda_overview.png │ └── oda_overview_zh-CN.png ├── index.md ├── llm-as-judge.md ├── model-based-evaluation.md ├── requirements.in └── requirements.txt ├── lumache.py ├── mkdocs.yml ├── model_eval ├── README.md ├── README_zh-CN.md ├── eval_script │ ├── test_llama.sh │ ├── test_llama_all_benchmarks.sh │ ├── test_llama_code.sh │ ├── test_llama_general.sh │ ├── test_llama_math.sh │ ├── test_llama_reasoning.sh │ ├── test_qwen.sh │ ├── test_qwen_all_benchmarks.sh │ ├── test_qwen_code.sh │ ├── test_qwen_general.sh │ ├── test_qwen_math.sh │ └── test_qwen_reasoning.sh └── summary_scores │ └── run_summary.py ├── model_train ├── README.md ├── README_zh-CN.md └── train_config │ ├── llama_config.yaml │ ├── llama_long_config.yaml │ ├── qwen_config.yaml │ └── qwen_long_config.yaml └── pyproject.toml /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/.gitmodules -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/.readthedocs.yaml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/README.md -------------------------------------------------------------------------------- /README_zh-CN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/README_zh-CN.md -------------------------------------------------------------------------------- /data_scorer/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/README.md -------------------------------------------------------------------------------- /data_scorer/README_zh-CN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/README_zh-CN.md -------------------------------------------------------------------------------- /data_scorer/data_process/example_input.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/data_process/example_input.jsonl -------------------------------------------------------------------------------- /data_scorer/data_process/example_input_w_answer.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/data_process/example_input_w_answer.jsonl -------------------------------------------------------------------------------- /data_scorer/data_process/example_input_wo_output.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/data_process/example_input_wo_output.jsonl -------------------------------------------------------------------------------- /data_scorer/data_process/example_upload.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/data_process/example_upload.jsonl -------------------------------------------------------------------------------- /data_scorer/data_process/normalize.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/data_process/normalize.py -------------------------------------------------------------------------------- /data_scorer/data_process/utils_jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/data_process/utils_jsonl.py -------------------------------------------------------------------------------- /data_scorer/heuristic/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/README.md -------------------------------------------------------------------------------- /data_scorer/heuristic/README_zh-CN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/README_zh-CN.md -------------------------------------------------------------------------------- /data_scorer/heuristic/configs/Length.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/configs/Length.yaml -------------------------------------------------------------------------------- /data_scorer/heuristic/main_para.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/main_para.py -------------------------------------------------------------------------------- /data_scorer/heuristic/results/example_output_TokenLengthScorer/output.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/results/example_output_TokenLengthScorer/output.jsonl -------------------------------------------------------------------------------- /data_scorer/heuristic/results/example_output_TokenLengthScorer/temp/OutputTokenLengthScorer.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/results/example_output_TokenLengthScorer/temp/OutputTokenLengthScorer.jsonl -------------------------------------------------------------------------------- /data_scorer/heuristic/results/example_output_TokenLengthScorer/temp/data.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/results/example_output_TokenLengthScorer/temp/data.jsonl -------------------------------------------------------------------------------- /data_scorer/heuristic/scorers/LengthScorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/scorers/LengthScorer.py -------------------------------------------------------------------------------- /data_scorer/heuristic/scorers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_scorer/heuristic/scorers/base_scorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/scorers/base_scorer.py -------------------------------------------------------------------------------- /data_scorer/heuristic/scorers/scorer_factory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/scorers/scorer_factory.py -------------------------------------------------------------------------------- /data_scorer/heuristic/scorers/scores_info.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/scorers/scores_info.json -------------------------------------------------------------------------------- /data_scorer/heuristic/scorers/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/scorers/utils.py -------------------------------------------------------------------------------- /data_scorer/heuristic/sh/Length.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/sh/Length.sh -------------------------------------------------------------------------------- /data_scorer/heuristic/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_scorer/heuristic/utils/config_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/utils/config_loader.py -------------------------------------------------------------------------------- /data_scorer/heuristic/utils/data_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/utils/data_loader.py -------------------------------------------------------------------------------- /data_scorer/heuristic/utils/utils_jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/heuristic/utils/utils_jsonl.py -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/README.md -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/README_zh-CN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/README_zh-CN.md -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/config.yaml -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/llm_as_judge/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/llm_as_judge/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/llm_as_judge/config.py -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/llm_as_judge/evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/llm_as_judge/evaluator.py -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/llm_as_judge/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/llm_as_judge/main.py -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/llm_as_judge/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/llm_as_judge/utils.py -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/llm_as_judge/validators.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/llm_as_judge/validators.py -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/output/example_input.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/output/example_input.jsonl -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/output/example_input_scored.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/output/example_input_scored.jsonl -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/output/scored_ids.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/output/scored_ids.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/QA_All.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/QA_All.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/QA_Clarity.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/QA_Clarity.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/QA_Coherence.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/QA_Coherence.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/QA_Completeness.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/QA_Completeness.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/QA_Complexity.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/QA_Complexity.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/QA_Correctness.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/QA_Correctness.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/QA_Meaningness.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/QA_Meaningness.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/QA_Relevance.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/QA_Relevance.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/Q_All.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/Q_All.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/Q_Clarity.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/Q_Clarity.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/Q_Code_Difficulty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/Q_Code_Difficulty.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/Q_Coherence.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/Q_Coherence.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/Q_Completeness.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/Q_Completeness.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/Q_Complexity.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/Q_Complexity.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/Q_Correctness.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/Q_Correctness.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/Q_Math_Difficulty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/Q_Math_Difficulty.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/prompts/Q_Meaningness.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/prompts/Q_Meaningness.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/requirements.txt -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/tools/add_empty_key.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/tools/add_empty_key.py -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/tools/merge_tracks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/tools/merge_tracks.py -------------------------------------------------------------------------------- /data_scorer/llm_as_judge/tools/process_scores.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/llm_as_judge/tools/process_scores.py -------------------------------------------------------------------------------- /data_scorer/model_based/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/README.md -------------------------------------------------------------------------------- /data_scorer/model_based/README_zh-CN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/README_zh-CN.md -------------------------------------------------------------------------------- /data_scorer/model_based/configs/DeitaCScorer.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/configs/DeitaCScorer.yaml -------------------------------------------------------------------------------- /data_scorer/model_based/configs/DeitaQScorer.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/configs/DeitaQScorer.yaml -------------------------------------------------------------------------------- /data_scorer/model_based/configs/FailRateScorer.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/configs/FailRateScorer.yaml -------------------------------------------------------------------------------- /data_scorer/model_based/configs/IFDScorer.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/configs/IFDScorer.yaml -------------------------------------------------------------------------------- /data_scorer/model_based/configs/MultiScorer.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/configs/MultiScorer.yaml -------------------------------------------------------------------------------- /data_scorer/model_based/configs/RewardModel.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/configs/RewardModel.yaml -------------------------------------------------------------------------------- /data_scorer/model_based/configs/ThinkingProbScorer.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/configs/ThinkingProbScorer.yaml -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/pyproject.toml -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/setup.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/__init__.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/__main__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/__main__.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/config/lighteval_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/config/lighteval_config.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/data.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/logging/evaluation_tracker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/logging/evaluation_tracker.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/logging/info_loggers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/logging/info_loggers.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/main_accelerate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/main_accelerate.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/main_baseline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/main_baseline.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/main_custom.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/main_custom.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/main_endpoint.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/main_endpoint.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/main_nanotron.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/main_nanotron.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/main_sglang.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/main_sglang.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/main_tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/main_tasks.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/main_vllm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/main_vllm.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/__init__.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/dynamic_metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/dynamic_metrics.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/harness_compatibility/drop.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/harness_compatibility/drop.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/harness_compatibility/truthful_qa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/harness_compatibility/truthful_qa.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/imports/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/imports/__init__.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/imports/bert_scorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/imports/bert_scorer.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/imports/data_stats_metric.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/imports/data_stats_metric.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/imports/data_stats_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/imports/data_stats_utils.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/imports/summac.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/imports/summac.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/judge_prompts.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/judge_prompts.jsonl -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/llm_as_judge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/llm_as_judge.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/metrics.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/metrics_corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/metrics_corpus.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/metrics_sample.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/metrics_sample.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/normalizations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/normalizations.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/sample_preparator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/sample_preparator.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/stderr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/stderr.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/utils/__init__.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/utils/extractive_match_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/utils/extractive_match_utils.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/utils/judge_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/utils/judge_utils.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/utils/linguistic_tokenizers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/utils/linguistic_tokenizers.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/utils/math_comparison.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/utils/math_comparison.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/metrics/utils/metric_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/metrics/utils/metric_utils.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/abstract_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/abstract_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/custom/custom_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/custom/custom_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/dummy/dummy_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/dummy/dummy_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/endpoints/endpoint_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/endpoints/endpoint_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/endpoints/inference_providers_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/endpoints/inference_providers_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/endpoints/openai_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/endpoints/openai_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/endpoints/tgi_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/endpoints/tgi_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/litellm_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/litellm_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/model_input.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/model_input.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/model_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/model_loader.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/model_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/model_output.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/nanotron/nanotron_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/nanotron/nanotron_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/sglang/sglang_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/sglang/sglang_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/transformers/adapter_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/transformers/adapter_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/transformers/delta_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/transformers/delta_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/transformers/transformers_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/transformers/transformers_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/transformers/vlm_transformers_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/transformers/vlm_transformers_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/utils.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/models/vllm/vllm_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/models/vllm/vllm_model.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/pipeline.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/__init__.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/default_prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/default_prompts.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/default_tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/default_tasks.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/__init__.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/hle/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/hle/main.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/ifeval/instructions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/ifeval/instructions.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/ifeval/instructions_registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/ifeval/instructions_registry.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/ifeval/instructions_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/ifeval/instructions_utils.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/ifeval/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/ifeval/main.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/lcb/codegen_metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/lcb/codegen_metrics.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/lcb/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/lcb/main.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/mix_eval/judge_prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/mix_eval/judge_prompts.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/mix_eval/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/mix_eval/main.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/mix_eval/prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/mix_eval/prompts.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/mt_bench/judge_prompt_templates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/mt_bench/judge_prompt_templates.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/mt_bench/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/mt_bench/main.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/olympiade_bench/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/olympiade_bench/main.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/tiny_benchmarks/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/tiny_benchmarks/main.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/tiny_benchmarks/tinyBenchmarks.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/extended/tiny_benchmarks/tinyBenchmarks.pkl -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/lighteval_task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/lighteval_task.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/multilingual/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/multilingual/__init__.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/multilingual/adapters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/multilingual/adapters.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/multilingual/tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/multilingual/tasks.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/multilingual/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/multilingual/utils/__init__.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/multilingual/utils/adapters_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/multilingual/utils/adapters_utils.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/multilingual/utils/task_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/multilingual/utils/task_utils.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/prompt_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/prompt_manager.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/registry.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/requests.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/requests.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/__init__.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/boolq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/boolq.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/continuation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/continuation.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/copa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/copa.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/hellaswag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/hellaswag.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/multichoice.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/multichoice.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/nli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/nli.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/qa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/qa.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/translation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/translation.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/utils/__init__.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/utils/adapter_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/utils/adapter_utils.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/utils/formatting_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/utils/formatting_utils.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/utils/formulation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/utils/formulation.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/utils/translation_literals.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/tasks/templates/utils/translation_literals.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/utils/__init__.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/utils/imports.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/utils/imports.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/utils/language.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/utils/language.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/utils/parallelism.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/utils/parallelism.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/utils/timeout.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/utils/timeout.py -------------------------------------------------------------------------------- /data_scorer/model_based/fail_rate/src/lighteval/utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/fail_rate/src/lighteval/utils/utils.py -------------------------------------------------------------------------------- /data_scorer/model_based/main_para.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/main_para.py -------------------------------------------------------------------------------- /data_scorer/model_based/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/run.sh -------------------------------------------------------------------------------- /data_scorer/model_based/scorers/DeitaCScorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/scorers/DeitaCScorer.py -------------------------------------------------------------------------------- /data_scorer/model_based/scorers/DeitaQScorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/scorers/DeitaQScorer.py -------------------------------------------------------------------------------- /data_scorer/model_based/scorers/FailRateScorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/scorers/FailRateScorer.py -------------------------------------------------------------------------------- /data_scorer/model_based/scorers/IFDScorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/scorers/IFDScorer.py -------------------------------------------------------------------------------- /data_scorer/model_based/scorers/ModelAwareMarginScorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/scorers/ModelAwareMarginScorer.py -------------------------------------------------------------------------------- /data_scorer/model_based/scorers/SkyworkRewardScorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/scorers/SkyworkRewardScorer.py -------------------------------------------------------------------------------- /data_scorer/model_based/scorers/ThinkingProbScorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/scorers/ThinkingProbScorer.py -------------------------------------------------------------------------------- /data_scorer/model_based/scorers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_scorer/model_based/scorers/base_scorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/scorers/base_scorer.py -------------------------------------------------------------------------------- /data_scorer/model_based/scorers/scorer_factory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/scorers/scorer_factory.py -------------------------------------------------------------------------------- /data_scorer/model_based/scorers/scores_info.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/scorers/scores_info.json -------------------------------------------------------------------------------- /data_scorer/model_based/scorers/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/scorers/utils.py -------------------------------------------------------------------------------- /data_scorer/model_based/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_scorer/model_based/utils/config_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/utils/config_loader.py -------------------------------------------------------------------------------- /data_scorer/model_based/utils/data_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/utils/data_loader.py -------------------------------------------------------------------------------- /data_scorer/model_based/utils/utils_jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/model_based/utils/utils_jsonl.py -------------------------------------------------------------------------------- /data_scorer/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/data_scorer/requirements.txt -------------------------------------------------------------------------------- /docs/avatars_circle/ChampionZhong.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/avatars_circle/ChampionZhong.svg -------------------------------------------------------------------------------- /docs/avatars_circle/GX-XinGao.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/avatars_circle/GX-XinGao.svg -------------------------------------------------------------------------------- /docs/avatars_circle/LHL3341.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/avatars_circle/LHL3341.svg -------------------------------------------------------------------------------- /docs/avatars_circle/Leey21.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/avatars_circle/Leey21.svg -------------------------------------------------------------------------------- /docs/avatars_circle/MySunX.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/avatars_circle/MySunX.svg -------------------------------------------------------------------------------- /docs/avatars_circle/QizhiPei.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/avatars_circle/QizhiPei.svg -------------------------------------------------------------------------------- /docs/avatars_circle/Word2VecT.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/avatars_circle/Word2VecT.svg -------------------------------------------------------------------------------- /docs/avatars_circle/apeterswu.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/avatars_circle/apeterswu.svg -------------------------------------------------------------------------------- /docs/avatars_circle/conghui.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/avatars_circle/conghui.svg -------------------------------------------------------------------------------- /docs/avatars_circle/gavinwxy.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/avatars_circle/gavinwxy.svg -------------------------------------------------------------------------------- /docs/avatars_circle/ming-bot.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/avatars_circle/ming-bot.svg -------------------------------------------------------------------------------- /docs/avatars_circle/orangeadegit.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/avatars_circle/orangeadegit.svg -------------------------------------------------------------------------------- /docs/avatars_circle/pzs19.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/avatars_circle/pzs19.svg -------------------------------------------------------------------------------- /docs/heuristic.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/heuristic.md -------------------------------------------------------------------------------- /docs/imgs/OpenDataArena.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/imgs/OpenDataArena.svg -------------------------------------------------------------------------------- /docs/imgs/OpenDataLab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/imgs/OpenDataLab.png -------------------------------------------------------------------------------- /docs/imgs/oda_comp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/imgs/oda_comp.png -------------------------------------------------------------------------------- /docs/imgs/oda_comp_zh-CN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/imgs/oda_comp_zh-CN.png -------------------------------------------------------------------------------- /docs/imgs/oda_first.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/imgs/oda_first.png -------------------------------------------------------------------------------- /docs/imgs/oda_first_zh-CN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/imgs/oda_first_zh-CN.png -------------------------------------------------------------------------------- /docs/imgs/oda_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/imgs/oda_overview.png -------------------------------------------------------------------------------- /docs/imgs/oda_overview_zh-CN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/imgs/oda_overview_zh-CN.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/index.md -------------------------------------------------------------------------------- /docs/llm-as-judge.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/llm-as-judge.md -------------------------------------------------------------------------------- /docs/model-based-evaluation.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/model-based-evaluation.md -------------------------------------------------------------------------------- /docs/requirements.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/requirements.in -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/docs/requirements.txt -------------------------------------------------------------------------------- /lumache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/lumache.py -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/mkdocs.yml -------------------------------------------------------------------------------- /model_eval/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/README.md -------------------------------------------------------------------------------- /model_eval/README_zh-CN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/README_zh-CN.md -------------------------------------------------------------------------------- /model_eval/eval_script/test_llama.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/eval_script/test_llama.sh -------------------------------------------------------------------------------- /model_eval/eval_script/test_llama_all_benchmarks.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/eval_script/test_llama_all_benchmarks.sh -------------------------------------------------------------------------------- /model_eval/eval_script/test_llama_code.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/eval_script/test_llama_code.sh -------------------------------------------------------------------------------- /model_eval/eval_script/test_llama_general.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/eval_script/test_llama_general.sh -------------------------------------------------------------------------------- /model_eval/eval_script/test_llama_math.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/eval_script/test_llama_math.sh -------------------------------------------------------------------------------- /model_eval/eval_script/test_llama_reasoning.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/eval_script/test_llama_reasoning.sh -------------------------------------------------------------------------------- /model_eval/eval_script/test_qwen.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/eval_script/test_qwen.sh -------------------------------------------------------------------------------- /model_eval/eval_script/test_qwen_all_benchmarks.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/eval_script/test_qwen_all_benchmarks.sh -------------------------------------------------------------------------------- /model_eval/eval_script/test_qwen_code.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/eval_script/test_qwen_code.sh -------------------------------------------------------------------------------- /model_eval/eval_script/test_qwen_general.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/eval_script/test_qwen_general.sh -------------------------------------------------------------------------------- /model_eval/eval_script/test_qwen_math.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/eval_script/test_qwen_math.sh -------------------------------------------------------------------------------- /model_eval/eval_script/test_qwen_reasoning.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/eval_script/test_qwen_reasoning.sh -------------------------------------------------------------------------------- /model_eval/summary_scores/run_summary.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_eval/summary_scores/run_summary.py -------------------------------------------------------------------------------- /model_train/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_train/README.md -------------------------------------------------------------------------------- /model_train/README_zh-CN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_train/README_zh-CN.md -------------------------------------------------------------------------------- /model_train/train_config/llama_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_train/train_config/llama_config.yaml -------------------------------------------------------------------------------- /model_train/train_config/llama_long_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_train/train_config/llama_long_config.yaml -------------------------------------------------------------------------------- /model_train/train_config/qwen_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_train/train_config/qwen_config.yaml -------------------------------------------------------------------------------- /model_train/train_config/qwen_long_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/model_train/train_config/qwen_long_config.yaml -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDataArena/OpenDataArena-Tool/HEAD/pyproject.toml --------------------------------------------------------------------------------