├── README.md ├── code ├── evalaute.py ├── inference.py └── models │ ├── __pycache__ │ ├── abab.cpython-311.pyc │ ├── deepseek_v2.cpython-311.pyc │ ├── ernie35.cpython-311.pyc │ ├── ernie4.cpython-311.pyc │ ├── ernie4turbo.cpython-311.pyc │ ├── glm4.cpython-311.pyc │ ├── gpt35_turbo_1106.cpython-311.pyc │ ├── gpt4_0125_preview.cpython-311.pyc │ ├── gpt4_turbo_0409.cpython-311.pyc │ ├── gpt4o.cpython-311.pyc │ ├── moonshot.cpython-311.pyc │ ├── qwen2_72b_instruct.cpython-311.pyc │ ├── qwen_2_72b_instruct.cpython-311.pyc │ └── yi_large.cpython-311.pyc │ ├── abab.py │ ├── claude_3_5_sonnet.py │ ├── deepseek_v2.py │ ├── deepseek_v2_lite_chat.py │ ├── ernie35.py │ ├── ernie4.py │ ├── ernie4turbo.py │ ├── glm4.py │ ├── gpt35_turbo_1106.py │ ├── gpt4_0125_preview.py │ ├── gpt4_turbo_0409.py │ ├── gpt4o.py │ ├── moonshot.py │ ├── qwen15_110b_chat.py │ ├── qwen2_72b_instruct.py │ ├── yi_15_34b_chat.py │ └── yi_large.py ├── data └── cfbench_data.json ├── output ├── judge │ ├── abab_eval.json │ ├── deepseek_v2_eval.json │ ├── ernie35_eval.json │ ├── ernie4_eval.json │ ├── glm4_eval.json │ ├── gpt35_turbo_1106_eval.json │ ├── gpt4_0125_preview_eval.json │ ├── gpt4_turbo_0409_eval.json │ ├── gpt4o_eval.json │ ├── moonshot_eval.json │ └── yi_large_eval.json ├── response │ ├── abab_infer.json │ ├── deepseek_v2_infer.json │ ├── ernie35_infer.json │ ├── ernie4_infer.json │ ├── ernie4turbo_infer.json │ ├── glm4_infer.json │ ├── gpt35_turbo_1106_infer.json │ ├── gpt4_0125_preview_infer.json │ ├── gpt4_turbo_0409_infer.json │ ├── gpt4o_infer.json │ ├── moonshot_infer.json │ └── yi_large_infer.json └── scores.xlsx ├── requirements.txt ├── resources ├── img │ ├── 1_introduction_case.png │ ├── 2_pipline.png │ ├── 4_constraints_results.png │ ├── 5_domain_nlp_results.png │ └── leaderboard.png └── paper │ └── CFBench- A Comprehensive Constraints-Following Benchmark for LLMs.pdf └── run.sh /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/README.md -------------------------------------------------------------------------------- /code/evalaute.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/evalaute.py -------------------------------------------------------------------------------- /code/inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/inference.py -------------------------------------------------------------------------------- /code/models/__pycache__/abab.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/abab.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/__pycache__/deepseek_v2.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/deepseek_v2.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/__pycache__/ernie35.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/ernie35.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/__pycache__/ernie4.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/ernie4.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/__pycache__/ernie4turbo.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/ernie4turbo.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/__pycache__/glm4.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/glm4.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/__pycache__/gpt35_turbo_1106.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/gpt35_turbo_1106.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/__pycache__/gpt4_0125_preview.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/gpt4_0125_preview.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/__pycache__/gpt4_turbo_0409.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/gpt4_turbo_0409.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/__pycache__/gpt4o.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/gpt4o.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/__pycache__/moonshot.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/moonshot.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/__pycache__/qwen2_72b_instruct.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/qwen2_72b_instruct.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/__pycache__/qwen_2_72b_instruct.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/qwen_2_72b_instruct.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/__pycache__/yi_large.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/__pycache__/yi_large.cpython-311.pyc -------------------------------------------------------------------------------- /code/models/abab.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/abab.py -------------------------------------------------------------------------------- /code/models/claude_3_5_sonnet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/claude_3_5_sonnet.py -------------------------------------------------------------------------------- /code/models/deepseek_v2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/deepseek_v2.py -------------------------------------------------------------------------------- /code/models/deepseek_v2_lite_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/deepseek_v2_lite_chat.py -------------------------------------------------------------------------------- /code/models/ernie35.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/ernie35.py -------------------------------------------------------------------------------- /code/models/ernie4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/ernie4.py -------------------------------------------------------------------------------- /code/models/ernie4turbo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/ernie4turbo.py -------------------------------------------------------------------------------- /code/models/glm4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/glm4.py -------------------------------------------------------------------------------- /code/models/gpt35_turbo_1106.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/gpt35_turbo_1106.py -------------------------------------------------------------------------------- /code/models/gpt4_0125_preview.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/gpt4_0125_preview.py -------------------------------------------------------------------------------- /code/models/gpt4_turbo_0409.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/gpt4_turbo_0409.py -------------------------------------------------------------------------------- /code/models/gpt4o.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/gpt4o.py -------------------------------------------------------------------------------- /code/models/moonshot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/moonshot.py -------------------------------------------------------------------------------- /code/models/qwen15_110b_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/qwen15_110b_chat.py -------------------------------------------------------------------------------- /code/models/qwen2_72b_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/qwen2_72b_instruct.py -------------------------------------------------------------------------------- /code/models/yi_15_34b_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/yi_15_34b_chat.py -------------------------------------------------------------------------------- /code/models/yi_large.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/code/models/yi_large.py -------------------------------------------------------------------------------- /data/cfbench_data.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/data/cfbench_data.json -------------------------------------------------------------------------------- /output/judge/abab_eval.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/judge/abab_eval.json -------------------------------------------------------------------------------- /output/judge/deepseek_v2_eval.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/judge/deepseek_v2_eval.json -------------------------------------------------------------------------------- /output/judge/ernie35_eval.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/judge/ernie35_eval.json -------------------------------------------------------------------------------- /output/judge/ernie4_eval.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/judge/ernie4_eval.json -------------------------------------------------------------------------------- /output/judge/glm4_eval.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/judge/glm4_eval.json -------------------------------------------------------------------------------- /output/judge/gpt35_turbo_1106_eval.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/judge/gpt35_turbo_1106_eval.json -------------------------------------------------------------------------------- /output/judge/gpt4_0125_preview_eval.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/judge/gpt4_0125_preview_eval.json -------------------------------------------------------------------------------- /output/judge/gpt4_turbo_0409_eval.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/judge/gpt4_turbo_0409_eval.json -------------------------------------------------------------------------------- /output/judge/gpt4o_eval.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/judge/gpt4o_eval.json -------------------------------------------------------------------------------- /output/judge/moonshot_eval.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/judge/moonshot_eval.json -------------------------------------------------------------------------------- /output/judge/yi_large_eval.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/judge/yi_large_eval.json -------------------------------------------------------------------------------- /output/response/abab_infer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/response/abab_infer.json -------------------------------------------------------------------------------- /output/response/deepseek_v2_infer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/response/deepseek_v2_infer.json -------------------------------------------------------------------------------- /output/response/ernie35_infer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/response/ernie35_infer.json -------------------------------------------------------------------------------- /output/response/ernie4_infer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/response/ernie4_infer.json -------------------------------------------------------------------------------- /output/response/ernie4turbo_infer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/response/ernie4turbo_infer.json -------------------------------------------------------------------------------- /output/response/glm4_infer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/response/glm4_infer.json -------------------------------------------------------------------------------- /output/response/gpt35_turbo_1106_infer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/response/gpt35_turbo_1106_infer.json -------------------------------------------------------------------------------- /output/response/gpt4_0125_preview_infer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/response/gpt4_0125_preview_infer.json -------------------------------------------------------------------------------- /output/response/gpt4_turbo_0409_infer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/response/gpt4_turbo_0409_infer.json -------------------------------------------------------------------------------- /output/response/gpt4o_infer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/response/gpt4o_infer.json -------------------------------------------------------------------------------- /output/response/moonshot_infer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/response/moonshot_infer.json -------------------------------------------------------------------------------- /output/response/yi_large_infer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/response/yi_large_infer.json -------------------------------------------------------------------------------- /output/scores.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/output/scores.xlsx -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/requirements.txt -------------------------------------------------------------------------------- /resources/img/1_introduction_case.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/resources/img/1_introduction_case.png -------------------------------------------------------------------------------- /resources/img/2_pipline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/resources/img/2_pipline.png -------------------------------------------------------------------------------- /resources/img/4_constraints_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/resources/img/4_constraints_results.png -------------------------------------------------------------------------------- /resources/img/5_domain_nlp_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/resources/img/5_domain_nlp_results.png -------------------------------------------------------------------------------- /resources/img/leaderboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/resources/img/leaderboard.png -------------------------------------------------------------------------------- /resources/paper/CFBench- A Comprehensive Constraints-Following Benchmark for LLMs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/resources/paper/CFBench- A Comprehensive Constraints-Following Benchmark for LLMs.pdf -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PKU-Baichuan-MLSystemLab/CFBench/HEAD/run.sh --------------------------------------------------------------------------------