├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt ├── tools ├── download_data │ └── download_cft_data_hf.py ├── evaluate_gpqa │ ├── __init__.py │ ├── code_utils.py │ ├── compute_accuracy.py │ ├── data_loader.py │ ├── dataset │ │ ├── bbh │ │ │ ├── bbh.json │ │ │ └── template.json │ │ ├── gpqa │ │ │ ├── gpqa.jsonl │ │ │ ├── gpqa_diamond.jsonl │ │ │ ├── gpqa_experts.jsonl │ │ │ ├── gpqa_extended.jsonl │ │ │ └── gpqa_main.jsonl │ │ └── mmlu_pro │ │ │ ├── prompt.json │ │ │ └── test.json │ ├── eval_gpqa.sh │ ├── number_utils.py │ ├── prompt_utils.py │ ├── run_choice.py │ ├── run_open.py │ ├── run_open_sc.py │ ├── scripts │ │ └── evaluate_gpqa.sh │ └── utils.py ├── evaluate_math │ ├── LICENSE │ ├── data │ │ ├── aime24 │ │ │ └── test.jsonl │ │ ├── amc23 │ │ │ └── test.jsonl │ │ ├── gsm8k │ │ │ ├── test.jsonl │ │ │ └── train.jsonl │ │ ├── math-500 │ │ │ └── test.jsonl │ │ ├── math │ │ │ ├── test.jsonl │ │ │ └── train.jsonl │ │ ├── minerva_math │ │ │ ├── README.md │ │ │ └── test.jsonl │ │ ├── olympiadbench │ │ │ ├── test.json │ │ │ └── test.jsonl │ │ └── theoremqa │ │ │ ├── test.json │ │ │ ├── test.jsonl │ │ │ └── trans_format.py │ ├── data_loader.py │ ├── evaluate.py │ ├── examples.py │ ├── grader.py │ ├── latex2sympy │ │ ├── .coveragerc │ │ ├── .gitignore │ │ ├── LICENSE.txt │ │ ├── PS.g4 │ │ ├── README.md │ │ ├── __init__.py │ │ ├── antlr-4.11.1-complete.jar │ │ ├── asciimath_printer.py │ │ ├── description.txt │ │ ├── dev-requirements.in │ │ ├── dev-requirements.txt │ │ ├── gen │ │ │ ├── PS.interp │ │ │ ├── PS.tokens │ │ │ ├── PSLexer.interp │ │ │ ├── PSLexer.py │ │ │ ├── PSLexer.tokens │ │ │ ├── PSListener.py │ │ │ ├── PSParser.py │ │ │ └── __init__.py │ │ ├── icon.png │ │ ├── latex2sympy2.py │ │ ├── requirements.in │ │ ├── requirements.txt │ │ ├── sandbox │ │ │ ├── linalg_equations.py │ │ │ ├── linalg_span.py │ │ │ ├── matrix.py │ │ │ ├── matrix_placeholders.py │ │ │ ├── sandbox.py │ │ │ ├── sandbox_equality.py │ │ │ ├── sectan.py │ │ │ └── vector.py │ │ ├── scripts │ │ │ ├── compile.sh │ │ │ ├── coverage-ci.sh │ │ │ ├── coverage.sh │ │ │ ├── pre-commit │ │ │ ├── pre-push │ │ │ ├── publish.sh │ │ │ ├── setup-hooks.sh │ │ │ ├── setup.sh │ │ │ └── test.sh │ │ ├── setup.cfg │ │ ├── setup.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── abs_test.py │ │ │ ├── all_bad_test.py │ │ │ ├── all_good_test.py │ │ │ ├── atom_expr_test.py │ │ │ ├── binomial_test.py │ │ │ ├── ceil_test.py │ │ │ ├── complex_test.py │ │ │ ├── context.py │ │ │ ├── exp_test.py │ │ │ ├── floor_test.py │ │ │ ├── gcd_test.py │ │ │ ├── greek_test.py │ │ │ ├── grouping_test.py │ │ │ ├── lcm_test.py │ │ │ ├── left_right_cdot_test.py │ │ │ ├── linalg_test.py │ │ │ ├── max_test.py │ │ │ ├── min_test.py │ │ │ ├── mod_test.py │ │ │ ├── overline_test.py │ │ │ ├── pi_test.py │ │ │ ├── trig_test.py │ │ │ └── variable_test.py │ ├── math_eval.py │ ├── math_utils.py │ ├── model_utils.py │ ├── parser.py │ ├── python_executor.py │ ├── requirements.txt │ ├── rm_maj_eval.py │ ├── scripts │ │ ├── evaluate_deepseek.sh │ │ └── evaluate_qwen.sh │ ├── trajectory.py │ └── utils.py ├── evaluate_mmlu-pro │ ├── cot_prompt_lib │ │ ├── initial_prompt.txt │ │ ├── initial_prompt_1.txt │ │ └── initial_prompt_2.txt │ ├── evaluate_from_local.py │ └── mmlu-pro-eval.sh ├── scripts │ ├── download_data.sh │ └── evaluate.sh └── self_construct_critique_data │ ├── generate_critique_by_api.py │ └── run.sh └── train ├── LLaMA-Factory ├── .env.local ├── .gitattributes ├── .pre-commit-config.yaml ├── CITATION.cff ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── README_zh.md ├── assets │ ├── benchmark.svg │ ├── logo.png │ ├── wechat.jpg │ └── wechat_npu.jpg ├── data │ ├── README.md │ ├── README_zh.md │ ├── alpaca_en_demo.json │ ├── alpaca_zh_demo.json │ ├── belle_multiturn │ │ └── belle_multiturn.py │ ├── c4_demo.json │ ├── dataset_info.json │ ├── dpo_en_demo.json │ ├── dpo_zh_demo.json │ ├── glaive_toolcall_en_demo.json │ ├── glaive_toolcall_zh_demo.json │ ├── hh_rlhf_en │ │ └── hh_rlhf_en.py │ ├── identity.json │ ├── kto_en_demo.json │ ├── mllm_demo.json │ ├── mllm_demo_data │ │ ├── 1.jpg │ │ ├── 1.mp4 │ │ ├── 2.avi │ │ ├── 2.jpg │ │ ├── 3.jpg │ │ └── 3.mp4 │ ├── mllm_video_demo.json │ ├── ultra_chat │ │ └── ultra_chat.py │ └── wiki_demo.txt ├── docker │ ├── docker-cuda │ │ ├── Dockerfile │ │ └── docker-compose.yml │ ├── docker-npu │ │ ├── Dockerfile │ │ └── docker-compose.yml │ └── docker-rocm │ │ ├── Dockerfile │ │ └── docker-compose.yml ├── evaluation │ ├── ceval │ │ ├── ceval.py │ │ ├── ceval.zip │ │ └── mapping.json │ ├── cmmlu │ │ ├── cmmlu.py │ │ ├── cmmlu.zip │ │ └── mapping.json │ └── mmlu │ │ ├── mapping.json │ │ ├── mmlu.py │ │ └── mmlu.zip ├── examples │ ├── README.md │ ├── README_zh.md │ ├── accelerate │ │ └── fsdp_config.yaml │ ├── deepspeed │ │ ├── ds_z0_config.json │ │ ├── ds_z2_config.json │ │ ├── ds_z2_offload_config.json │ │ ├── ds_z3_config.json │ │ └── ds_z3_offload_config.json │ ├── extras │ │ ├── adam_mini │ │ │ └── qwen2_full_sft.yaml │ │ ├── badam │ │ │ └── llama3_full_sft.yaml │ │ ├── fsdp_qlora │ │ │ ├── llama3_lora_sft.yaml │ │ │ └── train.sh │ │ ├── galore │ │ │ └── llama3_full_sft.yaml │ │ ├── llama_pro │ │ │ ├── expand.sh │ │ │ └── llama3_freeze_sft.yaml │ │ ├── loraplus │ │ │ └── llama3_lora_sft.yaml │ │ ├── mod │ │ │ └── llama3_full_sft.yaml │ │ ├── nlg_eval │ │ │ └── llama3_lora_predict.yaml │ │ └── pissa │ │ │ ├── init.sh │ │ │ └── llama3_lora_sft.yaml │ ├── inference │ │ ├── llama3.yaml │ │ ├── llama3_full_sft.yaml │ │ ├── llama3_lora_sft.yaml │ │ ├── llama3_vllm.yaml │ │ ├── llava1_5.yaml │ │ └── qwen2_vl.yaml │ ├── merge_lora │ │ ├── llama3_gptq.yaml │ │ ├── llama3_lora_sft.yaml │ │ └── qwen2vl_lora_sft.yaml │ ├── train_full │ │ ├── llama3_full_sft.yaml │ │ └── qwen2vl_full_sft.yaml │ ├── train_lora │ │ ├── llama3_lora_dpo.yaml │ │ ├── llama3_lora_eval.yaml │ │ ├── llama3_lora_kto.yaml │ │ ├── llama3_lora_ppo.yaml │ │ ├── llama3_lora_pretrain.yaml │ │ ├── llama3_lora_reward.yaml │ │ ├── llama3_lora_sft.yaml │ │ ├── llama3_lora_sft_ds3.yaml │ │ ├── llama3_preprocess.yaml │ │ ├── llava1_5_lora_sft.yaml │ │ ├── qwen2vl_lora_dpo.yaml │ │ └── qwen2vl_lora_sft.yaml │ └── train_qlora │ │ ├── llama3_lora_sft_aqlm.yaml │ │ ├── llama3_lora_sft_awq.yaml │ │ ├── llama3_lora_sft_gptq.yaml │ │ └── llama3_lora_sft_otfq.yaml ├── pyproject.toml ├── requirements.txt ├── scripts │ ├── api_example │ │ ├── test_image.py │ │ └── test_toolcall.py │ ├── convert_ckpt │ │ ├── llamafy_baichuan2.py │ │ └── llamafy_qwen.py │ ├── llama_pro.py │ ├── loftq_init.py │ ├── pissa_init.py │ ├── stat_utils │ │ ├── cal_flops.py │ │ ├── cal_lr.py │ │ ├── cal_mfu.py │ │ ├── cal_ppl.py │ │ └── length_cdf.py │ └── vllm_infer.py ├── setup.py ├── src │ ├── api.py │ ├── llamafactory.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ ├── entry_points.txt │ │ ├── requires.txt │ │ └── top_level.txt │ ├── llamafactory │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-311.pyc │ │ │ ├── cli.cpython-311.pyc │ │ │ └── launcher.cpython-311.pyc │ │ ├── api │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ ├── app.cpython-311.pyc │ │ │ │ ├── chat.cpython-311.pyc │ │ │ │ ├── common.cpython-311.pyc │ │ │ │ └── protocol.cpython-311.pyc │ │ │ ├── app.py │ │ │ ├── chat.py │ │ │ ├── common.py │ │ │ └── protocol.py │ │ ├── chat │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ ├── base_engine.cpython-311.pyc │ │ │ │ ├── chat_model.cpython-311.pyc │ │ │ │ ├── hf_engine.cpython-311.pyc │ │ │ │ └── vllm_engine.cpython-311.pyc │ │ │ ├── base_engine.py │ │ │ ├── chat_model.py │ │ │ ├── hf_engine.py │ │ │ └── vllm_engine.py │ │ ├── cli.py │ │ ├── eval │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ ├── evaluator.cpython-311.pyc │ │ │ │ └── template.cpython-311.pyc │ │ │ ├── evaluator.py │ │ │ └── template.py │ │ ├── extras │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ ├── constants.cpython-311.pyc │ │ │ │ ├── env.cpython-311.pyc │ │ │ │ ├── logging.cpython-311.pyc │ │ │ │ ├── misc.cpython-311.pyc │ │ │ │ ├── packages.cpython-311.pyc │ │ │ │ └── ploting.cpython-311.pyc │ │ │ ├── constants.py │ │ │ ├── env.py │ │ │ ├── logging.py │ │ │ ├── misc.py │ │ │ ├── packages.py │ │ │ └── ploting.py │ │ ├── hparams │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ ├── data_args.cpython-311.pyc │ │ │ │ ├── evaluation_args.cpython-311.pyc │ │ │ │ ├── finetuning_args.cpython-311.pyc │ │ │ │ ├── generating_args.cpython-311.pyc │ │ │ │ ├── model_args.cpython-311.pyc │ │ │ │ └── parser.cpython-311.pyc │ │ │ ├── data_args.py │ │ │ ├── evaluation_args.py │ │ │ ├── finetuning_args.py │ │ │ ├── generating_args.py │ │ │ ├── model_args.py │ │ │ └── parser.py │ │ ├── launcher.py │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ ├── adapter.cpython-311.pyc │ │ │ │ ├── loader.cpython-311.pyc │ │ │ │ └── patcher.cpython-311.pyc │ │ │ ├── adapter.py │ │ │ ├── loader.py │ │ │ ├── model_utils │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ │ ├── attention.cpython-311.pyc │ │ │ │ │ ├── checkpointing.cpython-311.pyc │ │ │ │ │ ├── embedding.cpython-311.pyc │ │ │ │ │ ├── liger_kernel.cpython-311.pyc │ │ │ │ │ ├── longlora.cpython-311.pyc │ │ │ │ │ ├── misc.cpython-311.pyc │ │ │ │ │ ├── mod.cpython-311.pyc │ │ │ │ │ ├── moe.cpython-311.pyc │ │ │ │ │ ├── packing.cpython-311.pyc │ │ │ │ │ ├── quantization.cpython-311.pyc │ │ │ │ │ ├── rope.cpython-311.pyc │ │ │ │ │ ├── unsloth.cpython-311.pyc │ │ │ │ │ ├── valuehead.cpython-311.pyc │ │ │ │ │ └── visual.cpython-311.pyc │ │ │ │ ├── attention.py │ │ │ │ ├── checkpointing.py │ │ │ │ ├── embedding.py │ │ │ │ ├── liger_kernel.py │ │ │ │ ├── longlora.py │ │ │ │ ├── misc.py │ │ │ │ ├── mod.py │ │ │ │ ├── moe.py │ │ │ │ ├── packing.py │ │ │ │ ├── quantization.py │ │ │ │ ├── rope.py │ │ │ │ ├── unsloth.py │ │ │ │ ├── valuehead.py │ │ │ │ └── visual.py │ │ │ └── patcher.py │ │ ├── train │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ ├── callbacks.cpython-311.pyc │ │ │ │ ├── trainer_utils.cpython-311.pyc │ │ │ │ └── tuner.cpython-311.pyc │ │ │ ├── callbacks.py │ │ │ ├── dpo │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ │ ├── trainer.cpython-311.pyc │ │ │ │ │ └── workflow.cpython-311.pyc │ │ │ │ ├── trainer.py │ │ │ │ └── workflow.py │ │ │ ├── kto │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ │ ├── trainer.cpython-311.pyc │ │ │ │ │ └── workflow.cpython-311.pyc │ │ │ │ ├── trainer.py │ │ │ │ └── workflow.py │ │ │ ├── ppo │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ │ ├── ppo_utils.cpython-311.pyc │ │ │ │ │ ├── trainer.cpython-311.pyc │ │ │ │ │ └── workflow.cpython-311.pyc │ │ │ │ ├── ppo_utils.py │ │ │ │ ├── trainer.py │ │ │ │ └── workflow.py │ │ │ ├── pt │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ │ ├── trainer.cpython-311.pyc │ │ │ │ │ └── workflow.cpython-311.pyc │ │ │ │ ├── trainer.py │ │ │ │ └── workflow.py │ │ │ ├── rm │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ │ ├── metric.cpython-311.pyc │ │ │ │ │ ├── trainer.cpython-311.pyc │ │ │ │ │ └── workflow.cpython-311.pyc │ │ │ │ ├── metric.py │ │ │ │ ├── trainer.py │ │ │ │ └── workflow.py │ │ │ ├── sft │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ │ ├── metric.cpython-311.pyc │ │ │ │ │ ├── trainer.cpython-311.pyc │ │ │ │ │ └── workflow.cpython-311.pyc │ │ │ │ ├── metric.py │ │ │ │ ├── trainer.py │ │ │ │ └── workflow.py │ │ │ ├── test_utils.py │ │ │ ├── trainer_utils.py │ │ │ └── tuner.py │ │ └── webui │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-311.pyc │ │ │ ├── chatter.cpython-311.pyc │ │ │ ├── common.cpython-311.pyc │ │ │ ├── css.cpython-311.pyc │ │ │ ├── engine.cpython-311.pyc │ │ │ ├── interface.cpython-311.pyc │ │ │ ├── manager.cpython-311.pyc │ │ │ ├── runner.cpython-311.pyc │ │ │ └── utils.cpython-311.pyc │ │ │ ├── chatter.py │ │ │ ├── common.py │ │ │ ├── components │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-311.pyc │ │ │ │ ├── chatbot.cpython-311.pyc │ │ │ │ ├── data.cpython-311.pyc │ │ │ │ ├── eval.cpython-311.pyc │ │ │ │ ├── export.cpython-311.pyc │ │ │ │ ├── infer.cpython-311.pyc │ │ │ │ ├── top.cpython-311.pyc │ │ │ │ └── train.cpython-311.pyc │ │ │ ├── chatbot.py │ │ │ ├── data.py │ │ │ ├── eval.py │ │ │ ├── export.py │ │ │ ├── infer.py │ │ │ ├── top.py │ │ │ └── train.py │ │ │ ├── css.py │ │ │ ├── engine.py │ │ │ ├── interface.py │ │ │ ├── manager.py │ │ │ ├── runner.py │ │ │ └── utils.py │ ├── train.py │ └── webui.py └── tests │ ├── e2e │ ├── test_chat.py │ └── test_train.py │ ├── eval │ └── test_eval_template.py │ ├── model │ ├── model_utils │ │ ├── test_attention.py │ │ ├── test_checkpointing.py │ │ └── test_packing.py │ ├── test_base.py │ ├── test_freeze.py │ ├── test_full.py │ ├── test_lora.py │ └── test_pissa.py │ └── train │ └── test_sft_trainer.py ├── Validation ├── start_validate.sh ├── validate_single.sh └── validation_on_math-500.py └── scripts ├── train_qwen2_5-32b-instruct-cft ├── qwen2.5-32b-cft-webinstruct-4k.yaml └── train.sh └── train_qwen2_5-math-7b-cft ├── qwen2.5-math-7b-cft-webinstruct-50k.yaml └── train.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .vscode/ 3 | *.swp 4 | *.swo 5 | .DS_Store 6 | local* 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 TIGER Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | vllm==0.6.6 2 | tqdm 3 | datasets 4 | python_dateutil 5 | flash_attn 6 | 7 | # math_eval 8 | sympy==1.12 9 | antlr4-python3-runtime==4.11.1 # ! The version needs to be compatible with sympy. 10 | word2number 11 | Pebble 12 | timeout-decorator 13 | latex2sympy2 14 | -------------------------------------------------------------------------------- /tools/evaluate_gpqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/tools/evaluate_gpqa/__init__.py -------------------------------------------------------------------------------- /tools/evaluate_gpqa/code_utils.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import threading 3 | from io import StringIO 4 | from contextlib import redirect_stdout 5 | 6 | 7 | def format_code(code_str: str): 8 | code = 'def run_it():\n' 9 | for line in code_str.split('\n'): 10 | code += ' ' + line + '\n' 11 | code += 'run_it()' 12 | return code 13 | 14 | 15 | class CodeExecutor: 16 | def __init__(self, code: str, timeout: int, use_process: bool): 17 | self.code = format_code(code) 18 | self.timeout = timeout 19 | self.error = '' 20 | self.use_process = use_process 21 | 22 | def execute_code(self, return_val): 23 | try: 24 | f = StringIO() 25 | with redirect_stdout(f): 26 | exec(self.code, globals(), locals()) 27 | s = f.getvalue() 28 | s = s.strip('\n') 29 | return_val['result'] = s 30 | except Exception: 31 | pass 32 | 33 | @staticmethod 34 | def execute_code_with_string(code, index, return_val): 35 | code = format_code(code) 36 | try: 37 | f = StringIO() 38 | with redirect_stdout(f): 39 | exec(code, globals(), locals()) 40 | s = f.getvalue() 41 | s = s.strip('\n') 42 | return_val[index] = s 43 | except Exception as e: 44 | pass 45 | 46 | def run(self): 47 | if self.use_process: 48 | manager = multiprocessing.Manager() 49 | return_dict = manager.dict() 50 | process = multiprocessing.Process( 51 | target=self.execute_code, args=(return_dict,)) 52 | process.start() 53 | process.join(timeout=self.timeout) 54 | process.terminate() 55 | else: 56 | return_dict = {} 57 | thread = threading.Thread( 58 | target=self.execute_code, args=(return_dict,)) 59 | thread.start() 60 | thread.join(timeout=self.timeout) 61 | if thread.is_alive(): 62 | thread.join() # Ensures the thread is terminated before continuing 63 | print('time out!') 64 | self.error = 'Execution timed out' 65 | 66 | if 'result' in return_dict: 67 | return return_dict['result'] 68 | else: 69 | return '' 70 | -------------------------------------------------------------------------------- /tools/evaluate_gpqa/compute_accuracy.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from utils import compare_answer_with_groundtruth 4 | # from multiprocessing import Pool 5 | 6 | assert len(sys.argv) >= 2, 'you need to feed in a file' 7 | 8 | 9 | def func(filename: str): 10 | results = {} 11 | with open(filename) as f: 12 | for line in f: 13 | entry = json.loads(line) 14 | 15 | groundtruth = entry['correct'] 16 | if isinstance(groundtruth, str): 17 | groundtruth = [groundtruth] 18 | 19 | if entry['task'] not in results: 20 | results[entry['task']] = [0, 0, 0] 21 | 22 | if entry['pred'] == '': 23 | results[entry['task']][-1] += 1 24 | 25 | if compare_answer_with_groundtruth(entry['pred'], *groundtruth): 26 | results[entry['task']][0] += 1 27 | else: 28 | results[entry['task']][1] += 1 29 | 30 | if results: 31 | overall_correct, overall_wrong = 0, 0 32 | tasks = sorted(results.keys()) 33 | for task in tasks: 34 | correct, wrong, fail = results[task][0], results[task][1], results[task][2] 35 | overall_correct += correct 36 | overall_wrong += wrong 37 | print(filename, '###', task, '###', f'accuracy={correct / (correct + wrong)}', f'fail={fail}') 38 | print(filename, 39 | '###', 40 | 'overall accuracy=', 41 | overall_correct / (overall_correct + overall_wrong), 42 | '###', 43 | f'length={overall_correct + overall_wrong}') 44 | else: 45 | print(filename, ' is Empty!') 46 | 47 | 48 | if __name__ == '__main__': 49 | for file in sys.argv[1:]: 50 | func(file) 51 | -------------------------------------------------------------------------------- /tools/evaluate_gpqa/eval_gpqa.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | model_path=$1 4 | output_dir=$2 5 | summary_path=$3 6 | n_shot=$4 7 | 8 | datasets=("gpqa_diamond") 9 | 10 | for dataset in "${datasets[@]}"; do 11 | echo "Processing dataset: $dataset" 12 | python run_open.py \ 13 | --model $model_path \ 14 | --shots $n_shot \ 15 | --dataset $dataset \ 16 | --form "gpqa" \ 17 | --output_dir $output_dir \ 18 | --summary_path $summary_path 19 | done 20 | 21 | -------------------------------------------------------------------------------- /tools/evaluate_gpqa/scripts/evaluate_gpqa.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | model_path=$1 4 | output_dir=$2 5 | summary_path=$3 6 | 7 | datasets=("gpqa") 8 | cd .. 9 | for dataset in "${datasets[@]}"; do 10 | echo "Processing dataset: $dataset" 11 | python run_open.py \ 12 | --model $model_path \ 13 | --dataset $dataset \ 14 | --form qwen \ 15 | --output_dir $output_dir \ 16 | --summary_path $summary_path 17 | done 18 | 19 | -------------------------------------------------------------------------------- /tools/evaluate_math/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Zhibin Gou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tools/evaluate_math/data/minerva_math/README.md: -------------------------------------------------------------------------------- 1 | MIT OpenCourseWare: 2 | - Solving Quantitative Reasoning Problems with Language Models. https://openreview.net/forum?id=IFXTZERXdM7 3 | -------------------------------------------------------------------------------- /tools/evaluate_math/data/theoremqa/trans_format.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | def main(): 6 | with open("test.json", "r") as fi: 7 | data = json.load(fi) 8 | output_data = [] 9 | for each in data: 10 | question = each["Question"] 11 | answer = str(each["Answer"]) 12 | if answer == "True" or answer == "False": 13 | question += " Answer with \\boxed{True} or \\boxed{False}." 14 | output_data.append({"problem": question, "answer": answer}) 15 | with open("test.jsonl", "w") as fo: 16 | for each in output_data: 17 | fo.write(json.dumps(each) + "\n") 18 | 19 | 20 | main() 21 | 22 | 23 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/.coveragerc: -------------------------------------------------------------------------------- 1 | # .coveragerc to control coverage.py 2 | [run] 3 | branch = True 4 | include = 5 | latex2sympy.py 6 | omit = 7 | sandbox/* 8 | gen/* 9 | asciimath_printer.py 10 | setup.py 11 | __init__.py 12 | 13 | [report] 14 | # Regexes for lines to exclude from consideration 15 | exclude_lines = 16 | # Have to re-enable the standard pragma 17 | pragma: no cover 18 | 19 | # Don't complain about missing debug-only code: 20 | def __repr__ 21 | if self\.debug 22 | 23 | # Don't complain if tests don't hit defensive assertion code: 24 | raise AssertionError 25 | raise NotImplementedError 26 | 27 | # Don't complain if non-runnable code isn't run: 28 | if 0: 29 | if __name__ == .__main__.: 30 | 31 | ignore_errors = True 32 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .antlr 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 89 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 90 | # install all needed dependencies. 91 | #Pipfile.lock 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | # Pyre type checker 124 | .pyre/ 125 | 126 | # Azure Functions artifacts 127 | bin 128 | obj 129 | appsettings.json 130 | local.settings.json 131 | .python_packages 132 | stemgen-solution-engine.zip -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright 2016, latex2sympy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/__init__.py: -------------------------------------------------------------------------------- 1 | import latex2sympy -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/antlr-4.11.1-complete.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/tools/evaluate_math/latex2sympy/antlr-4.11.1-complete.jar -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/asciimath_printer.py: -------------------------------------------------------------------------------- 1 | from sympy.printing.str import StrPrinter 2 | from sympy.core import S 3 | 4 | class AsciiMathPrinter(StrPrinter): 5 | 6 | def _print_Limit(self, expr): 7 | e, z = expr.args 8 | 9 | return "lim_(%s -> %s) %s" % (self._print(z), self._print(z), self._print(e)) 10 | 11 | def _print_Integral(self, expr): 12 | e, lims = expr.args 13 | if len(lims) > 1: 14 | return "int_(%s)^(%s) %s d%s" % (self._print(lims[1]), self._print(lims[2]), self._print(e), self._print(lims[0])) 15 | else: 16 | return "int %s d%s" % (self._print(e), self._print(lims)) 17 | 18 | def _print_Sum(self, expr): 19 | e, lims = expr.args 20 | return "sum_(%s = %s)^(%s) %s" % (self._print(lims[0]), self._print(lims[1]), self._print(lims[2]), self._print(e)) 21 | 22 | def _print_Product(self, expr): 23 | e, lims = expr.args 24 | return "prod_(%s = %s)^(%s) %s" % (self._print(lims[0]), self._print(lims[1]), self._print(lims[2]), self._print(e)) 25 | 26 | def _print_factorial(self, expr): 27 | return "%s!" % self._print(expr.args[0]) 28 | 29 | def _print_Derivative(self, expr): 30 | e = expr.args[0] 31 | wrt = expr.args[1] 32 | return "d/d%s %s" % (self._print(wrt), self._print(e)) 33 | 34 | def _print_Abs(self, expr): 35 | return "|%s|" % self._print(expr.args[0]) 36 | 37 | def _print_Equality(self, expr): 38 | return "%s = %s" % (self._print(expr.args[0]), self._print(expr.args[1])) 39 | 40 | def _print_Pow(self, expr): 41 | b = self._print(expr.base) 42 | if expr.exp is S.Half: 43 | return "sqrt(%s)" % b 44 | 45 | if -expr.exp is S.Half: 46 | return "1/sqrt(%s)" % b 47 | if expr.exp is -S.One: 48 | return "1/%s" % b 49 | 50 | return "%s^(%s)" % (b, self._print(expr.exp)) 51 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/dev-requirements.in: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | # Development 3 | pip-tools 4 | pytest 5 | pytest-cov 6 | pycodestyle 7 | autopep8 8 | -e . 9 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/dev-requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.10 3 | # by the following command: 4 | # 5 | # pip-compile dev-requirements.in 6 | # 7 | # via -r dev-requirements.in 8 | antlr4-python3-runtime==4.11.1 9 | # via 10 | # -r requirements.txt 11 | # latex2sympy2 12 | atomicwrites==1.3.0 13 | # via pytest 14 | attrs==19.3.0 15 | # via pytest 16 | autopep8==1.4.4 17 | # via -r dev-requirements.in 18 | click==7.0 19 | # via pip-tools 20 | coverage==4.5.4 21 | # via pytest-cov 22 | more-itertools==7.2.0 23 | # via pytest 24 | mpmath==1.3.0 25 | # via 26 | # -r requirements.txt 27 | # sympy 28 | packaging==19.2 29 | # via pytest 30 | pip-tools==4.2.0 31 | # via -r dev-requirements.in 32 | pluggy==0.13.0 33 | # via pytest 34 | py==1.8.0 35 | # via pytest 36 | pycodestyle==2.5.0 37 | # via 38 | # -r dev-requirements.in 39 | # autopep8 40 | pyparsing==2.4.4 41 | # via packaging 42 | pytest==5.2.2 43 | # via 44 | # -r dev-requirements.in 45 | # pytest-cov 46 | pytest-cov==2.8.1 47 | # via -r dev-requirements.in 48 | six==1.13.0 49 | # via 50 | # packaging 51 | # pip-tools 52 | sympy==1.12 53 | # via 54 | # -r requirements.txt 55 | # latex2sympy2 56 | wcwidth==0.1.7 57 | # via pytest 58 | 59 | # THIS MUST BE MAINTAINED AS-IS 60 | -e . -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/gen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/tools/evaluate_math/latex2sympy/gen/__init__.py -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/tools/evaluate_math/latex2sympy/icon.png -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/requirements.in: -------------------------------------------------------------------------------- 1 | sympy 2 | antlr4-python3-runtime 3 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.10 3 | # by the following command: 4 | # 5 | # pip-compile requirements.in 6 | # 7 | antlr4-python3-runtime==4.11.1 8 | # via -r requirements.in 9 | mpmath==1.3.0 10 | # via sympy 11 | sympy==1.12 12 | # via -r requirements.in 13 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/sandbox/linalg_equations.py: -------------------------------------------------------------------------------- 1 | from latex2sympy import process_sympy 2 | import sys 3 | sys.path.append("..") 4 | 5 | # latex = "2\\begin{pmatrix}1&1&1\\\\0&1&1\\\\0&0&1\\end{pmatrix}\\begin{pmatrix}1&1&1\\\\0&1&1\\\\0&0&1\\end{pmatrix}" 6 | latex = "\\frac{a^{2} \\left(3 \\pi - 4 \\sin{\\left(\\pi \\right)} + \\frac{\\sin{\\left(2 \\pi \\right)}}{2}\\right)}{2}" 7 | math = process_sympy(latex) 8 | 9 | print(type(math)) 10 | print("latex: %s to math: %s" % (latex, math)) 11 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/sandbox/linalg_span.py: -------------------------------------------------------------------------------- 1 | from latex2sympy import process_sympy 2 | import sys 3 | sys.path.append("..") 4 | 5 | latex = "\\begin{pmatrix}1\\\\2\\\\3\\end{pmatrix}" 6 | math = process_sympy(latex) 7 | print("latex: %s to math: %s" % (latex, math)) 8 | 9 | latex = "\\begin{pmatrix}1\\\\2\\\\3\\end{pmatrix},\\begin{pmatrix}4\\\\3\\\\1\\end{pmatrix}" 10 | math = process_sympy(latex) 11 | print("latex: %s to math: %s" % (latex, math)) 12 | 13 | latex = "[\\begin{pmatrix}1\\\\2\\\\3\\end{pmatrix},\\begin{pmatrix}4\\\\3\\\\1\\end{pmatrix}]" 14 | math = process_sympy(latex) 15 | print("latex: %s to math: %s" % (latex, math)) 16 | 17 | latex = "\\left\\{\\begin{pmatrix}1\\\\2\\\\3\\end{pmatrix},\\begin{pmatrix}4\\\\3\\\\1\\end{pmatrix}\\right\\}" 18 | math = process_sympy(latex) 19 | print("latex: %s to math: %s" % (latex, math)) 20 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/sandbox/matrix.py: -------------------------------------------------------------------------------- 1 | from latex2sympy import process_sympy 2 | from sympy import * 3 | import sys 4 | sys.path.append("..") 5 | 6 | theta = Symbol('theta', real=True) 7 | 8 | latex = "\\begin{matrix}1&2\\\\3&4\\end{matrix}" 9 | math = process_sympy(latex) 10 | print("latex: %s to math: %s" % (latex, math)) 11 | 12 | latex = "\\begin{matrix}1&2\\\\3&4\\\\5&6\\end{matrix}" 13 | math = process_sympy(latex) 14 | print("latex: %s to math: %s" % (latex, math)) 15 | 16 | latex = "\\begin{matrix}1&2&3\\\\4&5&6\\\\7&8&9\\end{matrix}" 17 | math = process_sympy(latex) 18 | print("latex: %s to math: %s" % (latex, math)) 19 | 20 | latex = "\\begin{matrix}x^1&x^2&x^3\\\\y^1&y^2&y^3\\\\z^1&z^2&z^3\\end{matrix}" 21 | math = process_sympy(latex) 22 | print("latex: %s to math: %s" % (latex, math)) 23 | 24 | latex = "\\begin{matrix}x\\\\y\\end{matrix}" 25 | math = process_sympy(latex) 26 | print("latex: %s to math: %s" % (latex, math)) 27 | 28 | latex = "2\\cdot\\begin{matrix}x\\\\y\\end{matrix}" 29 | math = process_sympy(latex) 30 | print("latex: %s to math: %s" % (latex, math)) 31 | 32 | latex = "2\\cdot\\begin{matrix}x\\\\y\\end{matrix} + \\begin{matrix}2\\\\3\\end{matrix}" 33 | math = process_sympy(latex) 34 | print("latex: %s to math: %s" % (latex, math)) 35 | 36 | latex = "-2\\begin{matrix}1&2\\\\3&4\\end{matrix}" 37 | math = process_sympy(latex) 38 | print("latex: %s to math: %s" % (latex, math)) 39 | 40 | latex = "2\\cdot\\theta\\begin{matrix}x\\\\y\\end{matrix} + \\begin{matrix}2\\\\3\\end{matrix}" 41 | math = process_sympy(latex) 42 | print("latex: %s to math: %s" % (latex, math)) 43 | 44 | latex = "\\theta\\begin{matrix}1\\\\3\\end{matrix} - \\begin{matrix}-1\\\\2\\end{matrix}" 45 | math = process_sympy(latex) 46 | print("latex: %s to math: %s" % (latex, math)) 47 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/sandbox/matrix_placeholders.py: -------------------------------------------------------------------------------- 1 | from latex2sympy import process_sympy 2 | from sympy import * 3 | import sys 4 | import hashlib 5 | import time 6 | 7 | sys.path.append("..") 8 | 9 | 10 | M = Matrix([[1, 2], [3, 4]]) 11 | v = Matrix([1, 2]) 12 | 13 | # sub settings 14 | sub_settings_symbols = {} 15 | sub_settings_symbols[Symbol('M' + hashlib.md5('M'.encode()).hexdigest(), real=True)] = M 16 | sub_settings_symbols[Symbol('v' + hashlib.md5('v'.encode()).hexdigest(), real=True)] = v 17 | 18 | 19 | # one parameters 20 | latex = "\\begin{matrix}1&2\\\\3&4\\end{matrix}\\cdot[!v!]" 21 | equation_sympy_check = MatMul(M, Symbol('v' + hashlib.md5('v'.encode()).hexdigest(), real=True)) 22 | equation_sympy_subs_check = MatMul(M, v) 23 | # placeholders 24 | equation_sympy = process_sympy(latex) 25 | print('latex = %s' % latex) 26 | print('equation_sympy = %s' % equation_sympy) 27 | print('equation_sympy_check = %s' % equation_sympy_check) 28 | print('equation_sympy = %s' % (srepr(equation_sympy))) 29 | 30 | equation_sympy_subs = equation_sympy.subs(sub_settings_symbols, evaluate=False) 31 | print('equation_sympy_subs = %s' % equation_sympy_subs) 32 | print('equation_sympy_subs_check = %s' % equation_sympy_subs_check) 33 | 34 | 35 | # two parameters 36 | 37 | # sub settings 38 | print('') 39 | print('============== Two Parameters -> M*v = Matrix*Vector =============') 40 | sub_settings_symbols = {} 41 | sub_settings_symbols[Symbol('M' + hashlib.md5('M'.encode()).hexdigest(), commutative=False)] = M 42 | sub_settings_symbols[Symbol('v' + hashlib.md5('v'.encode()).hexdigest(), commutative=False)] = v 43 | 44 | latex = "[!M!]\\cdot[!v!]" 45 | math_check = Mul(Symbol('M' + hashlib.md5('M'.encode()).hexdigest(), commutative=False), Symbol('v' + hashlib.md5('v'.encode()).hexdigest(), commutative=False)) 46 | # placeholders 47 | equation_sympy = process_sympy(latex) 48 | print(latex) 49 | print(math_check) 50 | print(equation_sympy) 51 | print(srepr(equation_sympy)) 52 | 53 | # performance 54 | t0 = time.time() 55 | 56 | # process_sympy and substitute at the same time 57 | # Only needed for linalg input 58 | placeholder_values = {'M': M, 'v': v} 59 | equation_sympy_subs = process_sympy(latex, variable_values=placeholder_values) 60 | 61 | t1 = time.time() 62 | print('equation with substituted placeholders = %s' % (str(equation_sympy_subs))) 63 | print('time to process to sympy with placeholders = %s s' % (t1 - t0)) 64 | print('') 65 | print('============== Two Parameters -> M*v = Matrix*Vector =============') 66 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/sandbox/sandbox.py: -------------------------------------------------------------------------------- 1 | from sympy import * 2 | from latex2sympy import process_sympy 3 | 4 | 5 | # latex = '\\variable{a}^{\\variable{b}}' 6 | # variables = {'a': process_sympy('658.95998'), 'b': process_sympy('185083.8060')} 7 | # c_ans_expr = process_sympy(latex, variables) 8 | # print(c_ans_expr) 9 | # print(srepr(c_ans_expr)) 10 | # c_ans = c_ans_expr.doit(deep=False).evalf(chop=True) 11 | # print(c_ans) 12 | # print(srepr(c_ans)) 13 | 14 | 15 | # numeric_responses = ['1', '1.0', '-1', '-1.0', '.5', '-.5', '3x10^3', '3E3', '3,000x10^{-3}', '0.5E-1', '\\frac{1}{3}', '(5\\times 3)^3', '\\sin(1)'] 16 | # for latex in numeric_responses: 17 | # parsed = process_sympy(latex) 18 | # print('latex: ', latex) 19 | # print('sympy: ', parsed) 20 | # print('is_number: ', parsed.is_number) 21 | # print('is_Number: ', parsed.is_Number) 22 | # print('srepr: ', srepr(parsed)) 23 | # print('-----------------------------------------------------') 24 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/sandbox/sandbox_equality.py: -------------------------------------------------------------------------------- 1 | from sympy import * 2 | from latex2sympy import process_sympy 3 | 4 | 5 | # 6 | # Equality Testing 7 | # 8 | 9 | answer_sets = [ 10 | { 11 | 'correct_answer': '(x-y)(x+2y)', 12 | 'student_answers': [ 13 | 'x^2+xy-2y^2', 14 | '(x-y)(x+2y)', 15 | '(x+2y)(x-y)', 16 | '(2\\times y+x)(-y+x)', 17 | '(y\\cdot 2+x)(-y+x)' 18 | ] 19 | }, 20 | { 21 | 'correct_answer': '2\\pi \\variable{r}^2', 22 | 'student_answers': [ 23 | '2\\pi \\variable{r}^2', 24 | '\\pi 2\\variable{r}^2', 25 | '2\\times \\pi \\times \\variable{r}^2', 26 | '2\\pi \\variable{r} \\times \\variable{r}' 27 | ] 28 | }, 29 | { 30 | 'correct_answer': '2x - 3y', 31 | 'student_answers': [ 32 | '-3y + 2x' 33 | ] 34 | }, 35 | { 36 | 'correct_answer': 'x\\times x', 37 | 'student_answers': [ 38 | 'x\\times x', 39 | 'x\\cdot x', 40 | 'x^2', 41 | '(\\sqrt{x})^{4}' 42 | ] 43 | }, 44 | { 45 | 'correct_answer': '23e^{-1\\times \\sqrt{t^2}}', 46 | 'student_answers': [ 47 | '23e^{-t}' 48 | ] 49 | }, 50 | { 51 | 'correct_answer': 'a=x^2+1', 52 | 'student_answers': [ 53 | 'x^2+1=a' 54 | ] 55 | } 56 | ] 57 | 58 | for answer_set in answer_sets: 59 | correct_answer = answer_set['correct_answer'] 60 | correct_answer_parsed = process_sympy(answer_set['correct_answer']) 61 | for student_answer in answer_set['student_answers']: 62 | student_answer_parsed = process_sympy(student_answer) 63 | print('correct_answer (c): ', correct_answer, correct_answer_parsed) 64 | print('student_answer (a): ', student_answer, student_answer_parsed) 65 | print('') 66 | print('Expression Tree (srepr(c) == srepr(a)) =>', srepr(correct_answer_parsed) == srepr(student_answer_parsed)) 67 | print('srepr(c) =>', srepr(correct_answer_parsed)) 68 | print('srepr(a) =>', srepr(student_answer_parsed)) 69 | print('') 70 | # print('Structural (c == a) =>', correct_answer_parsed == student_answer_parsed) 71 | print('Symbolic (simplify(c - s) == 0) =>', simplify(correct_answer_parsed - student_answer_parsed) == 0) 72 | print('simplified =>', simplify(correct_answer_parsed - student_answer_parsed)) 73 | print('') 74 | print('Numeric Substitution (c.equals(s)) =>', correct_answer_parsed.equals(student_answer_parsed)) 75 | print('-----------------------------------------------------') 76 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/sandbox/sectan.py: -------------------------------------------------------------------------------- 1 | from sympy import * 2 | import sys 3 | sys.path.append("..") 4 | 5 | # # x^2\cdot \left(3\cdot \tan \left([!a!]\cdot x+[!c!]\right)+[!a!]\cdot x\left(\sec \left([!a!]\cdot x+[!c!]\right)\right)^2\right) 6 | # latex1 = "x^2\\cdot \\left(3\\cdot \\tan \\left(2\\cdot x+5\\right)+2\\cdot x\\left(\\sec \\left(2\\cdot x+5\\right)\\right)^2\\right)" 7 | # math1 = process_sympy(latex1) 8 | # print("latex: %s to math: %s" %(latex1,math1)) 9 | # 10 | # latex2 = "x^2\\cdot \\left(3\\cdot \\tan \\left(2\\cdot x+5\\right)+2\\cdot x\\left(\\sec \\left(2\\cdot x+5\\right)^2\\right)\\right)" 11 | # math2 = process_sympy(latex2) 12 | # print("latex: %s to math: %s" %(latex2,math2)) 13 | # 14 | # latex3 = "x^2\\cdot \\left(3\\cdot \\tan \\left(2\\cdot x+5\\right)+2\\cdot x\\left(1+\\tan \\left(2\\cdot x+5\\right)^2\\right)\\right)" 15 | # math3 = process_sympy(latex3) 16 | # print("latex: %s to math: %s" %(latex3,math3)) 17 | # 18 | # print(simplify(math1 - math2)) 19 | # print(simplify(math1 - math3)) 20 | 21 | # 22 | # latex1 = "\\sec^2(2\\cdot x+5)" 23 | # math1 = process_sympy(latex1) 24 | # print("latex: %s to math: %s" %(latex1,math1)) 25 | # 26 | # latex2 = "1+\\tan^2(2\\cdot x+5)" 27 | # math2 = process_sympy(latex2) 28 | # print("latex: %s to math: %s" %(latex2,math2)) 29 | # print(simplify(math1 - math2)) 30 | 31 | 32 | x = Symbol('x', real=True) 33 | y = Symbol('y', real=True) 34 | 35 | # BUG: 1 + tan^2(x+1) should be == sec^2(x+1) but isnt 36 | lhs = (1 + (tan(x + 1))**2) 37 | rhs = (sec(x + 1))**2 38 | eq = lhs - rhs 39 | print(simplify(lhs)) 40 | print(simplify(rhs)) 41 | print(simplify(eq)) 42 | print(simplify(lhs) == simplify(rhs)) 43 | 44 | # 1 + tan^2(x) == sec^2(x) but isnt 45 | lhs = (1 + (tan(x))**2) 46 | rhs = (sec(x))**2 47 | eq = lhs - rhs 48 | print(simplify(lhs)) 49 | print(simplify(rhs)) 50 | print(simplify(eq)) 51 | print(simplify(lhs) == simplify(rhs)) 52 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/sandbox/vector.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sympy import * 3 | import sys 4 | sys.path.append("..") 5 | 6 | # row column matrix = vector 7 | v = [1, 2, 3] 8 | 9 | # single column matrix = vector 10 | m = Matrix([1, 2, 3]) 11 | print(m[:, 0]) 12 | 13 | # a three row and 2 column matrix 14 | m = Matrix([[1, 2], [3, 4], [5, 6]]) 15 | print(m[:, 0]) 16 | 17 | # determinant of lin indp system != 0 18 | m = Matrix([[1, 1], [1, 2]]) 19 | print(m.det()) 20 | 21 | # determinant of lin dep system = 0 22 | m = Matrix([[1, 1], [2, 2]]) 23 | print(m.det()) 24 | 25 | # determinant of lin dep system = 0 26 | x = Symbol('x') 27 | y = Symbol('y') 28 | m = Matrix([[x, y], [x, y]]) 29 | print(m.det()) 30 | # Reduced Row-Echelon Form 31 | _, ind = m.rref() 32 | print(len(ind)) 33 | 34 | # determinant of lin dep system != 0 35 | m = Matrix([[x, y], [y, x]]) 36 | print(m.det()) 37 | # Reduced Row-Echelon Form 38 | _, ind = m.rref() 39 | print(len(ind)) 40 | 41 | # determinant of lin dep system != 0 42 | # Reduced Row-Echelon Form 43 | m = Matrix([[x, x, y], [y, y, y]]) 44 | _, ind = m.rref() 45 | # Reduced Row-Echelon Form 46 | print(len(ind)) 47 | 48 | #==================# 49 | #===== Numpy ======# 50 | #==================# 51 | # http://kitchingroup.cheme.cmu.edu/blog/2013/03/01/Determining-linear-independence-of-a-set-of-vectors/ 52 | # Lin Indp of set of numerical vectors 53 | TOLERANCE = 1e-14 54 | v1 = [6, 0, 3, 1, 4, 2] 55 | v2 = [0, -1, 2, 7, 0, 5] 56 | v3 = [12, 3, 0, -19, 8, -11] 57 | 58 | A = np.row_stack([v1, v2, v3]) 59 | 60 | U, s, V = np.linalg.svd(A) 61 | print(s) 62 | print(np.sum(s > TOLERANCE)) 63 | 64 | v1 = [1, 1] 65 | v2 = [4, 4] 66 | 67 | A = np.row_stack([v1, v2]) 68 | U, s, V = np.linalg.svd(A) 69 | print(s) 70 | print(np.sum(s > TOLERANCE)) 71 | 72 | 73 | latex = "\\begin{matrix}1&2\\\\3&4\\end{matrix}" 74 | # math = process_sympy(latex) 75 | print("latex: %s to math: %s" % (latex, 1)) 76 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/scripts/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Get relative path of the root directory of the project 4 | rdir=`git rev-parse --git-dir` 5 | rel_path="$(dirname "$rdir")" 6 | # Change to that path and run the file 7 | cd $rel_path 8 | 9 | java -jar antlr-4.11.1-complete.jar PS.g4 -o gen 10 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/scripts/coverage-ci.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | pytest --doctest-modules --junitxml=junit/test-results.xml --cov-report=xml --cov-config=.coveragerc --cov=latex2sympy tests -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/scripts/coverage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Get relative path of the root directory of the project 4 | rdir=`git rev-parse --git-dir` 5 | rel_path="$(dirname "$rdir")" 6 | # Change to that path and run the file 7 | cd $rel_path 8 | 9 | # Activate virtual environment 10 | echo "activating venv..." 11 | if test -f .env/bin/activate 12 | then source .env/bin/activate && echo "venv activate (bin)" 13 | elif test -f .env/Scripts/activate 14 | then source .env/Scripts/activate && echo "venv activated (Scripts)" 15 | else exit 1 16 | fi 17 | 18 | # Run unit test coverage 19 | echo "starting coverage..." 20 | if pytest --doctest-modules --cov-report=html --cov-config=.coveragerc --cov=latex2sympy tests 21 | then echo "coverage finished" 22 | else exit 1 23 | fi 24 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/scripts/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Get relative path of the root directory of the project 4 | rdir=`git rev-parse --git-dir` 5 | rel_path="$(dirname "$rdir")" 6 | 7 | # Change to that path and run the file 8 | cd $rel_path 9 | 10 | echo "pre-commit hook started..." 11 | 12 | # Activate virtual environment 13 | echo "activating venv..." 14 | if test -f .env/bin/activate 15 | then source .env/bin/activate && echo "venv activated." 16 | elif test -f .env/Scripts/activate 17 | then source .env/Scripts/activate && echo "venv activated." 18 | else exit 1 19 | fi 20 | 21 | # Run auto formatting on all staged python files, then add those changes 22 | echo "auto-formatting code..." 23 | if autopep8 --in-place `git diff --name-status --cached | grep '.py' | awk 'match($1, "A|M"){print $2}'` && git add `git diff --name-status --cached | grep '.py' | awk 'match($1, "A|M"){print $2}'` 24 | then echo "code was auto-formatted." 25 | else echo "no code was auto-formatted." 26 | fi 27 | 28 | exit 0 29 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/scripts/pre-push: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Get relative path of the root directory of the project 4 | rdir=`git rev-parse --git-dir` 5 | rel_path="$(dirname "$rdir")" 6 | 7 | # Change to that path and run the file 8 | cd $rel_path 9 | 10 | echo "pre-push hook started..." 11 | 12 | # Activate virtual environment 13 | echo "activating venv..." 14 | if test -f .env/bin/activate 15 | then source .env/bin/activate && echo "venv activated." 16 | elif test -f .env/Scripts/activate 17 | then source .env/Scripts/activate && echo "venv activated." 18 | else exit 1 19 | fi 20 | 21 | # Run unit tests 22 | echo "starting tests..." 23 | # if pytest tests 24 | # then echo "tests finished." 25 | # else exit 1 26 | # fi 27 | 28 | exit 0 29 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/scripts/publish.sh: -------------------------------------------------------------------------------- 1 | rm ./dist/* 2 | python3 setup.py bdist_wheel 3 | twine upload dist/* 4 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/scripts/setup-hooks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | cp scripts/pre-push .git/hooks/ 3 | cp scripts/pre-commit .git/hooks/ -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/scripts/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Get relative path of the root directory of the project 4 | rdir=`git rev-parse --git-dir` 5 | rel_path="$(dirname "$rdir")" 6 | # Change to that path and run the file 7 | cd $rel_path 8 | 9 | echo "creating venv..." 10 | if test -d .env 11 | then echo "venv exists" 12 | else python3 -m venv .env && echo "venv created" 13 | fi 14 | 15 | echo '' 16 | # Activate virtual environment 17 | echo "activating venv..." 18 | if test -f .env/bin/activate 19 | then source .env/bin/activate && echo "venv activate (bin)" 20 | elif test -f .env/Scripts/activate 21 | then source .env/Scripts/activate && echo "venv activated (Scripts)" 22 | else exit 1 23 | fi 24 | 25 | echo '' 26 | echo "installing requirements..." 27 | if pip install -r dev-requirements.txt 28 | then echo "requirements installed" 29 | else exit 1 30 | fi 31 | 32 | echo '' 33 | echo "compiling parser..." 34 | sh scripts/compile.sh 35 | echo "parser compiled" 36 | 37 | echo '' 38 | echo "setup git hooks..." 39 | sh scripts/setup-hooks.sh 40 | echo "git hooks setup" 41 | 42 | exit 0 43 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/scripts/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Get relative path of the root directory of the project 4 | rdir=`git rev-parse --git-dir` 5 | rel_path="$(dirname "$rdir")" 6 | # Change to that path and run the file 7 | cd $rel_path 8 | 9 | # Activate virtual environment 10 | echo "activating venv..." 11 | if test -f .env/bin/activate 12 | then source .env/bin/activate && echo "venv activate (bin)" 13 | elif test -f .env/Scripts/activate 14 | then source .env/Scripts/activate && echo "venv activated (Scripts)" 15 | else exit 1 16 | fi 17 | 18 | echo '' 19 | echo "compiling parser..." 20 | sh scripts/compile.sh 21 | echo "parser compiled" 22 | 23 | echo '' 24 | # Run unit tests 25 | echo "starting tests..." 26 | if pytest tests 27 | then echo "tests finished" 28 | else exit 1 29 | fi 30 | 31 | exit 0 32 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/setup.cfg: -------------------------------------------------------------------------------- 1 | [pycodestyle] 2 | max-line-length = 120 3 | ignore = E501 4 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from codecs import open 3 | from os import path 4 | here = path.abspath(path.dirname(__file__)) 5 | 6 | 7 | setup( 8 | name="latex2sympy2", 9 | version="1.9.0", 10 | description='Convert latex to sympy with ANTLR and support Matrix, Linear Algebra and CAS functions.', 11 | long_description_content_type='text/markdown', 12 | long_description=open(path.join(here, "README.md"), encoding='utf-8').read(), 13 | # The project's main homepage. 14 | url='https://github.com/ZubinGou/latex2sympy', 15 | # Author details 16 | author='ZubinGou', 17 | author_email='zebgou@gmail.com', 18 | # Choose your license 19 | license='MIT', 20 | classifiers=[ 21 | 'Development Status :: 4 - Beta', 22 | 'Intended Audience :: Developers', 23 | 'Intended Audience :: Education', 24 | 'Intended Audience :: Science/Research', 25 | 'License :: OSI Approved :: MIT License', 26 | 'Topic :: Education', 27 | 'Topic :: Scientific/Engineering :: Mathematics', 28 | 'Topic :: Software Development :: Compilers', 29 | 'Topic :: Text Processing :: Markup :: LaTeX', 30 | 'Topic :: Text Processing :: Markup :: Markdown', 31 | 'Programming Language :: Python :: 3', 32 | 'Programming Language :: Python :: 3.3', 33 | 'Programming Language :: Python :: 3.4', 34 | 'Programming Language :: Python :: 3.5', 35 | 'Programming Language :: Python :: 3.6', 36 | 'Programming Language :: Python :: 3.7', 37 | 'Programming Language :: Python :: 3.8', 38 | ], 39 | packages=find_packages(exclude=('tests')), 40 | py_modules=['asciimath_printer', 'latex2sympy2'], 41 | install_requires=[ 42 | 'sympy>=1.4', 43 | 'antlr4-python3-runtime==4.11.1' 44 | ], 45 | ) 46 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/tools/evaluate_math/latex2sympy/tests/__init__.py -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/abs_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal, get_simple_examples 2 | import pytest 3 | from sympy import Abs 4 | 5 | examples = get_simple_examples(Abs) 6 | 7 | delimiter_pairs = { 8 | '|': '|', 9 | '\\vert': '\\vert', 10 | '\\lvert': '\\rvert' 11 | } 12 | 13 | 14 | @pytest.mark.parametrize('input, output, symbolically', examples) 15 | def test_abs(input, output, symbolically): 16 | for left, right in delimiter_pairs.items(): 17 | assert_equal("{left}{input}{right}".format(left=left, right=right, input=input), output, symbolically=symbolically) 18 | assert_equal("\\left{left}{input}\\right{right}".format(left=left, right=right, input=input), output, symbolically=symbolically) 19 | assert_equal("\\mleft{left}{input}\\mright{right}".format(left=left, right=right, input=input), output, symbolically=symbolically) 20 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/all_bad_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal, process_sympy 2 | import pytest 3 | 4 | 5 | def pytest_generate_tests(metafunc): 6 | metafunc.parametrize('s', metafunc.cls.BAD_STRINGS) 7 | 8 | 9 | class TestAllBad(object): 10 | # These bad latex strings should raise an exception when parsed 11 | BAD_STRINGS = [ 12 | "(", 13 | ")", 14 | # "a / b /", 15 | "\\frac{d}{dx}", 16 | "(\\frac{d}{dx})" 17 | "\\sqrt{}", 18 | "\\sqrt", 19 | "{", 20 | "}", 21 | # "1.1.1", 22 | "\\mathit{TEST}" 23 | "\\frac{2}{}", 24 | "\\frac{}{2}", 25 | "\\int", 26 | # "1 +", 27 | # "a +", 28 | "!", 29 | "!0", 30 | "_", 31 | "^", 32 | # "a // b", 33 | # "a \\cdot \\cdot b", 34 | # "a \\div \\div b", 35 | "a\\mod \\begin{matrix}b\\end{matrix}" 36 | "|", 37 | "||x|", 38 | "\\lfloor x", 39 | "\\lfloor a \\rceil", 40 | "\\operatorname{floor}(12.3, 123.4)", 41 | "()", 42 | "((((((((((((((((()))))))))))))))))", 43 | "-", 44 | "\\frac{d}{dx} + \\frac{d}{dt}", 45 | # "f()", 46 | # "f(,", 47 | # "f(x,,y)", 48 | # "f(x,y,", 49 | "\\sin^x", 50 | "\\cos^2", 51 | # "\\cos 1 \\cos", 52 | # "\\gcd(3)", 53 | # "\\lcm(2)", 54 | "@", "#", "$", "%", "&", "*", 55 | "\\", 56 | "~", 57 | "\\frac{(2 + x}{1 - x)}", 58 | "\\lim_{\\pi \\to 3} a", 59 | # because mix of COMMA and SEMICOLON 60 | "\\left\\{\\begin{pmatrix}1\\\\2\\\\3\\end{pmatrix},\\begin{pmatrix}4\\\\3\\\\1\\end{pmatrix};\\begin{pmatrix}1\\\\1\\\\1\\end{pmatrix}\\right\\}", 61 | # percentages without numbers before-hand 62 | "a\\%", 63 | "\\%100", 64 | # dollar signs without numbers after 65 | "\\$" 66 | ] 67 | 68 | def test_bad_string(self, s): 69 | with pytest.raises(Exception): 70 | process_sympy(s) 71 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/atom_expr_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal 2 | import pytest 3 | from sympy import Symbol, Integer, Pow 4 | 5 | # label, text, symbol_text 6 | symbols = [ 7 | ('letter', 'x', 'x'), 8 | ('greek letter', '\\lambda', 'lambda'), 9 | ('greek letter w/ space', '\\alpha ', 'alpha'), 10 | ('accented letter', '\\overline{x}', 'xbar') 11 | ] 12 | 13 | subscripts = [ 14 | ('2'), 15 | ('{23}'), 16 | ('i'), 17 | ('{ij}'), 18 | ('{i,j}'), 19 | ('{good}'), 20 | ('{x^2}') 21 | ] 22 | 23 | examples = [] 24 | for symbol in symbols: 25 | for subscript in subscripts: 26 | examples.append(tuple(list(symbol) + [subscript])) 27 | 28 | 29 | @pytest.mark.parametrize('label, text, symbol_text, subscript', examples) 30 | def test_with_supexpr(label, text, symbol_text, subscript): 31 | assert_equal(text + '^2', Pow(Symbol(symbol_text, real=True), Integer(2))) 32 | 33 | 34 | @pytest.mark.parametrize('label, text, symbol_text, subscript', examples) 35 | def test_with_subexpr(label, text, symbol_text, subscript): 36 | assert_equal(text + '_' + subscript, Symbol(symbol_text + '_' + subscript, real=True)) 37 | 38 | 39 | @pytest.mark.parametrize('label, text, symbol_text, subscript', examples) 40 | def test_with_subexpr_before_supexpr(label, text, symbol_text, subscript): 41 | assert_equal(text + '_' + subscript + '^2', Pow(Symbol(symbol_text + '_' + subscript, real=True), Integer(2))) 42 | 43 | 44 | @pytest.mark.parametrize('label, text, symbol_text, subscript', examples) 45 | def test_with_subexpr_before_supexpr_with_braces(label, text, symbol_text, subscript): 46 | wrapped_subscript = subscript if '{' in subscript else '{' + subscript + '}' 47 | assert_equal(text + '_' + wrapped_subscript + '^{2}', Pow(Symbol(symbol_text + '_' + subscript, real=True), Integer(2))) 48 | 49 | 50 | @pytest.mark.parametrize('label, text, symbol_text, subscript', examples) 51 | def test_with_supexpr_before_subexpr(label, text, symbol_text, subscript): 52 | assert_equal(text + '^2_' + subscript, Pow(Symbol(symbol_text + '_' + subscript, real=True), Integer(2))) 53 | 54 | 55 | @pytest.mark.parametrize('label, text, symbol_text, subscript', examples) 56 | def test_with_supexpr_before_subexpr_with_braces(label, text, symbol_text, subscript): 57 | wrapped_subscript = subscript if '{' in subscript else '{' + subscript + '}' 58 | assert_equal(text + '^{2}_' + wrapped_subscript, Pow(Symbol(symbol_text + '_' + subscript, real=True), Integer(2))) 59 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/binomial_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal, _Add, _Mul, _Pow 2 | import pytest 3 | from sympy import binomial, Symbol 4 | 5 | x = Symbol('x', real=True) 6 | y = Symbol('y', real=True) 7 | theta = Symbol('theta', real=True) 8 | gamma = Symbol('gamma', real=True) 9 | 10 | 11 | def test_binomial_numeric(): 12 | assert_equal("\\binom{16}{2}", binomial(16, 2)) 13 | 14 | 15 | def test_binomial_symbols(): 16 | assert_equal("\\binom{x}{y}", binomial(x, y)) 17 | 18 | 19 | def test_binomial_greek_symbols(): 20 | assert_equal("\\binom{\\theta}{\\gamma}", binomial(theta, gamma)) 21 | 22 | 23 | def test_binomial_expr(): 24 | assert_equal("\\binom{16+2}{\\frac{4}{2}}", binomial(_Add(16, 2), _Mul(4, _Pow(2, -1)), evaluate=False)) 25 | 26 | 27 | def test_choose_numeric(): 28 | assert_equal("\\choose{16}{2}", binomial(16, 2)) 29 | 30 | 31 | def test_choose_symbols(): 32 | assert_equal("\\choose{x}{y}", binomial(x, y)) 33 | 34 | 35 | def test_choose_greek_symbols(): 36 | assert_equal("\\choose{\\theta}{\\gamma}", binomial(theta, gamma)) 37 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/ceil_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal, get_simple_examples 2 | import pytest 3 | from sympy import ceiling 4 | 5 | examples = get_simple_examples(ceiling) 6 | 7 | 8 | @pytest.mark.parametrize('input, output, symbolically', examples) 9 | def test_ceil_func(input, output, symbolically): 10 | assert_equal("\\ceil({input})".format(input=input), output, symbolically=symbolically) 11 | 12 | 13 | @pytest.mark.parametrize('input, output, symbolically', examples) 14 | def test_ceil_operatorname(input, output, symbolically): 15 | assert_equal("\\operatorname{{ceil}}({input})".format(input=input), output, symbolically=symbolically) 16 | 17 | 18 | @pytest.mark.parametrize('input, output, symbolically', examples) 19 | def test_ceil_cmd(input, output, symbolically): 20 | assert_equal("\\lceil {input}\\rceil".format(input=input), output, symbolically=symbolically) 21 | assert_equal("\\left\\lceil {input}\\right\\rceil".format(input=input), output, symbolically=symbolically) 22 | assert_equal("\\mleft\\lceil {input}\\mright\\rceil".format(input=input), output, symbolically=symbolically) 23 | 24 | 25 | @pytest.mark.parametrize('input, output, symbolically', examples) 26 | def test_ceil_corners(input, output, symbolically): 27 | assert_equal("\\ulcorner {input}\\urcorner".format(input=input), output, symbolically=symbolically) 28 | assert_equal("\\left\\ulcorner {input}\\right\\urcorner".format(input=input), output, symbolically=symbolically) 29 | assert_equal("\\mleft\\ulcorner {input}\\mright\\urcorner".format(input=input), output, symbolically=symbolically) 30 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/complex_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal 2 | import pytest 3 | from sympy import Sum, I, Symbol, Integer 4 | 5 | a = Symbol('a', real=True) 6 | b = Symbol('b', real=True) 7 | i = Symbol('i', real=True) 8 | n = Symbol('n', real=True) 9 | x = Symbol('x', real=True) 10 | 11 | 12 | def test_complex(): 13 | assert_equal("a+Ib", a + I * b) 14 | 15 | 16 | def test_complex_e(): 17 | assert_equal("e^{I\\pi}", Integer(-1)) 18 | 19 | 20 | def test_complex_sum(): 21 | assert_equal("\\sum_{i=0}^{n} i \\cdot x", Sum(i * x, (i, 0, n))) 22 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/exp_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal 2 | import pytest 3 | from sympy import exp, sin, Symbol, E 4 | 5 | x = Symbol('x', real=True) 6 | y = Symbol('y', real=True) 7 | 8 | 9 | def test_exp_letter(): 10 | assert_equal("e", E) 11 | assert_equal("e", exp(1)) 12 | 13 | 14 | def test_exp_func(): 15 | assert_equal("\\exp(3)", exp(3)) 16 | 17 | 18 | def test_exp_func_no_delim(): 19 | assert_equal("\\exp3", exp(3)) 20 | 21 | 22 | def test_exp_command_symbol(): 23 | assert_equal("\\exponentialE", E) 24 | assert_equal("\\exponentialE", exp(1)) 25 | 26 | 27 | def test_exp_command_symbol_expression(): 28 | assert_equal("\\exponentialE^{3}", exp(3)) 29 | 30 | 31 | def test_exp_command_symbol_multiplied(): 32 | ''' 33 | \\exponentialE is NOT a function, so using the following notation equates to multiplication 34 | ''' 35 | assert_equal("\\exponentialE (3)", E * 3) 36 | assert_equal("\\exponentialE \\left( 3\\right)", E * 3) 37 | assert_equal("\\exponentialE \\times 3", E * 3) 38 | 39 | 40 | def test_exp_numeric(): 41 | assert_equal("e^3", exp(3)) 42 | 43 | 44 | def test_exp_symbol(): 45 | assert_equal("e^x", exp(x)) 46 | 47 | 48 | def test_exp_symbol_expr(): 49 | assert_equal("e^{x+y}", exp(x + y)) 50 | 51 | 52 | def test_exp_symbol_expr_group(): 53 | assert_equal("e^{(x+y)}", exp(x + y)) 54 | 55 | 56 | def test_exp_expr(): 57 | assert_equal("\\sin(x)*e^x", sin(x) * exp(x)) 58 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/floor_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal, get_simple_examples 2 | import pytest 3 | from sympy import floor 4 | 5 | examples = get_simple_examples(floor) 6 | 7 | 8 | @pytest.mark.parametrize('input, output, symbolically', examples) 9 | def test_floor_func(input, output, symbolically): 10 | assert_equal("\\floor({input})".format(input=input), output, symbolically=symbolically) 11 | 12 | 13 | @pytest.mark.parametrize('input, output, symbolically', examples) 14 | def test_floor_operatorname(input, output, symbolically): 15 | assert_equal("\\operatorname{{floor}}({input})".format(input=input), output, symbolically=symbolically) 16 | 17 | 18 | @pytest.mark.parametrize('input, output, symbolically', examples) 19 | def test_floor_cmd(input, output, symbolically): 20 | assert_equal("\\lfloor {input}\\rfloor".format(input=input), output, symbolically=symbolically) 21 | assert_equal("\\left\\lfloor {input}\\right\\rfloor".format(input=input), output, symbolically=symbolically) 22 | assert_equal("\\mleft\\lfloor {input}\\mright\\rfloor".format(input=input), output, symbolically=symbolically) 23 | 24 | 25 | @pytest.mark.parametrize('input, output, symbolically', examples) 26 | def test_floor_corners(input, output, symbolically): 27 | assert_equal("\\llcorner {input}\\lrcorner".format(input=input), output, symbolically=symbolically) 28 | assert_equal("\\left\\llcorner {input}\\right\\lrcorner".format(input=input), output, symbolically=symbolically) 29 | assert_equal("\\mleft\\llcorner {input}\\mright\\lrcorner".format(input=input), output, symbolically=symbolically) 30 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/greek_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal 2 | import pytest 3 | from sympy import Symbol 4 | 5 | epsilon_upper = Symbol('char"000190', real=True) 6 | epsilon_lower = Symbol('epsilon', real=True) 7 | varepsilon = Symbol('varepsilon', real=True) 8 | 9 | 10 | def test_greek_epsilon(): 11 | assert_equal("\\epsilon", epsilon_lower) 12 | 13 | 14 | def test_greek_epsilon_upper(): 15 | assert_equal('\\char"000190', epsilon_upper) 16 | 17 | 18 | def test_greek_varepsilon(): 19 | assert_equal('\\varepsilon', varepsilon) 20 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/grouping_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal, _Pow, _Add, _Mul 2 | import pytest 3 | from sympy import Integral, sin, Symbol, Mul, Integer, Pow 4 | from latex2sympy.latex2sympy2 import latex2sympy as process_sympy 5 | 6 | a = Symbol('a', real=True) 7 | b = Symbol('b', real=True) 8 | x = Symbol('x', real=True) 9 | theta = Symbol('theta', real=True) 10 | 11 | 12 | func_arg_examples = [ 13 | ('\\int ', 'x dx', Integral(x, x)), 14 | ('\\sin', '\\theta ', sin(theta)) 15 | ] 16 | 17 | example_groups = [ 18 | ('1+2', '3-4', _Mul(_Add(1, 2), _Add(3, _Mul(-1, 4)))) 19 | ] 20 | 21 | modifiable_delimiter_pairs = { 22 | '(': ')', 23 | '\\lgroup': '\\rgroup', 24 | '\\{': '\\}', 25 | '\\lbrace': '\\rbrace', 26 | '[': ']', 27 | '\\lbrack': '\\rbrack', 28 | } 29 | 30 | 31 | @pytest.mark.parametrize('func, args, output', func_arg_examples) 32 | def test_func_arg_groupings(func, args, output): 33 | # none 34 | assert_equal("{func} {args}".format(func=func, args=args), output) 35 | # normal brace (not modifiable) 36 | assert_equal("{func}{{{args}}}".format(func=func, args=args), output) 37 | # rest of delimiters, with modifications 38 | for left, right in modifiable_delimiter_pairs.items(): 39 | assert_equal("{func}{left}{args}{right}".format(left=left, right=right, func=func, args=args), output) 40 | assert_equal("{func}\\left{left}{args}\\right{right}".format(left=left, right=right, func=func, args=args), output) 41 | assert_equal("{func}\\mleft{left}{args}\\mright{right}".format(left=left, right=right, func=func, args=args), output) 42 | 43 | 44 | @pytest.mark.parametrize('group1, group2, output', example_groups) 45 | def test_delimiter_groupings(group1, group2, output): 46 | # normal brace (not modifiable) 47 | assert_equal("{{{group1}}}{{{group2}}}".format(group1=group1, group2=group2), output) 48 | # rest of delimiters, with modifications 49 | for left, right in modifiable_delimiter_pairs.items(): 50 | assert_equal("{left}{group1}{right}{left}{group2}{right}".format(left=left, right=right, group1=group1, group2=group2), output) 51 | assert_equal("\\left{left}{group1}\\right{right}\\left{left}{group2}\\right{right}".format(left=left, right=right, group1=group1, group2=group2), output) 52 | assert_equal("\\mleft{left}{group1}\\mright{right}\\mleft{left}{group2}\\mright{right}".format(left=left, right=right, group1=group1, group2=group2), output) 53 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/left_right_cdot_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal 2 | import pytest 3 | from sympy import sin, Symbol 4 | 5 | x = Symbol('x', real=True) 6 | 7 | 8 | def test_left_right_cdot(): 9 | assert_equal("\\sin\\left(x\\right)\\cdot x", sin(x) * x) 10 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/linalg_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal 2 | import pytest 3 | from sympy import MatMul, Matrix 4 | 5 | 6 | def test_linalg_placeholder(): 7 | assert_equal("\\begin{pmatrix}1&2\\\\3&4\\end{pmatrix}\\cdot\\variable{v}", MatMul(Matrix([[1, 2], [3, 4]]), Matrix([1, 2])), {'v': Matrix([1, 2])}) 8 | 9 | 10 | def test_linalg_placeholder_multiple(): 11 | assert_equal("\\variable{M}\\cdot\\variable{v}", MatMul(Matrix([[1, 2], [3, 4]]), Matrix([1, 2])), {'M': Matrix([[1, 2], [3, 4]]), 'v': Matrix([1, 2])}) 12 | 13 | 14 | def test_linalg_placeholder_multiple_mul(): 15 | assert_equal("\\begin{pmatrix}3&-1\\end{pmatrix}\\cdot\\variable{M}\\cdot\\variable{v}", MatMul(Matrix([[3, -1]]), Matrix([[1, 2], [3, 4]]), Matrix([1, 2])), {'M': Matrix([[1, 2], [3, 4]]), 'v': Matrix([1, 2])}) 16 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/overline_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal 2 | import pytest 3 | from sympy import sin, Symbol 4 | 5 | x = Symbol('x', real=True) 6 | 7 | 8 | def test_overline(): 9 | assert_equal("\\frac{\\sin(x)}{\\overline{x}_n}", sin(x) / Symbol('xbar_n', real=True)) 10 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/pi_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal, _Mul, _Pow 2 | import pytest 3 | from sympy import pi, Symbol, acos, cos 4 | 5 | 6 | def test_pi_frac(): 7 | assert_equal("\\frac{\\pi}{3}", _Mul(pi, _Pow(3, -1))) 8 | 9 | 10 | def test_pi_nested(): 11 | assert_equal("\\arccos{\\cos{\\frac{\\pi}{3}}}", acos(cos(_Mul(pi, _Pow(3, -1)), evaluate=False), evaluate=False)) 12 | 13 | 14 | def test_pi_arccos(): 15 | assert_equal("\\arccos{-1}", pi, symbolically=True) 16 | -------------------------------------------------------------------------------- /tools/evaluate_math/latex2sympy/tests/trig_test.py: -------------------------------------------------------------------------------- 1 | from .context import assert_equal 2 | import pytest 3 | from sympy import asinh, Symbol 4 | 5 | # x = Symbol('x', real=True); 6 | 7 | # latex = "\\sinh(x)" 8 | # math = process_sympy(latex) 9 | # print("latex: %s to math: %s" %(latex,math)) 10 | # 11 | # latex = "\\arcsinh(x)" 12 | # math = process_sympy(latex) 13 | # print("latex: %s to math: %s" %(latex,math)) 14 | # 15 | # latex = "\\arsinh(x)" 16 | # math = process_sympy(latex) 17 | # print("latex: %s to math: %s" %(latex,math)) 18 | 19 | 20 | def test_arcsinh(): 21 | assert_equal("\\operatorname{arcsinh}\\left(1\\right)", asinh(1, evaluate=False)) 22 | -------------------------------------------------------------------------------- /tools/evaluate_math/requirements.txt: -------------------------------------------------------------------------------- 1 | # common 2 | vllm 3 | tqdm 4 | datasets 5 | torch 6 | transformers 7 | python_dateutil 8 | flash_attn 9 | 10 | # math_eval 11 | sympy==1.12 12 | antlr4-python3-runtime==4.11.1 # ! The version needs to be compatible with sympy. 13 | word2number 14 | Pebble 15 | timeout-decorator -------------------------------------------------------------------------------- /tools/evaluate_math/scripts/evaluate_deepseek.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | PROMPT_TYPE="deepseek-math" 4 | MODEL_NAME_OR_PATH=$1 5 | OUTPUT_DIR=$2 6 | SUMMARY_PATH=$3 7 | SPLIT="test" 8 | NUM_TEST_SAMPLE=-1 9 | 10 | mkdir -p $OUTPUT_DIR 11 | cd .. 12 | 13 | DATA_NAME="math,minerva_math,gsm8k,olympiadbench,aime24,amc23,theoremqa" 14 | TOKENIZERS_PARALLELISM=false \ 15 | python3 -u math_eval.py \ 16 | --model_name_or_path ${MODEL_NAME_OR_PATH} \ 17 | --data_name ${DATA_NAME} \ 18 | --output_dir ${OUTPUT_DIR} \ 19 | --summary_path ${SUMMARY_PATH} \ 20 | --split ${SPLIT} \ 21 | --prompt_type ${PROMPT_TYPE} \ 22 | --num_test_sample ${NUM_TEST_SAMPLE} \ 23 | --seed 0 \ 24 | --temperature 0 \ 25 | --n_sampling 1 \ 26 | --top_p 1 \ 27 | --start 0 \ 28 | --end -1 \ 29 | --use_vllm \ 30 | --save_outputs \ 31 | # --overwrite \ -------------------------------------------------------------------------------- /tools/evaluate_math/scripts/evaluate_qwen.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | PROMPT_TYPE="qwen25-math-cot" 4 | MODEL_NAME_OR_PATH=$1 5 | OUTPUT_DIR=$2 6 | SUMMARY_PATH=$3 7 | SPLIT="test" 8 | NUM_TEST_SAMPLE=-1 9 | 10 | mkdir -p $OUTPUT_DIR 11 | cd .. 12 | 13 | DATA_NAME="math,minerva_math,gsm8k,olympiadbench,amc23,aime24,theoremqa" 14 | # DATA_NAME="minerva_math" 15 | TOKENIZERS_PARALLELISM=false \ 16 | python3 -u math_eval.py \ 17 | --model_name_or_path ${MODEL_NAME_OR_PATH} \ 18 | --data_name ${DATA_NAME} \ 19 | --output_dir ${OUTPUT_DIR} \ 20 | --summary_path ${SUMMARY_PATH} \ 21 | --split ${SPLIT} \ 22 | --prompt_type ${PROMPT_TYPE} \ 23 | --num_test_sample ${NUM_TEST_SAMPLE} \ 24 | --seed 0 \ 25 | --temperature 0 \ 26 | --n_sampling 1 \ 27 | --top_p 1 \ 28 | --start 0 \ 29 | --end -1 \ 30 | --use_vllm \ 31 | --save_outputs \ 32 | # --overwrite \ 33 | 34 | 35 | #DATA_NAME="aime24" 36 | #TOKENIZERS_PARALLELISM=false \ 37 | #python3 -u math_eval.py \ 38 | # --model_name_or_path ${MODEL_NAME_OR_PATH} \ 39 | # --data_name ${DATA_NAME} \ 40 | # --output_dir ${OUTPUT_DIR} \ 41 | # --summary_path ${SUMMARY_PATH} \ 42 | # --split ${SPLIT} \ 43 | # --prompt_type ${PROMPT_TYPE} \ 44 | # --num_test_sample ${NUM_TEST_SAMPLE} \ 45 | # --seed 0 \ 46 | # --temperature 0.8 \ 47 | # --n_sampling 1 \ 48 | # --top_p 1 \ 49 | # --start 0 \ 50 | # --end -1 \ 51 | # --use_vllm \ 52 | # --save_outputs \ 53 | # # --overwrite \ 54 | -------------------------------------------------------------------------------- /tools/evaluate_mmlu-pro/cot_prompt_lib/initial_prompt.txt: -------------------------------------------------------------------------------- 1 | The following are multiple choice questions (with answers) about {$}. Think step by step and then finish your answer with "the answer is (X)" where X is the correct letter choice. 2 | 3 | 4 | -------------------------------------------------------------------------------- /tools/evaluate_mmlu-pro/cot_prompt_lib/initial_prompt_1.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>system 2 | Please reason step by step, and put your answer with "the answer is (X)" where X is the correct letter choice.<|im_end|> 3 | -------------------------------------------------------------------------------- /tools/evaluate_mmlu-pro/cot_prompt_lib/initial_prompt_2.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>system 2 | The following are multiple choice questions (with answers) about {$}. Think step by step and then finish your answer with "the answer is (X)" where X is the correct letter choice.<|im_end|> 3 | -------------------------------------------------------------------------------- /tools/evaluate_mmlu-pro/mmlu-pro-eval.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | model_path=$1 4 | output_dir=$2 5 | summary_path=$3 6 | n_shot=$4 7 | 8 | python evaluate_from_local.py \ 9 | --ntrain $n_shot \ 10 | --model $model_path \ 11 | --save_dir $output_dir \ 12 | --global_record_file $summary_path 13 | 14 | -------------------------------------------------------------------------------- /tools/scripts/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | cd ../download_data 5 | python download_cft_data_hf.py --config 4k 50k 6 | -------------------------------------------------------------------------------- /tools/scripts/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | model_path="/path/to/model" 5 | output_dir="../evaluation_output" 6 | summary_path="../evaluation_summary.txt" 7 | 8 | export CUDA_VISIBLE_DEVICES=0,1,2,3 9 | 10 | cd ../evaluate_math/scripts 11 | bash evaluate_qwen.sh ${model_path} ${output_dir} ${summary_path} 12 | 13 | cd ../../evaluate_gpqa/scripts 14 | bash evaluate_gpqa.sh ${model_path} ${output_dir} ${summary_path} 15 | 16 | cd ../../evaluate_mmlu-pro 17 | bash mmlu-pro-eval.sh ${model_path} ${output_dir} ${summary_path} 0 18 | -------------------------------------------------------------------------------- /tools/self_construct_critique_data/run.sh: -------------------------------------------------------------------------------- 1 | 2 | export OPENAI_API_KEY=YOUR_API_KEY 3 | 4 | python generate_critique_by_api.py --model_name "gpt-4o-2024-11-20" --num_processes 20 5 | -------------------------------------------------------------------------------- /train/LLaMA-Factory/.env.local: -------------------------------------------------------------------------------- 1 | # Note: actually we do not support .env, just for reference 2 | # api 3 | API_HOST= 4 | API_PORT= 5 | API_KEY= 6 | API_MODEL_NAME= 7 | FASTAPI_ROOT_PATH= 8 | MAX_CONCURRENT= 9 | # general 10 | DISABLE_VERSION_CHECK= 11 | FORCE_CHECK_IMPORTS= 12 | LLAMAFACTORY_VERBOSITY= 13 | USE_MODELSCOPE_HUB= 14 | USE_OPENMIND_HUB= 15 | RECORD_VRAM= 16 | # torchrun 17 | FORCE_TORCHRUN= 18 | MASTER_ADDR= 19 | MASTER_PORT= 20 | NNODES= 21 | NODE_RANK= 22 | NPROC_PER_NODE= 23 | # wandb 24 | WANDB_DISABLED= 25 | WANDB_PROJECT= 26 | WANDB_API_KEY= 27 | # gradio ui 28 | GRADIO_SHARE= 29 | GRADIO_SERVER_NAME= 30 | GRADIO_SERVER_PORT= 31 | GRADIO_ROOT_PATH= 32 | GRADIO_IPV6= 33 | # setup 34 | ENABLE_SHORT_CONSOLE=1 35 | # reserved (do not use) 36 | LLAMABOARD_ENABLED= 37 | LLAMABOARD_WORKDIR= 38 | -------------------------------------------------------------------------------- /train/LLaMA-Factory/.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /train/LLaMA-Factory/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v5.0.0 4 | hooks: 5 | - id: check-ast 6 | - id: check-added-large-files 7 | args: ['--maxkb=25000'] 8 | - id: check-merge-conflict 9 | - id: check-yaml 10 | - id: debug-statements 11 | - id: end-of-file-fixer 12 | - id: trailing-whitespace 13 | args: [--markdown-linebreak-ext=md] 14 | - id: no-commit-to-branch 15 | args: ['--branch', 'main'] 16 | 17 | - repo: https://github.com/asottile/pyupgrade 18 | rev: v3.17.0 19 | hooks: 20 | - id: pyupgrade 21 | args: [--py38-plus] 22 | 23 | - repo: https://github.com/astral-sh/ruff-pre-commit 24 | rev: v0.6.9 25 | hooks: 26 | - id: ruff 27 | args: [--fix] 28 | - id: ruff-format 29 | -------------------------------------------------------------------------------- /train/LLaMA-Factory/CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | date-released: 2024-03 3 | message: "If you use this software, please cite it as below." 4 | authors: 5 | - family-names: "Zheng" 6 | given-names: "Yaowei" 7 | - family-names: "Zhang" 8 | given-names: "Richong" 9 | - family-names: "Zhang" 10 | given-names: "Junhao" 11 | - family-names: "Ye" 12 | given-names: "Yanhan" 13 | - family-names: "Luo" 14 | given-names: "Zheyan" 15 | - family-names: "Feng" 16 | given-names: "Zhangchi" 17 | - family-names: "Ma" 18 | given-names: "Yongqiang" 19 | title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models" 20 | url: "https://arxiv.org/abs/2403.13372" 21 | preferred-citation: 22 | type: conference-paper 23 | conference: 24 | name: "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)" 25 | authors: 26 | - family-names: "Zheng" 27 | given-names: "Yaowei" 28 | - family-names: "Zhang" 29 | given-names: "Richong" 30 | - family-names: "Zhang" 31 | given-names: "Junhao" 32 | - family-names: "Ye" 33 | given-names: "Yanhan" 34 | - family-names: "Luo" 35 | given-names: "Zheyan" 36 | - family-names: "Feng" 37 | given-names: "Zhangchi" 38 | - family-names: "Ma" 39 | given-names: "Yongqiang" 40 | title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models" 41 | url: "https://arxiv.org/abs/2403.13372" 42 | year: 2024 43 | publisher: "Association for Computational Linguistics" 44 | address: "Bangkok, Thailand" 45 | -------------------------------------------------------------------------------- /train/LLaMA-Factory/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE requirements.txt 2 | -------------------------------------------------------------------------------- /train/LLaMA-Factory/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build commit quality style test 2 | 3 | check_dirs := scripts src tests setup.py 4 | 5 | build: 6 | pip install build && python -m build 7 | 8 | commit: 9 | pre-commit install 10 | pre-commit run --all-files 11 | 12 | quality: 13 | ruff check $(check_dirs) 14 | ruff format --check $(check_dirs) 15 | 16 | style: 17 | ruff check $(check_dirs) --fix 18 | ruff format $(check_dirs) 19 | 20 | test: 21 | CUDA_VISIBLE_DEVICES= WANDB_DISABLED=true pytest -vv tests/ 22 | -------------------------------------------------------------------------------- /train/LLaMA-Factory/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/train/LLaMA-Factory/assets/logo.png -------------------------------------------------------------------------------- /train/LLaMA-Factory/assets/wechat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/train/LLaMA-Factory/assets/wechat.jpg -------------------------------------------------------------------------------- /train/LLaMA-Factory/assets/wechat_npu.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/train/LLaMA-Factory/assets/wechat_npu.jpg -------------------------------------------------------------------------------- /train/LLaMA-Factory/data/mllm_demo_data/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/train/LLaMA-Factory/data/mllm_demo_data/1.jpg -------------------------------------------------------------------------------- /train/LLaMA-Factory/data/mllm_demo_data/1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/train/LLaMA-Factory/data/mllm_demo_data/1.mp4 -------------------------------------------------------------------------------- /train/LLaMA-Factory/data/mllm_demo_data/2.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/train/LLaMA-Factory/data/mllm_demo_data/2.avi -------------------------------------------------------------------------------- /train/LLaMA-Factory/data/mllm_demo_data/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/train/LLaMA-Factory/data/mllm_demo_data/2.jpg -------------------------------------------------------------------------------- /train/LLaMA-Factory/data/mllm_demo_data/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/train/LLaMA-Factory/data/mllm_demo_data/3.jpg -------------------------------------------------------------------------------- /train/LLaMA-Factory/data/mllm_demo_data/3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/CritiqueFineTuning/6715b3ca3606b86141515f1918cc95b353609607/train/LLaMA-Factory/data/mllm_demo_data/3.mp4 -------------------------------------------------------------------------------- /train/LLaMA-Factory/data/mllm_video_demo.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "messages": [ 4 | { 5 | "content": "