├── .env.example ├── .gitignore ├── AI_ETHICS.md ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.txt ├── README.md ├── SECURITY.md ├── average_results.py ├── configs ├── batch_config.yaml └── batch_config.yaml.example ├── data └── reference_reports │ ├── qid_infoco2F10Hm9IIrzZsKJR_report.md │ ├── qid_infocoAHb8t0CYTuzTCznB_report.md │ ├── qid_infocoBp2TDUwPt62Iqzak_report.md │ ├── qid_infocoK5coAaUHa2SScywL_report.md │ ├── qid_infocoTspDptS95JzUHtfS_report.md │ ├── qid_infocoUaNfGu8KK80qMmpg_report.md │ ├── qid_infocoXlRepzUFvAfGkcSn_report.md │ ├── qid_infocomcw7CBtHAmVbvEYm_report.md │ ├── qid_lifesp2DJmlulFMDdLb7hA_report.md │ ├── qid_lifesp3KjIoCKZxJ10IgVh_report.md │ ├── qid_lifesp3dNgib6HFZgsHUFR_report.md │ ├── qid_lifespOPMqhxL5pPFJsWd8_report.md │ ├── qid_lifespUycUsYiR4GtRfrKQ_report.md │ ├── qid_lifespX6WTipV09EAveX19_report.md │ ├── qid_lifespjtHZwYQLi8NU6SdU_report.md │ ├── qid_lifespwgR2QorAuHgkzask_report.md │ ├── qid_market6VWmPyxptfK47civ_report.md │ ├── qid_marketCb3JeqbQuoNtoO6e_report.md │ ├── qid_marketDEJL1uDRvx5UqKKm_report.md │ ├── qid_marketJg4461KKm2uAPb7X_report.md │ ├── qid_marketONPr2EPqd6edkLv7_report.md │ ├── qid_marketOZ8DcbjNbI7D7gkQ_report.md │ ├── qid_marketY2oyfQ0PA24CMjb4_report.md │ ├── qid_marketcmQtxteZNgUdSpUE_report.md │ ├── qid_policy0nSp8LBMkdYR1dN4_report.md │ ├── qid_policy6ukPGIQni5apWCMS_report.md │ ├── qid_policyDAYZvZSnnBh32igN_report.md │ ├── qid_policyGkZ8xushXFtVcZHF_report.md │ ├── qid_policyRtVmcbZHm8DO6rmR_report.md │ ├── qid_policyfX8vyl8vSaEC7Jue_report.md │ ├── qid_policymqmG9gpR6KDuxp5F_report.md │ ├── qid_policytDVXqY4eQXxyrtLC_report.md │ ├── qid_procon2EqeEXA31JZn71e7_report.md │ ├── qid_proconX7zAJLOPgcyZRDpZ_report.md │ ├── qid_proconh2ZdDUYEw7hj6yTY_report.md │ ├── qid_proconhy0bVrN2JhZ2Y4GE_report.md │ ├── qid_proconoaEUC24gzbaHqsB9_report.md │ ├── qid_procony5ewSmxa6EiwByts_report.md │ ├── qid_proconyXgOxCYl7uWArBmZ_report.md │ ├── qid_prodcp1BJglRaf2jODTVcl_report.md │ ├── qid_prodcp5SZd3x5Kd51QEyJC_report.md │ ├── qid_prodcpGSsSCKpQAvSZhczP_report.md │ ├── qid_prodcpJ5QMHZrBMvFuCKbY_report.md │ ├── qid_prodcpQVPUCCeIbgfrg01Y_report.md │ ├── qid_prodcpk7qXxlpmrrTy0GyZ_report.md │ ├── qid_prodcponmUtiKpSBpdwMUq_report.md │ ├── qid_prodcpslATZmS6ESv2yDln_report.md │ ├── qid_rankli824vjs9DMOPbb7HD_report.md │ ├── qid_rankliDwyJ6ePJgm8znsQh_report.md │ ├── qid_rankliFmBTzbMqx2kaw22n_report.md │ ├── qid_rankliNmeXNx8zhLK17tZm_report.md │ ├── qid_rankliQW3nT9D69Y9z2IwY_report.md │ ├── qid_ranklidfdj3RVVH31L5VPo_report.md │ ├── qid_rankliqLKYjYMVUuUxPsQk_report.md │ ├── qid_rankliscQtnu8LeS5DQPDu_report.md │ ├── qid_review5q6iS0uJfnrUKg5h_report.md │ ├── qid_review7WR9DY30D57MosM2_report.md │ ├── qid_reviewDuKImcS9AcUMBZGy_report.md │ ├── qid_reviewK0nAC9ggA3ijafYR_report.md │ ├── qid_reviewUSC6Z2qYt8BXkwBa_report.md │ ├── qid_reviewes8I18LtI6ioOQum_report.md │ ├── qid_reviewkF4ySyG6kBGDm8gm_report.md │ ├── qid_reviewvl0BjYy6KRNoqDNQ_report.md │ ├── qid_techsp5dgyRDlZ9iuOOEpT_report.md │ ├── qid_techsp9FOJRdQzS5k3RUpc_report.md │ ├── qid_techspH03dNoqnVQ9MahGu_report.md │ ├── qid_techspSZIG8XzmicGFYvHD_report.md │ ├── qid_techspkCUk3fQGVjhIjenV_report.md │ ├── qid_techspnypcYIutGeysr17X_report.md │ ├── qid_techsppQVl7qtQ5HVWUJKp_report.md │ ├── qid_techspsK4Re3Jq5v6Zj5AB_report.md │ ├── qid_topic0wgEjhAfQW51cyiZ_report.md │ ├── qid_topicBSSGEhfWR3DVNsA2_report.md │ ├── qid_topicFUrl0vQ0wz5kkSHx_report.md │ ├── qid_topicU5Om4qeJWDc9Pn2A_report.md │ ├── qid_topicd2PZ70chNQZVYofJ_report.md │ ├── qid_topicnMbVhT4zxQSZHUx2_report.md │ ├── qid_topicqwjvYGldTBetIKqZ_report.md │ ├── qid_topicxQpauRxmf4wtVEVl_report.md │ └── qid_topiczL71JengsGofPr0A_report.md ├── docs └── DATASET.md ├── imgs └── task_domain_dist.png ├── liveresearchbench ├── __init__.py ├── batch_evaluator.py ├── common │ ├── __init__.py │ ├── io_utils.py │ ├── model_clients.py │ └── reference_reports.py ├── criteria │ ├── __init__.py │ ├── citation.py │ ├── consistency.py │ ├── coverage.py │ ├── depth.py │ └── presentation.py └── graders │ ├── __init__.py │ ├── base_grader.py │ ├── checklist_grader.py │ ├── pairwise_grader.py │ └── pointwise_grader.py ├── main.py ├── preprocess.py ├── pyproject.toml ├── scripts ├── batch_grade.sh ├── batch_grade_multi_provider.sh └── preprocess_all.sh ├── tests └── test_grading.py └── uv.lock /.env.example: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/.env.example -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/.gitignore -------------------------------------------------------------------------------- /AI_ETHICS.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/AI_ETHICS.md -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/CODEOWNERS -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/LICENSE.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/README.md -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/SECURITY.md -------------------------------------------------------------------------------- /average_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/average_results.py -------------------------------------------------------------------------------- /configs/batch_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/configs/batch_config.yaml -------------------------------------------------------------------------------- /configs/batch_config.yaml.example: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/configs/batch_config.yaml.example -------------------------------------------------------------------------------- /data/reference_reports/qid_infoco2F10Hm9IIrzZsKJR_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_infoco2F10Hm9IIrzZsKJR_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_infocoAHb8t0CYTuzTCznB_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_infocoAHb8t0CYTuzTCznB_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_infocoBp2TDUwPt62Iqzak_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_infocoBp2TDUwPt62Iqzak_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_infocoK5coAaUHa2SScywL_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_infocoK5coAaUHa2SScywL_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_infocoTspDptS95JzUHtfS_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_infocoTspDptS95JzUHtfS_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_infocoUaNfGu8KK80qMmpg_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_infocoUaNfGu8KK80qMmpg_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_infocoXlRepzUFvAfGkcSn_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_infocoXlRepzUFvAfGkcSn_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_infocomcw7CBtHAmVbvEYm_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_infocomcw7CBtHAmVbvEYm_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_lifesp2DJmlulFMDdLb7hA_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_lifesp2DJmlulFMDdLb7hA_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_lifesp3KjIoCKZxJ10IgVh_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_lifesp3KjIoCKZxJ10IgVh_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_lifesp3dNgib6HFZgsHUFR_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_lifesp3dNgib6HFZgsHUFR_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_lifespOPMqhxL5pPFJsWd8_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_lifespOPMqhxL5pPFJsWd8_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_lifespUycUsYiR4GtRfrKQ_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_lifespUycUsYiR4GtRfrKQ_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_lifespX6WTipV09EAveX19_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_lifespX6WTipV09EAveX19_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_lifespjtHZwYQLi8NU6SdU_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_lifespjtHZwYQLi8NU6SdU_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_lifespwgR2QorAuHgkzask_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_lifespwgR2QorAuHgkzask_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_market6VWmPyxptfK47civ_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_market6VWmPyxptfK47civ_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_marketCb3JeqbQuoNtoO6e_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_marketCb3JeqbQuoNtoO6e_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_marketDEJL1uDRvx5UqKKm_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_marketDEJL1uDRvx5UqKKm_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_marketJg4461KKm2uAPb7X_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_marketJg4461KKm2uAPb7X_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_marketONPr2EPqd6edkLv7_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_marketONPr2EPqd6edkLv7_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_marketOZ8DcbjNbI7D7gkQ_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_marketOZ8DcbjNbI7D7gkQ_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_marketY2oyfQ0PA24CMjb4_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_marketY2oyfQ0PA24CMjb4_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_marketcmQtxteZNgUdSpUE_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_marketcmQtxteZNgUdSpUE_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_policy0nSp8LBMkdYR1dN4_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_policy0nSp8LBMkdYR1dN4_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_policy6ukPGIQni5apWCMS_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_policy6ukPGIQni5apWCMS_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_policyDAYZvZSnnBh32igN_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_policyDAYZvZSnnBh32igN_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_policyGkZ8xushXFtVcZHF_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_policyGkZ8xushXFtVcZHF_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_policyRtVmcbZHm8DO6rmR_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_policyRtVmcbZHm8DO6rmR_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_policyfX8vyl8vSaEC7Jue_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_policyfX8vyl8vSaEC7Jue_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_policymqmG9gpR6KDuxp5F_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_policymqmG9gpR6KDuxp5F_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_policytDVXqY4eQXxyrtLC_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_policytDVXqY4eQXxyrtLC_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_procon2EqeEXA31JZn71e7_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_procon2EqeEXA31JZn71e7_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_proconX7zAJLOPgcyZRDpZ_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_proconX7zAJLOPgcyZRDpZ_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_proconh2ZdDUYEw7hj6yTY_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_proconh2ZdDUYEw7hj6yTY_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_proconhy0bVrN2JhZ2Y4GE_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_proconhy0bVrN2JhZ2Y4GE_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_proconoaEUC24gzbaHqsB9_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_proconoaEUC24gzbaHqsB9_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_procony5ewSmxa6EiwByts_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_procony5ewSmxa6EiwByts_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_proconyXgOxCYl7uWArBmZ_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_proconyXgOxCYl7uWArBmZ_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_prodcp1BJglRaf2jODTVcl_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_prodcp1BJglRaf2jODTVcl_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_prodcp5SZd3x5Kd51QEyJC_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_prodcp5SZd3x5Kd51QEyJC_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_prodcpGSsSCKpQAvSZhczP_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_prodcpGSsSCKpQAvSZhczP_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_prodcpJ5QMHZrBMvFuCKbY_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_prodcpJ5QMHZrBMvFuCKbY_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_prodcpQVPUCCeIbgfrg01Y_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_prodcpQVPUCCeIbgfrg01Y_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_prodcpk7qXxlpmrrTy0GyZ_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_prodcpk7qXxlpmrrTy0GyZ_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_prodcponmUtiKpSBpdwMUq_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_prodcponmUtiKpSBpdwMUq_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_prodcpslATZmS6ESv2yDln_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_prodcpslATZmS6ESv2yDln_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_rankli824vjs9DMOPbb7HD_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_rankli824vjs9DMOPbb7HD_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_rankliDwyJ6ePJgm8znsQh_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_rankliDwyJ6ePJgm8znsQh_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_rankliFmBTzbMqx2kaw22n_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_rankliFmBTzbMqx2kaw22n_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_rankliNmeXNx8zhLK17tZm_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_rankliNmeXNx8zhLK17tZm_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_rankliQW3nT9D69Y9z2IwY_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_rankliQW3nT9D69Y9z2IwY_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_ranklidfdj3RVVH31L5VPo_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_ranklidfdj3RVVH31L5VPo_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_rankliqLKYjYMVUuUxPsQk_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_rankliqLKYjYMVUuUxPsQk_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_rankliscQtnu8LeS5DQPDu_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_rankliscQtnu8LeS5DQPDu_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_review5q6iS0uJfnrUKg5h_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_review5q6iS0uJfnrUKg5h_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_review7WR9DY30D57MosM2_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_review7WR9DY30D57MosM2_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_reviewDuKImcS9AcUMBZGy_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_reviewDuKImcS9AcUMBZGy_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_reviewK0nAC9ggA3ijafYR_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_reviewK0nAC9ggA3ijafYR_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_reviewUSC6Z2qYt8BXkwBa_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_reviewUSC6Z2qYt8BXkwBa_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_reviewes8I18LtI6ioOQum_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_reviewes8I18LtI6ioOQum_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_reviewkF4ySyG6kBGDm8gm_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_reviewkF4ySyG6kBGDm8gm_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_reviewvl0BjYy6KRNoqDNQ_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_reviewvl0BjYy6KRNoqDNQ_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_techsp5dgyRDlZ9iuOOEpT_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_techsp5dgyRDlZ9iuOOEpT_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_techsp9FOJRdQzS5k3RUpc_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_techsp9FOJRdQzS5k3RUpc_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_techspH03dNoqnVQ9MahGu_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_techspH03dNoqnVQ9MahGu_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_techspSZIG8XzmicGFYvHD_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_techspSZIG8XzmicGFYvHD_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_techspkCUk3fQGVjhIjenV_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_techspkCUk3fQGVjhIjenV_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_techspnypcYIutGeysr17X_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_techspnypcYIutGeysr17X_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_techsppQVl7qtQ5HVWUJKp_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_techsppQVl7qtQ5HVWUJKp_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_techspsK4Re3Jq5v6Zj5AB_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_techspsK4Re3Jq5v6Zj5AB_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_topic0wgEjhAfQW51cyiZ_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_topic0wgEjhAfQW51cyiZ_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_topicBSSGEhfWR3DVNsA2_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_topicBSSGEhfWR3DVNsA2_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_topicFUrl0vQ0wz5kkSHx_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_topicFUrl0vQ0wz5kkSHx_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_topicU5Om4qeJWDc9Pn2A_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_topicU5Om4qeJWDc9Pn2A_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_topicd2PZ70chNQZVYofJ_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_topicd2PZ70chNQZVYofJ_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_topicnMbVhT4zxQSZHUx2_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_topicnMbVhT4zxQSZHUx2_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_topicqwjvYGldTBetIKqZ_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_topicqwjvYGldTBetIKqZ_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_topicxQpauRxmf4wtVEVl_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_topicxQpauRxmf4wtVEVl_report.md -------------------------------------------------------------------------------- /data/reference_reports/qid_topiczL71JengsGofPr0A_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/data/reference_reports/qid_topiczL71JengsGofPr0A_report.md -------------------------------------------------------------------------------- /docs/DATASET.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/docs/DATASET.md -------------------------------------------------------------------------------- /imgs/task_domain_dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/imgs/task_domain_dist.png -------------------------------------------------------------------------------- /liveresearchbench/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/__init__.py -------------------------------------------------------------------------------- /liveresearchbench/batch_evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/batch_evaluator.py -------------------------------------------------------------------------------- /liveresearchbench/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/common/__init__.py -------------------------------------------------------------------------------- /liveresearchbench/common/io_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/common/io_utils.py -------------------------------------------------------------------------------- /liveresearchbench/common/model_clients.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/common/model_clients.py -------------------------------------------------------------------------------- /liveresearchbench/common/reference_reports.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/common/reference_reports.py -------------------------------------------------------------------------------- /liveresearchbench/criteria/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/criteria/__init__.py -------------------------------------------------------------------------------- /liveresearchbench/criteria/citation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/criteria/citation.py -------------------------------------------------------------------------------- /liveresearchbench/criteria/consistency.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/criteria/consistency.py -------------------------------------------------------------------------------- /liveresearchbench/criteria/coverage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/criteria/coverage.py -------------------------------------------------------------------------------- /liveresearchbench/criteria/depth.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/criteria/depth.py -------------------------------------------------------------------------------- /liveresearchbench/criteria/presentation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/criteria/presentation.py -------------------------------------------------------------------------------- /liveresearchbench/graders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/graders/__init__.py -------------------------------------------------------------------------------- /liveresearchbench/graders/base_grader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/graders/base_grader.py -------------------------------------------------------------------------------- /liveresearchbench/graders/checklist_grader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/graders/checklist_grader.py -------------------------------------------------------------------------------- /liveresearchbench/graders/pairwise_grader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/graders/pairwise_grader.py -------------------------------------------------------------------------------- /liveresearchbench/graders/pointwise_grader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/liveresearchbench/graders/pointwise_grader.py -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/main.py -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/preprocess.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/pyproject.toml -------------------------------------------------------------------------------- /scripts/batch_grade.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/scripts/batch_grade.sh -------------------------------------------------------------------------------- /scripts/batch_grade_multi_provider.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/scripts/batch_grade_multi_provider.sh -------------------------------------------------------------------------------- /scripts/preprocess_all.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/scripts/preprocess_all.sh -------------------------------------------------------------------------------- /tests/test_grading.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/tests/test_grading.py -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SalesforceAIResearch/LiveResearchBench/HEAD/uv.lock --------------------------------------------------------------------------------