├── .github └── workflows │ ├── manual-dispatch.yml │ └── push.yml ├── .gitignore ├── LICENSE ├── README.md ├── example ├── data │ ├── plots │ │ ├── rpd_a_arp.pdf │ │ ├── rpd_a_ktu.pdf │ │ ├── rpd_a_rmse.pdf │ │ ├── rpd_b_arp.pdf │ │ ├── rpd_b_ktu.pdf │ │ ├── rpd_b_rmse.pdf │ │ ├── rpd_dri_vs_er.pdf │ │ ├── rpd_er.pdf │ │ ├── rpl_dri_vs_er.pdf │ │ └── rpl_er.pdf │ └── qrels │ │ ├── core17.txt │ │ ├── core18.txt │ │ ├── robust04.txt │ │ └── robust05.txt ├── demo.ipynb ├── get_data.sh ├── intro.ipynb ├── requirements.txt ├── rpd_arp.py ├── rpd_dri_vs_er.py ├── rpd_er.py ├── rpd_eval.py ├── rpd_ktu.py ├── rpd_rmse.py ├── rpl_dri_vs_er.py ├── rpl_er.py └── rpl_eval.py ├── repro_eval ├── Evaluator.py ├── __init__.py ├── __main__.py ├── config.py ├── measure │ ├── __init__.py │ ├── document_order.py │ ├── effectiveness.py │ ├── external │ │ ├── __init__.py │ │ └── rbo.py │ ├── overall_effects.py │ └── statistics.py ├── metadata.py ├── resources │ └── extensions.json ├── test │ ├── __init__.py │ ├── test_empty_rpd.py │ ├── test_empty_rpl.py │ ├── test_kwargs.py │ ├── test_path_param.py │ ├── test_rbo.py │ ├── test_rpd.py │ ├── test_rpl.py │ └── test_ttest.py └── util.py └── setup.py /.github/workflows/manual-dispatch.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: workflow_dispatch 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [3.7, 3.8, 3.9] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | python -m pip install pytest pytrec_eval numpy scipy tqdm 23 | - name: Test with pytest 24 | run: | 25 | pytest 26 | -------------------------------------------------------------------------------- /.github/workflows/push.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [3.7, 3.8, 3.9] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install dependencies 20 | run: | 21 | wget -P example/ https://gist.githubusercontent.com/breuert/95d01401a1ea767ca83592beeb8d7785/raw/9073fc0dd3cd118655f9a7e4f74116dd8da09df0/orig_b.txt https://gist.githubusercontent.com/breuert/c71c2c6bad1cda7ed121bb7d0f64e471/raw/d54404d960a0decddda1f19711d4cffc71614ecf/orig_a.txt https://gist.githubusercontent.com/breuert/5973d391a4bc38643264366299e2c3de/raw/d9d5eeeecffc9861113a1eeb044fc225da4e0f00/rpd_b.txt https://gist.githubusercontent.com/breuert/8e9bfb7aac30fa044da23fdd95174b92/raw/e1f1d1a84d9a8834a4d25f772ee409ada42b5eaa/rpd_a.txt https://gist.githubusercontent.com/breuert/a39373be8ec0e0b15844dcfe9f26f8cc/raw/ad2ea6db8ff1bec3a3ca6d488c3bbcc13ca1b05b/rpl_b.txt https://gist.githubusercontent.com/breuert/14d5eef9a1d51e337a4c3cd44f5212a3/raw/50f2f21c5902ed13f7550d0e588fada7015089ab/rpl_a.txt 22 | python -m pip install --upgrade pip 23 | python -m pip install pytest pytrec_eval numpy scipy tqdm 24 | - name: Test with pytest 25 | run: | 26 | pytest 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | __pycache__/ 3 | build/ 4 | dist/ 5 | repro_eval.egg-info/ 6 | playground.py 7 | example/data/runs/ 8 | example/orig_a.txt 9 | example/orig_b.txt 10 | example/rpd_a.txt 11 | example/rpd_b.txt 12 | example/rpl_a.txt 13 | example/rpl_b.txt 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Timo Breuer, Nicola Ferro, Maria Maistro, Philipp Schaer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /example/data/plots/rpd_a_arp.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_a_arp.pdf -------------------------------------------------------------------------------- /example/data/plots/rpd_a_ktu.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_a_ktu.pdf -------------------------------------------------------------------------------- /example/data/plots/rpd_a_rmse.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_a_rmse.pdf -------------------------------------------------------------------------------- /example/data/plots/rpd_b_arp.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_b_arp.pdf -------------------------------------------------------------------------------- /example/data/plots/rpd_b_ktu.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_b_ktu.pdf -------------------------------------------------------------------------------- /example/data/plots/rpd_b_rmse.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_b_rmse.pdf -------------------------------------------------------------------------------- /example/data/plots/rpd_dri_vs_er.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_dri_vs_er.pdf -------------------------------------------------------------------------------- /example/data/plots/rpd_er.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_er.pdf -------------------------------------------------------------------------------- /example/data/plots/rpl_dri_vs_er.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpl_dri_vs_er.pdf -------------------------------------------------------------------------------- /example/data/plots/rpl_er.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpl_er.pdf -------------------------------------------------------------------------------- /example/demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.7.3" 21 | }, 22 | "colab": { 23 | "name": "intro.ipynb", 24 | "provenance": [] 25 | } 26 | }, 27 | "cells": [ 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "0dy3GpaAVtDJ" 32 | }, 33 | "source": [ 34 | "# An Introduction to `repro_eval`" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "id": "2vP09TvRbZnI" 41 | }, 42 | "source": [ 43 | "This notebook introduces the functionalities of `repro_eval`. We provide sample data that has to be downloaded in advance, but it is also possible to upload your runs and evaluate the reproducibilty of your experiments with this notebook." 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": { 49 | "id": "7HvZTRuDb0FC" 50 | }, 51 | "source": [ 52 | "#### Install `repro_eval` via PyPI" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "metadata": { 58 | "id": "X1Odv7-WVt4o" 59 | }, 60 | "source": [ 61 | "!pip install repro_eval==0.1" 62 | ], 63 | "execution_count": null, 64 | "outputs": [] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": { 69 | "id": "wUkSqYmgb4yD" 70 | }, 71 | "source": [ 72 | "#### Download the sample data and extract it\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "metadata": { 78 | "id": "Sw2nFqDZWRyP" 79 | }, 80 | "source": [ 81 | "!wget https://www.dropbox.com/s/ncu49e91mosidei/data.tar.gz\n", 82 | "!tar -xzvf ./data.tar.gz " 83 | ], 84 | "execution_count": null, 85 | "outputs": [] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": { 90 | "id": "-SN1XavoVtDL" 91 | }, 92 | "source": [ 93 | "### Imports\n", 94 | "\n", 95 | "Once installed, the Evaluator classes for the evaluation of reproducibility and replicability can be imported. In this notebook, we also include other Python packages that are not necessarily required when using `repro_eval` for your experiments." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "metadata": { 101 | "id": "kCVSY0rGVtDM" 102 | }, 103 | "source": [ 104 | "from repro_eval.Evaluator import RpdEvaluator, RplEvaluator\n", 105 | "from repro_eval.util import arp, arp_scores, print_base_adv, print_simple_line, trim\n", 106 | "\n", 107 | "import pytrec_eval\n", 108 | "import pandas as pd\n", 109 | "from matplotlib import pyplot as plt\n", 110 | "import seaborn as sns\n", 111 | "sns.set()\n", 112 | "sns.set_style('whitegrid')\n", 113 | "colors = sns.color_palette()" 114 | ], 115 | "execution_count": null, 116 | "outputs": [] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "id": "rRZlhiToVtDS" 122 | }, 123 | "source": [ 124 | "### Path definition\n", 125 | "You can modify these paths and adapt them to your experiments. The entire notebook should be usable with your experiments when they comply with the given evaluation scenario. First, we need two kind of runs - a baseline run and an advanced run (that outperforms the baseline run). Second, for the evaluation of replicability, the replicated runs should be derived from another target collection. The dictionaries `runs_rpd` and `runs_rpl` contain runs with different parametrizations, but it should also be possible to include just one version for both the baseline and advanced run." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "metadata": { 131 | "id": "qyFqKV8KVtDT" 132 | }, 133 | "source": [ 134 | "QREL = './data/qrels/core17.txt'\n", 135 | "QREL_RPL = './data/qrels/core18.txt'\n", 136 | "ORIG_B = './data/runs/orig/input.WCrobust04'\n", 137 | "ORIG_A = './data/runs/orig/input.WCrobust0405'\n", 138 | "RPD_B = './data/runs/rpd/14/irc_task1_WCrobust04_001'\n", 139 | "RPD_A = './data/runs/rpd/14/irc_task1_WCrobust0405_001'\n", 140 | "RPL_B = './data/runs/rpl/14/irc_task2_WCrobust04_001'\n", 141 | "RPL_A = './data/runs/rpl/14/irc_task2_WCrobust0405_001'\n", 142 | "MEASURE = 'ndcg'\n", 143 | "\n", 144 | "runs_rpd = {\n", 145 | " 'rpd_wcr04_tf_1':\n", 146 | " {'path': './data/runs/rpd/45/irc_task1_WCrobust04_001'},\n", 147 | " 'rpd_wcr0405_tf_1':\n", 148 | " {'path': './data/runs/rpd/45/irc_task1_WCrobust0405_001'},\n", 149 | " 'rpd_wcr04_tf_2':\n", 150 | " {'path': './data/runs/rpd/46/irc_task1_WCrobust04_001'},\n", 151 | " 'rpd_wcr0405_tf_2':\n", 152 | " {'path': './data/runs/rpd/46/irc_task1_WCrobust0405_001'},\n", 153 | " 'rpd_wcr04_tf_3':\n", 154 | " {'path': './data/runs/rpd/47/irc_task1_WCrobust04_001'},\n", 155 | " 'rpd_wcr0405_tf_3':\n", 156 | " {'path': './data/runs/rpd/47/irc_task1_WCrobust0405_001'},\n", 157 | " 'rpd_wcr04_tf_4':\n", 158 | " {'path': './data/runs/rpd/48/irc_task1_WCrobust04_001'},\n", 159 | " 'rpd_wcr0405_tf_4':\n", 160 | " {'path': './data/runs/rpd/48/irc_task1_WCrobust0405_001'},\n", 161 | " 'rpd_wcr04_tf_5':\n", 162 | " {'path': './data/runs/rpd/49/irc_task1_WCrobust04_001'},\n", 163 | " 'rpd_wcr0405_tf_5':\n", 164 | " {'path': './data/runs/rpd/49/irc_task1_WCrobust0405_001'}\n", 165 | "}\n", 166 | "\n", 167 | "runs_rpl = {\n", 168 | " 'rpl_wcr04_tf_1':\n", 169 | " {'path': './data/runs/rpl/45/irc_task2_WCrobust04_001'},\n", 170 | " 'rpl_wcr0405_tf_1':\n", 171 | " {'path': './data/runs/rpl/45/irc_task2_WCrobust0405_001'},\n", 172 | " 'rpl_wcr04_tf_2':\n", 173 | " {'path': './data/runs/rpl/46/irc_task2_WCrobust04_001'},\n", 174 | " 'rpl_wcr0405_tf_2':\n", 175 | " {'path': './data/runs/rpl/46/irc_task2_WCrobust0405_001'},\n", 176 | " 'rpl_wcr04_tf_3':\n", 177 | " {'path': './data/runs/rpl/47/irc_task2_WCrobust04_001'},\n", 178 | " 'rpl_wcr0405_tf_3':\n", 179 | " {'path': './data/runs/rpl/47/irc_task2_WCrobust0405_001'},\n", 180 | " 'rpl_wcr04_tf_4':\n", 181 | " {'path': './data/runs/rpl/48/irc_task2_WCrobust04_001'},\n", 182 | " 'rpl_wcr0405_tf_4':\n", 183 | " {'path': './data/runs/rpl/48/irc_task2_WCrobust0405_001'},\n", 184 | " 'rpl_wcr04_tf_5':\n", 185 | " {'path': './data/runs/rpl/49/irc_task2_WCrobust04_001'},\n", 186 | " 'rpl_wcr0405_tf_5':\n", 187 | " {'path': './data/runs/rpl/49/irc_task2_WCrobust0405_001'}\n", 188 | "}" 189 | ], 190 | "execution_count": null, 191 | "outputs": [] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": { 196 | "id": "I5YPFEL5VtDa" 197 | }, 198 | "source": [ 199 | "Define a helping function for plotting the average retrieval performance (ARP) later in the notebook." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "metadata": { 205 | "id": "8-AdKB2-VtDb" 206 | }, 207 | "source": [ 208 | "def average_retrieval_performance(baseline_scores, reproduced_scores: dict, measures: list, xlabel: str, ylabel: str):\n", 209 | " reproduced_scores_arp = [arp_scores(topic_scores) for idx, topic_scores in reproduced_scores.items()]\n", 210 | " baseline_scores_arp = arp_scores(baseline_scores)\n", 211 | " index = list(reproduced_scores.keys())\n", 212 | " df_content = {}\n", 213 | " for measure in measures:\n", 214 | " df_content[measure] = [scores.get(measure) for scores in reproduced_scores_arp]\n", 215 | " df = pd.DataFrame(df_content, index=index)\n", 216 | "\n", 217 | " plt.figure()\n", 218 | " ax = df.plot.bar(rot=0, figsize=(10, 6))\n", 219 | " for num, measure in enumerate(measures):\n", 220 | " orig_val = baseline_scores_arp.get(measure)\n", 221 | " ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color=colors[num])\n", 222 | " ax.annotate(' ', (num, orig_val), color=colors[num])\n", 223 | " ax.set_ylim(0.0, 1.0)\n", 224 | "\n", 225 | " legend_content = [measure + ' (orig)' for measure in measures] + [measure + ' (rpl)' for measure in measures]\n", 226 | " ax.legend(legend_content, loc='center left', bbox_to_anchor=(1, 0.5))\n", 227 | "\n", 228 | " ax.set_xlabel(xlabel)\n", 229 | " ax.set_ylabel(ylabel)\n", 230 | " plt.show()" 231 | ], 232 | "execution_count": null, 233 | "outputs": [] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": { 238 | "id": "9Vn-UoX3VtDg" 239 | }, 240 | "source": [ 241 | "### Evaluating Reproducibility\n", 242 | "The following code snippet instantiates a reproducibility evaluator `RpdEvaluator` and determines Kendall's tau Union (KTU), the Rank-biased Overlap (RBO), the Root-Mean-Square-Error (RMSE), the Effect Ratio (ER), the Delta Relative Improvement (DRI) and the p-values of the paired t-test. Please be aware, that it takes some time for the RBO to be computed. We've included a progress bar to give you some feedback." 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "metadata": { 248 | "id": "dFybABY7VtDh" 249 | }, 250 | "source": [ 251 | "rpd_eval = RpdEvaluator(qrel_orig_path=QREL,\n", 252 | " run_b_orig_path=ORIG_B,\n", 253 | " run_a_orig_path=ORIG_A,\n", 254 | " run_b_rep_path=RPD_B,\n", 255 | " run_a_rep_path=RPD_A)\n", 256 | "\n", 257 | "rpd_eval.trim()\n", 258 | "rpd_eval.evaluate()\n", 259 | "\n", 260 | "# KTU\n", 261 | "ktau = rpd_eval.ktau_union()\n", 262 | "print(\"Kendall's tau Union (KTU)\")\n", 263 | "print('------------------------------------------------------------------')\n", 264 | "for topic, value in ktau.get('baseline').items():\n", 265 | " print_base_adv(topic, 'KTU', value, ktau.get('advanced').get(topic))\n", 266 | "print_base_adv('ARP', 'KTU', arp(ktau.get('baseline')), arp(ktau.get('advanced')))\n", 267 | "\n", 268 | "# RBO\n", 269 | "rbo = rpd_eval.rbo(print_feedback=True)\n", 270 | "print(\"Rank-biased Overlap (RBO)\")\n", 271 | "print('------------------------------------------------------------------')\n", 272 | "for topic, value in rbo.get('baseline').items():\n", 273 | " print_base_adv(topic, 'RBO', value, rbo.get('advanced').get(topic))\n", 274 | "print_base_adv('ARP', 'RBO', arp(rbo.get('baseline')), arp(rbo.get('advanced')))\n", 275 | "\n", 276 | "# RMSE\n", 277 | "rmse = rpd_eval.rmse()\n", 278 | "print(\"Root mean square error (RMSE)\")\n", 279 | "print('------------------------------------------------------------------')\n", 280 | "for measure, value in rmse.get('baseline').items():\n", 281 | " print_base_adv(measure, 'RMSE', value, rmse.get('advanced').get(measure))\n", 282 | "\n", 283 | "# ER\n", 284 | "print(\"Effect ratio (ER)\")\n", 285 | "print('------------------------------------------------------------------')\n", 286 | "er = rpd_eval.er()\n", 287 | "for measure, value in er.items():\n", 288 | " print_simple_line(measure, 'ER', value)\n", 289 | "\n", 290 | "# DRI\n", 291 | "print(\"Delta Relative Improvement (DRI)\")\n", 292 | "print('------------------------------------------------------------------')\n", 293 | "dri = rpd_eval.dri()\n", 294 | "for measure, value in dri.items():\n", 295 | " print_simple_line(measure, 'DRI', value)\n", 296 | "\n", 297 | "# ttest\n", 298 | "pvals = rpd_eval.ttest()\n", 299 | "print(\"Two-tailed paired t-test (p-value)\")\n", 300 | "print('------------------------------------------------------------------')\n", 301 | "for measure, value in pvals.get('baseline').items():\n", 302 | " print_base_adv(measure, 'PVAL', value, pvals.get('advanced').get(measure))" 303 | ], 304 | "execution_count": null, 305 | "outputs": [] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": { 310 | "id": "9acgVo-CVtDm" 311 | }, 312 | "source": [ 313 | "### Comparing the Average Retrieval Performance (ARP) of different parametrizations \n", 314 | "The following code snippet determines the ARP scores and compares them via a bar plot." 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "metadata": { 320 | "id": "YtASw_fMVtDn" 321 | }, 322 | "source": [ 323 | "rpd_eval = RpdEvaluator(qrel_orig_path=QREL,\n", 324 | " run_b_orig_path=ORIG_B,\n", 325 | " run_a_orig_path=ORIG_A,\n", 326 | " run_b_rep_path=None,\n", 327 | " run_a_rep_path=None)\n", 328 | "\n", 329 | "rpd_eval.trim()\n", 330 | "rpd_eval.evaluate()\n", 331 | "\n", 332 | "for run_name, info in runs_rpd.items():\n", 333 | " with open(info.get('path')) as run_file:\n", 334 | " info['run'] = pytrec_eval.parse_run(run_file)\n", 335 | " trim(info['run'])\n", 336 | " info['scores'] = rpd_eval.evaluate(info['run'])\n", 337 | "\n", 338 | "average_retrieval_performance(rpd_eval.run_b_orig_score,\n", 339 | " {\n", 340 | " 'tf_1': runs_rpd.get('rpd_wcr04_tf_1').get('scores'),\n", 341 | " 'tf_2': runs_rpd.get('rpd_wcr04_tf_2').get('scores'),\n", 342 | " 'tf_3': runs_rpd.get('rpd_wcr04_tf_3').get('scores'),\n", 343 | " 'tf_4': runs_rpd.get('rpd_wcr04_tf_4').get('scores'),\n", 344 | " 'tf_5': runs_rpd.get('rpd_wcr04_tf_5').get('scores'),\n", 345 | " },\n", 346 | " measures=['P_10', 'ndcg', 'bpref', 'map'],\n", 347 | " xlabel='Reproduced run (wcr04)',\n", 348 | " ylabel='Score')\n", 349 | "\n", 350 | "average_retrieval_performance(rpd_eval.run_a_orig_score,\n", 351 | " {\n", 352 | " 'tf_1': runs_rpd.get('rpd_wcr0405_tf_1').get('scores'),\n", 353 | " 'tf_2': runs_rpd.get('rpd_wcr0405_tf_2').get('scores'),\n", 354 | " 'tf_3': runs_rpd.get('rpd_wcr0405_tf_3').get('scores'),\n", 355 | " 'tf_4': runs_rpd.get('rpd_wcr0405_tf_4').get('scores'),\n", 356 | " 'tf_5': runs_rpd.get('rpd_wcr0405_tf_5').get('scores'),\n", 357 | " },\n", 358 | " measures=['P_10', 'ndcg', 'bpref', 'map'],\n", 359 | " xlabel='Reproduced run (wcr0405)',\n", 360 | " ylabel='Score')" 361 | ], 362 | "execution_count": null, 363 | "outputs": [] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": { 368 | "id": "AQVD43jBVtDs" 369 | }, 370 | "source": [ 371 | "### Kendall's tau Union (KTU) across different cut-offs\n", 372 | "The following code snippet compares the ordering of documents for the reproduced runs across different cut-off ranks." 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "metadata": { 378 | "id": "k8COriFZVtDt" 379 | }, 380 | "source": [ 381 | "cutoffs = [1000, 100, 50, 20, 10, 5]\n", 382 | "\n", 383 | "# BASELINE\n", 384 | "for run_name, info in zip(list(runs_rpd.keys())[::2], list(runs_rpd.values())[::2]):\n", 385 | " rpd_eval = RpdEvaluator(qrel_orig_path=QREL,\n", 386 | " run_b_orig_path=ORIG_B,\n", 387 | " run_a_orig_path=ORIG_A,\n", 388 | " run_b_rep_path=None,\n", 389 | " run_a_rep_path=None)\n", 390 | "\n", 391 | " rpd_eval.trim()\n", 392 | " rpd_eval.evaluate()\n", 393 | "\n", 394 | " with open(info.get('path')) as run_file:\n", 395 | " info['run'] = pytrec_eval.parse_run(run_file)\n", 396 | " for cutoff in cutoffs:\n", 397 | " rpd_eval.trim(cutoff)\n", 398 | " rpd_eval.trim(cutoff, info['run'])\n", 399 | " info['ktu_' + str(cutoff)] = arp(rpd_eval.ktau_union(info['run'])['baseline'])\n", 400 | "\n", 401 | "df_content = {}\n", 402 | "for run_name, info in zip(list(runs_rpd.keys())[::2], list(runs_rpd.values())[::2]):\n", 403 | " df_content[run_name] = [info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1]]\n", 404 | "\n", 405 | "plt.figure()\n", 406 | "ax = pd.DataFrame(data=df_content, index=[str(cutoff) for cutoff in cutoffs[::-1]]).plot(style='o-', figsize=(10, 6))\n", 407 | "ax.set_xlabel('Cut-off values')\n", 408 | "ax.set_ylabel(r\"Kendall's $\\tau$\")\n", 409 | "plt.show()\n", 410 | "\n", 411 | "# ADVANCED\n", 412 | "for run_name, info in zip(list(runs_rpd.keys())[1::2], list(runs_rpd.values())[1::2]):\n", 413 | " rpd_eval = RpdEvaluator(qrel_orig_path=QREL,\n", 414 | " run_b_orig_path=ORIG_B,\n", 415 | " run_a_orig_path=ORIG_A,\n", 416 | " run_b_rep_path=None,\n", 417 | " run_a_rep_path=None)\n", 418 | "\n", 419 | " rpd_eval.trim()\n", 420 | " rpd_eval.evaluate()\n", 421 | "\n", 422 | " with open(info.get('path')) as run_file:\n", 423 | " info['run'] = pytrec_eval.parse_run(run_file)\n", 424 | " for cutoff in cutoffs:\n", 425 | " rpd_eval.trim(cutoff)\n", 426 | " rpd_eval.trim(cutoff, info['run'])\n", 427 | " # scores = rpl_eval.evaluate(info['run'])\n", 428 | " info['ktu_' + str(cutoff)] = arp(rpd_eval.ktau_union(info['run'])['baseline'])\n", 429 | "\n", 430 | "df_content = {}\n", 431 | "for run_name, info in zip(list(runs_rpd.keys())[1::2], list(runs_rpd.values())[1::2]):\n", 432 | " df_content[run_name] = [info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1]]\n", 433 | "\n", 434 | "plt.figure()\n", 435 | "ax = pd.DataFrame(data=df_content, index=[str(cutoff) for cutoff in cutoffs[::-1]]).plot(style='o-', figsize=(10, 6))\n", 436 | "ax.set_xlabel('Cut-off values')\n", 437 | "ax.set_ylabel(r\"Kendall's $\\tau$\")\n", 438 | "plt.show()" 439 | ], 440 | "execution_count": null, 441 | "outputs": [] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": { 446 | "id": "C_Prc5aqVtDz" 447 | }, 448 | "source": [ 449 | "## Root-Mean-Square-Error (RMSE) across different cut-offs\n", 450 | "The following code snippet compares the reproduced runs at the level of effectiveness by determining the RMSE across different cut-off ranks." 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "metadata": { 456 | "id": "f2hsEkeGVtD0" 457 | }, 458 | "source": [ 459 | "rpd_eval = RpdEvaluator(qrel_orig_path=QREL,\n", 460 | " run_b_orig_path=ORIG_B,\n", 461 | " run_a_orig_path=ORIG_A,\n", 462 | " run_b_rep_path=None,\n", 463 | " run_a_rep_path=None)\n", 464 | "\n", 465 | "rpd_eval.trim()\n", 466 | "rpd_eval.evaluate()\n", 467 | "\n", 468 | "for run_name, info in runs_rpd.items():\n", 469 | " with open(info.get('path')) as run_file:\n", 470 | " info['run'] = pytrec_eval.parse_run(run_file)\n", 471 | " trim(info['run'])\n", 472 | " info['scores'] = rpd_eval.evaluate(info['run'])\n", 473 | " info['rmse'] = rpd_eval.rmse(run_b_score=info['scores'])\n", 474 | "\n", 475 | "\n", 476 | "baseline_runs = ['rpd_wcr04_tf_1', 'rpd_wcr04_tf_2', 'rpd_wcr04_tf_3', 'rpd_wcr04_tf_4', 'rpd_wcr04_tf_5']\n", 477 | "advanced_runs = ['rpd_wcr0405_tf_1', 'rpd_wcr0405_tf_2', 'rpd_wcr0405_tf_3', 'rpd_wcr0405_tf_4', 'rpd_wcr0405_tf_5']\n", 478 | "cutoffs = ['5', '10', '15', '20', '30', '100', '200', '500', '1000']\n", 479 | "\n", 480 | "df_content = {}\n", 481 | "for run_name in baseline_runs:\n", 482 | " df_content[run_name] = [runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co] for co in cutoffs]\n", 483 | "\n", 484 | "df = pd.DataFrame(df_content, index=cutoffs)\n", 485 | "plt.figure()\n", 486 | "ax = df.plot.line(style='o-', figsize=(10, 6))\n", 487 | "ax.set_xlabel('Cut-off values')\n", 488 | "ax.set_ylabel('RMSE')\n", 489 | "plt.show()\n", 490 | "\n", 491 | "df_content = {}\n", 492 | "for run_name in advanced_runs:\n", 493 | " df_content[run_name] = [runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co] for co in cutoffs]\n", 494 | "\n", 495 | "df = pd.DataFrame(df_content, index=cutoffs)\n", 496 | "plt.figure()\n", 497 | "ax = df.plot.line(style='o-', figsize=(10, 6))\n", 498 | "ax.set_xlabel('Cut-off values')\n", 499 | "ax.set_ylabel('RMSE')\n", 500 | "plt.show()" 501 | ], 502 | "execution_count": null, 503 | "outputs": [] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": { 508 | "id": "MKj4uBanVtD5" 509 | }, 510 | "source": [ 511 | "## Exploring the space of reproducibility at the level of overall effects\n", 512 | "The following code snippet plots the Delta Relative Improvement (DRI) against the Effect Ratio (ER). Having runs with different parametrizations at hand, we can compare them in the cartesian plane. As a rule of thumb, we can say the closer a point to (ER 1, DRI 0), the better the reproduction." 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "metadata": { 518 | "id": "scmfh0ZfVtD5" 519 | }, 520 | "source": [ 521 | "rpd_eval = RpdEvaluator(qrel_orig_path=QREL,\n", 522 | " run_b_orig_path=ORIG_B,\n", 523 | " run_a_orig_path=ORIG_A,\n", 524 | " run_b_rep_path=None,\n", 525 | " run_a_rep_path=None)\n", 526 | "\n", 527 | "rpd_eval.trim()\n", 528 | "rpd_eval.evaluate()\n", 529 | "\n", 530 | "for run_name, info in runs_rpd.items():\n", 531 | " with open(info.get('path')) as run_file:\n", 532 | " info['run'] = pytrec_eval.parse_run(run_file)\n", 533 | " trim(info['run'])\n", 534 | " info['scores'] = rpd_eval.evaluate(info['run'])\n", 535 | "\n", 536 | "dri_er = {\n", 537 | " 'wcr_tf_1': {\n", 538 | " 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_1']['scores'], runs_rpd['rpd_wcr0405_tf_1']['scores']),\n", 539 | " 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_1']['scores'], runs_rpd['rpd_wcr0405_tf_1']['scores'])\n", 540 | " },\n", 541 | " 'wcr_tf_2': {\n", 542 | " 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_2']['scores'], runs_rpd['rpd_wcr0405_tf_2']['scores']),\n", 543 | " 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_2']['scores'], runs_rpd['rpd_wcr0405_tf_2']['scores'])\n", 544 | " },\n", 545 | " 'wcr_tf_3': {\n", 546 | " 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_3']['scores'], runs_rpd['rpd_wcr0405_tf_3']['scores']),\n", 547 | " 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_3']['scores'], runs_rpd['rpd_wcr0405_tf_3']['scores'])\n", 548 | " },\n", 549 | " 'wcr_tf_4': {\n", 550 | " 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_4']['scores'], runs_rpd['rpd_wcr0405_tf_4']['scores']),\n", 551 | " 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_4']['scores'], runs_rpd['rpd_wcr0405_tf_4']['scores'])\n", 552 | " },\n", 553 | " 'wcr_tf_5': {\n", 554 | " 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_5']['scores'], runs_rpd['rpd_wcr0405_tf_5']['scores']),\n", 555 | " 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_5']['scores'], runs_rpd['rpd_wcr0405_tf_5']['scores'])\n", 556 | " },\n", 557 | "\n", 558 | "}\n", 559 | "\n", 560 | "measures = ['P_10', 'map', 'ndcg']\n", 561 | "marker_color = [('o', 'b'), ('^', 'g'), ('v', 'r')]\n", 562 | "\n", 563 | "fig, ax1 = plt.subplots(figsize=(10, 10))\n", 564 | "ax1.set_xlabel('Effect Ratio (ER)')\n", 565 | "ax1.set_ylabel(u'Delta Relative Improvement (ΔRI)')\n", 566 | "\n", 567 | "for measure, mk in zip(measures, marker_color):\n", 568 | " ax1.plot([dri_er[r]['er'][measure] for r in dri_er.keys()],\n", 569 | " [dri_er[r]['dri'][measure] for r in dri_er.keys()],\n", 570 | " marker=mk[0], color=mk[1], linestyle='None', label=measure)\n", 571 | "\n", 572 | "ax1.tick_params(axis='y', labelcolor='k')\n", 573 | "fig.tight_layout()\n", 574 | "plt.axhline(0, color='grey')\n", 575 | "plt.axvline(1, color='grey')\n", 576 | "plt.legend()\n", 577 | "plt.title('Reproducibility')\n", 578 | "\n", 579 | "for m in measures:\n", 580 | " for r in dri_er.keys():\n", 581 | " plt.text(x = dri_er[r]['er'][m], \n", 582 | " y = dri_er[r]['dri'][m],\n", 583 | " s = r) \n", 584 | "\n", 585 | "plt.show()" 586 | ], 587 | "execution_count": null, 588 | "outputs": [] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "metadata": { 593 | "id": "YGck2XO0VtD9" 594 | }, 595 | "source": [ 596 | "## Evaluating Replicability\n", 597 | "The following code snippet instantiates a replicability evaluator `RplEvaluator` and determines the Effect Ratio (ER), the Delta Relative Improvement (DRI) and the p-values of the unpaired t-test." 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "metadata": { 603 | "id": "hz8ExjJOVtD-" 604 | }, 605 | "source": [ 606 | "rpl_eval = RplEvaluator(qrel_orig_path=QREL,\n", 607 | " run_b_orig_path=ORIG_B,\n", 608 | " run_a_orig_path=ORIG_A,\n", 609 | " run_b_rep_path=RPL_B,\n", 610 | " run_a_rep_path=RPL_A,\n", 611 | " qrel_rpl_path=QREL_RPL)\n", 612 | "\n", 613 | "rpl_eval.trim()\n", 614 | "rpl_eval.evaluate()\n", 615 | "\n", 616 | "# ER\n", 617 | "print(\"Effect ratio (ER)\")\n", 618 | "print('------------------------------------------------------------------')\n", 619 | "er = rpl_eval.er()\n", 620 | "for measure, value in er.items():\n", 621 | " print_simple_line(measure, 'ER', value)\n", 622 | "\n", 623 | "# DRI\n", 624 | "print(\"Delta Relative Improvement (DRI)\")\n", 625 | "print('------------------------------------------------------------------')\n", 626 | "dri = rpl_eval.dri()\n", 627 | "for measure, value in dri.items():\n", 628 | " print_simple_line(measure, 'DRI', value)\n", 629 | "\n", 630 | "# ttest\n", 631 | "pvals = rpl_eval.ttest()\n", 632 | "print(\"Two-tailed unpaired t-test (p-value)\")\n", 633 | "print('------------------------------------------------------------------')\n", 634 | "for measure, value in pvals.get('baseline').items():\n", 635 | " print_base_adv(measure, 'PVAL', value, pvals.get('advanced').get(measure))" 636 | ], 637 | "execution_count": null, 638 | "outputs": [] 639 | }, 640 | { 641 | "cell_type": "markdown", 642 | "metadata": { 643 | "id": "YU4QlxI0VtEC" 644 | }, 645 | "source": [ 646 | "## Exploring the space of replicability at the level of overall effects\n", 647 | "The following code snippet plots the Delta Relative Improvement (DRI) against the Effect Ratio (ER). Having runs with different parametrizations at hand, we can compare them in the cartesian plane. As a rule of thumb, we can say the closer a point to (ER 1, DRI 0), the better the replication." 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "metadata": { 653 | "id": "GFvR5NAIVtED" 654 | }, 655 | "source": [ 656 | "rpl_eval = RplEvaluator(qrel_orig_path=QREL,\n", 657 | " run_b_orig_path=ORIG_B,\n", 658 | " run_a_orig_path=ORIG_A,\n", 659 | " run_b_rep_path=None,\n", 660 | " run_a_rep_path=None,\n", 661 | " qrel_rpl_path=QREL_RPL)\n", 662 | "\n", 663 | "rpl_eval.trim()\n", 664 | "rpl_eval.evaluate()\n", 665 | "\n", 666 | "for run_name, info in runs_rpl.items():\n", 667 | " with open(info.get('path')) as run_file:\n", 668 | " info['run'] = pytrec_eval.parse_run(run_file)\n", 669 | " trim(info['run'])\n", 670 | " info['scores'] = rpl_eval.evaluate(info['run'])\n", 671 | "\n", 672 | "dri_er = {\n", 673 | " 'wcr_tf_1': {\n", 674 | " 'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_1']['scores'], runs_rpl['rpl_wcr0405_tf_1']['scores']),\n", 675 | " 'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_1']['scores'], runs_rpl['rpl_wcr0405_tf_1']['scores'])\n", 676 | " },\n", 677 | " 'wcr_tf_2': {\n", 678 | " 'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_2']['scores'], runs_rpl['rpl_wcr0405_tf_2']['scores']),\n", 679 | " 'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_2']['scores'], runs_rpl['rpl_wcr0405_tf_2']['scores'])\n", 680 | " },\n", 681 | " 'wcr_tf_3': {\n", 682 | " 'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_3']['scores'], runs_rpl['rpl_wcr0405_tf_3']['scores']),\n", 683 | " 'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_3']['scores'], runs_rpl['rpl_wcr0405_tf_3']['scores'])\n", 684 | " },\n", 685 | " 'wcr_tf_4': {\n", 686 | " 'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_4']['scores'], runs_rpl['rpl_wcr0405_tf_4']['scores']),\n", 687 | " 'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_4']['scores'], runs_rpl['rpl_wcr0405_tf_4']['scores'])\n", 688 | " },\n", 689 | " 'wcr_tf_5': {\n", 690 | " 'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_5']['scores'], runs_rpl['rpl_wcr0405_tf_5']['scores']),\n", 691 | " 'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_5']['scores'], runs_rpl['rpl_wcr0405_tf_5']['scores'])\n", 692 | " },\n", 693 | "\n", 694 | "}\n", 695 | "\n", 696 | "measures = ['P_10', 'map', 'ndcg']\n", 697 | "marker_color = [('o', 'b'), ('^', 'g'), ('v', 'r')]\n", 698 | "\n", 699 | "fig, ax1 = plt.subplots(figsize=(10, 10))\n", 700 | "ax1.set_xlabel('Effect Ratio (ER)')\n", 701 | "ax1.set_ylabel(u'Delta Relative Improvement (ΔRI)')\n", 702 | "\n", 703 | "for measure, mk in zip(measures, marker_color):\n", 704 | " ax1.plot([dri_er[r]['er'][measure] for r in dri_er.keys()],\n", 705 | " [dri_er[r]['dri'][measure] for r in dri_er.keys()],\n", 706 | " marker=mk[0], color=mk[1], linestyle='None', label=measure)\n", 707 | "\n", 708 | "ax1.tick_params(axis='y', labelcolor='k')\n", 709 | "fig.tight_layout()\n", 710 | "plt.axhline(0, color='grey')\n", 711 | "plt.axvline(1, color='grey')\n", 712 | "plt.legend()\n", 713 | "plt.title('Replicability')\n", 714 | "\n", 715 | "for m in measures:\n", 716 | " for r in dri_er.keys():\n", 717 | " plt.text(x = dri_er[r]['er'][m], \n", 718 | " y = dri_er[r]['dri'][m],\n", 719 | " s = r) \n", 720 | "\n", 721 | "plt.show()" 722 | ], 723 | "execution_count": null, 724 | "outputs": [] 725 | } 726 | ] 727 | } -------------------------------------------------------------------------------- /example/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://www.dropbox.com/s/p1wwqqka1n3el6b/runs.tar.gz 4 | tar -xzvf runs.tar.gz -C ./example/data/ 5 | -------------------------------------------------------------------------------- /example/requirements.txt: -------------------------------------------------------------------------------- 1 | pytrec_eval 2 | numpy 3 | scipy 4 | pandas 5 | matplotlib 6 | seaborn -------------------------------------------------------------------------------- /example/rpd_arp.py: -------------------------------------------------------------------------------- 1 | import pytrec_eval 2 | from repro_eval.Evaluator import RpdEvaluator 3 | from repro_eval.util import arp, arp_scores 4 | from repro_eval.util import trim 5 | import pandas as pd 6 | from matplotlib import pyplot as plt 7 | import seaborn as sns 8 | sns.set() 9 | sns.set_style('whitegrid') 10 | # palette = sns.color_palette("GnBu_d") 11 | # sns.set_palette(palette) 12 | colors = sns.color_palette() 13 | 14 | ORIG_B = './data/runs/orig/input.WCrobust04' 15 | ORIG_A = './data/runs/orig/input.WCrobust0405' 16 | QREL = 'data/qrels/core17.txt' 17 | 18 | runs_rpd = { 19 | 'rpd_wcr04_tf_1': 20 | {'path': './data/runs/rpd/45/irc_task1_WCrobust04_001'}, 21 | 'rpd_wcr0405_tf_1': 22 | {'path': './data/runs/rpd/45/irc_task1_WCrobust0405_001'}, 23 | 'rpd_wcr04_tf_2': 24 | {'path': './data/runs/rpd/46/irc_task1_WCrobust04_001'}, 25 | 'rpd_wcr0405_tf_2': 26 | {'path': './data/runs/rpd/46/irc_task1_WCrobust0405_001'}, 27 | 'rpd_wcr04_tf_3': 28 | {'path': './data/runs/rpd/47/irc_task1_WCrobust04_001'}, 29 | 'rpd_wcr0405_tf_3': 30 | {'path': './data/runs/rpd/47/irc_task1_WCrobust0405_001'}, 31 | 'rpd_wcr04_tf_4': 32 | {'path': './data/runs/rpd/48/irc_task1_WCrobust04_001'}, 33 | 'rpd_wcr0405_tf_4': 34 | {'path': './data/runs/rpd/48/irc_task1_WCrobust0405_001'}, 35 | 'rpd_wcr04_tf_5': 36 | {'path': './data/runs/rpd/49/irc_task1_WCrobust04_001'}, 37 | 'rpd_wcr0405_tf_5': 38 | {'path': './data/runs/rpd/49/irc_task1_WCrobust0405_001'} 39 | } 40 | 41 | 42 | 43 | def average_retrieval_performance(baseline_scores, reproduced_scores: dict, measures: list, xlabel: str, ylabel: str, outfile: str): 44 | reproduced_scores_arp = [arp_scores(topic_scores) for idx, topic_scores in reproduced_scores.items()] 45 | baseline_scores_arp = arp_scores(baseline_scores) 46 | index = list(reproduced_scores.keys()) 47 | df_content = {} 48 | for measure in measures: 49 | df_content[measure] = [scores.get(measure) for scores in reproduced_scores_arp] 50 | df = pd.DataFrame(df_content, index=index) 51 | 52 | ax = df.plot.bar(rot=0) 53 | for num, measure in enumerate(measures): 54 | orig_val = baseline_scores_arp.get(measure) 55 | ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color=colors[num]) 56 | ax.annotate(' ', (num, orig_val), color=colors[num]) 57 | ax.set_ylim(0.0, 1.0) 58 | 59 | legend_content = [measure + ' (orig)' for measure in measures] + [measure + ' (rpl)' for measure in measures] 60 | ax.legend(legend_content, loc='lower left') 61 | 62 | ax.set_xlabel(xlabel) 63 | ax.set_ylabel(ylabel) 64 | ax.get_figure().savefig(outfile, format='pdf', bbox_inches='tight') 65 | plt.show() 66 | 67 | 68 | def main(): 69 | rpd_eval = RpdEvaluator(qrel_orig_path=QREL, 70 | run_b_orig_path=ORIG_B, 71 | run_a_orig_path=ORIG_A, 72 | run_b_rep_path=None, 73 | run_a_rep_path=None) 74 | 75 | rpd_eval.trim() 76 | rpd_eval.evaluate() 77 | 78 | for run_name, info in runs_rpd.items(): 79 | with open(info.get('path')) as run_file: 80 | info['run'] = pytrec_eval.parse_run(run_file) 81 | trim(info['run']) 82 | info['scores'] = rpd_eval.evaluate(info['run']) 83 | 84 | average_retrieval_performance(rpd_eval.run_b_orig_score, 85 | { 86 | 'tf_1': runs_rpd.get('rpd_wcr04_tf_1').get('scores'), 87 | 'tf_2': runs_rpd.get('rpd_wcr04_tf_2').get('scores'), 88 | 'tf_3': runs_rpd.get('rpd_wcr04_tf_3').get('scores'), 89 | 'tf_4': runs_rpd.get('rpd_wcr04_tf_4').get('scores'), 90 | 'tf_5': runs_rpd.get('rpd_wcr04_tf_5').get('scores'), 91 | }, 92 | measures=['P_10', 'ndcg', 'bpref', 'map'], 93 | xlabel='Reproduced run (wcr04)', 94 | ylabel='Score', 95 | outfile='data/plots/rpd_b_arp.pdf') 96 | 97 | average_retrieval_performance(rpd_eval.run_a_orig_score, 98 | { 99 | 'tf_1': runs_rpd.get('rpd_wcr0405_tf_1').get('scores'), 100 | 'tf_2': runs_rpd.get('rpd_wcr0405_tf_2').get('scores'), 101 | 'tf_3': runs_rpd.get('rpd_wcr0405_tf_3').get('scores'), 102 | 'tf_4': runs_rpd.get('rpd_wcr0405_tf_4').get('scores'), 103 | 'tf_5': runs_rpd.get('rpd_wcr0405_tf_5').get('scores'), 104 | }, 105 | measures=['P_10', 'ndcg', 'bpref', 'map'], 106 | xlabel='Reproduced run (wcr0405)', 107 | ylabel='Score', 108 | outfile='data/plots/rpd_a_arp.pdf') 109 | 110 | 111 | if __name__ == "__main__": 112 | main() -------------------------------------------------------------------------------- /example/rpd_dri_vs_er.py: -------------------------------------------------------------------------------- 1 | from repro_eval.Evaluator import RpdEvaluator 2 | from repro_eval.util import trim 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | sns.set(style="darkgrid") 6 | 7 | import pytrec_eval 8 | 9 | QREL = './data/qrels/core17.txt' 10 | ORIG_B = './data/runs/orig/input.WCrobust04' 11 | ORIG_A = './data/runs/orig/input.WCrobust0405' 12 | 13 | 14 | runs_rpd = { 15 | 'rpd_wcr04_tf_1': 16 | {'path': './data/runs/rpd/45/irc_task1_WCrobust04_001'}, 17 | 'rpd_wcr0405_tf_1': 18 | {'path': './data/runs/rpd/45/irc_task1_WCrobust0405_001'}, 19 | 'rpd_wcr04_tf_2': 20 | {'path': './data/runs/rpd/46/irc_task1_WCrobust04_001'}, 21 | 'rpd_wcr0405_tf_2': 22 | {'path': './data/runs/rpd/46/irc_task1_WCrobust0405_001'}, 23 | 'rpd_wcr04_tf_3': 24 | {'path': './data/runs/rpd/47/irc_task1_WCrobust04_001'}, 25 | 'rpd_wcr0405_tf_3': 26 | {'path': './data/runs/rpd/47/irc_task1_WCrobust0405_001'}, 27 | 'rpd_wcr04_tf_4': 28 | {'path': './data/runs/rpd/48/irc_task1_WCrobust04_001'}, 29 | 'rpd_wcr0405_tf_4': 30 | {'path': './data/runs/rpd/48/irc_task1_WCrobust0405_001'}, 31 | 'rpd_wcr04_tf_5': 32 | {'path': './data/runs/rpd/49/irc_task1_WCrobust04_001'}, 33 | 'rpd_wcr0405_tf_5': 34 | {'path': './data/runs/rpd/49/irc_task1_WCrobust0405_001'} 35 | } 36 | 37 | 38 | def main(): 39 | rpd_eval = RpdEvaluator(qrel_orig_path=QREL, 40 | run_b_orig_path=ORIG_B, 41 | run_a_orig_path=ORIG_A, 42 | run_b_rep_path=None, 43 | run_a_rep_path=None) 44 | 45 | rpd_eval.trim() 46 | rpd_eval.evaluate() 47 | 48 | for run_name, info in runs_rpd.items(): 49 | with open(info.get('path')) as run_file: 50 | info['run'] = pytrec_eval.parse_run(run_file) 51 | trim(info['run']) 52 | info['scores'] = rpd_eval.evaluate(info['run']) 53 | 54 | dri_er = { 55 | 'wcr_tf_1': { 56 | 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_1']['scores'], runs_rpd['rpd_wcr0405_tf_1']['scores']), 57 | 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_1']['scores'], runs_rpd['rpd_wcr0405_tf_1']['scores']) 58 | }, 59 | 'wcr_tf_2': { 60 | 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_2']['scores'], runs_rpd['rpd_wcr0405_tf_2']['scores']), 61 | 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_2']['scores'], runs_rpd['rpd_wcr0405_tf_2']['scores']) 62 | }, 63 | 'wcr_tf_3': { 64 | 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_3']['scores'], runs_rpd['rpd_wcr0405_tf_3']['scores']), 65 | 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_3']['scores'], runs_rpd['rpd_wcr0405_tf_3']['scores']) 66 | }, 67 | 'wcr_tf_4': { 68 | 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_4']['scores'], runs_rpd['rpd_wcr0405_tf_4']['scores']), 69 | 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_4']['scores'], runs_rpd['rpd_wcr0405_tf_4']['scores']) 70 | }, 71 | 'wcr_tf_5': { 72 | 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_5']['scores'], runs_rpd['rpd_wcr0405_tf_5']['scores']), 73 | 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_5']['scores'], runs_rpd['rpd_wcr0405_tf_5']['scores']) 74 | }, 75 | 76 | } 77 | 78 | measures = ['P_10', 'map', 'ndcg'] 79 | marker_color = [('o', 'b'), ('^', 'g'), ('v', 'r')] 80 | 81 | fig, ax1 = plt.subplots() 82 | ax1.set_xlabel('Effect Ratio (ER)') 83 | ax1.set_ylabel(u'Delta Relative Improvement (ΔRI)') 84 | 85 | for measure, mk in zip(measures, marker_color): 86 | ax1.plot([dri_er[r]['er'][measure] for r in dri_er.keys()], 87 | [dri_er[r]['dri'][measure] for r in dri_er.keys()], 88 | marker=mk[0], color=mk[1], linestyle='None', label=measure) 89 | 90 | ax1.tick_params(axis='y', labelcolor='k') 91 | fig.tight_layout() 92 | plt.axhline(0, color='grey') 93 | plt.axvline(1, color='grey') 94 | plt.legend() 95 | plt.title('Reproducibility') 96 | plt.savefig('data/plots/rpd_dri_vs_er.pdf', format='pdf', bbox_inches='tight') 97 | plt.show() 98 | 99 | 100 | if __name__ == "__main__": 101 | main() 102 | -------------------------------------------------------------------------------- /example/rpd_er.py: -------------------------------------------------------------------------------- 1 | import pytrec_eval 2 | from repro_eval.Evaluator import RpdEvaluator 3 | from repro_eval.util import trim 4 | import pandas as pd 5 | from matplotlib import pyplot as plt 6 | import seaborn as sns 7 | sns.set() 8 | sns.set_style('whitegrid') 9 | palette = sns.color_palette("GnBu_d") 10 | sns.set_palette(palette) 11 | colors = sns.color_palette() 12 | 13 | ORIG_B = './data/runs/orig/input.WCrobust04' 14 | ORIG_A = './data/runs/orig/input.WCrobust0405' 15 | QREL = 'data/qrels/core17.txt' 16 | 17 | runs_rpd = { 18 | 'rpd_wcr04_tf_1': 19 | {'path': './data/runs/rpd/45/irc_task1_WCrobust04_001'}, 20 | 'rpd_wcr0405_tf_1': 21 | {'path': './data/runs/rpd/45/irc_task1_WCrobust0405_001'}, 22 | 'rpd_wcr04_tf_2': 23 | {'path': './data/runs/rpd/46/irc_task1_WCrobust04_001'}, 24 | 'rpd_wcr0405_tf_2': 25 | {'path': './data/runs/rpd/46/irc_task1_WCrobust0405_001'}, 26 | 'rpd_wcr04_tf_3': 27 | {'path': './data/runs/rpd/47/irc_task1_WCrobust04_001'}, 28 | 'rpd_wcr0405_tf_3': 29 | {'path': './data/runs/rpd/47/irc_task1_WCrobust0405_001'}, 30 | 'rpd_wcr04_tf_4': 31 | {'path': './data/runs/rpd/48/irc_task1_WCrobust04_001'}, 32 | 'rpd_wcr0405_tf_4': 33 | {'path': './data/runs/rpd/48/irc_task1_WCrobust0405_001'}, 34 | 'rpd_wcr04_tf_5': 35 | {'path': './data/runs/rpd/49/irc_task1_WCrobust04_001'}, 36 | 'rpd_wcr0405_tf_5': 37 | {'path': './data/runs/rpd/49/irc_task1_WCrobust0405_001'} 38 | } 39 | 40 | 41 | def main(): 42 | rpd_eval = RpdEvaluator(qrel_orig_path=QREL, 43 | run_b_orig_path=ORIG_B, 44 | run_a_orig_path=ORIG_A, 45 | run_b_rep_path=None, 46 | run_a_rep_path=None) 47 | 48 | rpd_eval.trim() 49 | rpd_eval.evaluate() 50 | 51 | for run_name, info in runs_rpd.items(): 52 | with open(info.get('path')) as run_file: 53 | info['run'] = pytrec_eval.parse_run(run_file) 54 | trim(info['run']) 55 | info['scores'] = rpd_eval.evaluate(info['run']) 56 | 57 | pairs = [('rpd_wcr04_tf_1', 'rpd_wcr0405_tf_1'), 58 | ('rpd_wcr04_tf_2', 'rpd_wcr0405_tf_2'), 59 | ('rpd_wcr04_tf_3', 'rpd_wcr0405_tf_3'), 60 | ('rpd_wcr04_tf_4', 'rpd_wcr0405_tf_4'), 61 | ('rpd_wcr04_tf_5', 'rpd_wcr0405_tf_5')] 62 | 63 | df_content = { 64 | 'P_10': [rpd_eval.er(run_b_score=runs_rpd[pair[0]]['scores'], run_a_score=runs_rpd[pair[1]]['scores'])['P_10'] for pair in pairs], 65 | 'ndcg': [rpd_eval.er(run_b_score=runs_rpd[pair[0]]['scores'], run_a_score=runs_rpd[pair[1]]['scores'])['ndcg'] for pair in pairs], 66 | 'map': [rpd_eval.er(run_b_score=runs_rpd[pair[0]]['scores'], run_a_score=runs_rpd[pair[1]]['scores'])['map'] for pair in pairs], 67 | } 68 | 69 | df = pd.DataFrame(df_content, index=['tf_1', 'tf_2', 'tf_3', 'tf_4', 'tf_5']) 70 | orig_val = 1 71 | ax = df.plot.bar(rot=0) 72 | ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color='black') 73 | ax.annotate(' ', (3, orig_val), color='black') 74 | ax.set_xlabel("Reproduced Run") 75 | ax.set_ylabel("Effect Ratio (ER)") 76 | ax.get_figure().savefig('data/plots/rpd_er.pdf', format='pdf', bbox_inches='tight') 77 | plt.show() 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /example/rpd_eval.py: -------------------------------------------------------------------------------- 1 | from repro_eval.Evaluator import RpdEvaluator 2 | from repro_eval.util import arp 3 | from repro_eval.util import print_base_adv, print_simple_line 4 | 5 | QREL = './data/qrels/core17.txt' 6 | ORIG_B = './data/runs/orig/input.WCrobust04' 7 | ORIG_A = './data/runs/orig/input.WCrobust0405' 8 | RPD_B = './data/runs/rpd/14/irc_task1_WCrobust04_001' 9 | RPD_A = './data/runs/rpd/14/irc_task1_WCrobust0405_001' 10 | MEASURE = 'ndcg' 11 | 12 | 13 | def main(): 14 | rpd_eval = RpdEvaluator(qrel_orig_path=QREL, 15 | run_b_orig_path=ORIG_B, 16 | run_a_orig_path=ORIG_A, 17 | run_b_rep_path=RPD_B, 18 | run_a_rep_path=RPD_A) 19 | 20 | rpd_eval.trim() 21 | rpd_eval.evaluate() 22 | 23 | # KTU 24 | ktau = rpd_eval.ktau_union() 25 | print("Kendall's tau Union (KTU)") 26 | print('------------------------------------------------------------------') 27 | for topic, value in ktau.get('baseline').items(): 28 | print_base_adv(topic, 'KTU', value, ktau.get('advanced').get(topic)) 29 | print_base_adv('ARP', 'KTU', arp(ktau.get('baseline')), arp(ktau.get('advanced'))) 30 | 31 | # RBO 32 | rbo = rpd_eval.rbo() 33 | print("Rank-biased Overlap (RBO)") 34 | print('------------------------------------------------------------------') 35 | for topic, value in rbo.get('baseline').items(): 36 | print_base_adv(topic, 'RBO', value, rbo.get('advanced').get(topic)) 37 | print_base_adv('ARP', 'RBO', arp(rbo.get('baseline')), arp(rbo.get('advanced'))) 38 | 39 | # RMSE 40 | rmse = rpd_eval.rmse() 41 | print("Root mean square error (RMSE)") 42 | print('------------------------------------------------------------------') 43 | for measure, value in rmse.get('baseline').items(): 44 | print_base_adv(measure, 'RMSE', value, rmse.get('advanced').get(measure)) 45 | 46 | # ER 47 | print("Effect ratio (ER)") 48 | print('------------------------------------------------------------------') 49 | er = rpd_eval.er() 50 | for measure, value in er.items(): 51 | print_simple_line(measure, 'ER', value) 52 | 53 | # DRI 54 | print("Delta Relative Improvement (DRI)") 55 | print('------------------------------------------------------------------') 56 | dri = rpd_eval.dri() 57 | for measure, value in dri.items(): 58 | print_simple_line(measure, 'DRI', value) 59 | 60 | # ttest 61 | pvals = rpd_eval.ttest() 62 | print("Two-tailed paired t-test (p-value)") 63 | print('------------------------------------------------------------------') 64 | for measure, value in pvals.get('baseline').items(): 65 | print_base_adv(measure, 'PVAL', value, pvals.get('advanced').get(measure)) 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /example/rpd_ktu.py: -------------------------------------------------------------------------------- 1 | from repro_eval.Evaluator import RpdEvaluator 2 | from repro_eval.util import print_base_adv, print_simple_line, trim, arp 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | sns.set(style="darkgrid") 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | import pytrec_eval 9 | 10 | QREL = './data/qrels/core17.txt' 11 | ORIG_B = './data/runs/orig/input.WCrobust04' 12 | ORIG_A = './data/runs/orig/input.WCrobust0405' 13 | RPD_B = './data/runs/rpd/14/irc_task1_WCrobust04_001' 14 | RPD_A = './data/runs/rpd/14/irc_task1_WCrobust0405_001' 15 | MEASURE = 'ndcg' 16 | 17 | 18 | runs_rpd = { 19 | 'rpd_wcr04_tf_1': 20 | {'path': './data/runs/rpd/45/irc_task1_WCrobust04_001'}, 21 | 'rpd_wcr0405_tf_1': 22 | {'path': './data/runs/rpd/45/irc_task1_WCrobust0405_001'}, 23 | 'rpd_wcr04_tf_2': 24 | {'path': './data/runs/rpd/46/irc_task1_WCrobust04_001'}, 25 | 'rpd_wcr0405_tf_2': 26 | {'path': './data/runs/rpd/46/irc_task1_WCrobust0405_001'}, 27 | 'rpd_wcr04_tf_3': 28 | {'path': './data/runs/rpd/47/irc_task1_WCrobust04_001'}, 29 | 'rpd_wcr0405_tf_3': 30 | {'path': './data/runs/rpd/47/irc_task1_WCrobust0405_001'}, 31 | 'rpd_wcr04_tf_4': 32 | {'path': './data/runs/rpd/48/irc_task1_WCrobust04_001'}, 33 | 'rpd_wcr0405_tf_4': 34 | {'path': './data/runs/rpd/48/irc_task1_WCrobust0405_001'}, 35 | 'rpd_wcr04_tf_5': 36 | {'path': './data/runs/rpd/49/irc_task1_WCrobust04_001'}, 37 | 'rpd_wcr0405_tf_5': 38 | {'path': './data/runs/rpd/49/irc_task1_WCrobust0405_001'} 39 | } 40 | 41 | 42 | def main(): 43 | cutoffs = [1000, 100, 50, 20, 10, 5] 44 | 45 | # BASELINE 46 | for run_name, info in zip(list(runs_rpd.keys())[::2], list(runs_rpd.values())[::2]): 47 | rpd_eval = RpdEvaluator(qrel_orig_path=QREL, 48 | run_b_orig_path=ORIG_B, 49 | run_a_orig_path=ORIG_A, 50 | run_b_rep_path=None, 51 | run_a_rep_path=None) 52 | 53 | rpd_eval.trim() 54 | rpd_eval.evaluate() 55 | 56 | with open(info.get('path')) as run_file: 57 | info['run'] = pytrec_eval.parse_run(run_file) 58 | for cutoff in cutoffs: 59 | rpd_eval.trim(cutoff) 60 | rpd_eval.trim(cutoff, info['run']) 61 | info['ktu_' + str(cutoff)] = arp(rpd_eval.ktau_union(info['run'])['baseline']) 62 | 63 | df_content = {} 64 | for run_name, info in zip(list(runs_rpd.keys())[::2], list(runs_rpd.values())[::2]): 65 | df_content[run_name] = [info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1]] 66 | 67 | ax = pd.DataFrame(data=df_content, index=[str(cutoff) for cutoff in cutoffs[::-1]]).plot(style='-*') 68 | ax.set_xlabel('Cut-off values') 69 | ax.set_ylabel(r"Kendall's $\tau$") 70 | ax.get_figure().savefig('data/plots/rpd_b_ktu.pdf', format='pdf', bbox_inches='tight') 71 | plt.show() 72 | 73 | # ADVANCED 74 | for run_name, info in zip(list(runs_rpd.keys())[1::2], list(runs_rpd.values())[1::2]): 75 | rpd_eval = RpdEvaluator(qrel_orig_path=QREL, 76 | run_b_orig_path=ORIG_B, 77 | run_a_orig_path=ORIG_A, 78 | run_b_rep_path=None, 79 | run_a_rep_path=None) 80 | 81 | rpd_eval.trim() 82 | rpd_eval.evaluate() 83 | 84 | with open(info.get('path')) as run_file: 85 | info['run'] = pytrec_eval.parse_run(run_file) 86 | for cutoff in cutoffs: 87 | rpd_eval.trim(cutoff) 88 | rpd_eval.trim(cutoff, info['run']) 89 | # scores = rpl_eval.evaluate(info['run']) 90 | info['ktu_' + str(cutoff)] = arp(rpd_eval.ktau_union(info['run'])['baseline']) 91 | 92 | df_content = {} 93 | for run_name, info in zip(list(runs_rpd.keys())[1::2], list(runs_rpd.values())[1::2]): 94 | df_content[run_name] = [info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1]] 95 | 96 | ax = pd.DataFrame(data=df_content, index=[str(cutoff) for cutoff in cutoffs[::-1]]).plot(style='-*') 97 | ax.set_xlabel('Cut-off values') 98 | ax.set_ylabel(r"Kendall's $\tau$") 99 | ax.get_figure().savefig('data/plots/rpd_a_ktu.pdf', format='pdf', bbox_inches='tight') 100 | plt.show() 101 | 102 | 103 | if __name__ == "__main__": 104 | main() 105 | -------------------------------------------------------------------------------- /example/rpd_rmse.py: -------------------------------------------------------------------------------- 1 | import pytrec_eval 2 | from repro_eval.Evaluator import RpdEvaluator 3 | from repro_eval.util import arp, arp_scores 4 | from repro_eval.util import trim 5 | import pandas as pd 6 | from matplotlib import pyplot as plt 7 | import seaborn as sns 8 | sns.set() 9 | sns.set_style('white') 10 | palette = sns.color_palette("GnBu_d") 11 | sns.set_palette(palette) 12 | colors = sns.color_palette() 13 | 14 | ORIG_B = './data/runs/orig/input.WCrobust04' 15 | ORIG_A = './data/runs/orig/input.WCrobust0405' 16 | QREL = 'data/qrels/core17.txt' 17 | 18 | 19 | runs_rpd = { 20 | 'rpd_wcr04_tf_1': 21 | {'path': './data/runs/rpd/45/irc_task1_WCrobust04_001'}, 22 | 'rpd_wcr0405_tf_1': 23 | {'path': './data/runs/rpd/45/irc_task1_WCrobust0405_001'}, 24 | 'rpd_wcr04_tf_2': 25 | {'path': './data/runs/rpd/46/irc_task1_WCrobust04_001'}, 26 | 'rpd_wcr0405_tf_2': 27 | {'path': './data/runs/rpd/46/irc_task1_WCrobust0405_001'}, 28 | 'rpd_wcr04_tf_3': 29 | {'path': './data/runs/rpd/47/irc_task1_WCrobust04_001'}, 30 | 'rpd_wcr0405_tf_3': 31 | {'path': './data/runs/rpd/47/irc_task1_WCrobust0405_001'}, 32 | 'rpd_wcr04_tf_4': 33 | {'path': './data/runs/rpd/48/irc_task1_WCrobust04_001'}, 34 | 'rpd_wcr0405_tf_4': 35 | {'path': './data/runs/rpd/48/irc_task1_WCrobust0405_001'}, 36 | 'rpd_wcr04_tf_5': 37 | {'path': './data/runs/rpd/49/irc_task1_WCrobust04_001'}, 38 | 'rpd_wcr0405_tf_5': 39 | {'path': './data/runs/rpd/49/irc_task1_WCrobust0405_001'} 40 | } 41 | 42 | 43 | def average_retrieval_performance(baseline_scores, reproduced_scores: dict, measures: list, xlabel: str, ylabel: str, outfile: str): 44 | reproduced_scores_arp = [arp_scores(topic_scores) for idx, topic_scores in reproduced_scores.items()] 45 | baseline_scores_arp = arp_scores(baseline_scores) 46 | index = list(reproduced_scores.keys()) 47 | df_content = {} 48 | for measure in measures: 49 | df_content[measure] = [scores.get(measure) for scores in reproduced_scores_arp] 50 | df = pd.DataFrame(df_content, index=index) 51 | 52 | ax = df.plot.bar(rot=0) 53 | for num, measure in enumerate(measures): 54 | orig_val = baseline_scores_arp.get(measure) 55 | ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color=colors[num]) 56 | ax.annotate(' ', (num, orig_val), color=colors[num]) 57 | ax.set_ylim(0.0, 1.0) 58 | 59 | legend_content = [measure + ' (orig)' for measure in measures] + [measure + ' (rpd)' for measure in measures] 60 | ax.legend(legend_content, loc='lower left') 61 | 62 | ax.set_xlabel(xlabel) 63 | ax.set_ylabel(ylabel) 64 | ax.get_figure().savefig(outfile, format='pdf', bbox_inches='tight') 65 | plt.show() 66 | 67 | 68 | def main(): 69 | rpd_eval = RpdEvaluator(qrel_orig_path=QREL, 70 | run_b_orig_path=ORIG_B, 71 | run_a_orig_path=ORIG_A, 72 | run_b_rep_path=None, 73 | run_a_rep_path=None) 74 | 75 | rpd_eval.trim() 76 | rpd_eval.evaluate() 77 | 78 | for run_name, info in runs_rpd.items(): 79 | with open(info.get('path')) as run_file: 80 | info['run'] = pytrec_eval.parse_run(run_file) 81 | trim(info['run']) 82 | info['scores'] = rpd_eval.evaluate(info['run']) 83 | info['rmse'] = rpd_eval.rmse(run_b_score=info['scores']) 84 | 85 | 86 | baseline_runs = ['rpd_wcr04_tf_1', 'rpd_wcr04_tf_2', 'rpd_wcr04_tf_3', 'rpd_wcr04_tf_4', 'rpd_wcr04_tf_5'] 87 | advanced_runs = ['rpd_wcr0405_tf_1', 'rpd_wcr0405_tf_2', 'rpd_wcr0405_tf_3', 'rpd_wcr0405_tf_4', 'rpd_wcr0405_tf_5'] 88 | cutoffs = ['5', '10', '15', '20', '30', '100', '200', '500', '1000'] 89 | 90 | df_content = {} 91 | for run_name in baseline_runs: 92 | df_content[run_name] = [runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co] for co in cutoffs] 93 | 94 | df = pd.DataFrame(df_content, index=cutoffs) 95 | ax = df.plot.line(style='o-') 96 | ax.set_xlabel('Cut-off values') 97 | ax.set_ylabel('RMSE') 98 | ax.get_figure().savefig('data/plots/rpd_b_rmse.pdf', format='pdf', bbox_inches='tight') 99 | plt.show() 100 | 101 | df_content = {} 102 | for run_name in advanced_runs: 103 | df_content[run_name] = [runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co] for co in cutoffs] 104 | 105 | df = pd.DataFrame(df_content, index=cutoffs) 106 | ax = df.plot.line(style='o-') 107 | ax.set_xlabel('Cut-off values') 108 | ax.set_ylabel('RMSE') 109 | ax.get_figure().savefig('data/plots/rpd_a_rmse.pdf', format='pdf', bbox_inches='tight') 110 | plt.show() 111 | 112 | 113 | if __name__ == "__main__": 114 | main() 115 | -------------------------------------------------------------------------------- /example/rpl_dri_vs_er.py: -------------------------------------------------------------------------------- 1 | from repro_eval.Evaluator import RplEvaluator 2 | from repro_eval.util import trim 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | sns.set(style="darkgrid") 6 | 7 | import pytrec_eval 8 | 9 | QREL = './data/qrels/core17.txt' 10 | QREL_RPL = './data/qrels/core18.txt' 11 | ORIG_B = './data/runs/orig/input.WCrobust04' 12 | ORIG_A = './data/runs/orig/input.WCrobust0405' 13 | 14 | 15 | runs_rpl = { 16 | 'rpl_wcr04_tf_1': 17 | {'path': './data/runs/rpl/45/irc_task2_WCrobust04_001'}, 18 | 'rpl_wcr0405_tf_1': 19 | {'path': './data/runs/rpl/45/irc_task2_WCrobust0405_001'}, 20 | 'rpl_wcr04_tf_2': 21 | {'path': './data/runs/rpl/46/irc_task2_WCrobust04_001'}, 22 | 'rpl_wcr0405_tf_2': 23 | {'path': './data/runs/rpl/46/irc_task2_WCrobust0405_001'}, 24 | 'rpl_wcr04_tf_3': 25 | {'path': './data/runs/rpl/47/irc_task2_WCrobust04_001'}, 26 | 'rpl_wcr0405_tf_3': 27 | {'path': './data/runs/rpl/47/irc_task2_WCrobust0405_001'}, 28 | 'rpl_wcr04_tf_4': 29 | {'path': './data/runs/rpl/48/irc_task2_WCrobust04_001'}, 30 | 'rpl_wcr0405_tf_4': 31 | {'path': './data/runs/rpl/48/irc_task2_WCrobust0405_001'}, 32 | 'rpl_wcr04_tf_5': 33 | {'path': './data/runs/rpl/49/irc_task2_WCrobust04_001'}, 34 | 'rpl_wcr0405_tf_5': 35 | {'path': './data/runs/rpl/49/irc_task2_WCrobust0405_001'} 36 | } 37 | 38 | 39 | def main(): 40 | rpl_eval = RplEvaluator(qrel_orig_path=QREL, 41 | run_b_orig_path=ORIG_B, 42 | run_a_orig_path=ORIG_A, 43 | run_b_rep_path=None, 44 | run_a_rep_path=None, 45 | qrel_rpd_path=QREL_RPL) 46 | 47 | rpl_eval.trim() 48 | rpl_eval.evaluate() 49 | 50 | for run_name, info in runs_rpl.items(): 51 | with open(info.get('path')) as run_file: 52 | info['run'] = pytrec_eval.parse_run(run_file) 53 | trim(info['run']) 54 | info['scores'] = rpl_eval.evaluate(info['run']) 55 | 56 | dri_er = { 57 | 'wcr_tf_1': { 58 | 'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_1']['scores'], runs_rpl['rpl_wcr0405_tf_1']['scores']), 59 | 'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_1']['scores'], runs_rpl['rpl_wcr0405_tf_1']['scores']) 60 | }, 61 | 'wcr_tf_2': { 62 | 'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_2']['scores'], runs_rpl['rpl_wcr0405_tf_2']['scores']), 63 | 'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_2']['scores'], runs_rpl['rpl_wcr0405_tf_2']['scores']) 64 | }, 65 | 'wcr_tf_3': { 66 | 'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_3']['scores'], runs_rpl['rpl_wcr0405_tf_3']['scores']), 67 | 'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_3']['scores'], runs_rpl['rpl_wcr0405_tf_3']['scores']) 68 | }, 69 | 'wcr_tf_4': { 70 | 'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_4']['scores'], runs_rpl['rpl_wcr0405_tf_4']['scores']), 71 | 'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_4']['scores'], runs_rpl['rpl_wcr0405_tf_4']['scores']) 72 | }, 73 | 'wcr_tf_5': { 74 | 'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_5']['scores'], runs_rpl['rpl_wcr0405_tf_5']['scores']), 75 | 'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_5']['scores'], runs_rpl['rpl_wcr0405_tf_5']['scores']) 76 | }, 77 | 78 | } 79 | 80 | measures = ['P_10', 'map', 'ndcg'] 81 | marker_color = [('o', 'b'), ('^', 'g'), ('v', 'r')] 82 | 83 | fig, ax1 = plt.subplots() 84 | ax1.set_xlabel('Effect Ratio (ER)') 85 | ax1.set_ylabel(u'Delta Relative Improvement (ΔRI)') 86 | 87 | for measure, mk in zip(measures, marker_color): 88 | ax1.plot([dri_er[r]['er'][measure] for r in dri_er.keys()], 89 | [dri_er[r]['dri'][measure] for r in dri_er.keys()], 90 | marker=mk[0], color=mk[1], linestyle='None', label=measure) 91 | 92 | ax1.tick_params(axis='y', labelcolor='k') 93 | fig.tight_layout() 94 | plt.axhline(0, color='grey') 95 | plt.axvline(1, color='grey') 96 | plt.legend() 97 | plt.title('Replicability') 98 | plt.savefig('data/plots/rpl_dri_vs_er.pdf', format='pdf', bbox_inches='tight') 99 | plt.show() 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | -------------------------------------------------------------------------------- /example/rpl_er.py: -------------------------------------------------------------------------------- 1 | import pytrec_eval 2 | from repro_eval.Evaluator import RplEvaluator 3 | from repro_eval.util import trim 4 | import pandas as pd 5 | from matplotlib import pyplot as plt 6 | import seaborn as sns 7 | sns.set() 8 | sns.set_style('whitegrid') 9 | palette = sns.color_palette("GnBu_d") 10 | sns.set_palette(palette) 11 | colors = sns.color_palette() 12 | 13 | ORIG_B = './data/runs/orig/input.WCrobust04' 14 | ORIG_A = './data/runs/orig/input.WCrobust0405' 15 | QREL = 'data/qrels/core17.txt' 16 | QREL_RPL = 'data/qrels/core18.txt' 17 | 18 | runs_rpl = { 19 | 'rpl_wcr04_tf_1': 20 | {'path': './data/runs/rpl/45/irc_task2_WCrobust04_001'}, 21 | 'rpl_wcr0405_tf_1': 22 | {'path': './data/runs/rpl/45/irc_task2_WCrobust0405_001'}, 23 | 'rpl_wcr04_tf_2': 24 | {'path': './data/runs/rpl/46/irc_task2_WCrobust04_001'}, 25 | 'rpl_wcr0405_tf_2': 26 | {'path': './data/runs/rpl/46/irc_task2_WCrobust0405_001'}, 27 | 'rpl_wcr04_tf_3': 28 | {'path': './data/runs/rpl/47/irc_task2_WCrobust04_001'}, 29 | 'rpl_wcr0405_tf_3': 30 | {'path': './data/runs/rpl/47/irc_task2_WCrobust0405_001'}, 31 | 'rpl_wcr04_tf_4': 32 | {'path': './data/runs/rpl/48/irc_task2_WCrobust04_001'}, 33 | 'rpl_wcr0405_tf_4': 34 | {'path': './data/runs/rpl/48/irc_task2_WCrobust0405_001'}, 35 | 'rpl_wcr04_tf_5': 36 | {'path': './data/runs/rpl/49/irc_task2_WCrobust04_001'}, 37 | 'rpl_wcr0405_tf_5': 38 | {'path': './data/runs/rpl/49/irc_task2_WCrobust0405_001'} 39 | } 40 | 41 | 42 | def main(): 43 | rpl_eval = RplEvaluator(qrel_orig_path=QREL, 44 | run_b_orig_path=ORIG_B, 45 | run_a_orig_path=ORIG_A, 46 | run_b_rep_path=None, 47 | run_a_rep_path=None, 48 | qrel_rpd_path=QREL_RPL) 49 | 50 | rpl_eval.trim() 51 | rpl_eval.evaluate() 52 | 53 | for run_name, info in runs_rpl.items(): 54 | with open(info.get('path')) as run_file: 55 | info['run'] = pytrec_eval.parse_run(run_file) 56 | trim(info['run']) 57 | info['scores'] = rpl_eval.evaluate(info['run']) 58 | 59 | pairs = [('rpl_wcr04_tf_1', 'rpl_wcr0405_tf_1'), 60 | ('rpl_wcr04_tf_2', 'rpl_wcr0405_tf_2'), 61 | ('rpl_wcr04_tf_3', 'rpl_wcr0405_tf_3'), 62 | ('rpl_wcr04_tf_4', 'rpl_wcr0405_tf_4'), 63 | ('rpl_wcr04_tf_5', 'rpl_wcr0405_tf_5')] 64 | 65 | df_content = { 66 | 'P_10': [rpl_eval.er(run_b_score=runs_rpl[pair[0]]['scores'], run_a_score=runs_rpl[pair[1]]['scores'])['P_10'] for pair in pairs], 67 | 'ndcg': [rpl_eval.er(run_b_score=runs_rpl[pair[0]]['scores'], run_a_score=runs_rpl[pair[1]]['scores'])['ndcg'] for pair in pairs], 68 | 'map': [rpl_eval.er(run_b_score=runs_rpl[pair[0]]['scores'], run_a_score=runs_rpl[pair[1]]['scores'])['map'] for pair in pairs], 69 | } 70 | 71 | df = pd.DataFrame(df_content, index=['tf_1', 'tf_2', 'tf_3', 'tf_4', 'tf_5']) 72 | orig_val = 1 73 | ax = df.plot.bar(rot=0) 74 | ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color='black') 75 | ax.annotate(' ', (3, orig_val), color='black') 76 | ax.set_xlabel("Replicated Run") 77 | ax.set_ylabel("Effect Ratio (ER)") 78 | ax.get_figure().savefig('data/plots/rpl_er.pdf', format='pdf', bbox_inches='tight') 79 | plt.show() 80 | 81 | 82 | if __name__ == "__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /example/rpl_eval.py: -------------------------------------------------------------------------------- 1 | from repro_eval.Evaluator import RplEvaluator 2 | from repro_eval.util import print_base_adv, print_simple_line 3 | 4 | QREL = './data/qrels/core17.txt' 5 | QREL_RPL = './data/qrels/core18.txt' 6 | ORIG_B = './data/runs/orig/input.WCrobust04' 7 | ORIG_A = './data/runs/orig/input.WCrobust0405' 8 | RPL_B = './data/runs/rpl/14/irc_task2_WCrobust04_001' 9 | RPL_A = './data/runs/rpl/14/irc_task2_WCrobust0405_001' 10 | MEASURE = 'ndcg' 11 | 12 | 13 | def main(): 14 | rpl_eval = RplEvaluator(qrel_orig_path=QREL, 15 | run_b_orig_path=ORIG_B, 16 | run_a_orig_path=ORIG_A, 17 | run_b_rep_path=RPL_B, 18 | run_a_rep_path=RPL_A, 19 | qrel_rpd_path=QREL_RPL) 20 | 21 | rpl_eval.trim() 22 | rpl_eval.evaluate() 23 | 24 | # ER 25 | print("Effect ratio (ER)") 26 | print('------------------------------------------------------------------') 27 | er = rpl_eval.er() 28 | for measure, value in er.items(): 29 | print_simple_line(measure, 'ER', value) 30 | 31 | # DRI 32 | print("Delta Relative Improvement (DRI)") 33 | print('------------------------------------------------------------------') 34 | dri = rpl_eval.dri() 35 | for measure, value in dri.items(): 36 | print_simple_line(measure, 'DRI', value) 37 | 38 | # ttest 39 | pvals = rpl_eval.ttest() 40 | print("Two-tailed unpaired t-test (p-value)") 41 | print('------------------------------------------------------------------') 42 | for measure, value in pvals.get('baseline').items(): 43 | print_base_adv(measure, 'PVAL', value, pvals.get('advanced').get(measure)) 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /repro_eval/Evaluator.py: -------------------------------------------------------------------------------- 1 | import pytrec_eval 2 | from repro_eval.util import trim, break_ties 3 | from repro_eval.measure.statistics import ttest 4 | from repro_eval.measure.overall_effects import ER, deltaRI 5 | from repro_eval.measure.document_order import ktau_union as ktu, RBO 6 | from repro_eval.measure.effectiveness import rmse as RMSE, nrmse as nRMSE 7 | from repro_eval.config import ERR_MSG 8 | 9 | 10 | class Evaluator(object): 11 | """ 12 | An abstract evaluator that holds the original baseline and advanced run as well as 13 | the reproduced/replicated baseline and advanced run. 14 | """ 15 | 16 | def __init__(self, **kwargs): 17 | self.qrel_orig_path = kwargs.get('qrel_orig_path', None) 18 | self.run_b_orig_path = kwargs.get('run_b_orig_path', None) 19 | self.run_a_orig_path = kwargs.get('run_a_orig_path', None) 20 | self.run_b_rep_path = kwargs.get('run_b_rep_path', None) 21 | self.run_a_rep_path = kwargs.get('run_a_rep_path', None) 22 | self.run_b_orig = None 23 | self.run_a_orig = None 24 | self.run_b_rep = None 25 | self.run_a_rep = None 26 | self.run_b_orig_score = None 27 | self.run_a_orig_score = None 28 | self.run_b_rep_score = None 29 | self.run_a_rep_score = None 30 | 31 | if self.qrel_orig_path: 32 | with open(self.qrel_orig_path, 'r') as f_qrel: 33 | qrel_orig = pytrec_eval.parse_qrel(f_qrel) 34 | self.rel_eval = pytrec_eval.RelevanceEvaluator(qrel_orig, pytrec_eval.supported_measures) 35 | 36 | if self.run_b_orig_path: 37 | with open(self.run_b_orig_path, 'r') as f_run: 38 | self.run_b_orig = pytrec_eval.parse_run(f_run) 39 | self.run_b_orig = {t: self.run_b_orig[t] for t in sorted(self.run_b_orig)} 40 | 41 | if self.run_a_orig_path: 42 | with open(self.run_a_orig_path, 'r') as f_run: 43 | self.run_a_orig = pytrec_eval.parse_run(f_run) 44 | self.run_a_orig = {t: self.run_a_orig[t] for t in sorted(self.run_a_orig)} 45 | 46 | if self.run_b_rep_path: 47 | with open(self.run_b_rep_path, 'r') as f_run: 48 | self.run_b_rep = pytrec_eval.parse_run(f_run) 49 | self.run_b_rep = {t: self.run_b_rep[t] for t in sorted(self.run_b_rep)} 50 | 51 | if self.run_a_rep_path: 52 | with open(self.run_a_rep_path, 'r') as f_run: 53 | self.run_a_rep = pytrec_eval.parse_run(f_run) 54 | self.run_a_rep = {t: self.run_a_rep[t] for t in sorted(self.run_a_rep)} 55 | 56 | def trim(self, t=None, run=None): 57 | """ 58 | Trims all runs of the Evaluator to the length specified by the threshold value t. 59 | 60 | @param t: Threshold parameter or number of top-k documents to be considered. 61 | @param run: If run is not None, only the provided run will be trimmed. 62 | """ 63 | if run: 64 | run = break_ties(run) 65 | if t: 66 | trim(run, thresh=t) 67 | else: 68 | trim(run) 69 | return 70 | 71 | if self.run_b_orig: 72 | self.run_b_orig = break_ties(self.run_b_orig) 73 | if t: 74 | trim(self.run_b_orig, thresh=t) 75 | else: 76 | trim(self.run_b_orig) 77 | 78 | if self.run_a_orig: 79 | self.run_a_orig = break_ties(self.run_a_orig) 80 | if t: 81 | trim(self.run_a_orig, thresh=t) 82 | else: 83 | trim(self.run_a_orig) 84 | 85 | if self.run_b_rep: 86 | self.run_b_rep = break_ties(self.run_b_rep) 87 | if t: 88 | trim(self.run_b_rep, thresh=t) 89 | else: 90 | trim(self.run_b_rep) 91 | 92 | if self.run_a_rep: 93 | self.run_a_rep = break_ties(self.run_a_rep) 94 | if t: 95 | trim(self.run_a_rep, thresh=t) 96 | else: 97 | trim(self.run_a_rep) 98 | 99 | def evaluate(self, run=None): 100 | """ 101 | Evaluates the original baseline and advanced run if available. 102 | 103 | @param run: Reproduced or replicated run that will be evaluated. 104 | """ 105 | if self.run_b_orig: 106 | self.run_b_orig = break_ties(self.run_b_orig) 107 | self.run_b_orig_score = self.rel_eval.evaluate(self.run_b_orig) 108 | if self.run_a_orig: 109 | self.run_a_orig = break_ties(self.run_a_orig) 110 | self.run_a_orig_score = self.rel_eval.evaluate(self.run_a_orig) 111 | 112 | def er(self, run_b_score=None, run_a_score=None, run_b_path=None, run_a_path=None, print_feedback=False): 113 | """ 114 | Determines the Effect Ratio (ER) according to the following paper: 115 | Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff. 116 | How to Measure the Reproducibility of System-oriented IR Experiments. 117 | Proceedings of SIGIR, pages 349-358, 2020. 118 | 119 | The ER value is determined by the ratio between the mean improvements 120 | of the original and reproduced/replicated experiments. 121 | 122 | @param run_b_score: Scores of the baseline run, 123 | if not provided the scores of the RpdEvaluator object will be used instead. 124 | @param run_a_score: Scores of the advanced run, 125 | if not provided the scores of the RpdEvaluator object will be used instead. 126 | @param print_feedback: Boolean value indicating if feedback on progress should be printed. 127 | @return: Dictionary containing the ER values for the specified run combination. 128 | """ 129 | if print_feedback: 130 | print('Determining Effect Ratio (ER)') 131 | 132 | if self.run_b_orig_score and self.run_a_orig_score and run_b_path and run_a_path: 133 | with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run: 134 | run_b_rep = pytrec_eval.parse_run(b_run) 135 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 136 | run_b_rep_score = self.rel_eval_rpl.evaluate(run_b_rep) if hasattr(self, 'rel_eval_rpl') else self.rel_eval.evaluate(run_b_rep) 137 | run_a_rep = pytrec_eval.parse_run(a_run) 138 | run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)} 139 | run_a_rep_score = self.rel_eval_rpl.evaluate(run_a_rep) if hasattr(self, 'rel_eval_rpl') else self.rel_eval.evaluate(run_a_rep) 140 | return ER(orig_score_b=self.run_b_orig_score, orig_score_a=self.run_a_orig_score, 141 | rep_score_b=run_b_rep_score, rep_score_a=run_a_rep_score, pbar=print_feedback) 142 | 143 | if self.run_b_orig_score and self.run_a_orig_score and run_b_score and run_a_score: 144 | return ER(orig_score_b=self.run_b_orig_score, orig_score_a=self.run_a_orig_score, 145 | rep_score_b=run_b_score, rep_score_a=run_a_score, pbar=print_feedback) 146 | 147 | if self.run_b_orig_score and self.run_a_orig_score and self.run_b_rep_score and self.run_a_rep_score: 148 | return ER(orig_score_b=self.run_b_orig_score, orig_score_a=self.run_a_orig_score, 149 | rep_score_b=self.run_b_rep_score, rep_score_a=self.run_a_rep_score, pbar=print_feedback) 150 | else: 151 | print(ERR_MSG) 152 | 153 | def dri(self, run_b_score=None, run_a_score=None, run_b_path=None, run_a_path=None, print_feedback=False): 154 | """ 155 | Determines the Delta Relative Improvement (DeltaRI) according to the following paper: 156 | Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff. 157 | How to Measure the Reproducibility of System-oriented IR Experiments. 158 | Proceedings of SIGIR, pages 349-358, 2020. 159 | 160 | The DeltaRI value is determined by the difference between the relative improvements 161 | of the original and reproduced/replicated experiments. 162 | 163 | @param run_b_score: Scores of the baseline run, 164 | if not provided the scores of the RpdEvaluator object will be used instead. 165 | @param run_a_score: Scores of the advanced run, 166 | if not provided the scores of the RpdEvaluator object will be used instead. 167 | @param print_feedback: Boolean value indicating if feedback on progress should be printed. 168 | @return: Dictionary containing the DRI values for the specified run combination. 169 | """ 170 | if print_feedback: 171 | print('Determining Delta Relative Improvement (DRI)') 172 | 173 | if self.run_b_orig_score and self.run_a_orig_score and run_b_path and run_a_path: 174 | with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run: 175 | run_b_rep = pytrec_eval.parse_run(b_run) 176 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 177 | run_b_rep_score = self.rel_eval_rpl.evaluate(run_b_rep) if hasattr(self, 'rel_eval_rpl') else self.rel_eval.evaluate(run_b_rep) 178 | run_a_rep = pytrec_eval.parse_run(a_run) 179 | run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)} 180 | run_a_rep_score = self.rel_eval_rpl.evaluate(run_a_rep) if hasattr(self, 'rel_eval_rpl') else self.rel_eval.evaluate(run_a_rep) 181 | return deltaRI(orig_score_b=self.run_b_orig_score, orig_score_a=self.run_a_orig_score, 182 | rep_score_b=run_b_rep_score, rep_score_a=run_a_rep_score, pbar=print_feedback) 183 | 184 | if self.run_b_orig_score and self.run_a_orig_score and run_b_score and run_a_score: 185 | return deltaRI(orig_score_b=self.run_b_orig_score, orig_score_a=self.run_a_orig_score, 186 | rep_score_b=run_b_score, rep_score_a=run_a_score, pbar=print_feedback) 187 | 188 | if self.run_b_orig_score and self.run_a_orig_score and self.run_b_rep_score and self.run_a_rep_score: 189 | return deltaRI(orig_score_b=self.run_b_orig_score, orig_score_a=self.run_a_orig_score, 190 | rep_score_b=self.run_b_rep_score, rep_score_a=self.run_a_rep_score, pbar=print_feedback) 191 | else: 192 | print(ERR_MSG) 193 | 194 | def _ttest(self, rpd=True, run_b_score=None, run_a_score=None, print_feedback=False): 195 | """ 196 | Conducts either a paired (reproducibility) or unpaired (replicability) two-sided t-test according to the following paper: 197 | Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff. 198 | How to Measure the Reproducibility of System-oriented IR Experiments. 199 | Proceedings of SIGIR, pages 349-358, 2020. 200 | 201 | @param rpd: Boolean indicating if the evaluated runs are reproduced. 202 | @param run_b_score: Scores of the baseline run, 203 | if not provided the scores of the RpdEvaluator object will be used instead. 204 | @param run_a_score: Scores of the advanced run, 205 | if not provided the scores of the RpdEvaluator object will be used instead. 206 | @param print_feedback: Boolean value indicating if feedback on progress should be printed. 207 | @return: Dictionary with p-values that compare the score distributions of the baseline and advanced run. 208 | """ 209 | if self.run_b_orig_score and (self.run_b_rep_score or run_b_score): 210 | if run_b_score and run_a_score: 211 | if print_feedback: 212 | print('Determining p-values of t-test for baseline and advanced run.') 213 | return {'baseline': ttest(self.run_b_orig_score, run_b_score, rpd=rpd, pbar=print_feedback), 214 | 'advanced': ttest(self.run_a_orig_score, run_a_score, rpd=rpd, pbar=print_feedback)} 215 | if run_b_score: 216 | if print_feedback: 217 | print('Determining p-values of t-test for baseline run.') 218 | return {'baseline': ttest(self.run_b_orig_score, run_b_score, rpd=rpd, pbar=print_feedback)} 219 | if self.run_a_orig_score and self.run_a_rep_score: 220 | if print_feedback: 221 | print('Determining p-values of t-test for baseline and advanced run.') 222 | return {'baseline': ttest(self.run_b_orig_score, self.run_b_rep_score, rpd=rpd, pbar=print_feedback), 223 | 'advanced': ttest(self.run_a_orig_score, self.run_a_rep_score, rpd=rpd, pbar=print_feedback)} 224 | else: 225 | if print_feedback: 226 | print('Determining p-values of t-test for baseline run.') 227 | return {'baseline': ttest(self.run_b_orig_score, self.run_b_rep_score, rpd=rpd, pbar=print_feedback)} 228 | else: 229 | print(ERR_MSG) 230 | 231 | 232 | class RpdEvaluator(Evaluator): 233 | """ 234 | The Reproducibility Evaluator is used for quantifying the different levels of reproduction for runs that were 235 | derived from the same test collection used in the original experiment. 236 | """ 237 | 238 | def evaluate(self, run=None): 239 | """ 240 | Evaluates the scores of the original and reproduced baseline and advanced runs. 241 | If a (reproduced) run is provided only this one will be evaluated and a dictionary with the corresponding 242 | scores is returned. 243 | @param run: A reproduced run. If not specified, the original and reproduced runs of the the RpdEvaluator will 244 | be used instead. 245 | @return: If run is specified, a dictionary with the corresponding scores is returned. 246 | """ 247 | if run: 248 | return self.rel_eval.evaluate(run) 249 | 250 | super(RpdEvaluator, self).evaluate() 251 | if self.run_b_rep: 252 | self.run_b_rep = break_ties(self.run_b_rep) 253 | self.run_b_rep_score = self.rel_eval.evaluate(self.run_b_rep) 254 | if self.run_a_rep: 255 | self.run_a_rep = break_ties(self.run_a_rep) 256 | self.run_a_rep_score = self.rel_eval.evaluate(self.run_a_rep) 257 | 258 | def ktau_union(self, run_b_rep=None, run_a_rep=None, run_b_path=None, run_a_path=None, print_feedback=False): 259 | """ 260 | Determines Kendall's tau Union (KTU) between the original and reproduced document orderings 261 | according to the following paper: 262 | Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff. 263 | How to Measure the Reproducibility of System-oriented IR Experiments. 264 | Proceedings of SIGIR, pages 349-358, 2020. 265 | 266 | @param run_b_rep: Scores of the baseline run, 267 | if not provided the scores of the RpdEvaluator object will be used instead. 268 | @param run_a_rep: Scores of the advanced run, 269 | if not provided the scores of the RpdEvaluator object will be used instead. 270 | @param run_b_path: Path to another reproduced baseline run, 271 | if not provided the reproduced baseline run of the RpdEvaluator object will be used instead. 272 | @param run_a_path: Path to another reproduced advanced run, 273 | if not provided the reproduced advanced run of the RpdEvaluator object will be used instead. 274 | @param print_feedback: Boolean value indicating if feedback on progress should be printed. 275 | @return: Dictionary with KTU values that compare the document orderings of the original and reproduced runs. 276 | """ 277 | if self.run_b_orig and run_b_path: 278 | if self.run_a_orig and run_a_path: 279 | if print_feedback: 280 | print("Determining Kendall's tau Union (KTU) for baseline and advanced run.") 281 | with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run: 282 | run_b_rep = pytrec_eval.parse_run(b_run) 283 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 284 | run_a_rep = pytrec_eval.parse_run(a_run) 285 | run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)} 286 | return {'baseline': ktu(self.run_b_orig, run_b_rep, pbar=print_feedback), 287 | 'advanced': ktu(self.run_a_orig, run_a_rep, pbar=print_feedback)} 288 | else: 289 | if print_feedback: 290 | print("Determining Kendall's tau Union (KTU) for baseline run.") 291 | with open(run_b_path, 'r') as b_run: 292 | run_b_rep = pytrec_eval.parse_run(b_run) 293 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 294 | return {'baseline': ktu(self.run_b_orig, run_b_rep, pbar=print_feedback)} 295 | 296 | if self.run_b_orig and run_b_rep: 297 | if self.run_a_orig and run_a_rep: 298 | if print_feedback: 299 | print("Determining Kendall's tau Union (KTU) for baseline and advanced run.") 300 | return {'baseline': ktu(self.run_b_orig, run_b_rep, pbar=print_feedback), 301 | 'advanced': ktu(self.run_a_orig, run_a_rep, pbar=print_feedback)} 302 | else: 303 | if print_feedback: 304 | print("Determining Kendall's tau Union (KTU) for baseline run.") 305 | return {'baseline': ktu(self.run_b_orig, run_b_rep, pbar=print_feedback)} 306 | 307 | if self.run_b_orig and self.run_b_rep: 308 | if self.run_a_orig and self.run_a_rep: 309 | if print_feedback: 310 | print("Determining Kendall's tau Union (KTU) for baseline and advanced run.") 311 | return {'baseline': ktu(self.run_b_orig, self.run_b_rep, pbar=print_feedback), 312 | 'advanced': ktu(self.run_a_orig, self.run_a_rep, pbar=print_feedback)} 313 | else: 314 | if print_feedback: 315 | print("Determining Kendall's tau Union (KTU) for baseline run.") 316 | return {'baseline': ktu(self.run_b_orig, self.run_b_rep, pbar=print_feedback)} 317 | else: 318 | print(ERR_MSG) 319 | 320 | def rbo(self, run_b_rep=None, run_a_rep=None, run_b_path=None, run_a_path=None, print_feedback=False, misinfo=True): 321 | """ 322 | Determines the Rank-Biased Overlap (RBO) between the original and reproduced document orderings 323 | according to the following paper: 324 | Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff. 325 | How to Measure the Reproducibility of System-oriented IR Experiments. 326 | Proceedings of SIGIR, pages 349-358, 2020. 327 | 328 | @param run_b_rep: Scores of the baseline run, 329 | if not provided the scores of the RpdEvaluator object will be used instead. 330 | @param run_a_rep: Scores of the advanced run, 331 | if not provided the scores of the RpdEvaluator object will be used instead. 332 | @param run_b_path: Path to another reproduced baseline run, 333 | if not provided the reproduced baseline run of the RpdEvaluator object will be used instead. 334 | @param run_a_path: Path to another reproduced advanced run, 335 | if not provided the reproduced advanced run of the RpdEvaluator object will be used instead. 336 | @param print_feedback: Boolean value indicating if feedback on progress should be printed. 337 | @param misinfo: Use the RBO implementation that is also used in the TREC Health Misinformation Track. 338 | See also: https://github.com/claclark/Compatibility 339 | @return: Dictionary with RBO values that compare the document orderings of the original and reproduced runs. 340 | """ 341 | if self.run_b_orig and run_b_path: 342 | if self.run_a_orig and run_a_path: 343 | if print_feedback: 344 | print("Determining Rank-biased Overlap (RBO) for baseline and advanced run.") 345 | with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run: 346 | run_b_rep = pytrec_eval.parse_run(b_run) 347 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 348 | run_a_rep = pytrec_eval.parse_run(a_run) 349 | run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)} 350 | return {'baseline': RBO(self.run_b_orig, run_b_rep, pbar=print_feedback, misinfo=misinfo), 351 | 'advanced': RBO(self.run_a_orig, run_a_rep, pbar=print_feedback, misinfo=misinfo)} 352 | else: 353 | if print_feedback: 354 | print("Determining Rank-biased Overlap (RBO) for baseline run.") 355 | with open(run_b_path, 'r') as b_run: 356 | run_b_rep = pytrec_eval.parse_run(b_run) 357 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 358 | return {'baseline': RBO(self.run_b_orig, run_b_rep, pbar=print_feedback, misinfo=misinfo)} 359 | 360 | if self.run_b_orig and run_b_rep: 361 | if self.run_a_orig and run_a_rep: 362 | if print_feedback: 363 | print("Determining Rank-biased Overlap (RBO) for baseline and advanced run.") 364 | return {'baseline': RBO(self.run_b_orig, run_b_rep, pbar=print_feedback, misinfo=misinfo), 365 | 'advanced': RBO(self.run_a_orig, run_a_rep, pbar=print_feedback, misinfo=misinfo)} 366 | else: 367 | if print_feedback: 368 | print("Determining Rank-biased Overlap (RBO) for baseline run.") 369 | return {'baseline': RBO(self.run_b_orig, run_b_rep, pbar=print_feedback, misinfo=misinfo)} 370 | if self.run_b_orig and self.run_b_rep: 371 | if self.run_a_orig and self.run_a_rep: 372 | if print_feedback: 373 | print("Determining Rank-biased Overlap (RBO) for baseline and advanced run.") 374 | return {'baseline': RBO(self.run_b_orig, self.run_b_rep, pbar=print_feedback, misinfo=misinfo), 375 | 'advanced': RBO(self.run_a_orig, self.run_a_rep, pbar=print_feedback, misinfo=misinfo)} 376 | else: 377 | if print_feedback: 378 | print("Determining Rank-biased Overlap (RBO) for baseline run.") 379 | return {'baseline': RBO(self.run_b_orig, self.run_b_rep, pbar=print_feedback, misinfo=misinfo)} 380 | else: 381 | print(ERR_MSG) 382 | 383 | def rmse(self, run_b_score=None, run_a_score=None, run_b_path=None, run_a_path=None, print_feedback=False): 384 | """ 385 | Determines the Root Mean Square Error (RMSE) according to the following paper: 386 | Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff. 387 | How to Measure the Reproducibility of System-oriented IR Experiments. 388 | Proceedings of SIGIR, pages 349-358, 2020. 389 | 390 | @param run_b_score: Scores of the baseline run, 391 | if not provided the scores of the RpdEvaluator object will be used instead. 392 | @param run_a_score: Scores of the advanced run, 393 | if not provided the scores of the RpdEvaluator object will be used instead. 394 | @param run_b_path: Path to another reproduced baseline run, 395 | if not provided the reproduced baseline run of the RpdEvaluator object will be used instead. 396 | @param run_a_path: Path to another reproduced advanced run, 397 | if not provided the reproduced advanced run of the RpdEvaluator object will be used instead. 398 | @param print_feedback: Boolean value indicating if feedback on progress should be printed. 399 | @return: Dictionary with RMSE values that measure the closeness 400 | between the topics scores of the original and reproduced runs. 401 | """ 402 | if self.run_b_orig and run_b_path: 403 | if self.run_a_orig and run_a_path: 404 | if print_feedback: 405 | print("Determining Root Mean Square Error (RMSE) for baseline and advanced run.") 406 | with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run: 407 | run_b_rep = pytrec_eval.parse_run(b_run) 408 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 409 | run_b_rep_score = self.rel_eval.evaluate(run_b_rep) 410 | run_a_rep = pytrec_eval.parse_run(a_run) 411 | run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)} 412 | run_a_rep_score = self.rel_eval.evaluate(run_a_rep) 413 | return {'baseline': RMSE(self.run_b_orig_score, run_b_rep_score, pbar=print_feedback), 414 | 'advanced': RMSE(self.run_a_orig_score, run_a_rep_score, pbar=print_feedback)} 415 | else: 416 | if print_feedback: 417 | print("Determining Root Mean Square Error (RMSE) for baseline run.") 418 | with open(run_b_path, 'r') as b_run: 419 | run_b_rep = pytrec_eval.parse_run(b_run) 420 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 421 | run_b_rep_score = self.rel_eval.evaluate(run_b_rep) 422 | return {'baseline': RMSE(self.run_b_orig_score, run_b_rep_score, pbar=print_feedback)} 423 | 424 | if self.run_b_orig_score and run_b_score: 425 | if self.run_a_orig_score and run_a_score: 426 | if print_feedback: 427 | print("Determining Root Mean Square Error (RMSE) for baseline and advanced run.") 428 | return {'baseline': RMSE(self.run_b_orig_score, run_b_score, pbar=print_feedback), 429 | 'advanced': RMSE(self.run_a_orig_score, run_a_score, pbar=print_feedback)} 430 | else: 431 | if print_feedback: 432 | print("Determining Root Mean Square Error (RMSE) for baseline run.") 433 | return {'baseline': RMSE(self.run_b_orig_score, run_b_score, pbar=print_feedback)} 434 | if self.run_b_orig_score and self.run_b_rep_score: 435 | if self.run_a_orig_score and self.run_a_rep_score: 436 | if print_feedback: 437 | print("Determining Root Mean Square Error (RMSE) for baseline and advanced run.") 438 | return {'baseline': RMSE(self.run_b_orig_score, self.run_b_rep_score, pbar=print_feedback), 439 | 'advanced': RMSE(self.run_a_orig_score, self.run_a_rep_score, pbar=print_feedback)} 440 | else: 441 | if print_feedback: 442 | print("Determining Root Mean Square Error (RMSE) for baseline run.") 443 | return {'baseline': RMSE(self.run_b_orig_score, self.run_b_rep_score, pbar=print_feedback)} 444 | else: 445 | print(ERR_MSG) 446 | 447 | def nrmse(self, run_b_score=None, run_a_score=None, run_b_path=None, run_a_path=None, print_feedback=False): 448 | """ 449 | Determines the normalized Root Mean Square Error (RMSE). 450 | 451 | @param run_b_score: Scores of the baseline run, 452 | if not provided the scores of the RpdEvaluator object will be used instead. 453 | @param run_a_score: Scores of the advanced run, 454 | if not provided the scores of the RpdEvaluator object will be used instead. 455 | @param run_b_path: Path to another reproduced baseline run, 456 | if not provided the reproduced baseline run of the RpdEvaluator object will be used instead. 457 | @param run_a_path: Path to another reproduced advanced run, 458 | if not provided the reproduced advanced run of the RpdEvaluator object will be used instead. 459 | @param print_feedback: Boolean value indicating if feedback on progress should be printed. 460 | @return: Dictionary with nRMSE values that measure the closeness 461 | between the topics scores of the original and reproduced runs. 462 | """ 463 | if self.run_b_orig and run_b_path: 464 | if self.run_a_orig and run_a_path: 465 | if print_feedback: 466 | print("Determining normalized Root Mean Square Error (RMSE) for baseline and advanced run.") 467 | with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run: 468 | run_b_rep = pytrec_eval.parse_run(b_run) 469 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 470 | run_b_rep_score = self.rel_eval.evaluate(run_b_rep) 471 | run_a_rep = pytrec_eval.parse_run(a_run) 472 | run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)} 473 | run_a_rep_score = self.rel_eval.evaluate(run_a_rep) 474 | return {'baseline': nRMSE(self.run_b_orig_score, run_b_rep_score, pbar=print_feedback), 475 | 'advanced': nRMSE(self.run_a_orig_score, run_a_rep_score, pbar=print_feedback)} 476 | else: 477 | if print_feedback: 478 | print("Determining normalized Root Mean Square Error (RMSE) for baseline run.") 479 | with open(run_b_path, 'r') as b_run: 480 | run_b_rep = pytrec_eval.parse_run(b_run) 481 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 482 | run_b_rep_score = self.rel_eval.evaluate(run_b_rep) 483 | return {'baseline': nRMSE(self.run_b_orig_score, run_b_rep_score, pbar=print_feedback)} 484 | 485 | if self.run_b_orig_score and run_b_score: 486 | if self.run_a_orig_score and run_a_score: 487 | if print_feedback: 488 | print("Determining normalized Root Mean Square Error (RMSE) for baseline and advanced run.") 489 | return {'baseline': nRMSE(self.run_b_orig_score, run_b_score, pbar=print_feedback), 490 | 'advanced': nRMSE(self.run_a_orig_score, run_a_score, pbar=print_feedback)} 491 | else: 492 | if print_feedback: 493 | print("Determining normalized Root Mean Square Error (RMSE) for baseline run.") 494 | return {'baseline': nRMSE(self.run_b_orig_score, run_b_score, pbar=print_feedback)} 495 | if self.run_b_orig_score and self.run_b_rep_score: 496 | if self.run_a_orig_score and self.run_a_rep_score: 497 | if print_feedback: 498 | print("Determining Root Mean Square Error (RMSE) for baseline and advanced run.") 499 | return {'baseline': nRMSE(self.run_b_orig_score, self.run_b_rep_score, pbar=print_feedback), 500 | 'advanced': nRMSE(self.run_a_orig_score, self.run_a_rep_score, pbar=print_feedback)} 501 | else: 502 | if print_feedback: 503 | print("Determining normalized Root Mean Square Error (RMSE) for baseline run.") 504 | return {'baseline': nRMSE(self.run_b_orig_score, self.run_b_rep_score, pbar=print_feedback)} 505 | else: 506 | print(ERR_MSG) 507 | 508 | def ttest(self, run_b_score=None, run_a_score=None, run_b_path=None, run_a_path=None, print_feedback=False): 509 | """ 510 | Conducts a paired two-tailed t-test for reproduced runs that were derived from the same test collection 511 | as in the original experiment. 512 | 513 | @param run_b_score: Scores of the baseline run, 514 | if not provided the scores of the RpdEvaluator object will be used instead. 515 | @param run_a_score: Scores of the advanced run, 516 | if not provided the scores of the RpdEvaluator object will be used instead. 517 | @param run_b_path: Path to another reproduced baseline run, 518 | if not provided the reproduced baseline run of the RpdEvaluator object will be used instead. 519 | @param run_a_path: Path to another reproduced advanced run, 520 | if not provided the reproduced advanced run of the RpdEvaluator object will be used instead. 521 | @param print_feedback: Boolean value indicating if feedback on progress should be printed. 522 | @return: Dictionary with p-values that compare the score distributions of the baseline and advanced run. 523 | """ 524 | if run_b_path: 525 | if run_a_path: 526 | with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run: 527 | run_b_rep = pytrec_eval.parse_run(b_run) 528 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 529 | run_b_rep_score = self.rel_eval.evaluate(run_b_rep) 530 | run_a_rep = pytrec_eval.parse_run(a_run) 531 | run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)} 532 | run_a_rep_score = self.rel_eval.evaluate(run_a_rep) 533 | return self._ttest(run_b_score=run_b_rep_score, run_a_score=run_a_rep_score, print_feedback=print_feedback) 534 | else: 535 | with open(run_b_path, 'r') as b_run: 536 | run_b_rep = pytrec_eval.parse_run(b_run) 537 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 538 | run_b_rep_score = self.rel_eval.evaluate(run_b_rep) 539 | return self._ttest(run_b_score=run_b_rep_score, run_a_score=None, print_feedback=print_feedback) 540 | 541 | return self._ttest(run_b_score=run_b_score, run_a_score=run_a_score, print_feedback=print_feedback) 542 | 543 | 544 | class RplEvaluator(Evaluator): 545 | """ 546 | The Replicability Evaluator is used for quantifying the different levels of replication for runs that were 547 | derived from a test collection not used in the original experiment. 548 | """ 549 | def __init__(self, **kwargs): 550 | super(RplEvaluator, self).__init__(**kwargs) 551 | self.qrel_rpl_path = kwargs.get('qrel_rpl_path', None) 552 | 553 | if self.qrel_rpl_path: 554 | with open(self.qrel_rpl_path, 'r') as f_qrel: 555 | qrel_rpl = pytrec_eval.parse_qrel(f_qrel) 556 | self.rel_eval_rpl = pytrec_eval.RelevanceEvaluator(qrel_rpl, pytrec_eval.supported_measures) 557 | 558 | def evaluate(self, run=None): 559 | """ 560 | Evaluates the scores of the original and replicated baseline and advanced runs. 561 | If a (replicated) run is provided only this one will be evaluated and a dictionary with the corresponding 562 | scores is returned. 563 | @param run: A replicated run. If not specified, the original and replicated runs of the the RplEvaluator will 564 | be used instead. 565 | @return: If run is specified, a dictionary with the corresponding scores is returned. 566 | """ 567 | if run: 568 | return self.rel_eval_rpl.evaluate(run) 569 | 570 | super(RplEvaluator, self).evaluate() 571 | if self.run_b_rep: 572 | self.run_b_rep = break_ties(self.run_b_rep) 573 | self.run_b_rep_score = self.rel_eval_rpl.evaluate(self.run_b_rep) 574 | if self.run_a_rep: 575 | self.run_a_rep = break_ties(self.run_a_rep) 576 | self.run_a_rep_score = self.rel_eval_rpl.evaluate(self.run_a_rep) 577 | 578 | def ttest(self, run_b_score=None, run_a_score=None, run_b_path=None, run_a_path=None, print_feedback=False): 579 | """ 580 | Conducts an un-paired two-tailed t-test for replicated runs that were derived from a test collection 581 | not used in the original experiment. 582 | 583 | @param run_b_score: Scores of the baseline run, 584 | if not provided the scores of the RpdEvaluator object will be used instead. 585 | @param run_a_score: Scores of the advanced run, 586 | if not provided the scores of the RpdEvaluator object will be used instead. 587 | @param run_b_path: Path to another replicated baseline run, 588 | if not provided the replicated baseline run of the RplEvaluator object will be used instead. 589 | @param run_a_path: Path to another replicated advanced run, 590 | if not provided the replicated advanced run of the RplEvaluator object will be used instead. 591 | @param print_feedback: Boolean value indicating if feedback on progress should be printed. 592 | @return: Dictionary with p-values that compare the score distributions of the baseline and advanced run. 593 | """ 594 | if run_b_path: 595 | if run_a_path: 596 | with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run: 597 | run_b_rep = pytrec_eval.parse_run(b_run) 598 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 599 | run_b_rep_score = self.rel_eval_rpl.evaluate(run_b_rep) 600 | run_a_rep = pytrec_eval.parse_run(a_run) 601 | run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)} 602 | run_a_rep_score = self.rel_eval_rpl.evaluate(run_a_rep) 603 | return self._ttest(rpd=False, run_b_score=run_b_rep_score, run_a_score=run_a_rep_score, print_feedback=print_feedback) 604 | else: 605 | with open(run_b_path, 'r') as b_run: 606 | run_b_rep = pytrec_eval.parse_run(b_run) 607 | run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)} 608 | run_b_rep_score = self.rel_eval_rpl.evaluate(run_b_rep) 609 | return self._ttest(rpd=False, run_b_score=run_b_rep_score, run_a_score=None, print_feedback=print_feedback) 610 | 611 | return self._ttest(rpd=False, run_b_score=run_b_score, run_a_score=run_a_score, print_feedback=print_feedback) 612 | -------------------------------------------------------------------------------- /repro_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/repro_eval/__init__.py -------------------------------------------------------------------------------- /repro_eval/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Use repro_eval from the command line with e.g. 3 | 4 | python -m repro_eval -t rpd -q qrel_orig -r orig_b rpd_b 5 | 6 | python -m repro_eval -t rpd -q qrel_orig -r orig_b orig_a rpd_b rpd_a 7 | 8 | python -m repro_eval -t rpd -m rmse -q qrel_orig -r orig_b rpd_b 9 | 10 | python -m repro_eval -t rpl -q qrel_orig qrel_rpl -r orig_b rpl_b 11 | 12 | python -m repro_eval -t rpl -q qrel_orig qrel_rpl -r orig_b orig_a rpl_b rpl_a 13 | 14 | after having installed the Python package. 15 | For other more specific examples also have a look at the README file. 16 | Depending on the provided parameters and input run files, 17 | evaluation measures will be printed. 18 | """ 19 | 20 | import argparse 21 | from repro_eval.Evaluator import RpdEvaluator, RplEvaluator 22 | from repro_eval.util import print_simple_line, print_base_adv 23 | from repro_eval.util import arp 24 | 25 | 26 | def main(): 27 | parser = argparse.ArgumentParser() 28 | 29 | parser.add_argument('-t', '--type') 30 | parser.add_argument('-m', '--measure', nargs='+') 31 | parser.add_argument('-q', '--qrels', nargs='+') 32 | parser.add_argument('-r', '--runs', nargs='+') 33 | 34 | args = parser.parse_args() 35 | 36 | if args.type in ['rpd', 'reproducibility']: 37 | if len(args.runs) == 4: 38 | rpd_eval = RpdEvaluator(qrel_orig_path=args.qrels[0], 39 | run_b_orig_path=args.runs[0], 40 | run_a_orig_path=args.runs[1], 41 | run_b_rep_path=args.runs[2], 42 | run_a_rep_path=args.runs[3]) 43 | 44 | if len(args.runs) == 2: 45 | rpd_eval = RpdEvaluator(qrel_orig_path=args.qrels[0], 46 | run_b_orig_path=args.runs[0], 47 | run_a_orig_path=None, 48 | run_b_rep_path=args.runs[1], 49 | run_a_rep_path=None) 50 | 51 | rpd_eval.trim() 52 | rpd_eval.evaluate() 53 | 54 | measure_list = args.measure if args.measure is not None else [] 55 | 56 | # KTU 57 | if 'ktu' in measure_list or args.measure is None: 58 | ktu = rpd_eval.ktau_union() 59 | print("Kendall's tau Union (KTU)") 60 | print('------------------------------------------------------------------') 61 | for topic, value in ktu.get('baseline').items(): 62 | value_adv = ktu.get('advanced').get(topic) if ktu.get('advanced') is not None else None 63 | print_base_adv(topic, 'KTU', value, value_adv) 64 | value_adv = arp(ktu.get('advanced')) if ktu.get('advanced') is not None else None 65 | print_base_adv('ARP', 'KTU', arp(ktu.get('baseline')), value_adv) 66 | print() 67 | 68 | # RBO 69 | if 'rbo' in measure_list or args.measure is None: 70 | rbo = rpd_eval.rbo() 71 | print("Rank-biased Overlap (RBO)") 72 | print('------------------------------------------------------------------') 73 | for topic, value in rbo.get('baseline').items(): 74 | value_adv = rbo.get('advanced').get(topic) if rbo.get('advanced') is not None else None 75 | print_base_adv(topic, 'RBO', value, value_adv) 76 | value_adv = arp(rbo.get('advanced')) if rbo.get('advanced') is not None else None 77 | print_base_adv('ARP', 'RBO', arp(rbo.get('baseline')), value_adv) 78 | print() 79 | 80 | # RMSE 81 | if 'rmse' in measure_list or args.measure is None: 82 | rmse = rpd_eval.rmse() 83 | print("Root mean square error (RMSE)") 84 | print('------------------------------------------------------------------') 85 | for measure, value in rmse.get('baseline').items(): 86 | value_adv = rmse.get('advanced').get(measure) if rmse.get('advanced') is not None else None 87 | print_base_adv(measure, 'RMSE', value, value_adv) 88 | print() 89 | 90 | # ER 91 | if 'er' in measure_list or args.measure is None and len(args.runs) == 4: 92 | print("Effect ratio (ER)") 93 | print('------------------------------------------------------------------') 94 | er = rpd_eval.er() 95 | for measure, value in er.items(): 96 | print_simple_line(measure, 'ER', value) 97 | print() 98 | 99 | # DRI 100 | if 'dri' in measure_list or args.measure is None and len(args.runs) == 4: 101 | print("Delta Relative Improvement (DRI)") 102 | print('------------------------------------------------------------------') 103 | dri = rpd_eval.dri() 104 | for measure, value in dri.items(): 105 | print_simple_line(measure, 'DRI', value) 106 | print() 107 | 108 | # ttest 109 | if 'ttest' in measure_list or args.measure is None: 110 | pvals = rpd_eval.ttest() 111 | print("Two-tailed paired t-test (p-value)") 112 | print('------------------------------------------------------------------') 113 | for measure, value in pvals.get('baseline').items(): 114 | value_adv = pvals.get('advanced').get(measure) if pvals.get('advanced') is not None else None 115 | print_base_adv(measure, 'PVAL', value, value_adv) 116 | print() 117 | 118 | if args.type in ['rpl', 'replicability']: 119 | if len(args.runs) == 4: 120 | rpl_eval = RplEvaluator(qrel_orig_path=args.qrels[0], 121 | run_b_orig_path=args.runs[0], 122 | run_a_orig_path=args.runs[1], 123 | run_b_rep_path=args.runs[2], 124 | run_a_rep_path=args.runs[3], 125 | qrel_rpl_path=args.qrels[1]) 126 | 127 | if len(args.runs) == 2: 128 | rpl_eval = RplEvaluator(qrel_orig_path=args.qrels[0], 129 | run_b_orig_path=args.runs[0], 130 | run_a_orig_path=None, 131 | run_b_rep_path=args.runs[1], 132 | run_a_rep_path=None, 133 | qrel_rpl_path=args.qrels[1]) 134 | 135 | rpl_eval.trim() 136 | rpl_eval.evaluate() 137 | 138 | measure_list = args.measure if args.measure is not None else [] 139 | 140 | # ER 141 | if 'er' in measure_list or args.measure is None and len(args.runs) == 4: 142 | print("Effect ratio (ER)") 143 | print('------------------------------------------------------------------') 144 | er = rpl_eval.er() 145 | for measure, value in er.items(): 146 | print_simple_line(measure, 'ER', value) 147 | print() 148 | 149 | # DRI 150 | if 'dri' in measure_list or args.measure is None and len(args.runs) == 4: 151 | print("Delta Relative Improvement (DRI)") 152 | print('------------------------------------------------------------------') 153 | dri = rpl_eval.dri() 154 | for measure, value in dri.items(): 155 | print_simple_line(measure, 'DRI', value) 156 | print() 157 | 158 | # ttest 159 | if 'ttest' in measure_list or args.measure is None: 160 | pvals = rpl_eval.ttest() 161 | print("Two-tailed unpaired t-test (p-value)") 162 | print('------------------------------------------------------------------') 163 | for measure, value in pvals.get('baseline').items(): 164 | value_adv = pvals.get('advanced').get(measure) if pvals.get('advanced') is not None else None 165 | print_base_adv(measure, 'PVAL', value, value_adv) 166 | print() 167 | 168 | 169 | if __name__ == "__main__": 170 | main() 171 | -------------------------------------------------------------------------------- /repro_eval/config.py: -------------------------------------------------------------------------------- 1 | TRIM_THRESH = 1000 # default threshold for trimming the runs 2 | PHI = 0.8 # default parameter for the Rank-Biased Overlap (RBO) 3 | ERR_MSG = 'Please provide adequate run combinations and have them evaluated first.' # error message 4 | 5 | # evaluation measures of trec_eval that will be excluded from the reproduction and replication measures 6 | exclude = [ 7 | 'runid', 8 | 'num_q', 9 | 'num_ret', 10 | 'num_rel', 11 | 'num_rel_ret', 12 | 'num_nonrel_judged_ret', 13 | 'relstring' 14 | ] 15 | -------------------------------------------------------------------------------- /repro_eval/measure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/repro_eval/measure/__init__.py -------------------------------------------------------------------------------- /repro_eval/measure/document_order.py: -------------------------------------------------------------------------------- 1 | """Evaluation measures at the level of document orderings.""" 2 | 3 | from repro_eval.config import TRIM_THRESH, PHI 4 | from scipy.stats.stats import kendalltau 5 | from tqdm import tqdm 6 | from repro_eval.measure.external.rbo import rbo 7 | from repro_eval.util import break_ties 8 | 9 | 10 | def _rbo(run, ideal, p, depth): 11 | # Implementation taken from the TREC Health Misinformation Track with modifications 12 | # see also: https://github.com/claclark/Compatibility 13 | run_set = set() 14 | ideal_set = set() 15 | 16 | score = 0.0 17 | normalizer = 0.0 18 | weight = 1.0 19 | for i in range(depth): 20 | if i < len(run): 21 | run_set.add(run[i]) 22 | if i < len(ideal): 23 | ideal_set.add(ideal[i]) 24 | score += weight*len(ideal_set.intersection(run_set))/(i + 1) 25 | normalizer += weight 26 | weight *= p 27 | return score/normalizer 28 | 29 | 30 | def _ktau_union(orig_run, rep_run, trim_thresh=TRIM_THRESH, pbar=False): 31 | """ 32 | Helping function returning a generator to determine Kendall's tau Union (KTU) for all topics. 33 | 34 | @param orig_run: The original run. 35 | @param rep_run: The reproduced/replicated run. 36 | @param trim_thresh: Threshold values for the number of documents to be compared. 37 | @param pbar: Boolean value indicating if progress bar should be printed. 38 | @return: Generator with KTU values. 39 | """ 40 | 41 | generator = tqdm(rep_run.items()) if pbar else rep_run.items() 42 | 43 | for topic, docs in generator: 44 | orig_docs = list(orig_run.get(topic).keys())[:trim_thresh] 45 | rep_docs = list(rep_run.get(topic).keys())[:trim_thresh] 46 | union = list(sorted(set(orig_docs + rep_docs))) 47 | orig_idx = [union.index(doc) for doc in orig_docs] 48 | rep_idx = [union.index(doc) for doc in rep_docs] 49 | yield topic, round(kendalltau(orig_idx, rep_idx).correlation, 14) 50 | 51 | 52 | def ktau_union(orig_run, rep_run, trim_thresh=TRIM_THRESH, pbar=False): 53 | """ 54 | Determines the Kendall's tau Union (KTU) between the original and reproduced document orderings 55 | according to the following paper: 56 | Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff. 57 | How to Measure the Reproducibility of System-oriented IR Experiments. 58 | Proceedings of SIGIR, pages 349-358, 2020. 59 | 60 | @param orig_run: The original run. 61 | @param rep_run: The reproduced/replicated run. 62 | @param trim_thresh: Threshold values for the number of documents to be compared. 63 | @param pbar: Boolean value indicating if progress bar should be printed. 64 | @return: Dictionary with KTU values that compare the document orderings of the original and reproduced runs. 65 | """ 66 | 67 | # Safety check for runs that are not added via pytrec_eval 68 | orig_run = break_ties(orig_run) 69 | rep_run = break_ties(rep_run) 70 | 71 | return dict(_ktau_union(orig_run, rep_run, trim_thresh=trim_thresh, pbar=pbar)) 72 | 73 | 74 | def _RBO(orig_run, rep_run, phi, trim_thresh=TRIM_THRESH, pbar=False, misinfo=True): 75 | """ 76 | Helping function returning a generator to determine the Rank-Biased Overlap (RBO) for all topics. 77 | 78 | @param orig_run: The original run. 79 | @param rep_run: The reproduced/replicated run. 80 | @param phi: Parameter for top-heaviness of the RBO. 81 | @param trim_thresh: Threshold values for the number of documents to be compared. 82 | @param pbar: Boolean value indicating if progress bar should be printed. 83 | @param misinfo: Use the RBO implementation that is also used in the TREC Health Misinformation Track. 84 | See also: https://github.com/claclark/Compatibility 85 | @return: Generator with RBO values. 86 | """ 87 | 88 | generator = tqdm(rep_run.items()) if pbar else rep_run.items() 89 | 90 | if misinfo: 91 | for topic, docs in generator: 92 | yield topic, _rbo(list(rep_run.get(topic).keys())[:trim_thresh], 93 | list(orig_run.get(topic).keys())[:trim_thresh], 94 | p=phi, 95 | depth=trim_thresh) 96 | 97 | else: 98 | for topic, docs in generator: 99 | yield topic, rbo(list(rep_run.get(topic).keys())[:trim_thresh], 100 | list(orig_run.get(topic).keys())[:trim_thresh], 101 | p=phi).ext 102 | 103 | 104 | def RBO(orig_run, rep_run, phi=PHI, trim_thresh=TRIM_THRESH, pbar=False, misinfo=True): 105 | """ 106 | Determines the Rank-Biased Overlap (RBO) between the original and reproduced document orderings 107 | according to the following paper: 108 | Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff. 109 | How to Measure the Reproducibility of System-oriented IR Experiments. 110 | Proceedings of SIGIR, pages 349-358, 2020. 111 | 112 | @param orig_run: The original run. 113 | @param rep_run: The reproduced/replicated run. 114 | @param phi: Parameter for top-heaviness of the RBO. 115 | @param trim_thresh: Threshold values for the number of documents to be compared. 116 | @param pbar: Boolean value indicating if progress bar should be printed. 117 | @param misinfo: Use the RBO implementation that is also used in the TREC Health Misinformation Track. 118 | See also: https://github.com/claclark/Compatibility 119 | @return: Dictionary with RBO values that compare the document orderings of the original and reproduced runs. 120 | """ 121 | 122 | # Safety check for runs that are not added via pytrec_eval 123 | orig_run = break_ties(orig_run) 124 | rep_run = break_ties(rep_run) 125 | 126 | return dict(_RBO(orig_run, rep_run, phi=phi, trim_thresh=trim_thresh, pbar=pbar, misinfo=misinfo)) 127 | -------------------------------------------------------------------------------- /repro_eval/measure/effectiveness.py: -------------------------------------------------------------------------------- 1 | """Evaluation measures at the level of effectiveness.""" 2 | 3 | import numpy as np 4 | from math import sqrt 5 | from copy import deepcopy 6 | from tqdm import tqdm 7 | from repro_eval.config import exclude 8 | 9 | 10 | def _rmse(orig_score, rep_core, pbar=False): 11 | """ 12 | Helping function returning a generator to determine the Root Mean Square Error (RMSE) for all topics. 13 | 14 | @param orig_score: The original scores. 15 | @param rep_core: The reproduced/replicated scores. 16 | @param pbar: Boolean value indicating if progress bar should be printed. 17 | @return: Generator with RMSE values. 18 | """ 19 | orig_cp = deepcopy(orig_score) 20 | rep_cp = deepcopy(rep_core) 21 | measures_all = list(list(orig_cp.values())[0].keys()) 22 | topics = orig_cp.keys() 23 | measures_valid = [m for m in measures_all if m not in exclude] 24 | 25 | measures = tqdm(measures_valid) if pbar else measures_valid 26 | 27 | for measure in measures: 28 | orig_measure = np.array([orig_cp.get(topic).get(measure) for topic in topics]) 29 | rpl_measure = np.array([rep_cp.get(topic).get(measure) for topic in topics]) 30 | diff = orig_measure - rpl_measure 31 | yield measure, sqrt(sum(np.square(diff))/len(diff)) 32 | 33 | 34 | def rmse(orig_score, rep_score, pbar=False): 35 | """ 36 | Determines the Root Mean Square Error (RMSE) between the original and reproduced topic scores 37 | according to the following paper: 38 | Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff. 39 | How to Measure the Reproducibility of System-oriented IR Experiments. 40 | Proceedings of SIGIR, pages 349-358, 2020. 41 | 42 | @param orig_score: The original scores. 43 | @param rep_core: The reproduced/replicated scores. 44 | @param pbar: Boolean value indicating if progress bar should be printed. 45 | @return: Dictionary with RMSE values that measure the closeness between the original and reproduced topic scores. 46 | """ 47 | return dict(_rmse(orig_score, rep_score, pbar=pbar)) 48 | 49 | 50 | def _maxrmse(orig_score, pbar=False): 51 | """ 52 | Helping function returning a generator to determine the maximum Root Mean Square Error (RMSE) for all topics. 53 | 54 | @param orig_score: The original scores. 55 | @param pbar: Boolean value indicating if progress bar should be printed. 56 | @return: Generator with RMSE values. 57 | """ 58 | orig_cp = deepcopy(orig_score) 59 | measures_all = list(list(orig_cp.values())[0].keys()) 60 | topics = orig_cp.keys() 61 | measures_valid = [m for m in measures_all if m not in exclude] 62 | measures = tqdm(measures_valid) if pbar else measures_valid 63 | 64 | for measure in measures: 65 | orig_measure = np.array([orig_cp.get(topic).get(measure) for topic in topics]) 66 | _max = np.vectorize(lambda x: max(x, 1 - x)) 67 | maxdiff = _max(orig_measure) 68 | yield measure, sqrt(sum(np.square(maxdiff))/len(maxdiff)) 69 | 70 | 71 | def nrmse(orig_score, rep_score, pbar=False): 72 | """ 73 | Determines the normalized Root Mean Square Error (RMSE) between the original and reproduced topic scores. 74 | 75 | @param orig_score: The original scores. 76 | @param rep_core: The reproduced/replicated scores. 77 | @param pbar: Boolean value indicating if progress bar should be printed. 78 | @return: Dictionary with RMSE values that measure the closeness between the original and reproduced topic scores. 79 | """ 80 | rmse = dict(_rmse(orig_score, rep_score, pbar=pbar)) 81 | maxrmse = dict(_maxrmse(orig_score, pbar=pbar)) 82 | return {measure: score / maxrmse.get(measure) for measure, score in rmse.items()} 83 | -------------------------------------------------------------------------------- /repro_eval/measure/external/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/repro_eval/measure/external/__init__.py -------------------------------------------------------------------------------- /repro_eval/measure/external/rbo.py: -------------------------------------------------------------------------------- 1 | """Rank-biased overlap, a ragged sorted list similarity measure. 2 | 3 | See http://doi.acm.org/10.1145/1852102.1852106 for details. All functions 4 | directly corresponding to concepts from the paper are named so that they can be 5 | clearly cross-identified. 6 | 7 | The definition of overlap has been modified to account for ties. Without this, 8 | results for lists with tied items were being inflated. The modification itself 9 | is not mentioned in the paper but seems to be reasonable, see function 10 | ``overlap()``. Places in the code which diverge from the spec in the paper 11 | because of this are highlighted with comments. 12 | 13 | The two main functions for performing an RBO analysis are ``rbo()`` and 14 | ``rbo_dict()``; see their respective docstrings for how to use them. 15 | 16 | The following doctest just checks that equivalent specifications of a 17 | problem yield the same result using both functions: 18 | 19 | >>> lst1 = [{"c", "a"}, "b", "d"] 20 | >>> lst2 = ["a", {"c", "b"}, "d"] 21 | >>> ans_rbo = _round(rbo(lst1, lst2, p=.9)) 22 | >>> dct1 = dict(a=1, b=2, c=1, d=3) 23 | >>> dct2 = dict(a=1, b=2, c=2, d=3) 24 | >>> ans_rbo_dict = _round(rbo_dict(dct1, dct2, p=.9, sort_ascending=True)) 25 | >>> ans_rbo == ans_rbo_dict 26 | True 27 | 28 | """ 29 | 30 | from __future__ import division 31 | 32 | import math 33 | from bisect import bisect_left 34 | from collections import namedtuple 35 | 36 | 37 | RBO = namedtuple("RBO", "min res ext") 38 | RBO.__doc__ += ": Result of full RBO analysis" 39 | RBO.min.__doc__ = "Lower bound estimate" 40 | RBO.res.__doc__ = "Residual corresponding to min; min + res is an upper bound estimate" 41 | RBO.ext.__doc__ = "Extrapolated point estimate" 42 | 43 | 44 | def _round(obj): 45 | if isinstance(obj, RBO): 46 | return RBO(_round(obj.min), _round(obj.res), _round(obj.ext)) 47 | else: 48 | return round(obj, 3) 49 | 50 | 51 | def set_at_depth(lst, depth): 52 | ans = set() 53 | for v in lst[:depth]: 54 | if isinstance(v, set): 55 | ans.update(v) 56 | else: 57 | ans.add(v) 58 | return ans 59 | 60 | 61 | def raw_overlap(list1, list2, depth): 62 | """Overlap as defined in the article. 63 | 64 | """ 65 | set1, set2 = set_at_depth(list1, depth), set_at_depth(list2, depth) 66 | return len(set1.intersection(set2)), len(set1), len(set2) 67 | 68 | 69 | def overlap(list1, list2, depth): 70 | """Overlap which accounts for possible ties. 71 | 72 | This isn't mentioned in the paper but should be used in the ``rbo*()`` 73 | functions below, otherwise overlap at a given depth might be > depth which 74 | inflates the result. 75 | 76 | There are no guidelines in the paper as to what's a good way to calculate 77 | this, but a good guess is agreement scaled by the minimum between the 78 | requested depth and the lengths of the considered lists (overlap shouldn't 79 | be larger than the number of ranks in the shorter list, otherwise results 80 | are conspicuously wrong when the lists are of unequal lengths -- rbo_ext is 81 | not between rbo_min and rbo_min + rbo_res. 82 | 83 | >>> overlap("abcd", "abcd", 3) 84 | 3.0 85 | 86 | >>> overlap("abcd", "abcd", 5) 87 | 4.0 88 | 89 | >>> overlap(["a", {"b", "c"}, "d"], ["a", {"b", "c"}, "d"], 2) 90 | 2.0 91 | 92 | >>> overlap(["a", {"b", "c"}, "d"], ["a", {"b", "c"}, "d"], 3) 93 | 3.0 94 | 95 | """ 96 | return agreement(list1, list2, depth) * min(depth, len(list1), len(list2)) 97 | # NOTE: comment the preceding and uncomment the following line if you want 98 | # to stick to the algorithm as defined by the paper 99 | # return raw_overlap(list1, list2, depth)[0] 100 | 101 | 102 | def agreement(list1, list2, depth): 103 | """Proportion of shared values between two sorted lists at given depth. 104 | 105 | >>> _round(agreement("abcde", "abdcf", 1)) 106 | 1.0 107 | >>> _round(agreement("abcde", "abdcf", 3)) 108 | 0.667 109 | >>> _round(agreement("abcde", "abdcf", 4)) 110 | 1.0 111 | >>> _round(agreement("abcde", "abdcf", 5)) 112 | 0.8 113 | >>> _round(agreement([{1, 2}, 3], [1, {2, 3}], 1)) 114 | 0.667 115 | >>> _round(agreement([{1, 2}, 3], [1, {2, 3}], 2)) 116 | 1.0 117 | 118 | """ 119 | len_intersection, len_set1, len_set2 = raw_overlap(list1, list2, depth) 120 | return 2 * len_intersection / (len_set1 + len_set2) 121 | 122 | 123 | def cumulative_agreement(list1, list2, depth): 124 | return (agreement(list1, list2, d) for d in range(1, depth + 1)) 125 | 126 | 127 | def average_overlap(list1, list2, depth=None): 128 | """Calculate average overlap between ``list1`` and ``list2``. 129 | 130 | >>> _round(average_overlap("abcdefg", "zcavwxy", 1)) 131 | 0.0 132 | >>> _round(average_overlap("abcdefg", "zcavwxy", 2)) 133 | 0.0 134 | >>> _round(average_overlap("abcdefg", "zcavwxy", 3)) 135 | 0.222 136 | >>> _round(average_overlap("abcdefg", "zcavwxy", 4)) 137 | 0.292 138 | >>> _round(average_overlap("abcdefg", "zcavwxy", 5)) 139 | 0.313 140 | >>> _round(average_overlap("abcdefg", "zcavwxy", 6)) 141 | 0.317 142 | >>> _round(average_overlap("abcdefg", "zcavwxy", 7)) 143 | 0.312 144 | 145 | """ 146 | depth = min(len(list1), len(list2)) if depth is None else depth 147 | return sum(cumulative_agreement(list1, list2, depth)) / depth 148 | 149 | 150 | def rbo_at_k(list1, list2, p, depth=None): 151 | # ``p**d`` here instead of ``p**(d - 1)`` because enumerate starts at 152 | # 0 153 | depth = min(len(list1), len(list2)) if depth is None else depth 154 | d_a = enumerate(cumulative_agreement(list1, list2, depth)) 155 | return (1 - p) * sum(p ** d * a for (d, a) in d_a) 156 | 157 | 158 | def rbo_min(list1, list2, p, depth=None): 159 | """Tight lower bound on RBO. 160 | 161 | See equation (11) in paper. 162 | 163 | >>> _round(rbo_min("abcdefg", "abcdefg", .9)) 164 | 0.767 165 | >>> _round(rbo_min("abcdefgh", "abcdefg", .9)) 166 | 0.767 167 | 168 | """ 169 | depth = min(len(list1), len(list2)) if depth is None else depth 170 | x_k = overlap(list1, list2, depth) 171 | log_term = x_k * math.log(1 - p) 172 | sum_term = sum( 173 | p ** d / d * (overlap(list1, list2, d) - x_k) for d in range(1, depth + 1) 174 | ) 175 | return (1 - p) / p * (sum_term - log_term) 176 | 177 | 178 | def rbo_res(list1, list2, p): 179 | """Upper bound on residual overlap beyond evaluated depth. 180 | 181 | See equation (30) in paper. 182 | 183 | NOTE: The doctests weren't verified against manual computations but seem 184 | plausible. In particular, for identical lists, ``rbo_min()`` and 185 | ``rbo_res()`` should add up to 1, which is the case. 186 | 187 | >>> _round(rbo_res("abcdefg", "abcdefg", .9)) 188 | 0.233 189 | >>> _round(rbo_res("abcdefg", "abcdefghijklmnopqrstuvwxyz", .9)) 190 | 0.239 191 | 192 | """ 193 | S, L = sorted((list1, list2), key=len) 194 | s, l = len(S), len(L) 195 | x_l = overlap(list1, list2, l) 196 | # since overlap(...) can be fractional in the general case of ties and f 197 | # must be an integer --> math.ceil() 198 | f = int(math.ceil(l + s - x_l)) 199 | # upper bound of range() is non-inclusive, therefore + 1 is needed 200 | term1 = s * sum(p ** d / d for d in range(s + 1, f + 1)) 201 | term2 = l * sum(p ** d / d for d in range(l + 1, f + 1)) 202 | term3 = x_l * (math.log(1 / (1 - p)) - sum(p ** d / d for d in range(1, f + 1))) 203 | return p ** s + p ** l - p ** f - (1 - p) / p * (term1 + term2 + term3) 204 | 205 | 206 | def rbo_ext(list1, list2, p): 207 | """RBO point estimate based on extrapolating observed overlap. 208 | 209 | See equation (32) in paper. 210 | 211 | NOTE: The doctests weren't verified against manual computations but seem 212 | plausible. 213 | 214 | >>> _round(rbo_ext("abcdefg", "abcdefg", .9)) 215 | 1.0 216 | >>> _round(rbo_ext("abcdefg", "bacdefg", .9)) 217 | 0.9 218 | 219 | """ 220 | S, L = sorted((list1, list2), key=len) 221 | s, l = len(S), len(L) 222 | x_l = overlap(list1, list2, l) 223 | x_s = overlap(list1, list2, s) 224 | # the paper says overlap(..., d) / d, but it should be replaced by 225 | # agreement(..., d) defined as per equation (28) so that ties are handled 226 | # properly (otherwise values > 1 will be returned) 227 | # sum1 = sum(p**d * overlap(list1, list2, d)[0] / d for d in range(1, l + 1)) 228 | sum1 = sum(p ** d * agreement(list1, list2, d) for d in range(1, l + 1)) 229 | sum2 = sum(p ** d * x_s * (d - s) / s / d for d in range(s + 1, l + 1)) 230 | term1 = (1 - p) / p * (sum1 + sum2) 231 | term2 = p ** l * ((x_l - x_s) / l + x_s / s) 232 | return term1 + term2 233 | 234 | 235 | def rbo(list1, list2, p): 236 | """Complete RBO analysis (lower bound, residual, point estimate). 237 | 238 | ``list`` arguments should be already correctly sorted iterables and each 239 | item should either be an atomic value or a set of values tied for that 240 | rank. ``p`` is the probability of looking for overlap at rank k + 1 after 241 | having examined rank k. 242 | 243 | >>> lst1 = [{"c", "a"}, "b", "d"] 244 | >>> lst2 = ["a", {"c", "b"}, "d"] 245 | >>> _round(rbo(lst1, lst2, p=.9)) 246 | RBO(min=0.489, res=0.477, ext=0.967) 247 | 248 | """ 249 | if not 0 <= p <= 1: 250 | raise ValueError("The ``p`` parameter must be between 0 and 1.") 251 | args = (list1, list2, p) 252 | return RBO(rbo_min(*args), rbo_res(*args), rbo_ext(*args)) 253 | 254 | 255 | def sort_dict(dct, *, ascending=False): 256 | """Sort keys in ``dct`` according to their corresponding values. 257 | 258 | Sorts in descending order by default, because the values are 259 | typically scores, i.e. the higher the better. Specify 260 | ``ascending=True`` if the values are ranks, or some sort of score 261 | where lower values are better. 262 | 263 | Ties are handled by creating sets of tied keys at the given position 264 | in the sorted list. 265 | 266 | >>> dct = dict(a=1, b=2, c=1, d=3) 267 | >>> list(sort_dict(dct)) == ['d', 'b', {'a', 'c'}] 268 | True 269 | >>> list(sort_dict(dct, ascending=True)) == [{'a', 'c'}, 'b', 'd'] 270 | True 271 | 272 | """ 273 | scores = [] 274 | items = [] 275 | # items should be unique, scores don't have to 276 | for item, score in dct.items(): 277 | if not ascending: 278 | score *= -1 279 | i = bisect_left(scores, score) 280 | if i == len(scores): 281 | scores.append(score) 282 | items.append(item) 283 | elif scores[i] == score: 284 | existing_item = items[i] 285 | if isinstance(existing_item, set): 286 | existing_item.add(item) 287 | else: 288 | items[i] = {existing_item, item} 289 | else: 290 | scores.insert(i, score) 291 | items.insert(i, item) 292 | return items 293 | 294 | 295 | def rbo_dict(dict1, dict2, p, *, sort_ascending=False): 296 | """Wrapper around ``rbo()`` for dict input. 297 | 298 | Each dict maps items to be sorted to the score according to which 299 | they should be sorted. The RBO analysis is then performed on the 300 | resulting sorted lists. 301 | 302 | The sort is descending by default, because scores are typically the 303 | higher the better, but this can be overridden by specifying 304 | ``sort_ascending=True``. 305 | 306 | >>> dct1 = dict(a=1, b=2, c=1, d=3) 307 | >>> dct2 = dict(a=1, b=2, c=2, d=3) 308 | >>> _round(rbo_dict(dct1, dct2, p=.9, sort_ascending=True)) 309 | RBO(min=0.489, res=0.477, ext=0.967) 310 | 311 | """ 312 | list1, list2 = ( 313 | sort_dict(dict1, ascending=sort_ascending), 314 | sort_dict(dict2, ascending=sort_ascending), 315 | ) 316 | return rbo(list1, list2, p) 317 | 318 | 319 | if __name__ in ("__main__", "__console__"): 320 | import doctest 321 | 322 | doctest.testmod() 323 | -------------------------------------------------------------------------------- /repro_eval/measure/overall_effects.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from copy import deepcopy 3 | from tqdm import tqdm 4 | from repro_eval.config import exclude 5 | 6 | 7 | def diff(topic_score_a, topic_score_b): 8 | """ 9 | Use this function to get a generator with absoulte differences 10 | between the topic scores of the baseline and advanced runs. 11 | 12 | @param topic_score_a: Topic scores of the advanced run. 13 | @param topic_score_b: Topic scores of the baseline run. 14 | @return: Generator with absolute differences between the topics scores. 15 | """ 16 | for measure, value in topic_score_a.items(): 17 | if measure not in exclude: 18 | yield measure, value - topic_score_b.get(measure) 19 | 20 | 21 | def topic_diff(run_a, run_b): 22 | """ 23 | Use this function to get a generator with absoulte differences 24 | between the topic scores of the baseline and advanced runs for each measure. 25 | 26 | @param run_a: The advanced run. 27 | @param run_b: The baseline run. 28 | @return: Generator with absolute differences between the topics scores for each measure. 29 | """ 30 | run_a_cp = deepcopy(run_a) 31 | run_b_cp = deepcopy(run_b) 32 | 33 | for topic, measures in run_a_cp.items(): 34 | yield topic, dict(diff(measures, run_b_cp.get(topic))) 35 | 36 | 37 | def _mean_improvement(run_a, run_b): 38 | """ 39 | Helping function returning a generator for determining the mean improvements. 40 | 41 | @param run_a: The advanced run. 42 | @param run_b: The baseline run. 43 | @return: Generator with mean improvements. 44 | """ 45 | measures_all = list(list(run_a.values())[0].keys()) 46 | measures_valid = [m for m in measures_all if m not in exclude] 47 | topics = run_a.keys() 48 | delta = dict(topic_diff(run_a, run_b)) 49 | 50 | for measure in measures_valid: 51 | yield measure, np.array([delta.get(topic).get(measure) for topic in topics]).mean() 52 | 53 | 54 | def mean_improvement(run_a, run_b): 55 | """ 56 | Determines the relative improvement that is used to derive the Delta Relative Improvement (DeltaRI). 57 | 58 | @param run_a: The advanced run. 59 | @param run_b: The baseline run. 60 | @return: Dictionary with mean improvements for each measure. 61 | """ 62 | return dict(_mean_improvement(run_a, run_b)) 63 | 64 | 65 | def _er(orig_score_a, orig_score_b, rep_score_a, rep_score_b, pbar=False): 66 | """ 67 | Helping function returning a generator for determining the Effect Ratio (ER). 68 | 69 | @param orig_score_a: Scores of the original advanced run. 70 | @param orig_score_b: Scores of the original baseline run. 71 | @param rep_score_a: Scores of the reproduced/replicated advanced run. 72 | @param rep_score_b: Scores of the reproduced/replicated baseline run. 73 | @param pbar: Boolean value indicating if progress bar should be printed. 74 | @return: Generator with ER scores. 75 | """ 76 | mi_orig = mean_improvement(orig_score_a, orig_score_b) 77 | mi_rep = mean_improvement(rep_score_a, rep_score_b) 78 | 79 | generator = tqdm(mi_rep.items()) if pbar else mi_rep.items() 80 | 81 | for measure, value in generator: 82 | yield measure, value / mi_orig.get(measure) 83 | 84 | 85 | def ER(orig_score_a, orig_score_b, rep_score_a, rep_score_b, pbar=False): 86 | """ 87 | Determines the Effect Ratio (ER) according to the following paper: 88 | Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff. 89 | How to Measure the Reproducibility of System-oriented IR Experiments. 90 | Proceedings of SIGIR, pages 349-358, 2020. 91 | 92 | The ER value is determined by the ratio between the mean improvements 93 | of the original and reproduced/replicated experiments. 94 | 95 | @param orig_score_a: Scores of the original advanced run. 96 | @param orig_score_b: Scores of the original baseline run. 97 | @param rep_score_a: Scores of the reproduced/replicated advanced run. 98 | @param rep_score_b: Scores of the reproduced/replicated baseline run. 99 | @param pbar: Boolean value indicating if progress bar should be printed. 100 | @return: Dictionary containing the ER values for the specified run combination. 101 | """ 102 | return dict(_er(orig_score_a, orig_score_b, rep_score_a, rep_score_b, pbar=pbar)) 103 | 104 | 105 | def _mean_score(scores): 106 | """ 107 | Helping function to determine the mean scores across the topics for each measure. 108 | 109 | @param scores: Run scores. 110 | @return: Generator with mean scores. 111 | """ 112 | measures_all = list(list(scores.values())[0].keys()) 113 | measures_valid = [m for m in measures_all if m not in exclude] 114 | topics = scores.keys() 115 | 116 | for measure in measures_valid: 117 | yield measure, np.array([scores.get(topic).get(measure) for topic in topics]).mean() 118 | 119 | 120 | def mean_score(scores): 121 | """ 122 | Use this function to compute the mean scores across the topics for each measure. 123 | 124 | @param scores: Run scores. 125 | @return: Dictionary containing the mean scores for each measure. 126 | """ 127 | return dict(_mean_score(scores)) 128 | 129 | 130 | def _rel_improve(scores_a, scores_b): 131 | """ 132 | Helping function returning a generator for determining the relative improvements. 133 | 134 | @param scores_a: Scores of the advanced run. 135 | @param scores_b: Scores of the baseline run. 136 | @return: Generator with relative improvements. 137 | """ 138 | mean_scores_a = mean_score(scores_a) 139 | mean_scores_b = mean_score(scores_b) 140 | 141 | for measure, mean in mean_scores_a.items(): 142 | yield measure, (mean - mean_scores_b.get(measure)) / mean_scores_b.get(measure) 143 | 144 | 145 | def rel_improve(scores_a, scores_b): 146 | """ 147 | Determines the relative improvement that is used to derive the Delta Relative Improvement (DeltaRI). 148 | 149 | @param scores_a: Scores of the advanced run. 150 | @param scores_b: Scores of the baseline run. 151 | @return: Dictionary with relative improvements for each measure. 152 | """ 153 | return dict(_rel_improve(scores_a, scores_b)) 154 | 155 | 156 | def _deltaRI(orig_score_a, orig_score_b, rep_score_a, rep_score_b, pbar=False): 157 | """ 158 | Helping function returning a generator for determining the Delta Relative Improvement (DeltaRI). 159 | 160 | @param orig_score_a: Scores of the original advanced run. 161 | @param orig_score_b: Scores of the original baseline run. 162 | @param rep_score_a: Scores of the reproduced/replicated advanced run. 163 | @param rep_score_b: Scores of the reproduced/replicated baseline run. 164 | @param pbar: Boolean value indicating if progress bar should be printed. 165 | @return: Generator with DeltaRI scores. 166 | """ 167 | rel_improve_orig = rel_improve(orig_score_a, orig_score_b) 168 | rel_improve_rep = rel_improve(rep_score_a, rep_score_b) 169 | 170 | generator = tqdm(rel_improve_orig.items()) if pbar else rel_improve_orig.items() 171 | 172 | for measure, ri in generator: 173 | yield measure, ri - rel_improve_rep.get(measure) 174 | 175 | 176 | def deltaRI(orig_score_a, orig_score_b, rep_score_a, rep_score_b, pbar=False): 177 | """ 178 | Determines the Delta Relative Improvement (DeltaRI) according to the following paper: 179 | Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff. 180 | How to Measure the Reproducibility of System-oriented IR Experiments. 181 | Proceedings of SIGIR, pages 349-358, 2020. 182 | 183 | The DeltaRI value is determined by the difference between the relative improvements 184 | of the original and reproduced/replicated experiments. 185 | 186 | @param orig_score_a: Scores of the original advanced run. 187 | @param orig_score_b: Scores of the original baseline run. 188 | @param rep_score_a: Scores of the reproduced/replicated advanced run. 189 | @param rep_score_b: Scores of the reproduced/replicated baseline run. 190 | @param pbar: Boolean value indicating if progress bar should be printed. 191 | @return: Dictionary containing the DeltaRI values for the specified run combination. 192 | """ 193 | return dict(_deltaRI(orig_score_a, orig_score_b, rep_score_a, rep_score_b, pbar=pbar)) 194 | -------------------------------------------------------------------------------- /repro_eval/measure/statistics.py: -------------------------------------------------------------------------------- 1 | import math 2 | from scipy.stats.stats import ttest_rel, ttest_ind 3 | from tqdm import tqdm 4 | from repro_eval.util import topic_scores 5 | 6 | 7 | def _ttest(orig_score, rep_score, rpd=True, pbar=False): 8 | """ 9 | 10 | @param orig_score: The original scores. 11 | @param rep_score: The reproduced/replicated scores. 12 | @param rpd: Boolean indicating if the evaluated runs are reproduced. 13 | @param pbar: Boolean value indicating if progress bar should be printed. 14 | @return: Generator with p-values. 15 | """ 16 | if rpd: # paired two-tailed t-test 17 | topic_scores_orig = topic_scores(orig_score) 18 | topic_scores_rep = topic_scores(rep_score) 19 | 20 | generator = tqdm(topic_scores_orig.items()) if pbar else topic_scores_orig.items() 21 | 22 | for measure, scores in generator: 23 | yield measure, ttest_rel(scores, topic_scores_rep.get(measure)).pvalue 24 | 25 | else: # else unpaired two-tailed t-test 26 | topic_scores_orig = topic_scores(orig_score) 27 | topic_scores_rep = topic_scores(rep_score) 28 | 29 | generator = tqdm(topic_scores_orig.items()) if pbar else topic_scores_orig.items() 30 | 31 | for measure, scores in generator: 32 | yield measure, ttest_ind(scores, topic_scores_rep.get(measure)).pvalue 33 | 34 | 35 | def ttest(orig_score, rep_score, rpd=True, pbar=False): 36 | """ 37 | 38 | @param orig_score: The original scores. 39 | @param rep_score: The reproduced/replicated scores. 40 | @param rpd: Boolean indicating if the evaluated runs are reproduced. 41 | @param pbar: Boolean value indicating if progress bar should be printed. 42 | @return: Dictionary with p-values that compare the score distributions of the baseline and advanced run. 43 | """ 44 | pvals = dict(_ttest(orig_score, rep_score, rpd=rpd, pbar=pbar)) 45 | nan_list = list(filter(lambda x: math.isnan(x), pvals.values())) 46 | if len(nan_list) == len(pvals): # is every pval is nan? 47 | if orig_score == rep_score: # equal score distributions? 48 | pvals = dict.fromkeys(pvals, 1.0) 49 | 50 | return pvals 51 | -------------------------------------------------------------------------------- /repro_eval/metadata.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import platform 4 | import pkg_resources 5 | import warnings 6 | from collections import defaultdict 7 | from io import BytesIO, TextIOWrapper 8 | import cpuinfo 9 | import pytrec_eval 10 | import git 11 | from ruamel.yaml import YAML 12 | from repro_eval import Evaluator 13 | 14 | META_START = '# ir_metadata.start' 15 | META_END = '# ir_metadata.end' 16 | 17 | class PrimadExperiment: 18 | """ 19 | The PrimadExperiment is used to determine the reproducibility measures 20 | between a reference run and a set of one or more reproduced run files. 21 | Depending on the type of the PRIMAD experiment, several reproducibility 22 | measures can be determined. 23 | 24 | @param ref_base_path: Path to a single run file that corresponds to the 25 | original (or reference) baseline of the experiments. 26 | @param ref_adv_path: Path to a single run file that corresponds to the 27 | original (or reference) baseline of the experiments. 28 | @param primad: String with lower and upper case letters depending on which 29 | PRIMAD components have changed in the experiments, e.g., 30 | "priMad" when only the Method changes due to parameter sweeps. 31 | @param rep_base: List containing paths to run files that reproduce the 32 | original (or reference) baseline run. 33 | @param rpd_qrels: Qrels file that is used to evaluate the reproducibility of 34 | the experiments, i.e., it used to evaluate runs that are 35 | derived from the same test collection. 36 | @param rep_adv: List containing paths to run files that reproduce the 37 | original (or reference) advanced run. 38 | @param rpl_qrels: Qrels file that is used to evaluate the replicability of 39 | the experiments, i.e., it is used to evaluate runs that are 40 | derived from a different test collection. Please note that 41 | "rpd_qrels" has to be provided too. 42 | """ 43 | def __init__(self, **kwargs): 44 | 45 | self.ref_base_path = kwargs.get('ref_base_path', None) 46 | if self.ref_base_path: 47 | self.ref_base_run = MetadataHandler.strip_metadata(self.ref_base_path) 48 | else: 49 | self.ref_base_run = None 50 | 51 | self.ref_adv_path = kwargs.get('ref_adv_path', None) 52 | if self.ref_adv_path: 53 | self.ref_adv_run = MetadataHandler.strip_metadata(self.ref_adv_path) 54 | else: 55 | self.ref_adv_run = None 56 | 57 | self.primad = kwargs.get('primad', None) 58 | self.rep_base = kwargs.get('rep_base', None) 59 | self.rpd_qrels = kwargs.get('rpd_qrels', None) 60 | self.rep_adv = kwargs.get('rep_adv', None) 61 | self.rpl_qrels = kwargs.get('rpl_qrels', None) 62 | 63 | if self.rpl_qrels: 64 | self.rep_eval = Evaluator.RplEvaluator(qrel_orig_path=self.rpd_qrels, 65 | qrel_rpl_path=self.rpl_qrels) 66 | 67 | with open(self.rpd_qrels, 'r') as f_rpd_qrels, open(self.rpl_qrels, 'r') as f_rpl_qrels: 68 | qrels = pytrec_eval.parse_qrel(f_rpd_qrels) 69 | self.rpd_rel_eval = pytrec_eval.RelevanceEvaluator(qrels, pytrec_eval.supported_measures) 70 | qrels = pytrec_eval.parse_qrel(f_rpl_qrels) 71 | self.rpl_rel_eval = pytrec_eval.RelevanceEvaluator(qrels, pytrec_eval.supported_measures) 72 | 73 | 74 | elif self.primad[-1].islower(): # check if data component is the same 75 | self.rep_eval = Evaluator.RpdEvaluator(qrel_orig_path=self.rpd_qrels) 76 | 77 | with open(self.rpd_qrels, 'r') as f_qrels: 78 | qrels = pytrec_eval.parse_qrel(f_qrels) 79 | self.rpd_rel_eval = pytrec_eval.RelevanceEvaluator(qrels, pytrec_eval.supported_measures) 80 | 81 | else: 82 | raise ValueError('Please provide a correct combination of qrels and PRIMAD type.') 83 | 84 | def get_primad_type(self): 85 | """ 86 | This method returns a string that identifies the type of the 87 | PRIMAD experiment. 88 | 89 | @return: String with lower and upper case letters depending on which 90 | PRIMAD components have changed in the experiments, e.g., 91 | "priMad" when only the Method changes due to parameter sweeps. 92 | """ 93 | return self.primad 94 | 95 | def evaluate(self): 96 | """ 97 | This method validates the PRIMAD experiment in accordance with the given 98 | "primad" identifier. Currently, the following experiments are supported. 99 | - priMad: Parameter sweep 100 | - PRIMAd: Reproducibility evaluation on the same test collection 101 | - PRIMAD: Generalizability evaluation 102 | 103 | @return: Dictionary containing the average retrieval performance and 104 | the reproducibility measures for each run. 105 | """ 106 | 107 | if self.primad == 'priMad': 108 | if self.ref_adv_run is None and self.rep_adv is None: 109 | 110 | evaluations = {} 111 | 112 | self.rep_eval.run_b_orig = self.ref_base_run 113 | self.rep_eval.evaluate() 114 | 115 | for rep_run_path in self.rep_base + [self.ref_base_path]: 116 | 117 | run_evaluations = {} 118 | 119 | rep_run = MetadataHandler.strip_metadata(rep_run_path) 120 | scores = self.rpd_rel_eval.evaluate(rep_run) 121 | 122 | run_evaluations['arp'] = scores 123 | run_evaluations['ktu'] = self.rep_eval.ktau_union(run_b_rep=rep_run).get('baseline') 124 | run_evaluations['rbo'] = self.rep_eval.rbo(run_b_rep=rep_run).get('baseline') 125 | run_evaluations['rmse'] = self.rep_eval.nrmse(run_b_score=scores).get('baseline') 126 | run_evaluations['pval'] = self.rep_eval.ttest(run_b_score=scores).get('baseline') 127 | 128 | run_name = os.path.basename(rep_run_path) 129 | 130 | evaluations[run_name] = run_evaluations 131 | 132 | return evaluations 133 | 134 | if self.primad == 'PRIMAd': 135 | 136 | evaluations = {} 137 | 138 | self.rep_eval.run_b_orig = self.ref_base_run 139 | self.rep_eval.run_a_orig = self.ref_adv_run 140 | self.rep_eval.trim(t=1000) 141 | self.rep_eval.evaluate() 142 | 143 | pairs = self._find_pairs(rep_base=self.rep_base, rep_adv=self.rep_adv) 144 | pairs = pairs + [{'base': self.ref_base_path, 'adv': self.ref_adv_path}] 145 | 146 | for pair in pairs: 147 | 148 | pair_evaluations = {} 149 | 150 | rep_run_base = MetadataHandler.strip_metadata(pair.get('base')) 151 | rep_meta_base = MetadataHandler.read_metadata(pair.get('base')) 152 | rep_run_adv = MetadataHandler.strip_metadata(pair.get('adv')) 153 | rep_meta_adv = MetadataHandler.read_metadata(pair.get('adv')) 154 | 155 | self.rep_eval.trim(t=1000, run=rep_run_base) 156 | self.rep_eval.trim(t=1000, run=rep_run_adv) 157 | scores_base = self.rpd_rel_eval.evaluate(rep_run_base) 158 | scores_adv = self.rpd_rel_eval.evaluate(rep_run_adv) 159 | arp = {'baseline': scores_base, 'advanced': scores_adv} 160 | pair_evaluations['arp'] = arp 161 | pair_evaluations['ktu'] = self.rep_eval.ktau_union(run_b_rep=rep_run_base, run_a_rep=rep_run_adv) 162 | pair_evaluations['rbo'] = self.rep_eval.rbo(run_b_rep=rep_run_base, run_a_rep=rep_run_adv) 163 | pair_evaluations['rmse'] = self.rep_eval.nrmse(run_b_score=scores_base, run_a_score=scores_adv) 164 | pair_evaluations['er'] = self.rep_eval.er(run_b_score=scores_base, run_a_score=scores_adv) 165 | pair_evaluations['dri'] = self.rep_eval.dri(run_b_score=scores_base, run_a_score=scores_adv) 166 | pair_evaluations['pval'] = self.rep_eval.ttest(run_b_score=scores_base, run_a_score=scores_adv) 167 | 168 | if rep_meta_base.get('actor').get('team') == rep_meta_adv.get('actor').get('team'): 169 | expid = rep_meta_base.get('actor').get('team') 170 | else: 171 | expid = '_'.join([rep_meta_base.get('tag'), rep_meta_adv.get('tag')]) 172 | 173 | evaluations[expid] = pair_evaluations 174 | 175 | return evaluations 176 | 177 | if self.primad == 'PRIMAD': 178 | evaluations = {} 179 | 180 | self.rep_eval.run_b_orig = self.ref_base_run 181 | self.rep_eval.run_a_orig = self.ref_adv_run 182 | self.rep_eval.trim(t=1000) 183 | self.rep_eval.evaluate() 184 | 185 | pairs = self._find_pairs(rep_base=self.rep_base, rep_adv=self.rep_adv) 186 | pairs = pairs 187 | 188 | for pair in pairs: 189 | 190 | pair_evaluations = {} 191 | 192 | rep_run_base = MetadataHandler.strip_metadata(pair.get('base')) 193 | rep_meta_base = MetadataHandler.read_metadata(pair.get('base')) 194 | rep_run_adv = MetadataHandler.strip_metadata(pair.get('adv')) 195 | rep_meta_adv = MetadataHandler.read_metadata(pair.get('adv')) 196 | 197 | self.rep_eval.trim(t=1000, run=rep_run_base) 198 | self.rep_eval.trim(t=1000, run=rep_run_adv) 199 | scores_base = self.rpl_rel_eval.evaluate(rep_run_base) 200 | scores_adv = self.rpl_rel_eval.evaluate(rep_run_adv) 201 | arp = {'baseline': scores_base, 'advanced': scores_adv} 202 | pair_evaluations['arp'] = arp 203 | pair_evaluations['er'] = self.rep_eval.er(run_b_score=scores_base, run_a_score=scores_adv) 204 | pair_evaluations['dri'] = self.rep_eval.dri(run_b_score=scores_base, run_a_score=scores_adv) 205 | pair_evaluations['pval'] = self.rep_eval.ttest(run_b_score=scores_base, run_a_score=scores_adv) 206 | 207 | expid = '_'.join([rep_meta_base.get('tag'), rep_meta_adv.get('tag')]) 208 | evaluations[expid] = pair_evaluations 209 | 210 | return evaluations 211 | 212 | else: 213 | raise ValueError('The specified type of the PRIMAD experiments is not supported yet.') 214 | 215 | def _find_pairs(self, rep_base, rep_adv): 216 | """ 217 | This method finds pairs between lists of baseline and advanced runs. 218 | A pair is defined by the highest number of matching PRIMAD components. 219 | 220 | @param rep_base: List with baseline runs. 221 | @param rep_adv: List with advanced runs. 222 | 223 | @return: List with dictionaries containing paths to a baseline and an 224 | advanced run. 225 | """ 226 | 227 | pairs = [] 228 | for brp in rep_base: 229 | br = MetadataHandler.read_metadata(run_path=brp) 230 | 231 | arp = None 232 | cnt = 0 233 | 234 | for _arp in rep_adv: 235 | _cnt = 0 236 | ar = MetadataHandler.read_metadata(run_path=_arp) 237 | 238 | for k,v in br.items(): 239 | if v == ar.get(k): 240 | _cnt += 1 241 | 242 | if _cnt > cnt: 243 | cnt = _cnt 244 | arp = _arp 245 | 246 | pairs.append({'base': brp, 'adv': arp}) 247 | 248 | return pairs 249 | 250 | 251 | class MetadataAnalyzer: 252 | """ 253 | The MetadataAnalyzer is used to analyze set of different run files in 254 | reference to a run that has be be provided upon instantiation. The 255 | analyze_directory() method returns a dictionary with PRIMAD identifiers as 256 | keys and lists with the corresponding run paths as values. 257 | 258 | @param run_path: Path to the reference run file. 259 | """ 260 | 261 | def __init__(self, run_path): 262 | 263 | self.reference_run_path = run_path 264 | self.reference_run = MetadataHandler.strip_metadata(run_path) 265 | self.reference_metadata = MetadataHandler.read_metadata(run_path) 266 | 267 | def set_reference(self, run_path): 268 | """ 269 | Use this method to set a new reference run. 270 | 271 | @param run_path: Path to the new reference run file. 272 | """ 273 | 274 | self.reference_run_path = run_path 275 | self.reference_run = MetadataHandler.strip_metadata(run_path) 276 | self.reference_metadata = MetadataHandler.read_metadata(run_path) 277 | 278 | def analyze_directory(self, dir_path): 279 | """ 280 | Use this method to analyze the specified directory in comparison to the 281 | reference run. 282 | 283 | @param dir_path: Path to the directory. 284 | """ 285 | 286 | components = ['platform', 'research goal', 'implementation', 'method', 'actor', 'data'] 287 | primad = {} 288 | 289 | files = os.listdir(dir_path) 290 | 291 | for _file in files: 292 | file_path = os.path.join(dir_path, _file) 293 | 294 | if file_path == self.reference_run_path: 295 | continue 296 | 297 | _metadata = MetadataHandler.read_metadata(file_path) 298 | 299 | primad_str = '' 300 | 301 | for component in components: 302 | if self.reference_metadata[component] != _metadata[component]: 303 | primad_str += component[0].upper() 304 | else: 305 | primad_str += component[0] 306 | 307 | primad[file_path] = primad_str 308 | 309 | experiments = defaultdict(list) 310 | for k, v in primad.items(): 311 | experiments[v].append(k) 312 | 313 | return experiments 314 | 315 | @staticmethod 316 | def filter_by_baseline(ref_run, runs): 317 | """ 318 | Use this method to filter a list of runs wrt. to the baseline that is 319 | specified under "research goal/evaluation/baseline" of a given reference run. 320 | 321 | @param ref_run: The reference with the baseline. 322 | @param runs: A list of run paths that is filtered. 323 | """ 324 | 325 | run_tag = MetadataHandler.read_metadata(ref_run).get('tag') 326 | 327 | filtered_list = [] 328 | for run in runs: 329 | _metadata = MetadataHandler.read_metadata(run) 330 | baseline = _metadata.get('research goal').get('evaluation').get('baseline')[0] 331 | if baseline == run_tag: 332 | filtered_list.append(run) 333 | 334 | return filtered_list 335 | 336 | @staticmethod 337 | def filter_by_test_collection(test_collection, runs): 338 | """ 339 | Use this method to filter a list of runs wrt. to the test collection 340 | specified under "data/test_collection". 341 | 342 | @param test_collection: Name of the test collection. 343 | @param runs: A list of run paths that is filtered. 344 | """ 345 | 346 | filtered_list = [] 347 | for run in runs: 348 | _metadata = MetadataHandler.read_metadata(run) 349 | name = _metadata.get('data').get('test collection').get('name') 350 | if test_collection == name: 351 | filtered_list.append(run) 352 | 353 | return filtered_list 354 | 355 | 356 | class MetadataHandler: 357 | """ 358 | Use the MetadataHandler for in- and output operations of annotated run files. 359 | 360 | @param run_path: Path the run file without metadata annotations. It is also 361 | possible to load an already annotated run and modify it with 362 | the MetadataHandler. 363 | @param metadata_path: Path to the YAML file containing the metadata that 364 | should be added to the run file. 365 | """ 366 | def __init__(self, run_path, metadata_path=None): 367 | 368 | self.run_path = run_path 369 | 370 | if metadata_path: 371 | self._metadata = MetadataHandler.read_metadata_template(metadata_path) 372 | else: 373 | self._metadata = MetadataHandler.read_metadata(run_path) 374 | 375 | def get_metadata(self): 376 | """ 377 | Use this method to get the currently set metadata annotations. 378 | 379 | @return: Nested dictionary containing the metadata annotations. 380 | """ 381 | 382 | return self._metadata 383 | 384 | def set_metadata(self, metadata_dict=None, metadata_path=None): 385 | """ 386 | Use this method to set/update the metadata. It can either be provided with a 387 | dictionary of a path to a YAML file 388 | 389 | @param metadata_dict: Nested dictionary containing the metadata annotations. 390 | @param metadata_path: Path to the YAML file with metadata. 391 | """ 392 | if metadata_path: 393 | self._metadata = MetadataHandler.read_metadata_template(metadata_path) 394 | 395 | if metadata_dict: 396 | self._metadata = metadata_dict 397 | 398 | def dump_metadata(self, dump_path=None, complete_metadata=False, repo_path='.'): 399 | """ 400 | Use this method to dump the current metadata into a YAML file. 401 | The filename is a concatenation of the run tag and the "_annotated" suffix. 402 | 403 | @param dump_path: Path to the directory where the metadata is dumped. 404 | @param complete_metadata: If true, the Platform and Implementation will 405 | be added automatically, if not already provided. 406 | @param repo_path: Path to the git repository of the Implementation that 407 | underlies the run file. This path is needed for the 408 | automatic completion. 409 | """ 410 | 411 | if complete_metadata: 412 | self.complete_metadata(repo_path=repo_path) 413 | 414 | if self._metadata: 415 | 416 | tag = self._metadata['tag'] 417 | f_out_name = '_'.join([tag, 'dump.yaml']) 418 | f_out_path = os.path.join(dump_path, f_out_name) 419 | 420 | with open(f_out_path, 'wb') as f_out: 421 | bytes_io = BytesIO() 422 | yaml = YAML() 423 | yaml.width = 4096 424 | yaml.dump(self._metadata, bytes_io) 425 | f_out.write(bytes_io.getvalue()) 426 | 427 | def write_metadata(self, run_path=None, complete_metadata=False, repo_path='.'): 428 | """ 429 | This method writes the metadata into the run file. 430 | 431 | @param run_path: Path to the annotated run file. 432 | @param complete_metadata: If true, the Platform and Implementation will 433 | be added automatically, if not already provided. 434 | @param repo_path: Path to the git repository of the Implementation that 435 | underlies the run file. This path is needed for the 436 | automatic completion. 437 | """ 438 | if complete_metadata: 439 | self.complete_metadata(repo_path=repo_path) 440 | 441 | bytes_io = BytesIO() 442 | yaml = YAML() 443 | yaml.width = 4096 444 | yaml.dump(self._metadata, bytes_io) 445 | 446 | byte_str = bytes_io.getvalue().decode('UTF-8') 447 | lines = byte_str.split('\n') 448 | 449 | if run_path is None: 450 | f_out_path = '_'.join([self.run_path, 'annotated']) 451 | else: 452 | f_out_path = '_'.join([run_path]) 453 | 454 | with open(f_out_path, 'w') as f_out: 455 | 456 | f_out.write(''.join([META_START, '\n'])) 457 | for line in lines[:-1]: 458 | f_out.write(' '.join(['#', line, '\n'])) 459 | f_out.write(''.join([META_END, '\n'])) 460 | 461 | with open(self.run_path, 'r') as f_in: 462 | for run_line in f_in.readlines(): 463 | f_out.write(run_line) 464 | 465 | def complete_metadata(self, repo_path='.'): 466 | """ 467 | This method automatically adds metadata about the Platform and 468 | the Implementation component. 469 | 470 | @param repo_path: Path to the git repository of the Implementation that 471 | underlies the run file. If not specified this method 472 | assumes that the program is executed from the root 473 | directory of the git repository. 474 | """ 475 | if self._metadata.get('platform') is None: 476 | platform_dict = { 477 | 'hardware': { 478 | 'cpu': self._get_cpu(), 479 | 'ram': self._get_ram(), 480 | }, 481 | 'operating system': self._get_os(), 482 | 'software': self._get_libs(), 483 | } 484 | 485 | self._metadata['platform'] = platform_dict 486 | 487 | if self._metadata.get('implementation') is None: 488 | self._metadata['implementation'] = self._get_src(repo_path=repo_path) 489 | 490 | @staticmethod 491 | def strip_metadata(annotated_run): 492 | ''' 493 | Strips off the metadata and returns a dict-version of the run that is parsed with pytrec_eval. 494 | 495 | @param annotated_run: Path to the annotated run file. 496 | 497 | @return: defaultdict that can be used with pytrec_eval or repro_eval. 498 | ''' 499 | 500 | with TextIOWrapper(buffer=BytesIO(), encoding='utf-8', line_buffering=True) as text_io_wrapper: 501 | with open(annotated_run, 'r') as f_in: 502 | lines = f_in.readlines() 503 | for line in lines: 504 | if line[0] != '#': 505 | text_io_wrapper.write(line) 506 | text_io_wrapper.seek(0,0) 507 | run = pytrec_eval.parse_run(text_io_wrapper) 508 | 509 | return run 510 | 511 | @staticmethod 512 | def read_metadata(run_path): 513 | ''' 514 | Reads the metadata out of an annotated run and returns a dict containing the metadata. 515 | 516 | @param run_path: Path to the run file. 517 | 518 | @return: Dictionary containing the metadata information of the annotated 519 | run file. 520 | ''' 521 | 522 | _metadata = None 523 | 524 | with open(run_path, 'r') as f_in: 525 | lines = f_in.readlines() 526 | if lines[0].strip('\n') == META_START: 527 | metadata_str = '' 528 | yaml=YAML(typ='safe') 529 | 530 | for line in lines[1:]: 531 | if line.strip('\n') != META_END: 532 | metadata_str += line.strip('#') 533 | else: 534 | break 535 | _metadata = yaml.load(metadata_str) 536 | 537 | return _metadata 538 | 539 | @staticmethod 540 | def read_metadata_template(metadata_path): 541 | """ 542 | This method reads in a YAML file containing the metadata. 543 | 544 | @param template_path: Path to the metadata YAML file. 545 | 546 | @return: Nested dictionary containing the metadata. 547 | """ 548 | 549 | with open(metadata_path, 'r') as f_in: 550 | yaml = YAML(typ='safe') 551 | return yaml.load(f_in) 552 | 553 | def _get_cpu(self): 554 | """ 555 | Reads out metadata information about the CPU including the model's name 556 | the architectures, the operation mode and the number of available cores. 557 | """ 558 | 559 | cpu = cpuinfo.get_cpu_info() 560 | return { 561 | 'model': cpu['brand_raw'], 562 | 'architecture': platform.machine(), 563 | 'operation mode': '-'.join([str(cpu['bits']), 'bit']), 564 | 'number of cores': cpu['count'], 565 | } 566 | 567 | def _get_os(self): 568 | """ 569 | Reads out metadata information about the operating system including 570 | the platform (e.g. Linux), the kernel release version, 571 | and the distribution's name. 572 | """ 573 | 574 | try: 575 | with open("/etc/os-release") as f_in: 576 | os_info = {} 577 | for line in f_in: 578 | k,v = line.rstrip().split('=') 579 | os_info[k] = v.strip('"') 580 | 581 | distribution = os_info['PRETTY_NAME'] 582 | 583 | except: 584 | warnings.warn('/etc/os-release not found. Using the available information of the platform package instead.') 585 | distribution = platform.version() 586 | 587 | return { 588 | 'platform': platform.system(), 589 | 'kernel': platform.release(), 590 | 'distribution': distribution, 591 | } 592 | 593 | def _get_ram(self): 594 | """ 595 | Reads out the available RAM and returns the size in GB. 596 | """ 597 | 598 | memory_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') 599 | memory_gb = memory_bytes/(1024.0 ** 3) 600 | return ' '.join([str(round(memory_gb, 2)),'GB']) 601 | 602 | def _get_libs(self): 603 | """ 604 | Reads out all installed Python packages of the active environment. 605 | """ 606 | 607 | installed_packages = [d.project_name for d in pkg_resources.working_set] 608 | return {'libraries': {'python': installed_packages}} 609 | 610 | def _get_src(self, repo_path='.'): 611 | """ 612 | Reads out information from the specified repository. 613 | 614 | @param repo_path: Path to the git repository of the Implementation that 615 | underlies the run file. If not specified this method 616 | assumes that the program is executed from the root 617 | directory of the git repository. 618 | """ 619 | 620 | extensions_path = pkg_resources.resource_filename(__name__, 'resources/extensions.json') 621 | 622 | repo = git.Repo(repo_path) 623 | 624 | with open(extensions_path, 'r') as input_file: 625 | extensions = json.load(input_file) 626 | 627 | languages = set() 628 | 629 | for _, _, files in os.walk('.'): 630 | for name in files: 631 | _, file_extension = os.path.splitext(name) 632 | language = extensions.get(file_extension[1:]) 633 | if language: 634 | languages.add(language) 635 | 636 | return { 637 | 'repository': repo.remote().url, 638 | 'commit': str(repo.head.commit), 639 | 'lang': list(languages), 640 | } 641 | -------------------------------------------------------------------------------- /repro_eval/resources/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": "Groff", 3 | "2": "Groff", 4 | "3": "Groff", 5 | "4": "Groff", 6 | "5": "Groff", 7 | "6": "Groff", 8 | "7": "Groff", 9 | "8": "Groff", 10 | "9": "Groff", 11 | "abap": "ABAP", 12 | "asc": "Public Key", 13 | "ash": "AGS Script", 14 | "ampl": "AMPL", 15 | "mod": "XML", 16 | "g4": "ANTLR", 17 | "apib": "API Blueprint", 18 | "apl": "APL", 19 | "dyalog": "APL", 20 | "asp": "ASP", 21 | "asax": "ASP", 22 | "ascx": "ASP", 23 | "ashx": "ASP", 24 | "asmx": "ASP", 25 | "aspx": "ASP", 26 | "axd": "ASP", 27 | "dats": "ATS", 28 | "hats": "ATS", 29 | "sats": "ATS", 30 | "as": "ActionScript", 31 | "adb": "Ada", 32 | "ada": "Ada", 33 | "ads": "Ada", 34 | "agda": "Agda", 35 | "als": "Alloy", 36 | "apacheconf": "ApacheConf", 37 | "vhost": "Nginx", 38 | "cls": "Visual Basic", 39 | "applescript": "AppleScript", 40 | "scpt": "AppleScript", 41 | "arc": "Arc", 42 | "ino": "Arduino", 43 | "asciidoc": "AsciiDoc", 44 | "adoc": "AsciiDoc", 45 | "aj": "AspectJ", 46 | "asm": "Assembly", 47 | "a51": "Assembly", 48 | "inc": "SourcePawn", 49 | "nasm": "Assembly", 50 | "aug": "Augeas", 51 | "ahk": "AutoHotkey", 52 | "ahkl": "AutoHotkey", 53 | "au3": "AutoIt", 54 | "awk": "Awk", 55 | "auk": "Awk", 56 | "gawk": "Awk", 57 | "mawk": "Awk", 58 | "nawk": "Awk", 59 | "bat": "Batchfile", 60 | "cmd": "Batchfile", 61 | "befunge": "Befunge", 62 | "bison": "Bison", 63 | "bb": "BlitzBasic", 64 | "decls": "BlitzBasic", 65 | "bmx": "BlitzMax", 66 | "bsv": "Bluespec", 67 | "boo": "Boo", 68 | "b": "Limbo", 69 | "bf": "HyPhy", 70 | "brs": "Brightscript", 71 | "bro": "Bro", 72 | "c": "C", 73 | "cats": "C", 74 | "h": "Objective-C", 75 | "idc": "C", 76 | "w": "C", 77 | "cs": "Smalltalk", 78 | "cake": "CoffeeScript", 79 | "cshtml": "C#", 80 | "csx": "C#", 81 | "cpp": "C++", 82 | "c++": "C++", 83 | "cc": "C++", 84 | "cp": "Component Pascal", 85 | "cxx": "C++", 86 | "h++": "C++", 87 | "hh": "Hack", 88 | "hpp": "C++", 89 | "hxx": "C++", 90 | "inl": "C++", 91 | "ipp": "C++", 92 | "tcc": "C++", 93 | "tpp": "C++", 94 | "c-objdump": "C-ObjDump", 95 | "chs": "C2hs Haskell", 96 | "clp": "CLIPS", 97 | "cmake": "CMake", 98 | "cmake.in": "CMake", 99 | "cob": "COBOL", 100 | "cbl": "COBOL", 101 | "ccp": "COBOL", 102 | "cobol": "COBOL", 103 | "cpy": "COBOL", 104 | "css": "CSS", 105 | "csv": "CSV", 106 | "capnp": "Cap'n Proto", 107 | "mss": "CartoCSS", 108 | "ceylon": "Ceylon", 109 | "chpl": "Chapel", 110 | "ch": "xBase", 111 | "ck": "ChucK", 112 | "cirru": "Cirru", 113 | "clw": "Clarion", 114 | "icl": "Clean", 115 | "dcl": "Clean", 116 | "click": "Click", 117 | "clj": "Clojure", 118 | "boot": "Clojure", 119 | "cl2": "Clojure", 120 | "cljc": "Clojure", 121 | "cljs": "Clojure", 122 | "cljs.hl": "Clojure", 123 | "cljscm": "Clojure", 124 | "cljx": "Clojure", 125 | "hic": "Clojure", 126 | "coffee": "CoffeeScript", 127 | "_coffee": "CoffeeScript", 128 | "cjsx": "CoffeeScript", 129 | "cson": "CoffeeScript", 130 | "iced": "CoffeeScript", 131 | "cfm": "ColdFusion", 132 | "cfml": "ColdFusion", 133 | "cfc": "ColdFusion CFC", 134 | "lisp": "NewLisp", 135 | "asd": "Common Lisp", 136 | "cl": "OpenCL", 137 | "l": "PicoLisp", 138 | "lsp": "NewLisp", 139 | "ny": "Common Lisp", 140 | "podsl": "Common Lisp", 141 | "sexp": "Common Lisp", 142 | "cps": "Component Pascal", 143 | "coq": "Coq", 144 | "v": "Verilog", 145 | "cppobjdump": "Cpp-ObjDump", 146 | "c++-objdump": "Cpp-ObjDump", 147 | "c++objdump": "Cpp-ObjDump", 148 | "cpp-objdump": "Cpp-ObjDump", 149 | "cxx-objdump": "Cpp-ObjDump", 150 | "creole": "Creole", 151 | "cr": "Crystal", 152 | "feature": "Cucumber", 153 | "cu": "Cuda", 154 | "cuh": "Cuda", 155 | "cy": "Cycript", 156 | "pyx": "Cython", 157 | "pxd": "Cython", 158 | "pxi": "Cython", 159 | "d": "Makefile", 160 | "di": "D", 161 | "d-objdump": "D-ObjDump", 162 | "com": "DIGITAL Command Language", 163 | "dm": "DM", 164 | "zone": "DNS Zone", 165 | "arpa": "DNS Zone", 166 | "darcspatch": "Darcs Patch", 167 | "dpatch": "Darcs Patch", 168 | "dart": "Dart", 169 | "diff": "Diff", 170 | "patch": "Diff", 171 | "dockerfile": "Dockerfile", 172 | "djs": "Dogescript", 173 | "dylan": "Dylan", 174 | "dyl": "Dylan", 175 | "intr": "Dylan", 176 | "lid": "Dylan", 177 | "E": "E", 178 | "ecl": "ECLiPSe", 179 | "eclxml": "ECL", 180 | "sch": "KiCad", 181 | "brd": "KiCad", 182 | "epj": "Ecere Projects", 183 | "e": "Eiffel", 184 | "ex": "Elixir", 185 | "exs": "Elixir", 186 | "elm": "Elm", 187 | "el": "Emacs Lisp", 188 | "emacs": "Emacs Lisp", 189 | "emacs.desktop": "Emacs Lisp", 190 | "em": "EmberScript", 191 | "emberscript": "EmberScript", 192 | "erl": "Erlang", 193 | "es": "JavaScript", 194 | "escript": "Erlang", 195 | "hrl": "Erlang", 196 | "xrl": "Erlang", 197 | "yrl": "Erlang", 198 | "fs": "GLSL", 199 | "fsi": "F#", 200 | "fsx": "F#", 201 | "fx": "HLSL", 202 | "flux": "FLUX", 203 | "f90": "FORTRAN", 204 | "f": "Forth", 205 | "f03": "FORTRAN", 206 | "f08": "FORTRAN", 207 | "f77": "FORTRAN", 208 | "f95": "FORTRAN", 209 | "for": "Forth", 210 | "fpp": "FORTRAN", 211 | "factor": "Factor", 212 | "fy": "Fancy", 213 | "fancypack": "Fancy", 214 | "fan": "Fantom", 215 | "eam.fs": "Formatted", 216 | "fth": "Forth", 217 | "4th": "Forth", 218 | "forth": "Forth", 219 | "fr": "Text", 220 | "frt": "Forth", 221 | "ftl": "FreeMarker", 222 | "g": "GAP", 223 | "gco": "G-code", 224 | "gcode": "G-code", 225 | "gms": "GAMS", 226 | "gap": "GAP", 227 | "gd": "GDScript", 228 | "gi": "GAP", 229 | "tst": "Scilab", 230 | "s": "GAS", 231 | "ms": "MAXScript", 232 | "glsl": "GLSL", 233 | "fp": "GLSL", 234 | "frag": "JavaScript", 235 | "frg": "GLSL", 236 | "fsh": "GLSL", 237 | "fshader": "GLSL", 238 | "geo": "GLSL", 239 | "geom": "GLSL", 240 | "glslv": "GLSL", 241 | "gshader": "GLSL", 242 | "shader": "GLSL", 243 | "vert": "GLSL", 244 | "vrx": "GLSL", 245 | "vsh": "GLSL", 246 | "vshader": "GLSL", 247 | "gml": "XML", 248 | "kid": "Genshi", 249 | "ebuild": "Gentoo Ebuild", 250 | "eclass": "Gentoo Eclass", 251 | "po": "Gettext Catalog", 252 | "pot": "Gettext Catalog", 253 | "glf": "Glyph", 254 | "gp": "Gnuplot", 255 | "gnu": "Gnuplot", 256 | "gnuplot": "Gnuplot", 257 | "plot": "Gnuplot", 258 | "plt": "Gnuplot", 259 | "go": "Go", 260 | "golo": "Golo", 261 | "gs": "JavaScript", 262 | "gst": "Gosu", 263 | "gsx": "Gosu", 264 | "vark": "Gosu", 265 | "grace": "Grace", 266 | "gradle": "Gradle", 267 | "gf": "Grammatical Framework", 268 | "graphql": "GraphQL", 269 | "dot": "Graphviz (DOT)", 270 | "gv": "Graphviz (DOT)", 271 | "man": "Groff", 272 | "1in": "Groff", 273 | "1m": "Groff", 274 | "1x": "Groff", 275 | "3in": "Groff", 276 | "3m": "Groff", 277 | "3qt": "Groff", 278 | "3x": "Groff", 279 | "me": "Groff", 280 | "n": "Nemerle", 281 | "rno": "Groff", 282 | "roff": "Groff", 283 | "groovy": "Groovy", 284 | "grt": "Groovy", 285 | "gtpl": "Groovy", 286 | "gvy": "Groovy", 287 | "gsp": "Groovy Server Pages", 288 | "hcl": "HCL", 289 | "tf": "HCL", 290 | "hlsl": "HLSL", 291 | "fxh": "HLSL", 292 | "hlsli": "HLSL", 293 | "html": "HTML", 294 | "htm": "HTML", 295 | "html.hl": "HTML", 296 | "st": "Smalltalk", 297 | "xht": "HTML", 298 | "xhtml": "HTML", 299 | "mustache": "HTML+Django", 300 | "jinja": "HTML+Django", 301 | "eex": "HTML+EEX", 302 | "erb": "HTML+ERB", 303 | "erb.deface": "HTML+ERB", 304 | "phtml": "HTML+PHP", 305 | "http": "HTTP", 306 | "php": "PHP", 307 | "haml": "Haml", 308 | "haml.deface": "Haml", 309 | "handlebars": "Handlebars", 310 | "hbs": "Handlebars", 311 | "hb": "Harbour", 312 | "hs": "Haskell", 313 | "hsc": "Haskell", 314 | "hx": "Haxe", 315 | "hxsl": "Haxe", 316 | "hy": "Hy", 317 | "pro": "QMake", 318 | "dlm": "IDL", 319 | "ipf": "IGOR Pro", 320 | "ini": "INI", 321 | "cfg": "INI", 322 | "prefs": "INI", 323 | "properties": "INI", 324 | "irclog": "IRC log", 325 | "weechatlog": "IRC log", 326 | "idr": "Idris", 327 | "lidr": "Idris", 328 | "ni": "Inform 7", 329 | "i7x": "Inform 7", 330 | "iss": "Inno Setup", 331 | "io": "Io", 332 | "ik": "Ioke", 333 | "thy": "Isabelle", 334 | "ijs": "J", 335 | "flex": "JFlex", 336 | "jflex": "JFlex", 337 | "json": "JSON", 338 | "geojson": "JSON", 339 | "lock": "JSON", 340 | "topojson": "JSON", 341 | "json5": "JSON5", 342 | "jsonld": "JSONLD", 343 | "jq": "JSONiq", 344 | "jsx": "JSX", 345 | "jade": "Jade", 346 | "j": "Objective-J", 347 | "java": "Java", 348 | "jsp": "Java Server Pages", 349 | "js": "JavaScript", 350 | "_js": "JavaScript", 351 | "bones": "JavaScript", 352 | "es6": "JavaScript", 353 | "jake": "JavaScript", 354 | "jsb": "JavaScript", 355 | "jscad": "JavaScript", 356 | "jsfl": "JavaScript", 357 | "jsm": "JavaScript", 358 | "jss": "JavaScript", 359 | "njs": "JavaScript", 360 | "pac": "JavaScript", 361 | "sjs": "JavaScript", 362 | "ssjs": "JavaScript", 363 | "sublime-build": "JavaScript", 364 | "sublime-commands": "JavaScript", 365 | "sublime-completions": "JavaScript", 366 | "sublime-keymap": "JavaScript", 367 | "sublime-macro": "JavaScript", 368 | "sublime-menu": "JavaScript", 369 | "sublime-mousemap": "JavaScript", 370 | "sublime-project": "JavaScript", 371 | "sublime-settings": "JavaScript", 372 | "sublime-theme": "JavaScript", 373 | "sublime-workspace": "JavaScript", 374 | "sublime_metrics": "JavaScript", 375 | "sublime_session": "JavaScript", 376 | "xsjs": "JavaScript", 377 | "xsjslib": "JavaScript", 378 | "jl": "Julia", 379 | "ipynb": "Jupyter Notebook", 380 | "krl": "KRL", 381 | "kicad_pcb": "KiCad", 382 | "kit": "Kit", 383 | "kt": "Kotlin", 384 | "ktm": "Kotlin", 385 | "kts": "Kotlin", 386 | "lfe": "LFE", 387 | "ll": "LLVM", 388 | "lol": "LOLCODE", 389 | "lsl": "LSL", 390 | "lslp": "LSL", 391 | "lvproj": "LabVIEW", 392 | "lasso": "Lasso", 393 | "las": "Lasso", 394 | "lasso8": "Lasso", 395 | "lasso9": "Lasso", 396 | "ldml": "Lasso", 397 | "latte": "Latte", 398 | "lean": "Lean", 399 | "hlean": "Lean", 400 | "less": "Less", 401 | "lex": "Lex", 402 | "ly": "LilyPond", 403 | "ily": "LilyPond", 404 | "m": "Objective-C", 405 | "ld": "Linker Script", 406 | "lds": "Linker Script", 407 | "liquid": "Liquid", 408 | "lagda": "Literate Agda", 409 | "litcoffee": "Literate CoffeeScript", 410 | "lhs": "Literate Haskell", 411 | "ls": "LoomScript", 412 | "_ls": "LiveScript", 413 | "xm": "Logos", 414 | "x": "Logos", 415 | "xi": "Logos", 416 | "lgt": "Logtalk", 417 | "logtalk": "Logtalk", 418 | "lookml": "LookML", 419 | "lua": "Lua", 420 | "fcgi": "Shell", 421 | "nse": "Lua", 422 | "pd_lua": "Lua", 423 | "rbxs": "Lua", 424 | "wlua": "Lua", 425 | "mumps": "M", 426 | "m4": "M4Sugar", 427 | "mcr": "MAXScript", 428 | "mtml": "MTML", 429 | "muf": "MUF", 430 | "mak": "Makefile", 431 | "mk": "Makefile", 432 | "mkfile": "Makefile", 433 | "mako": "Mako", 434 | "mao": "Mako", 435 | "md": "Markdown", 436 | "markdown": "Markdown", 437 | "mkd": "Markdown", 438 | "mkdn": "Markdown", 439 | "mkdown": "Markdown", 440 | "ron": "Markdown", 441 | "mask": "Mask", 442 | "mathematica": "Mathematica", 443 | "cdf": "Mathematica", 444 | "ma": "Mathematica", 445 | "mt": "Mathematica", 446 | "nb": "Text", 447 | "nbp": "Mathematica", 448 | "wl": "Mathematica", 449 | "wlt": "Mathematica", 450 | "matlab": "Matlab", 451 | "maxpat": "Max", 452 | "maxhelp": "Max", 453 | "maxproj": "Max", 454 | "mxt": "Max", 455 | "pat": "Max", 456 | "mediawiki": "MediaWiki", 457 | "wiki": "MediaWiki", 458 | "moo": "Moocode", 459 | "metal": "Metal", 460 | "minid": "MiniD", 461 | "druby": "Mirah", 462 | "duby": "Mirah", 463 | "mir": "Mirah", 464 | "mirah": "Mirah", 465 | "mo": "Modelica", 466 | "mms": "Module Management System", 467 | "mmk": "Module Management System", 468 | "monkey": "Monkey", 469 | "moon": "MoonScript", 470 | "myt": "Myghty", 471 | "ncl": "Text", 472 | "nl": "NewLisp", 473 | "nsi": "NSIS", 474 | "nsh": "NSIS", 475 | "axs": "NetLinx", 476 | "axi": "NetLinx", 477 | "axs.erb": "NetLinx+ERB", 478 | "axi.erb": "NetLinx+ERB", 479 | "nlogo": "NetLogo", 480 | "nginxconf": "Nginx", 481 | "nim": "Nimrod", 482 | "nimrod": "Nimrod", 483 | "ninja": "Ninja", 484 | "nit": "Nit", 485 | "nix": "Nix", 486 | "nu": "Nu", 487 | "numpy": "NumPy", 488 | "numpyw": "NumPy", 489 | "numsc": "NumPy", 490 | "ml": "OCaml", 491 | "eliom": "OCaml", 492 | "eliomi": "OCaml", 493 | "ml4": "OCaml", 494 | "mli": "OCaml", 495 | "mll": "OCaml", 496 | "mly": "OCaml", 497 | "objdump": "ObjDump", 498 | "mm": "XML", 499 | "sj": "Objective-J", 500 | "omgrofl": "Omgrofl", 501 | "opa": "Opa", 502 | "opal": "Opal", 503 | "opencl": "OpenCL", 504 | "p": "OpenEdge ABL", 505 | "scad": "OpenSCAD", 506 | "org": "Org", 507 | "ox": "Ox", 508 | "oxh": "Ox", 509 | "oxo": "Ox", 510 | "oxygene": "Oxygene", 511 | "oz": "Oz", 512 | "pwn": "PAWN", 513 | "aw": "PHP", 514 | "ctp": "PHP", 515 | "php3": "PHP", 516 | "php4": "PHP", 517 | "php5": "PHP", 518 | "phps": "PHP", 519 | "phpt": "PHP", 520 | "pls": "PLSQL", 521 | "pck": "PLSQL", 522 | "pkb": "PLSQL", 523 | "pks": "PLSQL", 524 | "plb": "PLSQL", 525 | "plsql": "PLSQL", 526 | "sql": "SQLPL", 527 | "pov": "POV-Ray SDL", 528 | "pan": "Pan", 529 | "psc": "Papyrus", 530 | "parrot": "Parrot", 531 | "pasm": "Parrot Assembly", 532 | "pir": "Parrot Internal Representation", 533 | "pas": "Pascal", 534 | "dfm": "Pascal", 535 | "dpr": "Pascal", 536 | "lpr": "Pascal", 537 | "pp": "Puppet", 538 | "pl": "Prolog", 539 | "al": "Perl", 540 | "cgi": "Shell", 541 | "perl": "Perl", 542 | "ph": "Perl", 543 | "plx": "Perl", 544 | "pm": "Perl6", 545 | "pod": "Pod", 546 | "psgi": "Perl", 547 | "t": "Turing", 548 | "6pl": "Perl6", 549 | "6pm": "Perl6", 550 | "nqp": "Perl6", 551 | "p6": "Perl6", 552 | "p6l": "Perl6", 553 | "p6m": "Perl6", 554 | "pl6": "Perl6", 555 | "pm6": "Perl6", 556 | "pkl": "Pickle", 557 | "pig": "PigLatin", 558 | "pike": "Pike", 559 | "pmod": "Pike", 560 | "pogo": "PogoScript", 561 | "pony": "Pony", 562 | "ps": "PostScript", 563 | "eps": "PostScript", 564 | "ps1": "PowerShell", 565 | "psd1": "PowerShell", 566 | "psm1": "PowerShell", 567 | "pde": "Processing", 568 | "prolog": "Prolog", 569 | "yap": "Prolog", 570 | "spin": "Propeller Spin", 571 | "proto": "Protocol Buffer", 572 | "pub": "Public Key", 573 | "pd": "Pure Data", 574 | "pb": "PureBasic", 575 | "pbi": "PureBasic", 576 | "purs": "PureScript", 577 | "py": "Python", 578 | "bzl": "Python", 579 | "gyp": "Python", 580 | "lmi": "Python", 581 | "pyde": "Python", 582 | "pyp": "Python", 583 | "pyt": "Python", 584 | "pyw": "Python", 585 | "rpy": "Ren'Py", 586 | "tac": "Python", 587 | "wsgi": "Python", 588 | "xpy": "Python", 589 | "pytb": "Python traceback", 590 | "qml": "QML", 591 | "qbs": "QML", 592 | "pri": "QMake", 593 | "r": "Rebol", 594 | "rd": "R", 595 | "rsx": "R", 596 | "raml": "RAML", 597 | "rdoc": "RDoc", 598 | "rbbas": "REALbasic", 599 | "rbfrm": "REALbasic", 600 | "rbmnu": "REALbasic", 601 | "rbres": "REALbasic", 602 | "rbtbar": "REALbasic", 603 | "rbuistate": "REALbasic", 604 | "rhtml": "RHTML", 605 | "rmd": "RMarkdown", 606 | "rkt": "Racket", 607 | "rktd": "Racket", 608 | "rktl": "Racket", 609 | "scrbl": "Racket", 610 | "rl": "Ragel in Ruby Host", 611 | "raw": "Raw token data", 612 | "reb": "Rebol", 613 | "r2": "Rebol", 614 | "r3": "Rebol", 615 | "rebol": "Rebol", 616 | "red": "Red", 617 | "reds": "Red", 618 | "cw": "Redcode", 619 | "rs": "Rust", 620 | "rsh": "RenderScript", 621 | "robot": "RobotFramework", 622 | "rg": "Rouge", 623 | "rb": "Ruby", 624 | "builder": "Ruby", 625 | "gemspec": "Ruby", 626 | "god": "Ruby", 627 | "irbrc": "Ruby", 628 | "jbuilder": "Ruby", 629 | "mspec": "Ruby", 630 | "pluginspec": "XML", 631 | "podspec": "Ruby", 632 | "rabl": "Ruby", 633 | "rake": "Ruby", 634 | "rbuild": "Ruby", 635 | "rbw": "Ruby", 636 | "rbx": "Ruby", 637 | "ru": "Ruby", 638 | "ruby": "Ruby", 639 | "thor": "Ruby", 640 | "watchr": "Ruby", 641 | "rs.in": "Rust", 642 | "sas": "SAS", 643 | "scss": "SCSS", 644 | "smt2": "SMT", 645 | "smt": "SMT", 646 | "sparql": "SPARQL", 647 | "rq": "SPARQL", 648 | "sqf": "SQF", 649 | "hqf": "SQF", 650 | "cql": "SQL", 651 | "ddl": "SQL", 652 | "prc": "SQL", 653 | "tab": "SQL", 654 | "udf": "SQL", 655 | "viw": "SQL", 656 | "db2": "SQLPL", 657 | "ston": "STON", 658 | "svg": "SVG", 659 | "sage": "Sage", 660 | "sagews": "Sage", 661 | "sls": "Scheme", 662 | "sass": "Sass", 663 | "scala": "Scala", 664 | "sbt": "Scala", 665 | "sc": "SuperCollider", 666 | "scaml": "Scaml", 667 | "scm": "Scheme", 668 | "sld": "Scheme", 669 | "sps": "Scheme", 670 | "ss": "Scheme", 671 | "sci": "Scilab", 672 | "sce": "Scilab", 673 | "self": "Self", 674 | "sh": "Shell", 675 | "bash": "Shell", 676 | "bats": "Shell", 677 | "command": "Shell", 678 | "ksh": "Shell", 679 | "sh.in": "Shell", 680 | "tmux": "Shell", 681 | "tool": "Shell", 682 | "zsh": "Shell", 683 | "sh-session": "ShellSession", 684 | "shen": "Shen", 685 | "sl": "Slash", 686 | "slim": "Slim", 687 | "smali": "Smali", 688 | "tpl": "Smarty", 689 | "sp": "SourcePawn", 690 | "sma": "SourcePawn", 691 | "nut": "Squirrel", 692 | "stan": "Stan", 693 | "ML": "Standard ML", 694 | "fun": "Standard ML", 695 | "sig": "Standard ML", 696 | "sml": "Standard ML", 697 | "do": "Stata", 698 | "ado": "Stata", 699 | "doh": "Stata", 700 | "ihlp": "Stata", 701 | "mata": "Stata", 702 | "matah": "Stata", 703 | "sthlp": "Stata", 704 | "styl": "Stylus", 705 | "scd": "SuperCollider", 706 | "swift": "Swift", 707 | "sv": "SystemVerilog", 708 | "svh": "SystemVerilog", 709 | "vh": "SystemVerilog", 710 | "toml": "TOML", 711 | "txl": "TXL", 712 | "tcl": "Tcl", 713 | "adp": "Tcl", 714 | "tm": "Tcl", 715 | "tcsh": "Tcsh", 716 | "csh": "Tcsh", 717 | "tex": "TeX", 718 | "aux": "TeX", 719 | "bbx": "TeX", 720 | "bib": "TeX", 721 | "cbx": "TeX", 722 | "dtx": "TeX", 723 | "ins": "TeX", 724 | "lbx": "TeX", 725 | "ltx": "TeX", 726 | "mkii": "TeX", 727 | "mkiv": "TeX", 728 | "mkvi": "TeX", 729 | "sty": "TeX", 730 | "toc": "TeX", 731 | "tea": "Tea", 732 | "txt": "Text", 733 | "no": "Text", 734 | "textile": "Textile", 735 | "thrift": "Thrift", 736 | "tu": "Turing", 737 | "ttl": "Turtle", 738 | "twig": "Twig", 739 | "ts": "XML", 740 | "tsx": "XML", 741 | "upc": "Unified Parallel C", 742 | "anim": "Unity3D Asset", 743 | "asset": "Unity3D Asset", 744 | "mat": "Unity3D Asset", 745 | "meta": "Unity3D Asset", 746 | "prefab": "Unity3D Asset", 747 | "unity": "Unity3D Asset", 748 | "uno": "Uno", 749 | "uc": "UnrealScript", 750 | "ur": "UrWeb", 751 | "urs": "UrWeb", 752 | "vcl": "VCL", 753 | "vhdl": "VHDL", 754 | "vhd": "VHDL", 755 | "vhf": "VHDL", 756 | "vhi": "VHDL", 757 | "vho": "VHDL", 758 | "vhs": "VHDL", 759 | "vht": "VHDL", 760 | "vhw": "VHDL", 761 | "vala": "Vala", 762 | "vapi": "Vala", 763 | "veo": "Verilog", 764 | "vim": "VimL", 765 | "vb": "Visual Basic", 766 | "bas": "Visual Basic", 767 | "frm": "Visual Basic", 768 | "frx": "Visual Basic", 769 | "vba": "Visual Basic", 770 | "vbhtml": "Visual Basic", 771 | "vbs": "Visual Basic", 772 | "volt": "Volt", 773 | "vue": "Vue", 774 | "owl": "Web Ontology Language", 775 | "webidl": "WebIDL", 776 | "x10": "X10", 777 | "xc": "XC", 778 | "xml": "XML", 779 | "ant": "XML", 780 | "axml": "XML", 781 | "ccxml": "XML", 782 | "clixml": "XML", 783 | "cproject": "XML", 784 | "csl": "XML", 785 | "csproj": "XML", 786 | "ct": "XML", 787 | "dita": "XML", 788 | "ditamap": "XML", 789 | "ditaval": "XML", 790 | "dll.config": "XML", 791 | "dotsettings": "XML", 792 | "filters": "XML", 793 | "fsproj": "XML", 794 | "fxml": "XML", 795 | "glade": "XML", 796 | "grxml": "XML", 797 | "iml": "XML", 798 | "ivy": "XML", 799 | "jelly": "XML", 800 | "jsproj": "XML", 801 | "kml": "XML", 802 | "launch": "XML", 803 | "mdpolicy": "XML", 804 | "mxml": "XML", 805 | "nproj": "XML", 806 | "nuspec": "XML", 807 | "odd": "XML", 808 | "osm": "XML", 809 | "plist": "XML", 810 | "props": "XML", 811 | "ps1xml": "XML", 812 | "psc1": "XML", 813 | "pt": "XML", 814 | "rdf": "XML", 815 | "rss": "XML", 816 | "scxml": "XML", 817 | "srdf": "XML", 818 | "storyboard": "XML", 819 | "stTheme": "XML", 820 | "sublime-snippet": "XML", 821 | "targets": "XML", 822 | "tmCommand": "XML", 823 | "tml": "XML", 824 | "tmLanguage": "XML", 825 | "tmPreferences": "XML", 826 | "tmSnippet": "XML", 827 | "tmTheme": "XML", 828 | "ui": "XML", 829 | "urdf": "XML", 830 | "ux": "XML", 831 | "vbproj": "XML", 832 | "vcxproj": "XML", 833 | "vssettings": "XML", 834 | "vxml": "XML", 835 | "wsdl": "XML", 836 | "wsf": "XML", 837 | "wxi": "XML", 838 | "wxl": "XML", 839 | "wxs": "XML", 840 | "x3d": "XML", 841 | "xacro": "XML", 842 | "xaml": "XML", 843 | "xib": "XML", 844 | "xlf": "XML", 845 | "xliff": "XML", 846 | "xmi": "XML", 847 | "xml.dist": "XML", 848 | "xproj": "XML", 849 | "xsd": "XML", 850 | "xul": "XML", 851 | "zcml": "XML", 852 | "xsp-config": "XPages", 853 | "xsp.metadata": "XPages", 854 | "xpl": "XProc", 855 | "xproc": "XProc", 856 | "xquery": "XQuery", 857 | "xq": "XQuery", 858 | "xql": "XQuery", 859 | "xqm": "XQuery", 860 | "xqy": "XQuery", 861 | "xs": "XS", 862 | "xslt": "XSLT", 863 | "xsl": "XSLT", 864 | "xojo_code": "Xojo", 865 | "xojo_menu": "Xojo", 866 | "xojo_report": "Xojo", 867 | "xojo_script": "Xojo", 868 | "xojo_toolbar": "Xojo", 869 | "xojo_window": "Xojo", 870 | "xtend": "Xtend", 871 | "yml": "YAML", 872 | "reek": "YAML", 873 | "rviz": "YAML", 874 | "sublime-syntax": "YAML", 875 | "syntax": "YAML", 876 | "yaml": "YAML", 877 | "yaml-tmlanguage": "YAML", 878 | "yang": "YANG", 879 | "y": "Yacc", 880 | "yacc": "Yacc", 881 | "yy": "Yacc", 882 | "zep": "Zephir", 883 | "zimpl": "Zimpl", 884 | "zmpl": "Zimpl", 885 | "zpl": "Zimpl", 886 | "desktop": "desktop", 887 | "desktop.in": "desktop", 888 | "ec": "eC", 889 | "eh": "eC", 890 | "edn": "edn", 891 | "fish": "fish", 892 | "mu": "mupad", 893 | "nc": "nesC", 894 | "ooc": "ooc", 895 | "rst": "reStructuredText", 896 | "rest": "reStructuredText", 897 | "rest.txt": "reStructuredText", 898 | "rst.txt": "reStructuredText", 899 | "wisp": "wisp", 900 | "prg": "xBase", 901 | "prw": "xBase" 902 | } -------------------------------------------------------------------------------- /repro_eval/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/repro_eval/test/__init__.py -------------------------------------------------------------------------------- /repro_eval/test/test_empty_rpd.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from repro_eval.Evaluator import RpdEvaluator 3 | from repro_eval.config import ERR_MSG 4 | 5 | rpd_eval = RpdEvaluator(qrel_orig_path=None, 6 | run_b_orig_path=None, 7 | run_a_orig_path=None, 8 | run_b_rep_path=None, 9 | run_a_rep_path=None) 10 | 11 | 12 | def test_ktu(capfd): 13 | assert None is rpd_eval.ktau_union() 14 | out, err = capfd.readouterr() 15 | assert out == ''.join([ERR_MSG, '\n']) 16 | 17 | 18 | def test_rbo(capfd): 19 | assert None is rpd_eval.rbo() 20 | out, err = capfd.readouterr() 21 | assert out == ''.join([ERR_MSG, '\n']) 22 | 23 | 24 | def test_rmse(capfd): 25 | assert None is rpd_eval.rmse() 26 | out, err = capfd.readouterr() 27 | assert out == ''.join([ERR_MSG, '\n']) 28 | 29 | 30 | def test_er(capfd): 31 | assert None is rpd_eval.er() 32 | out, err = capfd.readouterr() 33 | assert out == ''.join([ERR_MSG, '\n']) 34 | 35 | 36 | def test_dri(capfd): 37 | assert None is rpd_eval.dri() 38 | out, err = capfd.readouterr() 39 | assert out == ''.join([ERR_MSG, '\n']) 40 | 41 | 42 | def test_ttest(capfd): 43 | assert None is rpd_eval.ttest() 44 | out, err = capfd.readouterr() 45 | assert out == ''.join([ERR_MSG, '\n']) 46 | -------------------------------------------------------------------------------- /repro_eval/test/test_empty_rpl.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from repro_eval.Evaluator import RplEvaluator 3 | from repro_eval.config import ERR_MSG 4 | 5 | rpd_eval = RplEvaluator(qrel_orig_path=None, 6 | run_b_orig_path=None, 7 | run_a_orig_path=None, 8 | run_b_rep_path=None, 9 | run_a_rep_path=None, 10 | qrel_rpl_path=None) 11 | 12 | 13 | def test_er(capfd): 14 | assert None is rpd_eval.er() 15 | out, err = capfd.readouterr() 16 | assert out == ''.join([ERR_MSG, '\n']) 17 | 18 | 19 | def test_dri(capfd): 20 | assert None is rpd_eval.dri() 21 | out, err = capfd.readouterr() 22 | assert out == ''.join([ERR_MSG, '\n']) 23 | 24 | 25 | def test_ttest(capfd): 26 | assert None is rpd_eval.ttest() 27 | out, err = capfd.readouterr() 28 | assert out == ''.join([ERR_MSG, '\n']) 29 | -------------------------------------------------------------------------------- /repro_eval/test/test_kwargs.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pytrec_eval 3 | from repro_eval.Evaluator import RpdEvaluator 4 | 5 | 6 | rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 7 | run_b_orig_path='./example/orig_b.txt', 8 | run_a_orig_path='./example/orig_a.txt', 9 | run_b_rep_path='./example/rpd_b.txt', 10 | run_a_rep_path='./example/rpd_a.txt') 11 | 12 | rpd_eval.trim() 13 | rpd_eval.evaluate() 14 | 15 | ktu = rpd_eval.ktau_union() 16 | ktu_base = ktu.get('baseline') 17 | ktu_adv = ktu.get('advanced') 18 | 19 | rbo = rpd_eval.rbo() 20 | rbo_base = rbo.get('baseline') 21 | rbo_adv = rbo.get('advanced') 22 | 23 | 24 | def test_path_ktu(): 25 | _ktu = rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt') 26 | assert 'baseline' in _ktu.keys() 27 | assert ktu_base == _ktu.get('baseline') 28 | assert 'advanced' in _ktu.keys() 29 | assert ktu_adv == _ktu.get('advanced') 30 | 31 | 32 | def test_path_rbo(): 33 | _rbo = rpd_eval.rbo(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt') 34 | assert 'baseline' in _rbo.keys() 35 | assert rbo_base == _rbo.get('baseline') 36 | assert 'advanced' in _rbo.keys() 37 | assert rbo_adv == _rbo.get('advanced') 38 | 39 | 40 | def test_run_ktu(): 41 | with open('./example/rpd_b.txt') as _base_file, open('./example/rpd_a.txt') as _adv_file: 42 | _base_run = pytrec_eval.parse_run(_base_file) 43 | _adv_run = pytrec_eval.parse_run(_adv_file) 44 | _ktu = rpd_eval.ktau_union(run_b_rep=_base_run, run_a_rep=_adv_run) 45 | assert 'baseline' in _ktu.keys() 46 | assert ktu_base == _ktu.get('baseline') 47 | assert 'advanced' in _ktu.keys() 48 | assert ktu_adv == _ktu.get('advanced') 49 | 50 | 51 | def test_run_rbo(): 52 | with open('./example/rpd_b.txt') as _base_file, open('./example/rpd_a.txt') as _adv_file: 53 | _base_run = pytrec_eval.parse_run(_base_file) 54 | _adv_run = pytrec_eval.parse_run(_adv_file) 55 | _rbo = rpd_eval.rbo(run_b_rep=_base_run, run_a_rep=_adv_run) 56 | assert 'baseline' in _rbo.keys() 57 | assert rbo_base == _rbo.get('baseline') 58 | assert 'advanced' in _rbo.keys() 59 | assert rbo_adv == _rbo.get('advanced') 60 | -------------------------------------------------------------------------------- /repro_eval/test/test_path_param.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from repro_eval.Evaluator import RpdEvaluator, RplEvaluator 3 | import numpy as np 4 | 5 | rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 6 | run_b_orig_path='./example/orig_b.txt', 7 | run_a_orig_path='./example/orig_a.txt', 8 | run_b_rep_path='./example/rpd_b.txt', 9 | run_a_rep_path='./example/rpd_a.txt') 10 | 11 | rpd_eval.trim() 12 | rpd_eval.evaluate() 13 | 14 | 15 | def test_ktu_path_param(): 16 | ktu = rpd_eval.ktau_union() 17 | assert 'baseline' in ktu.keys() 18 | assert 'advanced' in ktu.keys() 19 | 20 | _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 21 | run_b_orig_path='./example/orig_b.txt', 22 | run_a_orig_path='./example/orig_a.txt') 23 | _rpd_eval.trim() 24 | _rpd_eval.evaluate() 25 | 26 | _ktu = _rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt') 27 | assert 'baseline' in _ktu.keys() 28 | assert ktu.get('baseline') == _ktu.get('baseline') 29 | 30 | _ktu = _rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt') 31 | assert 'advanced' in _ktu.keys() 32 | assert ktu.get('advanced') == _ktu.get('advanced') 33 | 34 | 35 | def test_rbo_path_param(): 36 | rbo = rpd_eval.rbo() 37 | assert 'baseline' in rbo.keys() 38 | assert 'advanced' in rbo.keys() 39 | 40 | _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 41 | run_b_orig_path='./example/orig_b.txt', 42 | run_a_orig_path='./example/orig_a.txt') 43 | _rpd_eval.trim() 44 | _rpd_eval.evaluate() 45 | 46 | _rbo = _rpd_eval.rbo(run_b_path='./example/rpd_b.txt') 47 | assert 'baseline' in _rbo.keys() 48 | assert rbo.get('baseline') == _rbo.get('baseline') 49 | 50 | _rbo = _rpd_eval.rbo(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt') 51 | assert 'advanced' in _rbo.keys() 52 | assert rbo.get('advanced') == _rbo.get('advanced') 53 | 54 | 55 | def test_rmse_path_param(): 56 | rmse = rpd_eval.rmse() 57 | assert 'baseline' in rmse.keys() 58 | assert 'advanced' in rmse.keys() 59 | 60 | _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 61 | run_b_orig_path='./example/orig_b.txt', 62 | run_a_orig_path='./example/orig_a.txt') 63 | _rpd_eval.trim() 64 | _rpd_eval.evaluate() 65 | 66 | _rmse = _rpd_eval.rmse(run_b_path='./example/rpd_b.txt') 67 | assert 'baseline' in _rmse.keys() 68 | assert rmse.get('baseline') == _rmse.get('baseline') 69 | 70 | _rmse = _rpd_eval.rmse(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt') 71 | assert 'advanced' in _rmse.keys() 72 | assert rmse.get('advanced') == _rmse.get('advanced') 73 | 74 | 75 | def test_rpd_ttest_path_param(): 76 | pval = rpd_eval.ttest() 77 | assert 'baseline' in pval.keys() 78 | assert 'advanced' in pval.keys() 79 | 80 | _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 81 | run_b_orig_path='./example/orig_b.txt', 82 | run_a_orig_path='./example/orig_a.txt') 83 | _rpd_eval.trim() 84 | _rpd_eval.evaluate() 85 | 86 | _pval = _rpd_eval.ttest(run_b_path='./example/rpd_b.txt') 87 | assert 'baseline' in _pval.keys() 88 | # pick a few samples here since nan comparisons cause problems in combination with assert 89 | assert pval.get('baseline').get('ndcg') == _pval.get('baseline').get('ndcg') 90 | assert pval.get('baseline').get('P_10') == _pval.get('baseline').get('P_10') 91 | assert pval.get('baseline').get('map') == _pval.get('baseline').get('map') 92 | 93 | _pval = _rpd_eval.ttest(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt') 94 | assert 'advanced' in _pval.keys() 95 | # pick a few samples here since nan comparisons cause problems in combination with assert 96 | assert pval.get('advanced').get('ndcg') == _pval.get('advanced').get('ndcg') 97 | assert pval.get('advanced').get('P_10') == _pval.get('advanced').get('P_10') 98 | assert pval.get('advanced').get('map') == _pval.get('advanced').get('map') 99 | 100 | 101 | def test_rpl_ttest_path_param(): 102 | rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 103 | run_b_orig_path='./example/orig_b.txt', 104 | run_a_orig_path='./example/orig_a.txt', 105 | run_b_rep_path='./example/rpl_b.txt', 106 | run_a_rep_path='./example/rpl_a.txt', 107 | qrel_rpl_path='./example/data/qrels/core18.txt') 108 | 109 | rpl_eval.trim() 110 | rpl_eval.evaluate() 111 | 112 | pval = rpl_eval.ttest() 113 | assert 'baseline' in pval.keys() 114 | assert 'advanced' in pval.keys() 115 | 116 | _rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 117 | run_b_orig_path='./example/orig_b.txt', 118 | run_a_orig_path='./example/orig_a.txt', 119 | qrel_rpl_path='./example/data/qrels/core18.txt') 120 | _rpl_eval.trim() 121 | _rpl_eval.evaluate() 122 | 123 | _pval = _rpl_eval.ttest(run_b_path='./example/rpl_b.txt') 124 | assert 'baseline' in _pval.keys() 125 | # pick a few samples here since nan comparisons cause problems in combination with assert 126 | assert pval.get('baseline').get('ndcg') == _pval.get('baseline').get('ndcg') 127 | assert pval.get('baseline').get('P_10') == _pval.get('baseline').get('P_10') 128 | assert pval.get('baseline').get('map') == _pval.get('baseline').get('map') 129 | 130 | _pval = _rpl_eval.ttest(run_b_path='./example/rpl_b.txt', run_a_path='./example/rpl_a.txt') 131 | assert 'advanced' in _pval.keys() 132 | # pick a few samples here since nan comparisons cause problems in combination with assert 133 | assert pval.get('advanced').get('ndcg') == _pval.get('advanced').get('ndcg') 134 | assert pval.get('advanced').get('P_10') == _pval.get('advanced').get('P_10') 135 | assert pval.get('advanced').get('map') == _pval.get('advanced').get('map') 136 | 137 | 138 | def test_rpd_er_path_param(): 139 | er = rpd_eval.er() 140 | 141 | _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 142 | run_b_orig_path='./example/orig_b.txt', 143 | run_a_orig_path='./example/orig_a.txt') 144 | _rpd_eval.trim() 145 | _rpd_eval.evaluate() 146 | 147 | _er = _rpd_eval.er(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt') 148 | 149 | # pick a few samples here since nan comparisons cause problems in combination with assert 150 | assert er.get('ndcg') == _er.get('ndcg') 151 | assert er.get('P_10') == _er.get('P_10') 152 | assert er.get('map') == _er.get('map') 153 | 154 | 155 | def test_rpl_er_path_param(): 156 | rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 157 | run_b_orig_path='./example/orig_b.txt', 158 | run_a_orig_path='./example/orig_a.txt', 159 | run_b_rep_path='./example/rpl_b.txt', 160 | run_a_rep_path='./example/rpl_a.txt', 161 | qrel_rpl_path='./example/data/qrels/core18.txt') 162 | 163 | rpl_eval.trim() 164 | rpl_eval.evaluate() 165 | 166 | er = rpl_eval.er() 167 | 168 | _rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 169 | run_b_orig_path='./example/orig_b.txt', 170 | run_a_orig_path='./example/orig_a.txt', 171 | qrel_rpl_path='./example/data/qrels/core18.txt') 172 | _rpl_eval.trim() 173 | _rpl_eval.evaluate() 174 | 175 | _er = _rpl_eval.er(run_b_path='./example/rpl_b.txt', run_a_path='./example/rpl_a.txt') 176 | 177 | # pick a few samples here since nan comparisons cause problems in combination with assert 178 | assert er.get('ndcg') == _er.get('ndcg') 179 | assert er.get('P_10') == _er.get('P_10') 180 | assert er.get('map') == _er.get('map') 181 | 182 | 183 | def test_rpd_dri_path_param(): 184 | dri = rpd_eval.dri() 185 | 186 | _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 187 | run_b_orig_path='./example/orig_b.txt', 188 | run_a_orig_path='./example/orig_a.txt') 189 | _rpd_eval.trim() 190 | _rpd_eval.evaluate() 191 | 192 | _dri = _rpd_eval.dri(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt') 193 | 194 | # pick a few samples here since nan comparisons cause problems in combination with assert 195 | assert dri.get('ndcg') == _dri.get('ndcg') 196 | assert dri.get('P_10') == _dri.get('P_10') 197 | assert dri.get('map') == _dri.get('map') 198 | 199 | 200 | def test_rpl_dri_path_param(): 201 | rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 202 | run_b_orig_path='./example/orig_b.txt', 203 | run_a_orig_path='./example/orig_a.txt', 204 | run_b_rep_path='./example/rpl_b.txt', 205 | run_a_rep_path='./example/rpl_a.txt', 206 | qrel_rpl_path='./example/data/qrels/core18.txt') 207 | 208 | rpl_eval.trim() 209 | rpl_eval.evaluate() 210 | 211 | dri = rpl_eval.dri() 212 | 213 | _rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 214 | run_b_orig_path='./example/orig_b.txt', 215 | run_a_orig_path='./example/orig_a.txt', 216 | qrel_rpl_path='./example/data/qrels/core18.txt') 217 | _rpl_eval.trim() 218 | _rpl_eval.evaluate() 219 | 220 | _dri = _rpl_eval.dri(run_b_path='./example/rpl_b.txt', run_a_path='./example/rpl_a.txt') 221 | 222 | # pick a few samples here since nan comparisons cause problems in combination with assert 223 | assert dri.get('ndcg') == _dri.get('ndcg') 224 | assert dri.get('P_10') == _dri.get('P_10') 225 | assert dri.get('map') == _dri.get('map') 226 | -------------------------------------------------------------------------------- /repro_eval/test/test_rbo.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from repro_eval.Evaluator import RpdEvaluator 3 | 4 | rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 5 | run_b_orig_path='./example/orig_b.txt', 6 | run_a_orig_path='./example/orig_a.txt', 7 | run_b_rep_path='./example/rpd_b.txt', 8 | run_a_rep_path='./example/rpd_a.txt') 9 | 10 | rpd_eval.trim() 11 | rpd_eval.evaluate() 12 | 13 | 14 | def test_rbo(): 15 | # compare rbo implementations by the 10th decimal 16 | 17 | rbo = rpd_eval.rbo() 18 | rbo_slow = rpd_eval.rbo(misinfo=False) 19 | 20 | for k, v in rbo.get('baseline').items(): 21 | rbo['baseline'][k] = round(v, 10) 22 | for k, v in rbo.get('advanced').items(): 23 | rbo['advanced'][k] = round(v, 10) 24 | for k, v in rbo_slow.get('baseline').items(): 25 | rbo_slow['baseline'][k] = round(v, 10) 26 | for k, v in rbo_slow.get('advanced').items(): 27 | rbo_slow['advanced'][k] = round(v, 10) 28 | 29 | assert rbo.get('baseline') == rbo_slow.get('baseline') 30 | assert rbo.get('advanced') == rbo_slow.get('advanced') 31 | 32 | 33 | -------------------------------------------------------------------------------- /repro_eval/test/test_rpd.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from repro_eval.Evaluator import RpdEvaluator 3 | 4 | rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 5 | run_b_orig_path='./example/orig_b.txt', 6 | run_a_orig_path='./example/orig_a.txt', 7 | run_b_rep_path='./example/rpd_b.txt', 8 | run_a_rep_path='./example/rpd_a.txt') 9 | 10 | rpd_eval.trim() 11 | rpd_eval.evaluate() 12 | 13 | 14 | def test_ktu(): 15 | ktu = rpd_eval.ktau_union() 16 | assert 'baseline' in ktu.keys() 17 | assert 'advanced' in ktu.keys() 18 | 19 | 20 | def test_rbo(): 21 | rbo = rpd_eval.rbo() 22 | assert 'baseline' in rbo.keys() 23 | assert 'advanced' in rbo.keys() 24 | 25 | 26 | def test_rmse(): 27 | rmse = rpd_eval.rmse() 28 | assert 'baseline' in rmse.keys() 29 | assert 'advanced' in rmse.keys() 30 | 31 | 32 | def test_nrmse(): 33 | nrmse = rpd_eval.nrmse() 34 | assert 'baseline' in nrmse.keys() 35 | assert 'advanced' in nrmse.keys() 36 | 37 | 38 | def test_er(): 39 | er = rpd_eval.er() 40 | assert 'map' in er.keys() 41 | assert 'recip_rank' in er.keys() 42 | assert 'P_10' in er.keys() 43 | 44 | 45 | def test_dri(): 46 | dri = rpd_eval.dri() 47 | assert 'map' in dri.keys() 48 | assert 'recip_rank' in dri.keys() 49 | assert 'P_10' in dri.keys() 50 | 51 | 52 | def test_ttest(): 53 | ttest = rpd_eval.ttest() 54 | assert 'baseline' in ttest.keys() 55 | assert 'advanced' in ttest.keys() -------------------------------------------------------------------------------- /repro_eval/test/test_rpl.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from repro_eval.Evaluator import RplEvaluator 3 | 4 | rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 5 | run_b_orig_path='./example/orig_b.txt', 6 | run_a_orig_path='./example/orig_a.txt', 7 | run_b_rep_path='./example/rpl_b.txt', 8 | run_a_rep_path='./example/rpl_a.txt', 9 | qrel_rpl_path='./example/data/qrels/core18.txt') 10 | 11 | rpl_eval.trim() 12 | rpl_eval.evaluate() 13 | 14 | 15 | def test_er(): 16 | er = rpl_eval.er() 17 | assert 'map' in er.keys() 18 | assert 'recip_rank' in er.keys() 19 | assert 'P_10' in er.keys() 20 | 21 | 22 | def test_dri(): 23 | dri = rpl_eval.dri() 24 | assert 'map' in dri.keys() 25 | assert 'recip_rank' in dri.keys() 26 | assert 'P_10' in dri.keys() 27 | 28 | 29 | def test_ttest(): 30 | ttest = rpl_eval.ttest() 31 | assert 'baseline' in ttest.keys() 32 | assert 'advanced' in ttest.keys() -------------------------------------------------------------------------------- /repro_eval/test/test_ttest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from repro_eval.Evaluator import RpdEvaluator 3 | 4 | 5 | def test_ttest_with_identical_score_distributions(): 6 | rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', 7 | run_b_orig_path='./example/orig_b.txt', 8 | run_a_orig_path='./example/orig_a.txt', 9 | run_b_rep_path='./example/orig_b.txt', 10 | run_a_rep_path='./example/orig_a.txt') 11 | 12 | rpd_eval.trim() 13 | rpd_eval.evaluate() 14 | 15 | ttest = rpd_eval.ttest() 16 | 17 | pvals = list(filter(lambda x: x == 1.0, ttest.get('baseline').values())) 18 | assert len(pvals) == len(ttest.get('baseline').keys()) 19 | 20 | pvals = list(filter(lambda x: x == 1.0, ttest.get('advanced').values())) 21 | assert len(pvals) == len(ttest.get('advanced').keys()) 22 | -------------------------------------------------------------------------------- /repro_eval/util.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from collections import OrderedDict 3 | import numpy as np 4 | from repro_eval.config import TRIM_THRESH, exclude 5 | 6 | 7 | def trim(run, thresh=TRIM_THRESH): 8 | """ 9 | Use this function to trim a run to a length of a document length specified by thresh. 10 | 11 | @param run: The run to be trimmed. 12 | @param thresh: The threshold value of the run length. 13 | """ 14 | for topic, docs in run.items(): 15 | run[topic] = dict(list(run[topic].items())[:thresh]) 16 | 17 | 18 | def arp(topic_scores): 19 | """ 20 | This function computes the Average Retrieval Performance (ARP) according to the following paper: 21 | Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff. 22 | How to Measure the Reproducibility of System-oriented IR Experiments. 23 | Proceedings of SIGIR, pages 349-358, 2020. 24 | 25 | The ARP score is defined by the mean across the different topic scores of a run. 26 | 27 | @param topic_scores: Topic scores of an evaluated run. 28 | @return: The ARP score. 29 | """ 30 | return np.array(list(topic_scores.values())).mean() 31 | 32 | 33 | def _arp_scores(run): 34 | """ 35 | Helping function returning a generator for determining the Average Retrieval Performance (ARP) scores. 36 | 37 | @param run: The run to be evaluated. 38 | @return: Generator with ARP scores for each trec_eval evaluation measure. 39 | """ 40 | measures_all = list(list(run.values())[0].keys()) 41 | measures_valid = [m for m in measures_all if m not in exclude] 42 | topics = run.keys() 43 | 44 | for measure in measures_valid: 45 | yield measure, np.array(list([run.get(topic).get(measure) for topic in topics])).mean() 46 | 47 | 48 | def arp_scores(run): 49 | """ 50 | This function computes the Average Retrieval Performance (ARP) scores according to the following paper: 51 | Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff. 52 | How to Measure the Reproducibility of System-oriented IR Experiments. 53 | Proceedings of SIGIR, pages 349-358, 2020. 54 | 55 | The ARP score is defined by the mean across the different topic scores of a run. 56 | For all measures outputted by trec_eval, the ARP scores will be determined. 57 | 58 | @param run: The run to be evaluated. 59 | @return: Dictionary containing the ARP scores for every measure outputted by trec_eval. 60 | """ 61 | return dict(_arp_scores(run)) 62 | 63 | 64 | def _topic_scores(run_scores): 65 | """ 66 | Helping function returning a generator for determining the topic scores for each measure. 67 | 68 | @param run_scores: The run scores of the previously evaluated run. 69 | @return: Generator with topic scores for each trec_eval evaluation measure. 70 | """ 71 | measures_all = list(list(run_scores.values())[0].keys()) 72 | measures_valid = [m for m in measures_all if m not in exclude] 73 | topics = run_scores.keys() 74 | 75 | for measure in measures_valid: 76 | yield measure, [run_scores.get(topic).get(measure) for topic in topics] 77 | 78 | 79 | def topic_scores(run_scores): 80 | """ 81 | Use this function for a dictionary that contains the topic scores for each measure outputted by trec_eval. 82 | 83 | @param run_scores: The run scores of the previously evaluated run. 84 | @return: Dictionary containing the topic scores for every measure outputted by trec_eval. 85 | """ 86 | return dict(_topic_scores(run_scores)) 87 | 88 | 89 | def print_base_adv(measure_topic, repro_measure, base_value, adv_value=None): 90 | """ 91 | Pretty print output in trec_eval inspired style. Use this for printing baseline and/or advanced results. 92 | 93 | @param measure_topic: The topic number. 94 | @param repro_measure: Name of the reproduction/replication measure. 95 | @param base_value: Value of the evaluated baseline run. 96 | @param adv_value: Value of the evaluated advanced run. 97 | """ 98 | if adv_value: 99 | fill = ('{:3s}' if base_value < 0 else '{:4s}') 100 | print(('{:25s}{:8s}{:8s}{:.4f}' + fill + '{:8s}{:.4f}').format(measure_topic, repro_measure, 101 | 'BASE', base_value, ' ', 'ADV', adv_value)) 102 | else: 103 | print('{:25s}{:8s}{:8s}{:.4f}'.format(measure_topic, repro_measure, 'BASE', base_value)) 104 | 105 | 106 | def print_simple_line(measure, repro_measure, value): 107 | """ 108 | Use this for printing lines with trec_eval and reproduction/replication measures. 109 | Pretty print output in trec_eval inspired style. 110 | @param measure: Name of the trec_eval measure. 111 | @param repro_measure: Name of the reproduction/replication measure. 112 | @param value: Value of the evaluated run. 113 | @return: 114 | """ 115 | print('{:25s}{:8s}{:.4f}'.format(measure, repro_measure, value)) 116 | 117 | 118 | def break_ties(run): 119 | """ 120 | Use this function to break score ties like it is implemented in trec_eval. 121 | Documents with the same score will be sorted in reverse alphabetical order. 122 | :param run: Run with score ties. Nested dictionary structure (cf. pytrec_eval) 123 | :return: Reordered run 124 | """ 125 | for topic, ranking in run.items(): 126 | docid_score_tuple = list(ranking.items()) 127 | reordered_ranking = [] 128 | for k, v in itertools.groupby(docid_score_tuple, lambda item: item[1]): 129 | reordered_ranking.extend(sorted(v, reverse=True)) 130 | run[topic] = OrderedDict(reordered_ranking) 131 | return run 132 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from setuptools import setup 3 | 4 | HERE = pathlib.Path(__file__).parent 5 | README = (HERE / "README.md").read_text() 6 | 7 | setup(name='repro_eval', 8 | version='0.4.0', 9 | description='A tool to quantify the replicability and reproducibility of system-oriented IR experiments.', 10 | long_description=README, 11 | long_description_content_type="text/markdown", 12 | url='http://github.com/irgroup/repro_eval', 13 | author='Timo Breuer', 14 | author_email='timo.breuer@th-koeln.de', 15 | license='MIT', 16 | packages=['repro_eval', 17 | 'repro_eval.measure', 18 | 'repro_eval.measure.external'], 19 | install_requires=[ 20 | 'pytrec_eval', 21 | 'numpy', 22 | 'scipy', 23 | 'tqdm', 24 | 'ruamel.yaml', 25 | 'GitPython', 26 | 'py-cpuinfo' 27 | ], 28 | include_package_data=True, 29 | package_data={'': ['resources/*.json']}, 30 | zip_safe=False) 31 | --------------------------------------------------------------------------------