├── .github
    └── workflows
    │   ├── manual-dispatch.yml
    │   └── push.yml
├── .gitignore
├── LICENSE
├── README.md
├── example
    ├── data
    │   ├── plots
    │   │   ├── rpd_a_arp.pdf
    │   │   ├── rpd_a_ktu.pdf
    │   │   ├── rpd_a_rmse.pdf
    │   │   ├── rpd_b_arp.pdf
    │   │   ├── rpd_b_ktu.pdf
    │   │   ├── rpd_b_rmse.pdf
    │   │   ├── rpd_dri_vs_er.pdf
    │   │   ├── rpd_er.pdf
    │   │   ├── rpl_dri_vs_er.pdf
    │   │   └── rpl_er.pdf
    │   └── qrels
    │   │   ├── core17.txt
    │   │   ├── core18.txt
    │   │   ├── robust04.txt
    │   │   └── robust05.txt
    ├── demo.ipynb
    ├── get_data.sh
    ├── intro.ipynb
    ├── requirements.txt
    ├── rpd_arp.py
    ├── rpd_dri_vs_er.py
    ├── rpd_er.py
    ├── rpd_eval.py
    ├── rpd_ktu.py
    ├── rpd_rmse.py
    ├── rpl_dri_vs_er.py
    ├── rpl_er.py
    └── rpl_eval.py
├── repro_eval
    ├── Evaluator.py
    ├── __init__.py
    ├── __main__.py
    ├── config.py
    ├── measure
    │   ├── __init__.py
    │   ├── document_order.py
    │   ├── effectiveness.py
    │   ├── external
    │   │   ├── __init__.py
    │   │   └── rbo.py
    │   ├── overall_effects.py
    │   └── statistics.py
    ├── metadata.py
    ├── resources
    │   └── extensions.json
    ├── test
    │   ├── __init__.py
    │   ├── test_empty_rpd.py
    │   ├── test_empty_rpl.py
    │   ├── test_kwargs.py
    │   ├── test_path_param.py
    │   ├── test_rbo.py
    │   ├── test_rpd.py
    │   ├── test_rpl.py
    │   └── test_ttest.py
    └── util.py
└── setup.py


/.github/workflows/manual-dispatch.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         python-version: [3.7, 3.8, 3.9]
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v2
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 |     - name: Install dependencies
20 |       run: |
21 |         python -m pip install --upgrade pip
22 |         python -m pip install pytest pytrec_eval numpy scipy tqdm
23 |     - name: Test with pytest
24 |       run: |
25 |         pytest
26 | 


--------------------------------------------------------------------------------
/.github/workflows/push.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         python-version: [3.7, 3.8, 3.9]
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v2
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 |     - name: Install dependencies
20 |       run: |
21 |         wget -P example/ https://gist.githubusercontent.com/breuert/95d01401a1ea767ca83592beeb8d7785/raw/9073fc0dd3cd118655f9a7e4f74116dd8da09df0/orig_b.txt https://gist.githubusercontent.com/breuert/c71c2c6bad1cda7ed121bb7d0f64e471/raw/d54404d960a0decddda1f19711d4cffc71614ecf/orig_a.txt https://gist.githubusercontent.com/breuert/5973d391a4bc38643264366299e2c3de/raw/d9d5eeeecffc9861113a1eeb044fc225da4e0f00/rpd_b.txt https://gist.githubusercontent.com/breuert/8e9bfb7aac30fa044da23fdd95174b92/raw/e1f1d1a84d9a8834a4d25f772ee409ada42b5eaa/rpd_a.txt https://gist.githubusercontent.com/breuert/a39373be8ec0e0b15844dcfe9f26f8cc/raw/ad2ea6db8ff1bec3a3ca6d488c3bbcc13ca1b05b/rpl_b.txt https://gist.githubusercontent.com/breuert/14d5eef9a1d51e337a4c3cd44f5212a3/raw/50f2f21c5902ed13f7550d0e588fada7015089ab/rpl_a.txt
22 |         python -m pip install --upgrade pip
23 |         python -m pip install pytest pytrec_eval numpy scipy tqdm
24 |     - name: Test with pytest
25 |       run: |
26 |         pytest
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | __pycache__/
 3 | build/
 4 | dist/
 5 | repro_eval.egg-info/
 6 | playground.py
 7 | example/data/runs/
 8 | example/orig_a.txt
 9 | example/orig_b.txt
10 | example/rpd_a.txt
11 | example/rpd_b.txt
12 | example/rpl_a.txt
13 | example/rpl_b.txt
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Timo Breuer, Nicola Ferro, Maria Maistro, Philipp Schaer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/example/data/plots/rpd_a_arp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_a_arp.pdf


--------------------------------------------------------------------------------
/example/data/plots/rpd_a_ktu.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_a_ktu.pdf


--------------------------------------------------------------------------------
/example/data/plots/rpd_a_rmse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_a_rmse.pdf


--------------------------------------------------------------------------------
/example/data/plots/rpd_b_arp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_b_arp.pdf


--------------------------------------------------------------------------------
/example/data/plots/rpd_b_ktu.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_b_ktu.pdf


--------------------------------------------------------------------------------
/example/data/plots/rpd_b_rmse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_b_rmse.pdf


--------------------------------------------------------------------------------
/example/data/plots/rpd_dri_vs_er.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_dri_vs_er.pdf


--------------------------------------------------------------------------------
/example/data/plots/rpd_er.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpd_er.pdf


--------------------------------------------------------------------------------
/example/data/plots/rpl_dri_vs_er.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpl_dri_vs_er.pdf


--------------------------------------------------------------------------------
/example/data/plots/rpl_er.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/example/data/plots/rpl_er.pdf


--------------------------------------------------------------------------------
/example/demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "nbformat": 4,
  3 |  "nbformat_minor": 0,
  4 |  "metadata": {
  5 |   "kernelspec": {
  6 |    "display_name": "Python 3",
  7 |    "language": "python",
  8 |    "name": "python3"
  9 |   },
 10 |   "language_info": {
 11 |    "codemirror_mode": {
 12 |     "name": "ipython",
 13 |     "version": 3
 14 |    },
 15 |    "file_extension": ".py",
 16 |    "mimetype": "text/x-python",
 17 |    "name": "python",
 18 |    "nbconvert_exporter": "python",
 19 |    "pygments_lexer": "ipython3",
 20 |    "version": "3.7.3"
 21 |   },
 22 |   "colab": {
 23 |    "name": "intro.ipynb",
 24 |    "provenance": []
 25 |   }
 26 |  },
 27 |  "cells": [
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {
 31 |     "id": "0dy3GpaAVtDJ"
 32 |    },
 33 |    "source": [
 34 |     "# An Introduction to `repro_eval`"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {
 40 |     "id": "2vP09TvRbZnI"
 41 |    },
 42 |    "source": [
 43 |     "This notebook introduces the functionalities of `repro_eval`. We provide sample data that has to be downloaded in advance, but it is also possible to upload your runs and evaluate the reproducibilty of your experiments with this notebook."
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {
 49 |     "id": "7HvZTRuDb0FC"
 50 |    },
 51 |    "source": [
 52 |     "#### Install `repro_eval` via PyPI"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "metadata": {
 58 |     "id": "X1Odv7-WVt4o"
 59 |    },
 60 |    "source": [
 61 |     "!pip install repro_eval==0.1"
 62 |    ],
 63 |    "execution_count": null,
 64 |    "outputs": []
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {
 69 |     "id": "wUkSqYmgb4yD"
 70 |    },
 71 |    "source": [
 72 |     "#### Download the sample data and extract it\n"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "metadata": {
 78 |     "id": "Sw2nFqDZWRyP"
 79 |    },
 80 |    "source": [
 81 |     "!wget https://www.dropbox.com/s/ncu49e91mosidei/data.tar.gz\n",
 82 |     "!tar -xzvf ./data.tar.gz "
 83 |    ],
 84 |    "execution_count": null,
 85 |    "outputs": []
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {
 90 |     "id": "-SN1XavoVtDL"
 91 |    },
 92 |    "source": [
 93 |     "### Imports\n",
 94 |     "\n",
 95 |     "Once installed, the Evaluator classes for the evaluation of reproducibility and replicability can be imported. In this notebook, we also include other Python packages that are not necessarily required when using `repro_eval` for your experiments."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "metadata": {
101 |     "id": "kCVSY0rGVtDM"
102 |    },
103 |    "source": [
104 |     "from repro_eval.Evaluator import RpdEvaluator, RplEvaluator\n",
105 |     "from repro_eval.util import arp, arp_scores, print_base_adv, print_simple_line, trim\n",
106 |     "\n",
107 |     "import pytrec_eval\n",
108 |     "import pandas as pd\n",
109 |     "from matplotlib import pyplot as plt\n",
110 |     "import seaborn as sns\n",
111 |     "sns.set()\n",
112 |     "sns.set_style('whitegrid')\n",
113 |     "colors = sns.color_palette()"
114 |    ],
115 |    "execution_count": null,
116 |    "outputs": []
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {
121 |     "id": "rRZlhiToVtDS"
122 |    },
123 |    "source": [
124 |     "### Path definition\n",
125 |     "You can modify these paths and adapt them to your experiments. The entire notebook should be usable with your experiments when they comply with the given evaluation scenario. First, we need two kind of runs - a baseline run and an advanced run (that outperforms the baseline run). Second, for the evaluation of replicability, the replicated runs should be derived from another target collection. The dictionaries `runs_rpd` and `runs_rpl` contain runs with different parametrizations, but it should also be possible to include just one version for both the baseline and advanced run."
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "metadata": {
131 |     "id": "qyFqKV8KVtDT"
132 |    },
133 |    "source": [
134 |     "QREL = './data/qrels/core17.txt'\n",
135 |     "QREL_RPL = './data/qrels/core18.txt'\n",
136 |     "ORIG_B = './data/runs/orig/input.WCrobust04'\n",
137 |     "ORIG_A = './data/runs/orig/input.WCrobust0405'\n",
138 |     "RPD_B = './data/runs/rpd/14/irc_task1_WCrobust04_001'\n",
139 |     "RPD_A = './data/runs/rpd/14/irc_task1_WCrobust0405_001'\n",
140 |     "RPL_B = './data/runs/rpl/14/irc_task2_WCrobust04_001'\n",
141 |     "RPL_A = './data/runs/rpl/14/irc_task2_WCrobust0405_001'\n",
142 |     "MEASURE = 'ndcg'\n",
143 |     "\n",
144 |     "runs_rpd = {\n",
145 |     "    'rpd_wcr04_tf_1':\n",
146 |     "        {'path': './data/runs/rpd/45/irc_task1_WCrobust04_001'},\n",
147 |     "    'rpd_wcr0405_tf_1':\n",
148 |     "        {'path': './data/runs/rpd/45/irc_task1_WCrobust0405_001'},\n",
149 |     "    'rpd_wcr04_tf_2':\n",
150 |     "        {'path': './data/runs/rpd/46/irc_task1_WCrobust04_001'},\n",
151 |     "    'rpd_wcr0405_tf_2':\n",
152 |     "        {'path': './data/runs/rpd/46/irc_task1_WCrobust0405_001'},\n",
153 |     "    'rpd_wcr04_tf_3':\n",
154 |     "        {'path': './data/runs/rpd/47/irc_task1_WCrobust04_001'},\n",
155 |     "    'rpd_wcr0405_tf_3':\n",
156 |     "        {'path': './data/runs/rpd/47/irc_task1_WCrobust0405_001'},\n",
157 |     "    'rpd_wcr04_tf_4':\n",
158 |     "        {'path': './data/runs/rpd/48/irc_task1_WCrobust04_001'},\n",
159 |     "    'rpd_wcr0405_tf_4':\n",
160 |     "        {'path': './data/runs/rpd/48/irc_task1_WCrobust0405_001'},\n",
161 |     "    'rpd_wcr04_tf_5':\n",
162 |     "        {'path': './data/runs/rpd/49/irc_task1_WCrobust04_001'},\n",
163 |     "    'rpd_wcr0405_tf_5':\n",
164 |     "        {'path': './data/runs/rpd/49/irc_task1_WCrobust0405_001'}\n",
165 |     "}\n",
166 |     "\n",
167 |     "runs_rpl = {\n",
168 |     "    'rpl_wcr04_tf_1':\n",
169 |     "        {'path': './data/runs/rpl/45/irc_task2_WCrobust04_001'},\n",
170 |     "    'rpl_wcr0405_tf_1':\n",
171 |     "        {'path': './data/runs/rpl/45/irc_task2_WCrobust0405_001'},\n",
172 |     "    'rpl_wcr04_tf_2':\n",
173 |     "        {'path': './data/runs/rpl/46/irc_task2_WCrobust04_001'},\n",
174 |     "    'rpl_wcr0405_tf_2':\n",
175 |     "        {'path': './data/runs/rpl/46/irc_task2_WCrobust0405_001'},\n",
176 |     "    'rpl_wcr04_tf_3':\n",
177 |     "        {'path': './data/runs/rpl/47/irc_task2_WCrobust04_001'},\n",
178 |     "    'rpl_wcr0405_tf_3':\n",
179 |     "        {'path': './data/runs/rpl/47/irc_task2_WCrobust0405_001'},\n",
180 |     "    'rpl_wcr04_tf_4':\n",
181 |     "        {'path': './data/runs/rpl/48/irc_task2_WCrobust04_001'},\n",
182 |     "    'rpl_wcr0405_tf_4':\n",
183 |     "        {'path': './data/runs/rpl/48/irc_task2_WCrobust0405_001'},\n",
184 |     "    'rpl_wcr04_tf_5':\n",
185 |     "        {'path': './data/runs/rpl/49/irc_task2_WCrobust04_001'},\n",
186 |     "    'rpl_wcr0405_tf_5':\n",
187 |     "        {'path': './data/runs/rpl/49/irc_task2_WCrobust0405_001'}\n",
188 |     "}"
189 |    ],
190 |    "execution_count": null,
191 |    "outputs": []
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {
196 |     "id": "I5YPFEL5VtDa"
197 |    },
198 |    "source": [
199 |     "Define a helping function for plotting the average retrieval performance (ARP) later in the notebook."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "metadata": {
205 |     "id": "8-AdKB2-VtDb"
206 |    },
207 |    "source": [
208 |     "def average_retrieval_performance(baseline_scores, reproduced_scores: dict, measures: list, xlabel: str, ylabel: str):\n",
209 |     "    reproduced_scores_arp = [arp_scores(topic_scores) for idx, topic_scores in reproduced_scores.items()]\n",
210 |     "    baseline_scores_arp = arp_scores(baseline_scores)\n",
211 |     "    index = list(reproduced_scores.keys())\n",
212 |     "    df_content = {}\n",
213 |     "    for measure in measures:\n",
214 |     "        df_content[measure] = [scores.get(measure) for scores in reproduced_scores_arp]\n",
215 |     "    df = pd.DataFrame(df_content, index=index)\n",
216 |     "\n",
217 |     "    plt.figure()\n",
218 |     "    ax = df.plot.bar(rot=0, figsize=(10, 6))\n",
219 |     "    for num, measure in enumerate(measures):\n",
220 |     "        orig_val = baseline_scores_arp.get(measure)\n",
221 |     "        ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color=colors[num])\n",
222 |     "        ax.annotate(' ', (num, orig_val), color=colors[num])\n",
223 |     "        ax.set_ylim(0.0, 1.0)\n",
224 |     "\n",
225 |     "    legend_content = [measure + ' (orig)' for measure in measures] + [measure + ' (rpl)' for measure in measures]\n",
226 |     "    ax.legend(legend_content, loc='center left', bbox_to_anchor=(1, 0.5))\n",
227 |     "\n",
228 |     "    ax.set_xlabel(xlabel)\n",
229 |     "    ax.set_ylabel(ylabel)\n",
230 |     "    plt.show()"
231 |    ],
232 |    "execution_count": null,
233 |    "outputs": []
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {
238 |     "id": "9Vn-UoX3VtDg"
239 |    },
240 |    "source": [
241 |     "### Evaluating Reproducibility\n",
242 |     "The following code snippet instantiates a reproducibility evaluator `RpdEvaluator` and determines Kendall's tau Union (KTU), the Rank-biased Overlap (RBO), the Root-Mean-Square-Error (RMSE), the Effect Ratio (ER), the Delta Relative Improvement (DRI) and the p-values of the paired t-test. Please be aware, that it takes some time for the RBO to be computed. We've included a progress bar to give you some feedback."
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "metadata": {
248 |     "id": "dFybABY7VtDh"
249 |    },
250 |    "source": [
251 |     "rpd_eval = RpdEvaluator(qrel_orig_path=QREL,\n",
252 |     "                        run_b_orig_path=ORIG_B,\n",
253 |     "                        run_a_orig_path=ORIG_A,\n",
254 |     "                        run_b_rep_path=RPD_B,\n",
255 |     "                        run_a_rep_path=RPD_A)\n",
256 |     "\n",
257 |     "rpd_eval.trim()\n",
258 |     "rpd_eval.evaluate()\n",
259 |     "\n",
260 |     "# KTU\n",
261 |     "ktau = rpd_eval.ktau_union()\n",
262 |     "print(\"Kendall's tau Union (KTU)\")\n",
263 |     "print('------------------------------------------------------------------')\n",
264 |     "for topic, value in ktau.get('baseline').items():\n",
265 |     "    print_base_adv(topic, 'KTU', value, ktau.get('advanced').get(topic))\n",
266 |     "print_base_adv('ARP', 'KTU', arp(ktau.get('baseline')), arp(ktau.get('advanced')))\n",
267 |     "\n",
268 |     "# RBO\n",
269 |     "rbo = rpd_eval.rbo(print_feedback=True)\n",
270 |     "print(\"Rank-biased Overlap (RBO)\")\n",
271 |     "print('------------------------------------------------------------------')\n",
272 |     "for topic, value in rbo.get('baseline').items():\n",
273 |     "    print_base_adv(topic, 'RBO', value, rbo.get('advanced').get(topic))\n",
274 |     "print_base_adv('ARP', 'RBO', arp(rbo.get('baseline')), arp(rbo.get('advanced')))\n",
275 |     "\n",
276 |     "# RMSE\n",
277 |     "rmse = rpd_eval.rmse()\n",
278 |     "print(\"Root mean square error (RMSE)\")\n",
279 |     "print('------------------------------------------------------------------')\n",
280 |     "for measure, value in rmse.get('baseline').items():\n",
281 |     "    print_base_adv(measure, 'RMSE', value, rmse.get('advanced').get(measure))\n",
282 |     "\n",
283 |     "# ER\n",
284 |     "print(\"Effect ratio (ER)\")\n",
285 |     "print('------------------------------------------------------------------')\n",
286 |     "er = rpd_eval.er()\n",
287 |     "for measure, value in er.items():\n",
288 |     "    print_simple_line(measure, 'ER', value)\n",
289 |     "\n",
290 |     "# DRI\n",
291 |     "print(\"Delta Relative Improvement (DRI)\")\n",
292 |     "print('------------------------------------------------------------------')\n",
293 |     "dri = rpd_eval.dri()\n",
294 |     "for measure, value in dri.items():\n",
295 |     "    print_simple_line(measure, 'DRI', value)\n",
296 |     "\n",
297 |     "# ttest\n",
298 |     "pvals = rpd_eval.ttest()\n",
299 |     "print(\"Two-tailed paired t-test (p-value)\")\n",
300 |     "print('------------------------------------------------------------------')\n",
301 |     "for measure, value in pvals.get('baseline').items():\n",
302 |     "    print_base_adv(measure, 'PVAL', value, pvals.get('advanced').get(measure))"
303 |    ],
304 |    "execution_count": null,
305 |    "outputs": []
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "metadata": {
310 |     "id": "9acgVo-CVtDm"
311 |    },
312 |    "source": [
313 |     "### Comparing the Average Retrieval Performance (ARP) of different parametrizations \n",
314 |     "The following code snippet determines the ARP scores and compares them via a bar plot."
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "metadata": {
320 |     "id": "YtASw_fMVtDn"
321 |    },
322 |    "source": [
323 |     "rpd_eval = RpdEvaluator(qrel_orig_path=QREL,\n",
324 |     "                        run_b_orig_path=ORIG_B,\n",
325 |     "                        run_a_orig_path=ORIG_A,\n",
326 |     "                        run_b_rep_path=None,\n",
327 |     "                        run_a_rep_path=None)\n",
328 |     "\n",
329 |     "rpd_eval.trim()\n",
330 |     "rpd_eval.evaluate()\n",
331 |     "\n",
332 |     "for run_name, info in runs_rpd.items():\n",
333 |     "    with open(info.get('path')) as run_file:\n",
334 |     "        info['run'] = pytrec_eval.parse_run(run_file)\n",
335 |     "        trim(info['run'])\n",
336 |     "        info['scores'] = rpd_eval.evaluate(info['run'])\n",
337 |     "\n",
338 |     "average_retrieval_performance(rpd_eval.run_b_orig_score,\n",
339 |     "                              {\n",
340 |     "                                  'tf_1': runs_rpd.get('rpd_wcr04_tf_1').get('scores'),\n",
341 |     "                                  'tf_2': runs_rpd.get('rpd_wcr04_tf_2').get('scores'),\n",
342 |     "                                  'tf_3': runs_rpd.get('rpd_wcr04_tf_3').get('scores'),\n",
343 |     "                                  'tf_4': runs_rpd.get('rpd_wcr04_tf_4').get('scores'),\n",
344 |     "                                  'tf_5': runs_rpd.get('rpd_wcr04_tf_5').get('scores'),\n",
345 |     "                              },\n",
346 |     "                              measures=['P_10', 'ndcg', 'bpref', 'map'],\n",
347 |     "                              xlabel='Reproduced run (wcr04)',\n",
348 |     "                              ylabel='Score')\n",
349 |     "\n",
350 |     "average_retrieval_performance(rpd_eval.run_a_orig_score,\n",
351 |     "                              {\n",
352 |     "                                  'tf_1': runs_rpd.get('rpd_wcr0405_tf_1').get('scores'),\n",
353 |     "                                  'tf_2': runs_rpd.get('rpd_wcr0405_tf_2').get('scores'),\n",
354 |     "                                  'tf_3': runs_rpd.get('rpd_wcr0405_tf_3').get('scores'),\n",
355 |     "                                  'tf_4': runs_rpd.get('rpd_wcr0405_tf_4').get('scores'),\n",
356 |     "                                  'tf_5': runs_rpd.get('rpd_wcr0405_tf_5').get('scores'),\n",
357 |     "                              },\n",
358 |     "                              measures=['P_10', 'ndcg', 'bpref', 'map'],\n",
359 |     "                              xlabel='Reproduced run (wcr0405)',\n",
360 |     "                              ylabel='Score')"
361 |    ],
362 |    "execution_count": null,
363 |    "outputs": []
364 |   },
365 |   {
366 |    "cell_type": "markdown",
367 |    "metadata": {
368 |     "id": "AQVD43jBVtDs"
369 |    },
370 |    "source": [
371 |     "### Kendall's tau Union (KTU) across different cut-offs\n",
372 |     "The following code snippet compares the ordering of documents for the reproduced runs across different cut-off ranks."
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "metadata": {
378 |     "id": "k8COriFZVtDt"
379 |    },
380 |    "source": [
381 |     "cutoffs = [1000, 100, 50, 20, 10, 5]\n",
382 |     "\n",
383 |     "# BASELINE\n",
384 |     "for run_name, info in zip(list(runs_rpd.keys())[::2], list(runs_rpd.values())[::2]):\n",
385 |     "    rpd_eval = RpdEvaluator(qrel_orig_path=QREL,\n",
386 |     "                            run_b_orig_path=ORIG_B,\n",
387 |     "                            run_a_orig_path=ORIG_A,\n",
388 |     "                            run_b_rep_path=None,\n",
389 |     "                            run_a_rep_path=None)\n",
390 |     "\n",
391 |     "    rpd_eval.trim()\n",
392 |     "    rpd_eval.evaluate()\n",
393 |     "\n",
394 |     "    with open(info.get('path')) as run_file:\n",
395 |     "        info['run'] = pytrec_eval.parse_run(run_file)\n",
396 |     "        for cutoff in cutoffs:\n",
397 |     "            rpd_eval.trim(cutoff)\n",
398 |     "            rpd_eval.trim(cutoff, info['run'])\n",
399 |     "            info['ktu_' + str(cutoff)] = arp(rpd_eval.ktau_union(info['run'])['baseline'])\n",
400 |     "\n",
401 |     "df_content = {}\n",
402 |     "for run_name, info in zip(list(runs_rpd.keys())[::2], list(runs_rpd.values())[::2]):\n",
403 |     "    df_content[run_name] = [info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1]]\n",
404 |     "\n",
405 |     "plt.figure()\n",
406 |     "ax = pd.DataFrame(data=df_content, index=[str(cutoff) for cutoff in cutoffs[::-1]]).plot(style='o-', figsize=(10, 6))\n",
407 |     "ax.set_xlabel('Cut-off values')\n",
408 |     "ax.set_ylabel(r\"Kendall's $\\tau$\")\n",
409 |     "plt.show()\n",
410 |     "\n",
411 |     "# ADVANCED\n",
412 |     "for run_name, info in zip(list(runs_rpd.keys())[1::2], list(runs_rpd.values())[1::2]):\n",
413 |     "    rpd_eval = RpdEvaluator(qrel_orig_path=QREL,\n",
414 |     "                            run_b_orig_path=ORIG_B,\n",
415 |     "                            run_a_orig_path=ORIG_A,\n",
416 |     "                            run_b_rep_path=None,\n",
417 |     "                            run_a_rep_path=None)\n",
418 |     "\n",
419 |     "    rpd_eval.trim()\n",
420 |     "    rpd_eval.evaluate()\n",
421 |     "\n",
422 |     "    with open(info.get('path')) as run_file:\n",
423 |     "        info['run'] = pytrec_eval.parse_run(run_file)\n",
424 |     "        for cutoff in cutoffs:\n",
425 |     "            rpd_eval.trim(cutoff)\n",
426 |     "            rpd_eval.trim(cutoff, info['run'])\n",
427 |     "            # scores = rpl_eval.evaluate(info['run'])\n",
428 |     "            info['ktu_' + str(cutoff)] = arp(rpd_eval.ktau_union(info['run'])['baseline'])\n",
429 |     "\n",
430 |     "df_content = {}\n",
431 |     "for run_name, info in zip(list(runs_rpd.keys())[1::2], list(runs_rpd.values())[1::2]):\n",
432 |     "    df_content[run_name] = [info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1]]\n",
433 |     "\n",
434 |     "plt.figure()\n",
435 |     "ax = pd.DataFrame(data=df_content, index=[str(cutoff) for cutoff in cutoffs[::-1]]).plot(style='o-', figsize=(10, 6))\n",
436 |     "ax.set_xlabel('Cut-off values')\n",
437 |     "ax.set_ylabel(r\"Kendall's $\\tau$\")\n",
438 |     "plt.show()"
439 |    ],
440 |    "execution_count": null,
441 |    "outputs": []
442 |   },
443 |   {
444 |    "cell_type": "markdown",
445 |    "metadata": {
446 |     "id": "C_Prc5aqVtDz"
447 |    },
448 |    "source": [
449 |     "## Root-Mean-Square-Error (RMSE) across different cut-offs\n",
450 |     "The following code snippet compares the reproduced runs at the level of effectiveness by determining the RMSE across different cut-off ranks."
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "metadata": {
456 |     "id": "f2hsEkeGVtD0"
457 |    },
458 |    "source": [
459 |     "rpd_eval = RpdEvaluator(qrel_orig_path=QREL,\n",
460 |     "                        run_b_orig_path=ORIG_B,\n",
461 |     "                        run_a_orig_path=ORIG_A,\n",
462 |     "                        run_b_rep_path=None,\n",
463 |     "                        run_a_rep_path=None)\n",
464 |     "\n",
465 |     "rpd_eval.trim()\n",
466 |     "rpd_eval.evaluate()\n",
467 |     "\n",
468 |     "for run_name, info in runs_rpd.items():\n",
469 |     "    with open(info.get('path')) as run_file:\n",
470 |     "        info['run'] = pytrec_eval.parse_run(run_file)\n",
471 |     "        trim(info['run'])\n",
472 |     "        info['scores'] = rpd_eval.evaluate(info['run'])\n",
473 |     "        info['rmse'] = rpd_eval.rmse(run_b_score=info['scores'])\n",
474 |     "\n",
475 |     "\n",
476 |     "baseline_runs = ['rpd_wcr04_tf_1', 'rpd_wcr04_tf_2', 'rpd_wcr04_tf_3', 'rpd_wcr04_tf_4', 'rpd_wcr04_tf_5']\n",
477 |     "advanced_runs = ['rpd_wcr0405_tf_1', 'rpd_wcr0405_tf_2', 'rpd_wcr0405_tf_3', 'rpd_wcr0405_tf_4', 'rpd_wcr0405_tf_5']\n",
478 |     "cutoffs = ['5', '10', '15', '20', '30', '100', '200', '500', '1000']\n",
479 |     "\n",
480 |     "df_content = {}\n",
481 |     "for run_name in baseline_runs:\n",
482 |     "    df_content[run_name] = [runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co] for co in cutoffs]\n",
483 |     "\n",
484 |     "df = pd.DataFrame(df_content, index=cutoffs)\n",
485 |     "plt.figure()\n",
486 |     "ax = df.plot.line(style='o-', figsize=(10, 6))\n",
487 |     "ax.set_xlabel('Cut-off values')\n",
488 |     "ax.set_ylabel('RMSE')\n",
489 |     "plt.show()\n",
490 |     "\n",
491 |     "df_content = {}\n",
492 |     "for run_name in advanced_runs:\n",
493 |     "    df_content[run_name] = [runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co] for co in cutoffs]\n",
494 |     "\n",
495 |     "df = pd.DataFrame(df_content, index=cutoffs)\n",
496 |     "plt.figure()\n",
497 |     "ax = df.plot.line(style='o-', figsize=(10, 6))\n",
498 |     "ax.set_xlabel('Cut-off values')\n",
499 |     "ax.set_ylabel('RMSE')\n",
500 |     "plt.show()"
501 |    ],
502 |    "execution_count": null,
503 |    "outputs": []
504 |   },
505 |   {
506 |    "cell_type": "markdown",
507 |    "metadata": {
508 |     "id": "MKj4uBanVtD5"
509 |    },
510 |    "source": [
511 |     "## Exploring the space of reproducibility at the level of overall effects\n",
512 |     "The following code snippet plots the Delta Relative Improvement (DRI) against the Effect Ratio (ER). Having runs with different parametrizations at hand, we can compare them in the cartesian plane. As a rule of thumb, we can say the closer a point to (ER 1, DRI 0), the better the reproduction."
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "metadata": {
518 |     "id": "scmfh0ZfVtD5"
519 |    },
520 |    "source": [
521 |     "rpd_eval = RpdEvaluator(qrel_orig_path=QREL,\n",
522 |     "                        run_b_orig_path=ORIG_B,\n",
523 |     "                        run_a_orig_path=ORIG_A,\n",
524 |     "                        run_b_rep_path=None,\n",
525 |     "                        run_a_rep_path=None)\n",
526 |     "\n",
527 |     "rpd_eval.trim()\n",
528 |     "rpd_eval.evaluate()\n",
529 |     "\n",
530 |     "for run_name, info in runs_rpd.items():\n",
531 |     "    with open(info.get('path')) as run_file:\n",
532 |     "        info['run'] = pytrec_eval.parse_run(run_file)\n",
533 |     "        trim(info['run'])\n",
534 |     "        info['scores'] = rpd_eval.evaluate(info['run'])\n",
535 |     "\n",
536 |     "dri_er = {\n",
537 |     "    'wcr_tf_1': {\n",
538 |     "        'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_1']['scores'], runs_rpd['rpd_wcr0405_tf_1']['scores']),\n",
539 |     "        'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_1']['scores'], runs_rpd['rpd_wcr0405_tf_1']['scores'])\n",
540 |     "    },\n",
541 |     "    'wcr_tf_2': {\n",
542 |     "        'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_2']['scores'], runs_rpd['rpd_wcr0405_tf_2']['scores']),\n",
543 |     "        'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_2']['scores'], runs_rpd['rpd_wcr0405_tf_2']['scores'])\n",
544 |     "    },\n",
545 |     "    'wcr_tf_3': {\n",
546 |     "        'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_3']['scores'], runs_rpd['rpd_wcr0405_tf_3']['scores']),\n",
547 |     "        'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_3']['scores'], runs_rpd['rpd_wcr0405_tf_3']['scores'])\n",
548 |     "    },\n",
549 |     "    'wcr_tf_4': {\n",
550 |     "        'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_4']['scores'], runs_rpd['rpd_wcr0405_tf_4']['scores']),\n",
551 |     "        'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_4']['scores'], runs_rpd['rpd_wcr0405_tf_4']['scores'])\n",
552 |     "    },\n",
553 |     "    'wcr_tf_5': {\n",
554 |     "        'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_5']['scores'], runs_rpd['rpd_wcr0405_tf_5']['scores']),\n",
555 |     "        'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_5']['scores'], runs_rpd['rpd_wcr0405_tf_5']['scores'])\n",
556 |     "    },\n",
557 |     "\n",
558 |     "}\n",
559 |     "\n",
560 |     "measures = ['P_10', 'map', 'ndcg']\n",
561 |     "marker_color = [('o', 'b'), ('^', 'g'), ('v', 'r')]\n",
562 |     "\n",
563 |     "fig, ax1 = plt.subplots(figsize=(10, 10))\n",
564 |     "ax1.set_xlabel('Effect Ratio (ER)')\n",
565 |     "ax1.set_ylabel(u'Delta Relative Improvement (ΔRI)')\n",
566 |     "\n",
567 |     "for measure, mk in zip(measures, marker_color):\n",
568 |     "    ax1.plot([dri_er[r]['er'][measure] for r in dri_er.keys()],\n",
569 |     "             [dri_er[r]['dri'][measure] for r in dri_er.keys()],\n",
570 |     "             marker=mk[0], color=mk[1], linestyle='None', label=measure)\n",
571 |     "\n",
572 |     "ax1.tick_params(axis='y', labelcolor='k')\n",
573 |     "fig.tight_layout()\n",
574 |     "plt.axhline(0, color='grey')\n",
575 |     "plt.axvline(1, color='grey')\n",
576 |     "plt.legend()\n",
577 |     "plt.title('Reproducibility')\n",
578 |     "\n",
579 |     "for m in measures:\n",
580 |     "  for r in dri_er.keys():\n",
581 |     "    plt.text(x = dri_er[r]['er'][m], \n",
582 |     "             y = dri_er[r]['dri'][m],\n",
583 |     "             s = r) \n",
584 |     "\n",
585 |     "plt.show()"
586 |    ],
587 |    "execution_count": null,
588 |    "outputs": []
589 |   },
590 |   {
591 |    "cell_type": "markdown",
592 |    "metadata": {
593 |     "id": "YGck2XO0VtD9"
594 |    },
595 |    "source": [
596 |     "## Evaluating Replicability\n",
597 |     "The following code snippet instantiates a replicability evaluator `RplEvaluator` and determines the Effect Ratio (ER), the Delta Relative Improvement (DRI) and the p-values of the unpaired t-test."
598 |    ]
599 |   },
600 |   {
601 |    "cell_type": "code",
602 |    "metadata": {
603 |     "id": "hz8ExjJOVtD-"
604 |    },
605 |    "source": [
606 |     "rpl_eval = RplEvaluator(qrel_orig_path=QREL,\n",
607 |     "                        run_b_orig_path=ORIG_B,\n",
608 |     "                        run_a_orig_path=ORIG_A,\n",
609 |     "                        run_b_rep_path=RPL_B,\n",
610 |     "                        run_a_rep_path=RPL_A,\n",
611 |     "                        qrel_rpl_path=QREL_RPL)\n",
612 |     "\n",
613 |     "rpl_eval.trim()\n",
614 |     "rpl_eval.evaluate()\n",
615 |     "\n",
616 |     "# ER\n",
617 |     "print(\"Effect ratio (ER)\")\n",
618 |     "print('------------------------------------------------------------------')\n",
619 |     "er = rpl_eval.er()\n",
620 |     "for measure, value in er.items():\n",
621 |     "    print_simple_line(measure, 'ER', value)\n",
622 |     "\n",
623 |     "# DRI\n",
624 |     "print(\"Delta Relative Improvement (DRI)\")\n",
625 |     "print('------------------------------------------------------------------')\n",
626 |     "dri = rpl_eval.dri()\n",
627 |     "for measure, value in dri.items():\n",
628 |     "    print_simple_line(measure, 'DRI', value)\n",
629 |     "\n",
630 |     "# ttest\n",
631 |     "pvals = rpl_eval.ttest()\n",
632 |     "print(\"Two-tailed unpaired t-test (p-value)\")\n",
633 |     "print('------------------------------------------------------------------')\n",
634 |     "for measure, value in pvals.get('baseline').items():\n",
635 |     "    print_base_adv(measure, 'PVAL', value, pvals.get('advanced').get(measure))"
636 |    ],
637 |    "execution_count": null,
638 |    "outputs": []
639 |   },
640 |   {
641 |    "cell_type": "markdown",
642 |    "metadata": {
643 |     "id": "YU4QlxI0VtEC"
644 |    },
645 |    "source": [
646 |     "## Exploring the space of replicability at the level of overall effects\n",
647 |     "The following code snippet plots the Delta Relative Improvement (DRI) against the Effect Ratio (ER). Having runs with different parametrizations at hand, we can compare them in the cartesian plane. As a rule of thumb, we can say the closer a point to (ER 1, DRI 0), the better the replication."
648 |    ]
649 |   },
650 |   {
651 |    "cell_type": "code",
652 |    "metadata": {
653 |     "id": "GFvR5NAIVtED"
654 |    },
655 |    "source": [
656 |     "rpl_eval = RplEvaluator(qrel_orig_path=QREL,\n",
657 |     "                        run_b_orig_path=ORIG_B,\n",
658 |     "                        run_a_orig_path=ORIG_A,\n",
659 |     "                        run_b_rep_path=None,\n",
660 |     "                        run_a_rep_path=None,\n",
661 |     "                        qrel_rpl_path=QREL_RPL)\n",
662 |     "\n",
663 |     "rpl_eval.trim()\n",
664 |     "rpl_eval.evaluate()\n",
665 |     "\n",
666 |     "for run_name, info in runs_rpl.items():\n",
667 |     "    with open(info.get('path')) as run_file:\n",
668 |     "        info['run'] = pytrec_eval.parse_run(run_file)\n",
669 |     "        trim(info['run'])\n",
670 |     "        info['scores'] = rpl_eval.evaluate(info['run'])\n",
671 |     "\n",
672 |     "dri_er = {\n",
673 |     "    'wcr_tf_1': {\n",
674 |     "        'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_1']['scores'], runs_rpl['rpl_wcr0405_tf_1']['scores']),\n",
675 |     "        'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_1']['scores'], runs_rpl['rpl_wcr0405_tf_1']['scores'])\n",
676 |     "    },\n",
677 |     "    'wcr_tf_2': {\n",
678 |     "        'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_2']['scores'], runs_rpl['rpl_wcr0405_tf_2']['scores']),\n",
679 |     "        'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_2']['scores'], runs_rpl['rpl_wcr0405_tf_2']['scores'])\n",
680 |     "    },\n",
681 |     "    'wcr_tf_3': {\n",
682 |     "        'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_3']['scores'], runs_rpl['rpl_wcr0405_tf_3']['scores']),\n",
683 |     "        'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_3']['scores'], runs_rpl['rpl_wcr0405_tf_3']['scores'])\n",
684 |     "    },\n",
685 |     "    'wcr_tf_4': {\n",
686 |     "        'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_4']['scores'], runs_rpl['rpl_wcr0405_tf_4']['scores']),\n",
687 |     "        'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_4']['scores'], runs_rpl['rpl_wcr0405_tf_4']['scores'])\n",
688 |     "    },\n",
689 |     "    'wcr_tf_5': {\n",
690 |     "        'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_5']['scores'], runs_rpl['rpl_wcr0405_tf_5']['scores']),\n",
691 |     "        'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_5']['scores'], runs_rpl['rpl_wcr0405_tf_5']['scores'])\n",
692 |     "    },\n",
693 |     "\n",
694 |     "}\n",
695 |     "\n",
696 |     "measures = ['P_10', 'map', 'ndcg']\n",
697 |     "marker_color = [('o', 'b'), ('^', 'g'), ('v', 'r')]\n",
698 |     "\n",
699 |     "fig, ax1 = plt.subplots(figsize=(10, 10))\n",
700 |     "ax1.set_xlabel('Effect Ratio (ER)')\n",
701 |     "ax1.set_ylabel(u'Delta Relative Improvement (ΔRI)')\n",
702 |     "\n",
703 |     "for measure, mk in zip(measures, marker_color):\n",
704 |     "    ax1.plot([dri_er[r]['er'][measure] for r in dri_er.keys()],\n",
705 |     "             [dri_er[r]['dri'][measure] for r in dri_er.keys()],\n",
706 |     "             marker=mk[0], color=mk[1], linestyle='None', label=measure)\n",
707 |     "\n",
708 |     "ax1.tick_params(axis='y', labelcolor='k')\n",
709 |     "fig.tight_layout()\n",
710 |     "plt.axhline(0, color='grey')\n",
711 |     "plt.axvline(1, color='grey')\n",
712 |     "plt.legend()\n",
713 |     "plt.title('Replicability')\n",
714 |     "\n",
715 |     "for m in measures:\n",
716 |     "  for r in dri_er.keys():\n",
717 |     "    plt.text(x = dri_er[r]['er'][m], \n",
718 |     "             y = dri_er[r]['dri'][m],\n",
719 |     "             s = r) \n",
720 |     "\n",
721 |     "plt.show()"
722 |    ],
723 |    "execution_count": null,
724 |    "outputs": []
725 |   }
726 |  ]
727 | }


--------------------------------------------------------------------------------
/example/get_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | wget https://www.dropbox.com/s/p1wwqqka1n3el6b/runs.tar.gz 
4 | tar -xzvf runs.tar.gz -C ./example/data/
5 | 


--------------------------------------------------------------------------------
/example/requirements.txt:
--------------------------------------------------------------------------------
1 | pytrec_eval
2 | numpy
3 | scipy
4 | pandas
5 | matplotlib
6 | seaborn


--------------------------------------------------------------------------------
/example/rpd_arp.py:
--------------------------------------------------------------------------------
  1 | import pytrec_eval
  2 | from repro_eval.Evaluator import RpdEvaluator
  3 | from repro_eval.util import arp, arp_scores
  4 | from repro_eval.util import trim
  5 | import pandas as pd
  6 | from matplotlib import pyplot as plt
  7 | import seaborn as sns
  8 | sns.set()
  9 | sns.set_style('whitegrid')
 10 | # palette = sns.color_palette("GnBu_d")
 11 | # sns.set_palette(palette)
 12 | colors = sns.color_palette()
 13 | 
 14 | ORIG_B = './data/runs/orig/input.WCrobust04'
 15 | ORIG_A = './data/runs/orig/input.WCrobust0405'
 16 | QREL = 'data/qrels/core17.txt'
 17 | 
 18 | runs_rpd = {
 19 |     'rpd_wcr04_tf_1':
 20 |         {'path': './data/runs/rpd/45/irc_task1_WCrobust04_001'},
 21 |     'rpd_wcr0405_tf_1':
 22 |         {'path': './data/runs/rpd/45/irc_task1_WCrobust0405_001'},
 23 |     'rpd_wcr04_tf_2':
 24 |         {'path': './data/runs/rpd/46/irc_task1_WCrobust04_001'},
 25 |     'rpd_wcr0405_tf_2':
 26 |         {'path': './data/runs/rpd/46/irc_task1_WCrobust0405_001'},
 27 |     'rpd_wcr04_tf_3':
 28 |         {'path': './data/runs/rpd/47/irc_task1_WCrobust04_001'},
 29 |     'rpd_wcr0405_tf_3':
 30 |         {'path': './data/runs/rpd/47/irc_task1_WCrobust0405_001'},
 31 |     'rpd_wcr04_tf_4':
 32 |         {'path': './data/runs/rpd/48/irc_task1_WCrobust04_001'},
 33 |     'rpd_wcr0405_tf_4':
 34 |         {'path': './data/runs/rpd/48/irc_task1_WCrobust0405_001'},
 35 |     'rpd_wcr04_tf_5':
 36 |         {'path': './data/runs/rpd/49/irc_task1_WCrobust04_001'},
 37 |     'rpd_wcr0405_tf_5':
 38 |         {'path': './data/runs/rpd/49/irc_task1_WCrobust0405_001'}
 39 | }
 40 | 
 41 | 
 42 | 
 43 | def average_retrieval_performance(baseline_scores, reproduced_scores: dict, measures: list, xlabel: str, ylabel: str, outfile: str):
 44 |     reproduced_scores_arp = [arp_scores(topic_scores) for idx, topic_scores in reproduced_scores.items()]
 45 |     baseline_scores_arp = arp_scores(baseline_scores)
 46 |     index = list(reproduced_scores.keys())
 47 |     df_content = {}
 48 |     for measure in measures:
 49 |         df_content[measure] = [scores.get(measure) for scores in reproduced_scores_arp]
 50 |     df = pd.DataFrame(df_content, index=index)
 51 | 
 52 |     ax = df.plot.bar(rot=0)
 53 |     for num, measure in enumerate(measures):
 54 |         orig_val = baseline_scores_arp.get(measure)
 55 |         ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color=colors[num])
 56 |         ax.annotate(' ', (num, orig_val), color=colors[num])
 57 |         ax.set_ylim(0.0, 1.0)
 58 | 
 59 |     legend_content = [measure + ' (orig)' for measure in measures] + [measure + ' (rpl)' for measure in measures]
 60 |     ax.legend(legend_content, loc='lower left')
 61 | 
 62 |     ax.set_xlabel(xlabel)
 63 |     ax.set_ylabel(ylabel)
 64 |     ax.get_figure().savefig(outfile, format='pdf', bbox_inches='tight')
 65 |     plt.show()
 66 | 
 67 | 
 68 | def main():
 69 |     rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
 70 |                             run_b_orig_path=ORIG_B,
 71 |                             run_a_orig_path=ORIG_A,
 72 |                             run_b_rep_path=None,
 73 |                             run_a_rep_path=None)
 74 | 
 75 |     rpd_eval.trim()
 76 |     rpd_eval.evaluate()
 77 | 
 78 |     for run_name, info in runs_rpd.items():
 79 |         with open(info.get('path')) as run_file:
 80 |             info['run'] = pytrec_eval.parse_run(run_file)
 81 |             trim(info['run'])
 82 |             info['scores'] = rpd_eval.evaluate(info['run'])
 83 | 
 84 |     average_retrieval_performance(rpd_eval.run_b_orig_score,
 85 |                                   {
 86 |                                       'tf_1': runs_rpd.get('rpd_wcr04_tf_1').get('scores'),
 87 |                                       'tf_2': runs_rpd.get('rpd_wcr04_tf_2').get('scores'),
 88 |                                       'tf_3': runs_rpd.get('rpd_wcr04_tf_3').get('scores'),
 89 |                                       'tf_4': runs_rpd.get('rpd_wcr04_tf_4').get('scores'),
 90 |                                       'tf_5': runs_rpd.get('rpd_wcr04_tf_5').get('scores'),
 91 |                                   },
 92 |                                   measures=['P_10', 'ndcg', 'bpref', 'map'],
 93 |                                   xlabel='Reproduced run (wcr04)',
 94 |                                   ylabel='Score',
 95 |                                   outfile='data/plots/rpd_b_arp.pdf')
 96 | 
 97 |     average_retrieval_performance(rpd_eval.run_a_orig_score,
 98 |                                   {
 99 |                                       'tf_1': runs_rpd.get('rpd_wcr0405_tf_1').get('scores'),
100 |                                       'tf_2': runs_rpd.get('rpd_wcr0405_tf_2').get('scores'),
101 |                                       'tf_3': runs_rpd.get('rpd_wcr0405_tf_3').get('scores'),
102 |                                       'tf_4': runs_rpd.get('rpd_wcr0405_tf_4').get('scores'),
103 |                                       'tf_5': runs_rpd.get('rpd_wcr0405_tf_5').get('scores'),
104 |                                   },
105 |                                   measures=['P_10', 'ndcg', 'bpref', 'map'],
106 |                                   xlabel='Reproduced run (wcr0405)',
107 |                                   ylabel='Score',
108 |                                   outfile='data/plots/rpd_a_arp.pdf')
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     main()


--------------------------------------------------------------------------------
/example/rpd_dri_vs_er.py:
--------------------------------------------------------------------------------
  1 | from repro_eval.Evaluator import RpdEvaluator
  2 | from repro_eval.util import trim
  3 | import matplotlib.pyplot as plt
  4 | import seaborn as sns
  5 | sns.set(style="darkgrid")
  6 | 
  7 | import pytrec_eval
  8 | 
  9 | QREL = './data/qrels/core17.txt'
 10 | ORIG_B = './data/runs/orig/input.WCrobust04'
 11 | ORIG_A = './data/runs/orig/input.WCrobust0405'
 12 | 
 13 | 
 14 | runs_rpd = {
 15 |     'rpd_wcr04_tf_1':
 16 |         {'path': './data/runs/rpd/45/irc_task1_WCrobust04_001'},
 17 |     'rpd_wcr0405_tf_1':
 18 |         {'path': './data/runs/rpd/45/irc_task1_WCrobust0405_001'},
 19 |     'rpd_wcr04_tf_2':
 20 |         {'path': './data/runs/rpd/46/irc_task1_WCrobust04_001'},
 21 |     'rpd_wcr0405_tf_2':
 22 |         {'path': './data/runs/rpd/46/irc_task1_WCrobust0405_001'},
 23 |     'rpd_wcr04_tf_3':
 24 |         {'path': './data/runs/rpd/47/irc_task1_WCrobust04_001'},
 25 |     'rpd_wcr0405_tf_3':
 26 |         {'path': './data/runs/rpd/47/irc_task1_WCrobust0405_001'},
 27 |     'rpd_wcr04_tf_4':
 28 |         {'path': './data/runs/rpd/48/irc_task1_WCrobust04_001'},
 29 |     'rpd_wcr0405_tf_4':
 30 |         {'path': './data/runs/rpd/48/irc_task1_WCrobust0405_001'},
 31 |     'rpd_wcr04_tf_5':
 32 |         {'path': './data/runs/rpd/49/irc_task1_WCrobust04_001'},
 33 |     'rpd_wcr0405_tf_5':
 34 |         {'path': './data/runs/rpd/49/irc_task1_WCrobust0405_001'}
 35 | }
 36 | 
 37 | 
 38 | def main():
 39 |     rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
 40 |                             run_b_orig_path=ORIG_B,
 41 |                             run_a_orig_path=ORIG_A,
 42 |                             run_b_rep_path=None,
 43 |                             run_a_rep_path=None)
 44 | 
 45 |     rpd_eval.trim()
 46 |     rpd_eval.evaluate()
 47 | 
 48 |     for run_name, info in runs_rpd.items():
 49 |         with open(info.get('path')) as run_file:
 50 |             info['run'] = pytrec_eval.parse_run(run_file)
 51 |             trim(info['run'])
 52 |             info['scores'] = rpd_eval.evaluate(info['run'])
 53 | 
 54 |     dri_er = {
 55 |         'wcr_tf_1': {
 56 |             'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_1']['scores'], runs_rpd['rpd_wcr0405_tf_1']['scores']),
 57 |             'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_1']['scores'], runs_rpd['rpd_wcr0405_tf_1']['scores'])
 58 |         },
 59 |         'wcr_tf_2': {
 60 |             'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_2']['scores'], runs_rpd['rpd_wcr0405_tf_2']['scores']),
 61 |             'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_2']['scores'], runs_rpd['rpd_wcr0405_tf_2']['scores'])
 62 |         },
 63 |         'wcr_tf_3': {
 64 |             'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_3']['scores'], runs_rpd['rpd_wcr0405_tf_3']['scores']),
 65 |             'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_3']['scores'], runs_rpd['rpd_wcr0405_tf_3']['scores'])
 66 |         },
 67 |         'wcr_tf_4': {
 68 |             'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_4']['scores'], runs_rpd['rpd_wcr0405_tf_4']['scores']),
 69 |             'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_4']['scores'], runs_rpd['rpd_wcr0405_tf_4']['scores'])
 70 |         },
 71 |         'wcr_tf_5': {
 72 |             'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_5']['scores'], runs_rpd['rpd_wcr0405_tf_5']['scores']),
 73 |             'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_5']['scores'], runs_rpd['rpd_wcr0405_tf_5']['scores'])
 74 |         },
 75 | 
 76 |     }
 77 | 
 78 |     measures = ['P_10', 'map', 'ndcg']
 79 |     marker_color = [('o', 'b'), ('^', 'g'), ('v', 'r')]
 80 | 
 81 |     fig, ax1 = plt.subplots()
 82 |     ax1.set_xlabel('Effect Ratio (ER)')
 83 |     ax1.set_ylabel(u'Delta Relative Improvement (ΔRI)')
 84 | 
 85 |     for measure, mk in zip(measures, marker_color):
 86 |         ax1.plot([dri_er[r]['er'][measure] for r in dri_er.keys()],
 87 |                  [dri_er[r]['dri'][measure] for r in dri_er.keys()],
 88 |                  marker=mk[0], color=mk[1], linestyle='None', label=measure)
 89 | 
 90 |     ax1.tick_params(axis='y', labelcolor='k')
 91 |     fig.tight_layout()
 92 |     plt.axhline(0, color='grey')
 93 |     plt.axvline(1, color='grey')
 94 |     plt.legend()
 95 |     plt.title('Reproducibility')
 96 |     plt.savefig('data/plots/rpd_dri_vs_er.pdf', format='pdf', bbox_inches='tight')
 97 |     plt.show()
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     main()
102 | 


--------------------------------------------------------------------------------
/example/rpd_er.py:
--------------------------------------------------------------------------------
 1 | import pytrec_eval
 2 | from repro_eval.Evaluator import RpdEvaluator
 3 | from repro_eval.util import trim
 4 | import pandas as pd
 5 | from matplotlib import pyplot as plt
 6 | import seaborn as sns
 7 | sns.set()
 8 | sns.set_style('whitegrid')
 9 | palette = sns.color_palette("GnBu_d")
10 | sns.set_palette(palette)
11 | colors = sns.color_palette()
12 | 
13 | ORIG_B = './data/runs/orig/input.WCrobust04'
14 | ORIG_A = './data/runs/orig/input.WCrobust0405'
15 | QREL = 'data/qrels/core17.txt'
16 | 
17 | runs_rpd = {
18 |     'rpd_wcr04_tf_1':
19 |         {'path': './data/runs/rpd/45/irc_task1_WCrobust04_001'},
20 |     'rpd_wcr0405_tf_1':
21 |         {'path': './data/runs/rpd/45/irc_task1_WCrobust0405_001'},
22 |     'rpd_wcr04_tf_2':
23 |         {'path': './data/runs/rpd/46/irc_task1_WCrobust04_001'},
24 |     'rpd_wcr0405_tf_2':
25 |         {'path': './data/runs/rpd/46/irc_task1_WCrobust0405_001'},
26 |     'rpd_wcr04_tf_3':
27 |         {'path': './data/runs/rpd/47/irc_task1_WCrobust04_001'},
28 |     'rpd_wcr0405_tf_3':
29 |         {'path': './data/runs/rpd/47/irc_task1_WCrobust0405_001'},
30 |     'rpd_wcr04_tf_4':
31 |         {'path': './data/runs/rpd/48/irc_task1_WCrobust04_001'},
32 |     'rpd_wcr0405_tf_4':
33 |         {'path': './data/runs/rpd/48/irc_task1_WCrobust0405_001'},
34 |     'rpd_wcr04_tf_5':
35 |         {'path': './data/runs/rpd/49/irc_task1_WCrobust04_001'},
36 |     'rpd_wcr0405_tf_5':
37 |         {'path': './data/runs/rpd/49/irc_task1_WCrobust0405_001'}
38 | }
39 | 
40 | 
41 | def main():
42 |     rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
43 |                             run_b_orig_path=ORIG_B,
44 |                             run_a_orig_path=ORIG_A,
45 |                             run_b_rep_path=None,
46 |                             run_a_rep_path=None)
47 | 
48 |     rpd_eval.trim()
49 |     rpd_eval.evaluate()
50 | 
51 |     for run_name, info in runs_rpd.items():
52 |         with open(info.get('path')) as run_file:
53 |             info['run'] = pytrec_eval.parse_run(run_file)
54 |             trim(info['run'])
55 |             info['scores'] = rpd_eval.evaluate(info['run'])
56 | 
57 |     pairs = [('rpd_wcr04_tf_1', 'rpd_wcr0405_tf_1'),
58 |              ('rpd_wcr04_tf_2', 'rpd_wcr0405_tf_2'),
59 |              ('rpd_wcr04_tf_3', 'rpd_wcr0405_tf_3'),
60 |              ('rpd_wcr04_tf_4', 'rpd_wcr0405_tf_4'),
61 |              ('rpd_wcr04_tf_5', 'rpd_wcr0405_tf_5')]
62 | 
63 |     df_content = {
64 |         'P_10': [rpd_eval.er(run_b_score=runs_rpd[pair[0]]['scores'], run_a_score=runs_rpd[pair[1]]['scores'])['P_10'] for pair in pairs],
65 |         'ndcg': [rpd_eval.er(run_b_score=runs_rpd[pair[0]]['scores'], run_a_score=runs_rpd[pair[1]]['scores'])['ndcg'] for pair in pairs],
66 |         'map': [rpd_eval.er(run_b_score=runs_rpd[pair[0]]['scores'], run_a_score=runs_rpd[pair[1]]['scores'])['map'] for pair in pairs],
67 |     }
68 | 
69 |     df = pd.DataFrame(df_content, index=['tf_1', 'tf_2', 'tf_3', 'tf_4', 'tf_5'])
70 |     orig_val = 1
71 |     ax = df.plot.bar(rot=0)
72 |     ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color='black')
73 |     ax.annotate(' ', (3, orig_val), color='black')
74 |     ax.set_xlabel("Reproduced Run")
75 |     ax.set_ylabel("Effect Ratio (ER)")
76 |     ax.get_figure().savefig('data/plots/rpd_er.pdf', format='pdf', bbox_inches='tight')
77 |     plt.show()
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/example/rpd_eval.py:
--------------------------------------------------------------------------------
 1 | from repro_eval.Evaluator import RpdEvaluator
 2 | from repro_eval.util import arp
 3 | from repro_eval.util import print_base_adv, print_simple_line
 4 | 
 5 | QREL = './data/qrels/core17.txt'
 6 | ORIG_B = './data/runs/orig/input.WCrobust04'
 7 | ORIG_A = './data/runs/orig/input.WCrobust0405'
 8 | RPD_B = './data/runs/rpd/14/irc_task1_WCrobust04_001'
 9 | RPD_A = './data/runs/rpd/14/irc_task1_WCrobust0405_001'
10 | MEASURE = 'ndcg'
11 | 
12 | 
13 | def main():
14 |     rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
15 |                             run_b_orig_path=ORIG_B,
16 |                             run_a_orig_path=ORIG_A,
17 |                             run_b_rep_path=RPD_B,
18 |                             run_a_rep_path=RPD_A)
19 | 
20 |     rpd_eval.trim()
21 |     rpd_eval.evaluate()
22 | 
23 |     # KTU
24 |     ktau = rpd_eval.ktau_union()
25 |     print("Kendall's tau Union (KTU)")
26 |     print('------------------------------------------------------------------')
27 |     for topic, value in ktau.get('baseline').items():
28 |         print_base_adv(topic, 'KTU', value, ktau.get('advanced').get(topic))
29 |     print_base_adv('ARP', 'KTU', arp(ktau.get('baseline')), arp(ktau.get('advanced')))
30 | 
31 |     # RBO
32 |     rbo = rpd_eval.rbo()
33 |     print("Rank-biased Overlap (RBO)")
34 |     print('------------------------------------------------------------------')
35 |     for topic, value in rbo.get('baseline').items():
36 |         print_base_adv(topic, 'RBO', value, rbo.get('advanced').get(topic))
37 |     print_base_adv('ARP', 'RBO', arp(rbo.get('baseline')), arp(rbo.get('advanced')))
38 | 
39 |     # RMSE
40 |     rmse = rpd_eval.rmse()
41 |     print("Root mean square error (RMSE)")
42 |     print('------------------------------------------------------------------')
43 |     for measure, value in rmse.get('baseline').items():
44 |         print_base_adv(measure, 'RMSE', value, rmse.get('advanced').get(measure))
45 | 
46 |     # ER
47 |     print("Effect ratio (ER)")
48 |     print('------------------------------------------------------------------')
49 |     er = rpd_eval.er()
50 |     for measure, value in er.items():
51 |         print_simple_line(measure, 'ER', value)
52 | 
53 |     # DRI
54 |     print("Delta Relative Improvement (DRI)")
55 |     print('------------------------------------------------------------------')
56 |     dri = rpd_eval.dri()
57 |     for measure, value in dri.items():
58 |         print_simple_line(measure, 'DRI', value)
59 | 
60 |     # ttest
61 |     pvals = rpd_eval.ttest()
62 |     print("Two-tailed paired t-test (p-value)")
63 |     print('------------------------------------------------------------------')
64 |     for measure, value in pvals.get('baseline').items():
65 |         print_base_adv(measure, 'PVAL', value, pvals.get('advanced').get(measure))
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/example/rpd_ktu.py:
--------------------------------------------------------------------------------
  1 | from repro_eval.Evaluator import RpdEvaluator
  2 | from repro_eval.util import print_base_adv, print_simple_line, trim, arp
  3 | import matplotlib.pyplot as plt
  4 | import seaborn as sns
  5 | sns.set(style="darkgrid")
  6 | import pandas as pd
  7 | import matplotlib.pyplot as plt
  8 | import pytrec_eval
  9 | 
 10 | QREL = './data/qrels/core17.txt'
 11 | ORIG_B = './data/runs/orig/input.WCrobust04'
 12 | ORIG_A = './data/runs/orig/input.WCrobust0405'
 13 | RPD_B = './data/runs/rpd/14/irc_task1_WCrobust04_001'
 14 | RPD_A = './data/runs/rpd/14/irc_task1_WCrobust0405_001'
 15 | MEASURE = 'ndcg'
 16 | 
 17 | 
 18 | runs_rpd = {
 19 |     'rpd_wcr04_tf_1':
 20 |         {'path': './data/runs/rpd/45/irc_task1_WCrobust04_001'},
 21 |     'rpd_wcr0405_tf_1':
 22 |         {'path': './data/runs/rpd/45/irc_task1_WCrobust0405_001'},
 23 |     'rpd_wcr04_tf_2':
 24 |         {'path': './data/runs/rpd/46/irc_task1_WCrobust04_001'},
 25 |     'rpd_wcr0405_tf_2':
 26 |         {'path': './data/runs/rpd/46/irc_task1_WCrobust0405_001'},
 27 |     'rpd_wcr04_tf_3':
 28 |         {'path': './data/runs/rpd/47/irc_task1_WCrobust04_001'},
 29 |     'rpd_wcr0405_tf_3':
 30 |         {'path': './data/runs/rpd/47/irc_task1_WCrobust0405_001'},
 31 |     'rpd_wcr04_tf_4':
 32 |         {'path': './data/runs/rpd/48/irc_task1_WCrobust04_001'},
 33 |     'rpd_wcr0405_tf_4':
 34 |         {'path': './data/runs/rpd/48/irc_task1_WCrobust0405_001'},
 35 |     'rpd_wcr04_tf_5':
 36 |         {'path': './data/runs/rpd/49/irc_task1_WCrobust04_001'},
 37 |     'rpd_wcr0405_tf_5':
 38 |         {'path': './data/runs/rpd/49/irc_task1_WCrobust0405_001'}
 39 | }
 40 | 
 41 | 
 42 | def main():
 43 |     cutoffs = [1000, 100, 50, 20, 10, 5]
 44 | 
 45 |     # BASELINE
 46 |     for run_name, info in zip(list(runs_rpd.keys())[::2], list(runs_rpd.values())[::2]):
 47 |         rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
 48 |                                 run_b_orig_path=ORIG_B,
 49 |                                 run_a_orig_path=ORIG_A,
 50 |                                 run_b_rep_path=None,
 51 |                                 run_a_rep_path=None)
 52 | 
 53 |         rpd_eval.trim()
 54 |         rpd_eval.evaluate()
 55 | 
 56 |         with open(info.get('path')) as run_file:
 57 |             info['run'] = pytrec_eval.parse_run(run_file)
 58 |             for cutoff in cutoffs:
 59 |                 rpd_eval.trim(cutoff)
 60 |                 rpd_eval.trim(cutoff, info['run'])
 61 |                 info['ktu_' + str(cutoff)] = arp(rpd_eval.ktau_union(info['run'])['baseline'])
 62 | 
 63 |     df_content = {}
 64 |     for run_name, info in zip(list(runs_rpd.keys())[::2], list(runs_rpd.values())[::2]):
 65 |         df_content[run_name] = [info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1]]
 66 | 
 67 |     ax = pd.DataFrame(data=df_content, index=[str(cutoff) for cutoff in cutoffs[::-1]]).plot(style='-*')
 68 |     ax.set_xlabel('Cut-off values')
 69 |     ax.set_ylabel(r"Kendall's $\tau$")
 70 |     ax.get_figure().savefig('data/plots/rpd_b_ktu.pdf', format='pdf', bbox_inches='tight')
 71 |     plt.show()
 72 | 
 73 |     # ADVANCED
 74 |     for run_name, info in zip(list(runs_rpd.keys())[1::2], list(runs_rpd.values())[1::2]):
 75 |         rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
 76 |                                 run_b_orig_path=ORIG_B,
 77 |                                 run_a_orig_path=ORIG_A,
 78 |                                 run_b_rep_path=None,
 79 |                                 run_a_rep_path=None)
 80 | 
 81 |         rpd_eval.trim()
 82 |         rpd_eval.evaluate()
 83 | 
 84 |         with open(info.get('path')) as run_file:
 85 |             info['run'] = pytrec_eval.parse_run(run_file)
 86 |             for cutoff in cutoffs:
 87 |                 rpd_eval.trim(cutoff)
 88 |                 rpd_eval.trim(cutoff, info['run'])
 89 |                 # scores = rpl_eval.evaluate(info['run'])
 90 |                 info['ktu_' + str(cutoff)] = arp(rpd_eval.ktau_union(info['run'])['baseline'])
 91 | 
 92 |     df_content = {}
 93 |     for run_name, info in zip(list(runs_rpd.keys())[1::2], list(runs_rpd.values())[1::2]):
 94 |         df_content[run_name] = [info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1]]
 95 | 
 96 |     ax = pd.DataFrame(data=df_content, index=[str(cutoff) for cutoff in cutoffs[::-1]]).plot(style='-*')
 97 |     ax.set_xlabel('Cut-off values')
 98 |     ax.set_ylabel(r"Kendall's $\tau$")
 99 |     ax.get_figure().savefig('data/plots/rpd_a_ktu.pdf', format='pdf', bbox_inches='tight')
100 |     plt.show()
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     main()
105 | 


--------------------------------------------------------------------------------
/example/rpd_rmse.py:
--------------------------------------------------------------------------------
  1 | import pytrec_eval
  2 | from repro_eval.Evaluator import RpdEvaluator
  3 | from repro_eval.util import arp, arp_scores
  4 | from repro_eval.util import trim
  5 | import pandas as pd
  6 | from matplotlib import pyplot as plt
  7 | import seaborn as sns
  8 | sns.set()
  9 | sns.set_style('white')
 10 | palette = sns.color_palette("GnBu_d")
 11 | sns.set_palette(palette)
 12 | colors = sns.color_palette()
 13 | 
 14 | ORIG_B = './data/runs/orig/input.WCrobust04'
 15 | ORIG_A = './data/runs/orig/input.WCrobust0405'
 16 | QREL = 'data/qrels/core17.txt'
 17 | 
 18 | 
 19 | runs_rpd = {
 20 |     'rpd_wcr04_tf_1':
 21 |         {'path': './data/runs/rpd/45/irc_task1_WCrobust04_001'},
 22 |     'rpd_wcr0405_tf_1':
 23 |         {'path': './data/runs/rpd/45/irc_task1_WCrobust0405_001'},
 24 |     'rpd_wcr04_tf_2':
 25 |         {'path': './data/runs/rpd/46/irc_task1_WCrobust04_001'},
 26 |     'rpd_wcr0405_tf_2':
 27 |         {'path': './data/runs/rpd/46/irc_task1_WCrobust0405_001'},
 28 |     'rpd_wcr04_tf_3':
 29 |         {'path': './data/runs/rpd/47/irc_task1_WCrobust04_001'},
 30 |     'rpd_wcr0405_tf_3':
 31 |         {'path': './data/runs/rpd/47/irc_task1_WCrobust0405_001'},
 32 |     'rpd_wcr04_tf_4':
 33 |         {'path': './data/runs/rpd/48/irc_task1_WCrobust04_001'},
 34 |     'rpd_wcr0405_tf_4':
 35 |         {'path': './data/runs/rpd/48/irc_task1_WCrobust0405_001'},
 36 |     'rpd_wcr04_tf_5':
 37 |         {'path': './data/runs/rpd/49/irc_task1_WCrobust04_001'},
 38 |     'rpd_wcr0405_tf_5':
 39 |         {'path': './data/runs/rpd/49/irc_task1_WCrobust0405_001'}
 40 | }
 41 | 
 42 | 
 43 | def average_retrieval_performance(baseline_scores, reproduced_scores: dict, measures: list, xlabel: str, ylabel: str, outfile: str):
 44 |     reproduced_scores_arp = [arp_scores(topic_scores) for idx, topic_scores in reproduced_scores.items()]
 45 |     baseline_scores_arp = arp_scores(baseline_scores)
 46 |     index = list(reproduced_scores.keys())
 47 |     df_content = {}
 48 |     for measure in measures:
 49 |         df_content[measure] = [scores.get(measure) for scores in reproduced_scores_arp]
 50 |     df = pd.DataFrame(df_content, index=index)
 51 | 
 52 |     ax = df.plot.bar(rot=0)
 53 |     for num, measure in enumerate(measures):
 54 |         orig_val = baseline_scores_arp.get(measure)
 55 |         ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color=colors[num])
 56 |         ax.annotate(' ', (num, orig_val), color=colors[num])
 57 |         ax.set_ylim(0.0, 1.0)
 58 | 
 59 |     legend_content = [measure + ' (orig)' for measure in measures] + [measure + ' (rpd)' for measure in measures]
 60 |     ax.legend(legend_content, loc='lower left')
 61 | 
 62 |     ax.set_xlabel(xlabel)
 63 |     ax.set_ylabel(ylabel)
 64 |     ax.get_figure().savefig(outfile, format='pdf', bbox_inches='tight')
 65 |     plt.show()
 66 | 
 67 | 
 68 | def main():
 69 |     rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
 70 |                             run_b_orig_path=ORIG_B,
 71 |                             run_a_orig_path=ORIG_A,
 72 |                             run_b_rep_path=None,
 73 |                             run_a_rep_path=None)
 74 | 
 75 |     rpd_eval.trim()
 76 |     rpd_eval.evaluate()
 77 | 
 78 |     for run_name, info in runs_rpd.items():
 79 |         with open(info.get('path')) as run_file:
 80 |             info['run'] = pytrec_eval.parse_run(run_file)
 81 |             trim(info['run'])
 82 |             info['scores'] = rpd_eval.evaluate(info['run'])
 83 |             info['rmse'] = rpd_eval.rmse(run_b_score=info['scores'])
 84 | 
 85 | 
 86 |     baseline_runs = ['rpd_wcr04_tf_1', 'rpd_wcr04_tf_2', 'rpd_wcr04_tf_3', 'rpd_wcr04_tf_4', 'rpd_wcr04_tf_5']
 87 |     advanced_runs = ['rpd_wcr0405_tf_1', 'rpd_wcr0405_tf_2', 'rpd_wcr0405_tf_3', 'rpd_wcr0405_tf_4', 'rpd_wcr0405_tf_5']
 88 |     cutoffs = ['5', '10', '15', '20', '30', '100', '200', '500', '1000']
 89 | 
 90 |     df_content = {}
 91 |     for run_name in baseline_runs:
 92 |         df_content[run_name] = [runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co] for co in cutoffs]
 93 | 
 94 |     df = pd.DataFrame(df_content, index=cutoffs)
 95 |     ax = df.plot.line(style='o-')
 96 |     ax.set_xlabel('Cut-off values')
 97 |     ax.set_ylabel('RMSE')
 98 |     ax.get_figure().savefig('data/plots/rpd_b_rmse.pdf', format='pdf', bbox_inches='tight')
 99 |     plt.show()
100 | 
101 |     df_content = {}
102 |     for run_name in advanced_runs:
103 |         df_content[run_name] = [runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co] for co in cutoffs]
104 | 
105 |     df = pd.DataFrame(df_content, index=cutoffs)
106 |     ax = df.plot.line(style='o-')
107 |     ax.set_xlabel('Cut-off values')
108 |     ax.set_ylabel('RMSE')
109 |     ax.get_figure().savefig('data/plots/rpd_a_rmse.pdf', format='pdf', bbox_inches='tight')
110 |     plt.show()
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     main()
115 | 


--------------------------------------------------------------------------------
/example/rpl_dri_vs_er.py:
--------------------------------------------------------------------------------
  1 | from repro_eval.Evaluator import RplEvaluator
  2 | from repro_eval.util import trim
  3 | import matplotlib.pyplot as plt
  4 | import seaborn as sns
  5 | sns.set(style="darkgrid")
  6 | 
  7 | import pytrec_eval
  8 | 
  9 | QREL = './data/qrels/core17.txt'
 10 | QREL_RPL = './data/qrels/core18.txt'
 11 | ORIG_B = './data/runs/orig/input.WCrobust04'
 12 | ORIG_A = './data/runs/orig/input.WCrobust0405'
 13 | 
 14 | 
 15 | runs_rpl = {
 16 |     'rpl_wcr04_tf_1':
 17 |         {'path': './data/runs/rpl/45/irc_task2_WCrobust04_001'},
 18 |     'rpl_wcr0405_tf_1':
 19 |         {'path': './data/runs/rpl/45/irc_task2_WCrobust0405_001'},
 20 |     'rpl_wcr04_tf_2':
 21 |         {'path': './data/runs/rpl/46/irc_task2_WCrobust04_001'},
 22 |     'rpl_wcr0405_tf_2':
 23 |         {'path': './data/runs/rpl/46/irc_task2_WCrobust0405_001'},
 24 |     'rpl_wcr04_tf_3':
 25 |         {'path': './data/runs/rpl/47/irc_task2_WCrobust04_001'},
 26 |     'rpl_wcr0405_tf_3':
 27 |         {'path': './data/runs/rpl/47/irc_task2_WCrobust0405_001'},
 28 |     'rpl_wcr04_tf_4':
 29 |         {'path': './data/runs/rpl/48/irc_task2_WCrobust04_001'},
 30 |     'rpl_wcr0405_tf_4':
 31 |         {'path': './data/runs/rpl/48/irc_task2_WCrobust0405_001'},
 32 |     'rpl_wcr04_tf_5':
 33 |         {'path': './data/runs/rpl/49/irc_task2_WCrobust04_001'},
 34 |     'rpl_wcr0405_tf_5':
 35 |         {'path': './data/runs/rpl/49/irc_task2_WCrobust0405_001'}
 36 | }
 37 | 
 38 | 
 39 | def main():
 40 |     rpl_eval = RplEvaluator(qrel_orig_path=QREL,
 41 |                             run_b_orig_path=ORIG_B,
 42 |                             run_a_orig_path=ORIG_A,
 43 |                             run_b_rep_path=None,
 44 |                             run_a_rep_path=None,
 45 |                             qrel_rpd_path=QREL_RPL)
 46 | 
 47 |     rpl_eval.trim()
 48 |     rpl_eval.evaluate()
 49 | 
 50 |     for run_name, info in runs_rpl.items():
 51 |         with open(info.get('path')) as run_file:
 52 |             info['run'] = pytrec_eval.parse_run(run_file)
 53 |             trim(info['run'])
 54 |             info['scores'] = rpl_eval.evaluate(info['run'])
 55 | 
 56 |     dri_er = {
 57 |         'wcr_tf_1': {
 58 |             'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_1']['scores'], runs_rpl['rpl_wcr0405_tf_1']['scores']),
 59 |             'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_1']['scores'], runs_rpl['rpl_wcr0405_tf_1']['scores'])
 60 |         },
 61 |         'wcr_tf_2': {
 62 |             'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_2']['scores'], runs_rpl['rpl_wcr0405_tf_2']['scores']),
 63 |             'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_2']['scores'], runs_rpl['rpl_wcr0405_tf_2']['scores'])
 64 |         },
 65 |         'wcr_tf_3': {
 66 |             'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_3']['scores'], runs_rpl['rpl_wcr0405_tf_3']['scores']),
 67 |             'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_3']['scores'], runs_rpl['rpl_wcr0405_tf_3']['scores'])
 68 |         },
 69 |         'wcr_tf_4': {
 70 |             'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_4']['scores'], runs_rpl['rpl_wcr0405_tf_4']['scores']),
 71 |             'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_4']['scores'], runs_rpl['rpl_wcr0405_tf_4']['scores'])
 72 |         },
 73 |         'wcr_tf_5': {
 74 |             'er': rpl_eval.er(runs_rpl['rpl_wcr04_tf_5']['scores'], runs_rpl['rpl_wcr0405_tf_5']['scores']),
 75 |             'dri': rpl_eval.dri(runs_rpl['rpl_wcr04_tf_5']['scores'], runs_rpl['rpl_wcr0405_tf_5']['scores'])
 76 |         },
 77 | 
 78 |     }
 79 | 
 80 |     measures = ['P_10', 'map', 'ndcg']
 81 |     marker_color = [('o', 'b'), ('^', 'g'), ('v', 'r')]
 82 | 
 83 |     fig, ax1 = plt.subplots()
 84 |     ax1.set_xlabel('Effect Ratio (ER)')
 85 |     ax1.set_ylabel(u'Delta Relative Improvement (ΔRI)')
 86 | 
 87 |     for measure, mk in zip(measures, marker_color):
 88 |         ax1.plot([dri_er[r]['er'][measure] for r in dri_er.keys()],
 89 |                  [dri_er[r]['dri'][measure] for r in dri_er.keys()],
 90 |                  marker=mk[0], color=mk[1], linestyle='None', label=measure)
 91 | 
 92 |     ax1.tick_params(axis='y', labelcolor='k')
 93 |     fig.tight_layout()
 94 |     plt.axhline(0, color='grey')
 95 |     plt.axvline(1, color='grey')
 96 |     plt.legend()
 97 |     plt.title('Replicability')
 98 |     plt.savefig('data/plots/rpl_dri_vs_er.pdf', format='pdf', bbox_inches='tight')
 99 |     plt.show()
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     main()
104 | 


--------------------------------------------------------------------------------
/example/rpl_er.py:
--------------------------------------------------------------------------------
 1 | import pytrec_eval
 2 | from repro_eval.Evaluator import RplEvaluator
 3 | from repro_eval.util import trim
 4 | import pandas as pd
 5 | from matplotlib import pyplot as plt
 6 | import seaborn as sns
 7 | sns.set()
 8 | sns.set_style('whitegrid')
 9 | palette = sns.color_palette("GnBu_d")
10 | sns.set_palette(palette)
11 | colors = sns.color_palette()
12 | 
13 | ORIG_B = './data/runs/orig/input.WCrobust04'
14 | ORIG_A = './data/runs/orig/input.WCrobust0405'
15 | QREL = 'data/qrels/core17.txt'
16 | QREL_RPL = 'data/qrels/core18.txt'
17 | 
18 | runs_rpl = {
19 |     'rpl_wcr04_tf_1':
20 |         {'path': './data/runs/rpl/45/irc_task2_WCrobust04_001'},
21 |     'rpl_wcr0405_tf_1':
22 |         {'path': './data/runs/rpl/45/irc_task2_WCrobust0405_001'},
23 |     'rpl_wcr04_tf_2':
24 |         {'path': './data/runs/rpl/46/irc_task2_WCrobust04_001'},
25 |     'rpl_wcr0405_tf_2':
26 |         {'path': './data/runs/rpl/46/irc_task2_WCrobust0405_001'},
27 |     'rpl_wcr04_tf_3':
28 |         {'path': './data/runs/rpl/47/irc_task2_WCrobust04_001'},
29 |     'rpl_wcr0405_tf_3':
30 |         {'path': './data/runs/rpl/47/irc_task2_WCrobust0405_001'},
31 |     'rpl_wcr04_tf_4':
32 |         {'path': './data/runs/rpl/48/irc_task2_WCrobust04_001'},
33 |     'rpl_wcr0405_tf_4':
34 |         {'path': './data/runs/rpl/48/irc_task2_WCrobust0405_001'},
35 |     'rpl_wcr04_tf_5':
36 |         {'path': './data/runs/rpl/49/irc_task2_WCrobust04_001'},
37 |     'rpl_wcr0405_tf_5':
38 |         {'path': './data/runs/rpl/49/irc_task2_WCrobust0405_001'}
39 | }
40 | 
41 | 
42 | def main():
43 |     rpl_eval = RplEvaluator(qrel_orig_path=QREL,
44 |                             run_b_orig_path=ORIG_B,
45 |                             run_a_orig_path=ORIG_A,
46 |                             run_b_rep_path=None,
47 |                             run_a_rep_path=None,
48 |                             qrel_rpd_path=QREL_RPL)
49 | 
50 |     rpl_eval.trim()
51 |     rpl_eval.evaluate()
52 | 
53 |     for run_name, info in runs_rpl.items():
54 |         with open(info.get('path')) as run_file:
55 |             info['run'] = pytrec_eval.parse_run(run_file)
56 |             trim(info['run'])
57 |             info['scores'] = rpl_eval.evaluate(info['run'])
58 | 
59 |     pairs = [('rpl_wcr04_tf_1', 'rpl_wcr0405_tf_1'),
60 |              ('rpl_wcr04_tf_2', 'rpl_wcr0405_tf_2'),
61 |              ('rpl_wcr04_tf_3', 'rpl_wcr0405_tf_3'),
62 |              ('rpl_wcr04_tf_4', 'rpl_wcr0405_tf_4'),
63 |              ('rpl_wcr04_tf_5', 'rpl_wcr0405_tf_5')]
64 | 
65 |     df_content = {
66 |         'P_10': [rpl_eval.er(run_b_score=runs_rpl[pair[0]]['scores'], run_a_score=runs_rpl[pair[1]]['scores'])['P_10'] for pair in pairs],
67 |         'ndcg': [rpl_eval.er(run_b_score=runs_rpl[pair[0]]['scores'], run_a_score=runs_rpl[pair[1]]['scores'])['ndcg'] for pair in pairs],
68 |         'map': [rpl_eval.er(run_b_score=runs_rpl[pair[0]]['scores'], run_a_score=runs_rpl[pair[1]]['scores'])['map'] for pair in pairs],
69 |     }
70 | 
71 |     df = pd.DataFrame(df_content, index=['tf_1', 'tf_2', 'tf_3', 'tf_4', 'tf_5'])
72 |     orig_val = 1
73 |     ax = df.plot.bar(rot=0)
74 |     ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color='black')
75 |     ax.annotate(' ', (3, orig_val), color='black')
76 |     ax.set_xlabel("Replicated Run")
77 |     ax.set_ylabel("Effect Ratio (ER)")
78 |     ax.get_figure().savefig('data/plots/rpl_er.pdf', format='pdf', bbox_inches='tight')
79 |     plt.show()
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     main()
84 | 


--------------------------------------------------------------------------------
/example/rpl_eval.py:
--------------------------------------------------------------------------------
 1 | from repro_eval.Evaluator import RplEvaluator
 2 | from repro_eval.util import print_base_adv, print_simple_line
 3 | 
 4 | QREL = './data/qrels/core17.txt'
 5 | QREL_RPL = './data/qrels/core18.txt'
 6 | ORIG_B = './data/runs/orig/input.WCrobust04'
 7 | ORIG_A = './data/runs/orig/input.WCrobust0405'
 8 | RPL_B = './data/runs/rpl/14/irc_task2_WCrobust04_001'
 9 | RPL_A = './data/runs/rpl/14/irc_task2_WCrobust0405_001'
10 | MEASURE = 'ndcg'
11 | 
12 | 
13 | def main():
14 |     rpl_eval = RplEvaluator(qrel_orig_path=QREL,
15 |                             run_b_orig_path=ORIG_B,
16 |                             run_a_orig_path=ORIG_A,
17 |                             run_b_rep_path=RPL_B,
18 |                             run_a_rep_path=RPL_A,
19 |                             qrel_rpd_path=QREL_RPL)
20 | 
21 |     rpl_eval.trim()
22 |     rpl_eval.evaluate()
23 | 
24 |     # ER
25 |     print("Effect ratio (ER)")
26 |     print('------------------------------------------------------------------')
27 |     er = rpl_eval.er()
28 |     for measure, value in er.items():
29 |         print_simple_line(measure, 'ER', value)
30 | 
31 |     # DRI
32 |     print("Delta Relative Improvement (DRI)")
33 |     print('------------------------------------------------------------------')
34 |     dri = rpl_eval.dri()
35 |     for measure, value in dri.items():
36 |         print_simple_line(measure, 'DRI', value)
37 | 
38 |     # ttest
39 |     pvals = rpl_eval.ttest()
40 |     print("Two-tailed unpaired t-test (p-value)")
41 |     print('------------------------------------------------------------------')
42 |     for measure, value in pvals.get('baseline').items():
43 |         print_base_adv(measure, 'PVAL', value, pvals.get('advanced').get(measure))
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/repro_eval/Evaluator.py:
--------------------------------------------------------------------------------
  1 | import pytrec_eval
  2 | from repro_eval.util import trim, break_ties
  3 | from repro_eval.measure.statistics import ttest
  4 | from repro_eval.measure.overall_effects import ER, deltaRI
  5 | from repro_eval.measure.document_order import ktau_union as ktu, RBO
  6 | from repro_eval.measure.effectiveness import rmse as RMSE, nrmse as nRMSE
  7 | from repro_eval.config import ERR_MSG
  8 | 
  9 | 
 10 | class Evaluator(object):
 11 |     """
 12 |     An abstract evaluator that holds the original baseline and advanced run as well as
 13 |     the reproduced/replicated baseline and advanced run.
 14 |     """
 15 | 
 16 |     def __init__(self, **kwargs):
 17 |         self.qrel_orig_path = kwargs.get('qrel_orig_path', None)
 18 |         self.run_b_orig_path = kwargs.get('run_b_orig_path', None)
 19 |         self.run_a_orig_path = kwargs.get('run_a_orig_path', None)
 20 |         self.run_b_rep_path = kwargs.get('run_b_rep_path', None)
 21 |         self.run_a_rep_path = kwargs.get('run_a_rep_path', None)
 22 |         self.run_b_orig = None
 23 |         self.run_a_orig = None
 24 |         self.run_b_rep = None
 25 |         self.run_a_rep = None
 26 |         self.run_b_orig_score = None
 27 |         self.run_a_orig_score = None
 28 |         self.run_b_rep_score = None
 29 |         self.run_a_rep_score = None
 30 | 
 31 |         if self.qrel_orig_path:
 32 |             with open(self.qrel_orig_path, 'r') as f_qrel:
 33 |                 qrel_orig = pytrec_eval.parse_qrel(f_qrel)
 34 |                 self.rel_eval = pytrec_eval.RelevanceEvaluator(qrel_orig, pytrec_eval.supported_measures)
 35 | 
 36 |         if self.run_b_orig_path:
 37 |             with open(self.run_b_orig_path, 'r') as f_run:
 38 |                 self.run_b_orig = pytrec_eval.parse_run(f_run)
 39 |                 self.run_b_orig = {t: self.run_b_orig[t] for t in sorted(self.run_b_orig)}
 40 | 
 41 |         if self.run_a_orig_path:
 42 |             with open(self.run_a_orig_path, 'r') as f_run:
 43 |                 self.run_a_orig = pytrec_eval.parse_run(f_run)
 44 |                 self.run_a_orig = {t: self.run_a_orig[t] for t in sorted(self.run_a_orig)}
 45 | 
 46 |         if self.run_b_rep_path:
 47 |             with open(self.run_b_rep_path, 'r') as f_run:
 48 |                 self.run_b_rep = pytrec_eval.parse_run(f_run)
 49 |                 self.run_b_rep = {t: self.run_b_rep[t] for t in sorted(self.run_b_rep)}
 50 | 
 51 |         if self.run_a_rep_path:
 52 |             with open(self.run_a_rep_path, 'r') as f_run:
 53 |                 self.run_a_rep = pytrec_eval.parse_run(f_run)
 54 |                 self.run_a_rep = {t: self.run_a_rep[t] for t in sorted(self.run_a_rep)}
 55 | 
 56 |     def trim(self, t=None, run=None):
 57 |         """
 58 |         Trims all runs of the Evaluator to the length specified by the threshold value t.
 59 | 
 60 |         @param t: Threshold parameter or number of top-k documents to be considered.
 61 |         @param run: If run is not None, only the provided run will be trimmed.
 62 |         """
 63 |         if run:
 64 |             run = break_ties(run)
 65 |             if t:
 66 |                 trim(run, thresh=t)
 67 |             else:
 68 |                 trim(run)
 69 |             return
 70 | 
 71 |         if self.run_b_orig:
 72 |             self.run_b_orig = break_ties(self.run_b_orig)
 73 |             if t:
 74 |                 trim(self.run_b_orig, thresh=t)
 75 |             else:
 76 |                 trim(self.run_b_orig)
 77 | 
 78 |         if self.run_a_orig:
 79 |             self.run_a_orig = break_ties(self.run_a_orig)
 80 |             if t:
 81 |                 trim(self.run_a_orig, thresh=t)
 82 |             else:
 83 |                 trim(self.run_a_orig)
 84 | 
 85 |         if self.run_b_rep:
 86 |             self.run_b_rep = break_ties(self.run_b_rep)
 87 |             if t:
 88 |                 trim(self.run_b_rep, thresh=t)
 89 |             else:
 90 |                 trim(self.run_b_rep)
 91 | 
 92 |         if self.run_a_rep:
 93 |             self.run_a_rep = break_ties(self.run_a_rep)
 94 |             if t:
 95 |                 trim(self.run_a_rep, thresh=t)
 96 |             else:
 97 |                 trim(self.run_a_rep)
 98 | 
 99 |     def evaluate(self, run=None):
100 |         """
101 |         Evaluates the original baseline and advanced run if available.
102 | 
103 |         @param run: Reproduced or replicated run that will be evaluated.
104 |         """
105 |         if self.run_b_orig:
106 |             self.run_b_orig = break_ties(self.run_b_orig)
107 |             self.run_b_orig_score = self.rel_eval.evaluate(self.run_b_orig)
108 |         if self.run_a_orig:
109 |             self.run_a_orig = break_ties(self.run_a_orig)
110 |             self.run_a_orig_score = self.rel_eval.evaluate(self.run_a_orig)
111 | 
112 |     def er(self, run_b_score=None, run_a_score=None, run_b_path=None, run_a_path=None, print_feedback=False):
113 |         """
114 |         Determines the Effect Ratio (ER) according to the following paper:
115 |         Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff.
116 |         How to Measure the Reproducibility of System-oriented IR Experiments.
117 |         Proceedings of SIGIR, pages 349-358, 2020.
118 | 
119 |         The ER value is determined by the ratio between the mean improvements
120 |         of the original and reproduced/replicated experiments.
121 | 
122 |         @param run_b_score: Scores of the baseline run,
123 |                             if not provided the scores of the RpdEvaluator object will be used instead.
124 |         @param run_a_score: Scores of the advanced run,
125 |                             if not provided the scores of the RpdEvaluator object will be used instead.
126 |         @param print_feedback: Boolean value indicating if feedback on progress should be printed.
127 |         @return: Dictionary containing the ER values for the specified run combination.
128 |         """
129 |         if print_feedback:
130 |             print('Determining Effect Ratio (ER)')
131 | 
132 |         if self.run_b_orig_score and self.run_a_orig_score and run_b_path and run_a_path:
133 |             with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run:
134 |                 run_b_rep = pytrec_eval.parse_run(b_run)
135 |                 run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
136 |                 run_b_rep_score = self.rel_eval_rpl.evaluate(run_b_rep) if hasattr(self, 'rel_eval_rpl') else self.rel_eval.evaluate(run_b_rep)
137 |                 run_a_rep = pytrec_eval.parse_run(a_run)
138 |                 run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)}
139 |                 run_a_rep_score = self.rel_eval_rpl.evaluate(run_a_rep) if hasattr(self, 'rel_eval_rpl') else self.rel_eval.evaluate(run_a_rep)
140 |             return ER(orig_score_b=self.run_b_orig_score, orig_score_a=self.run_a_orig_score,
141 |                       rep_score_b=run_b_rep_score, rep_score_a=run_a_rep_score, pbar=print_feedback)
142 | 
143 |         if self.run_b_orig_score and self.run_a_orig_score and run_b_score and run_a_score:
144 |             return ER(orig_score_b=self.run_b_orig_score, orig_score_a=self.run_a_orig_score,
145 |                       rep_score_b=run_b_score, rep_score_a=run_a_score, pbar=print_feedback)
146 | 
147 |         if self.run_b_orig_score and self.run_a_orig_score and self.run_b_rep_score and self.run_a_rep_score:
148 |             return ER(orig_score_b=self.run_b_orig_score, orig_score_a=self.run_a_orig_score,
149 |                       rep_score_b=self.run_b_rep_score, rep_score_a=self.run_a_rep_score, pbar=print_feedback)
150 |         else:
151 |             print(ERR_MSG)
152 | 
153 |     def dri(self, run_b_score=None, run_a_score=None, run_b_path=None, run_a_path=None, print_feedback=False):
154 |         """
155 |         Determines the Delta Relative Improvement (DeltaRI) according to the following paper:
156 |         Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff.
157 |         How to Measure the Reproducibility of System-oriented IR Experiments.
158 |         Proceedings of SIGIR, pages 349-358, 2020.
159 | 
160 |         The DeltaRI value is determined by the difference between the relative improvements
161 |         of the original and reproduced/replicated experiments.
162 | 
163 |         @param run_b_score: Scores of the baseline run,
164 |                             if not provided the scores of the RpdEvaluator object will be used instead.
165 |         @param run_a_score: Scores of the advanced run,
166 |                             if not provided the scores of the RpdEvaluator object will be used instead.
167 |         @param print_feedback: Boolean value indicating if feedback on progress should be printed.
168 |         @return: Dictionary containing the DRI values for the specified run combination.
169 |         """
170 |         if print_feedback:
171 |             print('Determining Delta Relative Improvement (DRI)')
172 | 
173 |         if self.run_b_orig_score and self.run_a_orig_score and run_b_path and run_a_path:
174 |             with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run:
175 |                 run_b_rep = pytrec_eval.parse_run(b_run)
176 |                 run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
177 |                 run_b_rep_score = self.rel_eval_rpl.evaluate(run_b_rep) if hasattr(self, 'rel_eval_rpl') else self.rel_eval.evaluate(run_b_rep)
178 |                 run_a_rep = pytrec_eval.parse_run(a_run)
179 |                 run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)}
180 |                 run_a_rep_score = self.rel_eval_rpl.evaluate(run_a_rep) if hasattr(self, 'rel_eval_rpl') else self.rel_eval.evaluate(run_a_rep)
181 |             return deltaRI(orig_score_b=self.run_b_orig_score, orig_score_a=self.run_a_orig_score,
182 |                            rep_score_b=run_b_rep_score, rep_score_a=run_a_rep_score, pbar=print_feedback)
183 | 
184 |         if self.run_b_orig_score and self.run_a_orig_score and run_b_score and run_a_score:
185 |             return deltaRI(orig_score_b=self.run_b_orig_score, orig_score_a=self.run_a_orig_score,
186 |                            rep_score_b=run_b_score, rep_score_a=run_a_score, pbar=print_feedback)
187 | 
188 |         if self.run_b_orig_score and self.run_a_orig_score and self.run_b_rep_score and self.run_a_rep_score:
189 |             return deltaRI(orig_score_b=self.run_b_orig_score, orig_score_a=self.run_a_orig_score,
190 |                            rep_score_b=self.run_b_rep_score, rep_score_a=self.run_a_rep_score, pbar=print_feedback)
191 |         else:
192 |             print(ERR_MSG)
193 | 
194 |     def _ttest(self, rpd=True, run_b_score=None, run_a_score=None, print_feedback=False):
195 |         """
196 |         Conducts either a paired (reproducibility) or unpaired (replicability) two-sided t-test according to the following paper:
197 |         Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff.
198 |         How to Measure the Reproducibility of System-oriented IR Experiments.
199 |         Proceedings of SIGIR, pages 349-358, 2020.
200 | 
201 |         @param rpd: Boolean indicating if the evaluated runs are reproduced.
202 |         @param run_b_score: Scores of the baseline run,
203 |                             if not provided the scores of the RpdEvaluator object will be used instead.
204 |         @param run_a_score: Scores of the advanced run,
205 |                             if not provided the scores of the RpdEvaluator object will be used instead.
206 |         @param print_feedback: Boolean value indicating if feedback on progress should be printed.
207 |         @return: Dictionary with p-values that compare the score distributions of the baseline and advanced run.
208 |         """
209 |         if self.run_b_orig_score and (self.run_b_rep_score or run_b_score):
210 |             if run_b_score and run_a_score:
211 |                 if print_feedback:
212 |                     print('Determining p-values of t-test for baseline and advanced run.')
213 |                 return {'baseline': ttest(self.run_b_orig_score, run_b_score, rpd=rpd, pbar=print_feedback),
214 |                         'advanced': ttest(self.run_a_orig_score, run_a_score, rpd=rpd, pbar=print_feedback)}
215 |             if run_b_score:
216 |                 if print_feedback:
217 |                     print('Determining p-values of t-test for baseline run.')
218 |                 return {'baseline': ttest(self.run_b_orig_score, run_b_score, rpd=rpd, pbar=print_feedback)}
219 |             if self.run_a_orig_score and self.run_a_rep_score:
220 |                 if print_feedback:
221 |                     print('Determining p-values of t-test for baseline and advanced run.')
222 |                 return {'baseline': ttest(self.run_b_orig_score, self.run_b_rep_score, rpd=rpd, pbar=print_feedback),
223 |                         'advanced': ttest(self.run_a_orig_score, self.run_a_rep_score, rpd=rpd, pbar=print_feedback)}
224 |             else:
225 |                 if print_feedback:
226 |                     print('Determining p-values of t-test for baseline run.')
227 |                 return {'baseline': ttest(self.run_b_orig_score, self.run_b_rep_score, rpd=rpd, pbar=print_feedback)}
228 |         else:
229 |             print(ERR_MSG)
230 | 
231 | 
232 | class RpdEvaluator(Evaluator):
233 |     """
234 |     The Reproducibility Evaluator is used for quantifying the different levels of reproduction for runs that were
235 |     derived from the same test collection used in the original experiment.
236 |     """
237 | 
238 |     def evaluate(self, run=None):
239 |         """
240 |         Evaluates the scores of the original and reproduced baseline and advanced runs.
241 |         If a (reproduced) run is provided only this one will be evaluated and a dictionary with the corresponding
242 |         scores is returned.
243 |         @param run: A reproduced run. If not specified, the original and reproduced runs of the the RpdEvaluator will
244 |                     be used instead.
245 |         @return: If run is specified, a dictionary with the corresponding scores is returned.
246 |         """
247 |         if run:
248 |             return self.rel_eval.evaluate(run)
249 | 
250 |         super(RpdEvaluator, self).evaluate()
251 |         if self.run_b_rep:
252 |             self.run_b_rep = break_ties(self.run_b_rep)
253 |             self.run_b_rep_score = self.rel_eval.evaluate(self.run_b_rep)
254 |         if self.run_a_rep:
255 |             self.run_a_rep = break_ties(self.run_a_rep)
256 |             self.run_a_rep_score = self.rel_eval.evaluate(self.run_a_rep)
257 | 
258 |     def ktau_union(self, run_b_rep=None, run_a_rep=None, run_b_path=None, run_a_path=None, print_feedback=False):
259 |         """
260 |         Determines Kendall's tau Union (KTU) between the original and reproduced document orderings
261 |         according to the following paper:
262 |         Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff.
263 |         How to Measure the Reproducibility of System-oriented IR Experiments.
264 |         Proceedings of SIGIR, pages 349-358, 2020.
265 | 
266 |         @param run_b_rep: Scores of the baseline run,
267 |                           if not provided the scores of the RpdEvaluator object will be used instead.
268 |         @param run_a_rep: Scores of the advanced run,
269 |                           if not provided the scores of the RpdEvaluator object will be used instead.
270 |         @param run_b_path: Path to another reproduced baseline run,
271 |                            if not provided the reproduced baseline run of the RpdEvaluator object will be used instead.
272 |         @param run_a_path: Path to another reproduced advanced run,
273 |                            if not provided the reproduced advanced run of the RpdEvaluator object will be used instead.
274 |         @param print_feedback: Boolean value indicating if feedback on progress should be printed.
275 |         @return: Dictionary with KTU values that compare the document orderings of the original and reproduced runs.
276 |         """
277 |         if self.run_b_orig and run_b_path:
278 |             if self.run_a_orig and run_a_path:
279 |                 if print_feedback:
280 |                     print("Determining Kendall's tau Union (KTU) for baseline and advanced run.")
281 |                 with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run:
282 |                     run_b_rep = pytrec_eval.parse_run(b_run)
283 |                     run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
284 |                     run_a_rep = pytrec_eval.parse_run(a_run)
285 |                     run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)}
286 |                 return {'baseline': ktu(self.run_b_orig, run_b_rep, pbar=print_feedback),
287 |                         'advanced': ktu(self.run_a_orig, run_a_rep, pbar=print_feedback)}
288 |             else:
289 |                 if print_feedback:
290 |                     print("Determining Kendall's tau Union (KTU) for baseline run.")
291 |                 with open(run_b_path, 'r') as b_run:
292 |                     run_b_rep = pytrec_eval.parse_run(b_run)
293 |                     run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
294 |                 return {'baseline': ktu(self.run_b_orig, run_b_rep, pbar=print_feedback)}
295 | 
296 |         if self.run_b_orig and run_b_rep:
297 |             if self.run_a_orig and run_a_rep:
298 |                 if print_feedback:
299 |                     print("Determining Kendall's tau Union (KTU) for baseline and advanced run.")
300 |                 return {'baseline': ktu(self.run_b_orig, run_b_rep, pbar=print_feedback),
301 |                         'advanced': ktu(self.run_a_orig, run_a_rep, pbar=print_feedback)}
302 |             else:
303 |                 if print_feedback:
304 |                     print("Determining Kendall's tau Union (KTU) for baseline run.")
305 |                 return {'baseline':  ktu(self.run_b_orig, run_b_rep, pbar=print_feedback)}
306 | 
307 |         if self.run_b_orig and self.run_b_rep:
308 |             if self.run_a_orig and self.run_a_rep:
309 |                 if print_feedback:
310 |                     print("Determining Kendall's tau Union (KTU) for baseline and advanced run.")
311 |                 return {'baseline': ktu(self.run_b_orig, self.run_b_rep, pbar=print_feedback),
312 |                         'advanced': ktu(self.run_a_orig, self.run_a_rep, pbar=print_feedback)}
313 |             else:
314 |                 if print_feedback:
315 |                     print("Determining Kendall's tau Union (KTU) for baseline run.")
316 |                 return {'baseline':  ktu(self.run_b_orig, self.run_b_rep, pbar=print_feedback)}
317 |         else:
318 |             print(ERR_MSG)
319 | 
320 |     def rbo(self, run_b_rep=None, run_a_rep=None, run_b_path=None, run_a_path=None, print_feedback=False, misinfo=True):
321 |         """
322 |         Determines the Rank-Biased Overlap (RBO) between the original and reproduced document orderings
323 |         according to the following paper:
324 |         Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff.
325 |         How to Measure the Reproducibility of System-oriented IR Experiments.
326 |         Proceedings of SIGIR, pages 349-358, 2020.
327 | 
328 |         @param run_b_rep: Scores of the baseline run,
329 |                           if not provided the scores of the RpdEvaluator object will be used instead.
330 |         @param run_a_rep: Scores of the advanced run,
331 |                           if not provided the scores of the RpdEvaluator object will be used instead.
332 |         @param run_b_path: Path to another reproduced baseline run,
333 |                            if not provided the reproduced baseline run of the RpdEvaluator object will be used instead.
334 |         @param run_a_path: Path to another reproduced advanced run,
335 |                            if not provided the reproduced advanced run of the RpdEvaluator object will be used instead.
336 |         @param print_feedback: Boolean value indicating if feedback on progress should be printed.
337 |         @param misinfo: Use the RBO implementation that is also used in the TREC Health Misinformation Track.
338 |                         See also: https://github.com/claclark/Compatibility
339 |         @return: Dictionary with RBO values that compare the document orderings of the original and reproduced runs.
340 |         """
341 |         if self.run_b_orig and run_b_path:
342 |             if self.run_a_orig and run_a_path:
343 |                 if print_feedback:
344 |                     print("Determining Rank-biased Overlap (RBO) for baseline and advanced run.")
345 |                 with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run:
346 |                     run_b_rep = pytrec_eval.parse_run(b_run)
347 |                     run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
348 |                     run_a_rep = pytrec_eval.parse_run(a_run)
349 |                     run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)}
350 |                 return {'baseline': RBO(self.run_b_orig, run_b_rep, pbar=print_feedback, misinfo=misinfo),
351 |                         'advanced': RBO(self.run_a_orig, run_a_rep, pbar=print_feedback, misinfo=misinfo)}
352 |             else:
353 |                 if print_feedback:
354 |                     print("Determining Rank-biased Overlap (RBO) for baseline run.")
355 |                 with open(run_b_path, 'r') as b_run:
356 |                     run_b_rep = pytrec_eval.parse_run(b_run)
357 |                     run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
358 |                 return {'baseline': RBO(self.run_b_orig, run_b_rep, pbar=print_feedback, misinfo=misinfo)}
359 | 
360 |         if self.run_b_orig and run_b_rep:
361 |             if self.run_a_orig and run_a_rep:
362 |                 if print_feedback:
363 |                     print("Determining Rank-biased Overlap (RBO) for baseline and advanced run.")
364 |                 return {'baseline': RBO(self.run_b_orig, run_b_rep, pbar=print_feedback, misinfo=misinfo),
365 |                         'advanced': RBO(self.run_a_orig, run_a_rep, pbar=print_feedback, misinfo=misinfo)}
366 |             else:
367 |                 if print_feedback:
368 |                     print("Determining Rank-biased Overlap (RBO) for baseline run.")
369 |                 return {'baseline':  RBO(self.run_b_orig, run_b_rep, pbar=print_feedback, misinfo=misinfo)}
370 |         if self.run_b_orig and self.run_b_rep:
371 |             if self.run_a_orig and self.run_a_rep:
372 |                 if print_feedback:
373 |                     print("Determining Rank-biased Overlap (RBO) for baseline and advanced run.")
374 |                 return {'baseline': RBO(self.run_b_orig, self.run_b_rep, pbar=print_feedback, misinfo=misinfo),
375 |                         'advanced': RBO(self.run_a_orig, self.run_a_rep, pbar=print_feedback, misinfo=misinfo)}
376 |             else:
377 |                 if print_feedback:
378 |                     print("Determining Rank-biased Overlap (RBO) for baseline run.")
379 |                 return {'baseline':  RBO(self.run_b_orig, self.run_b_rep, pbar=print_feedback, misinfo=misinfo)}
380 |         else:
381 |             print(ERR_MSG)
382 | 
383 |     def rmse(self, run_b_score=None, run_a_score=None, run_b_path=None, run_a_path=None, print_feedback=False):
384 |         """
385 |         Determines the Root Mean Square Error (RMSE) according to the following paper:
386 |         Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff.
387 |         How to Measure the Reproducibility of System-oriented IR Experiments.
388 |         Proceedings of SIGIR, pages 349-358, 2020.
389 | 
390 |         @param run_b_score: Scores of the baseline run,
391 |                             if not provided the scores of the RpdEvaluator object will be used instead.
392 |         @param run_a_score: Scores of the advanced run,
393 |                             if not provided the scores of the RpdEvaluator object will be used instead.
394 |         @param run_b_path: Path to another reproduced baseline run,
395 |                            if not provided the reproduced baseline run of the RpdEvaluator object will be used instead.
396 |         @param run_a_path: Path to another reproduced advanced run,
397 |                            if not provided the reproduced advanced run of the RpdEvaluator object will be used instead.
398 |         @param print_feedback: Boolean value indicating if feedback on progress should be printed.
399 |         @return: Dictionary with RMSE values that measure the closeness
400 |                  between the topics scores of the original and reproduced runs.
401 |         """
402 |         if self.run_b_orig and run_b_path:
403 |             if self.run_a_orig and run_a_path:
404 |                 if print_feedback:
405 |                     print("Determining Root Mean Square Error (RMSE) for baseline and advanced run.")
406 |                 with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run:
407 |                     run_b_rep = pytrec_eval.parse_run(b_run)
408 |                     run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
409 |                     run_b_rep_score = self.rel_eval.evaluate(run_b_rep)
410 |                     run_a_rep = pytrec_eval.parse_run(a_run)
411 |                     run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)}
412 |                     run_a_rep_score = self.rel_eval.evaluate(run_a_rep)
413 |                 return {'baseline': RMSE(self.run_b_orig_score, run_b_rep_score, pbar=print_feedback),
414 |                         'advanced': RMSE(self.run_a_orig_score, run_a_rep_score, pbar=print_feedback)}
415 |             else:
416 |                 if print_feedback:
417 |                     print("Determining Root Mean Square Error (RMSE) for baseline run.")
418 |                 with open(run_b_path, 'r') as b_run:
419 |                     run_b_rep = pytrec_eval.parse_run(b_run)
420 |                     run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
421 |                     run_b_rep_score = self.rel_eval.evaluate(run_b_rep)
422 |                 return {'baseline': RMSE(self.run_b_orig_score, run_b_rep_score, pbar=print_feedback)}
423 | 
424 |         if self.run_b_orig_score and run_b_score:
425 |             if self.run_a_orig_score and run_a_score:
426 |                 if print_feedback:
427 |                     print("Determining Root Mean Square Error (RMSE) for baseline and advanced run.")
428 |                 return {'baseline': RMSE(self.run_b_orig_score, run_b_score, pbar=print_feedback),
429 |                         'advanced': RMSE(self.run_a_orig_score, run_a_score, pbar=print_feedback)}
430 |             else:
431 |                 if print_feedback:
432 |                     print("Determining Root Mean Square Error (RMSE) for baseline run.")
433 |                 return {'baseline': RMSE(self.run_b_orig_score, run_b_score, pbar=print_feedback)}
434 |         if self.run_b_orig_score and self.run_b_rep_score:
435 |             if self.run_a_orig_score and self.run_a_rep_score:
436 |                 if print_feedback:
437 |                     print("Determining Root Mean Square Error (RMSE) for baseline and advanced run.")
438 |                 return {'baseline': RMSE(self.run_b_orig_score, self.run_b_rep_score, pbar=print_feedback),
439 |                         'advanced': RMSE(self.run_a_orig_score, self.run_a_rep_score, pbar=print_feedback)}
440 |             else:
441 |                 if print_feedback:
442 |                     print("Determining Root Mean Square Error (RMSE) for baseline run.")
443 |                 return {'baseline': RMSE(self.run_b_orig_score, self.run_b_rep_score, pbar=print_feedback)}
444 |         else:
445 |             print(ERR_MSG)
446 | 
447 |     def nrmse(self, run_b_score=None, run_a_score=None, run_b_path=None, run_a_path=None, print_feedback=False):
448 |         """
449 |         Determines the normalized Root Mean Square Error (RMSE).
450 | 
451 |         @param run_b_score: Scores of the baseline run,
452 |                             if not provided the scores of the RpdEvaluator object will be used instead.
453 |         @param run_a_score: Scores of the advanced run,
454 |                             if not provided the scores of the RpdEvaluator object will be used instead.
455 |         @param run_b_path: Path to another reproduced baseline run,
456 |                            if not provided the reproduced baseline run of the RpdEvaluator object will be used instead.
457 |         @param run_a_path: Path to another reproduced advanced run,
458 |                            if not provided the reproduced advanced run of the RpdEvaluator object will be used instead.
459 |         @param print_feedback: Boolean value indicating if feedback on progress should be printed.
460 |         @return: Dictionary with nRMSE values that measure the closeness
461 |                  between the topics scores of the original and reproduced runs.
462 |         """
463 |         if self.run_b_orig and run_b_path:
464 |             if self.run_a_orig and run_a_path:
465 |                 if print_feedback:
466 |                     print("Determining normalized Root Mean Square Error (RMSE) for baseline and advanced run.")
467 |                 with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run:
468 |                     run_b_rep = pytrec_eval.parse_run(b_run)
469 |                     run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
470 |                     run_b_rep_score = self.rel_eval.evaluate(run_b_rep)
471 |                     run_a_rep = pytrec_eval.parse_run(a_run)
472 |                     run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)}
473 |                     run_a_rep_score = self.rel_eval.evaluate(run_a_rep)
474 |                 return {'baseline': nRMSE(self.run_b_orig_score, run_b_rep_score, pbar=print_feedback),
475 |                         'advanced': nRMSE(self.run_a_orig_score, run_a_rep_score, pbar=print_feedback)}
476 |             else:
477 |                 if print_feedback:
478 |                     print("Determining normalized Root Mean Square Error (RMSE) for baseline run.")
479 |                 with open(run_b_path, 'r') as b_run:
480 |                     run_b_rep = pytrec_eval.parse_run(b_run)
481 |                     run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
482 |                     run_b_rep_score = self.rel_eval.evaluate(run_b_rep)
483 |                 return {'baseline': nRMSE(self.run_b_orig_score, run_b_rep_score, pbar=print_feedback)}
484 | 
485 |         if self.run_b_orig_score and run_b_score:
486 |             if self.run_a_orig_score and run_a_score:
487 |                 if print_feedback:
488 |                     print("Determining normalized Root Mean Square Error (RMSE) for baseline and advanced run.")
489 |                 return {'baseline': nRMSE(self.run_b_orig_score, run_b_score, pbar=print_feedback),
490 |                         'advanced': nRMSE(self.run_a_orig_score, run_a_score, pbar=print_feedback)}
491 |             else:
492 |                 if print_feedback:
493 |                     print("Determining normalized Root Mean Square Error (RMSE) for baseline run.")
494 |                 return {'baseline': nRMSE(self.run_b_orig_score, run_b_score, pbar=print_feedback)}
495 |         if self.run_b_orig_score and self.run_b_rep_score:
496 |             if self.run_a_orig_score and self.run_a_rep_score:
497 |                 if print_feedback:
498 |                     print("Determining Root Mean Square Error (RMSE) for baseline and advanced run.")
499 |                 return {'baseline': nRMSE(self.run_b_orig_score, self.run_b_rep_score, pbar=print_feedback),
500 |                         'advanced': nRMSE(self.run_a_orig_score, self.run_a_rep_score, pbar=print_feedback)}
501 |             else:
502 |                 if print_feedback:
503 |                     print("Determining normalized Root Mean Square Error (RMSE) for baseline run.")
504 |                 return {'baseline': nRMSE(self.run_b_orig_score, self.run_b_rep_score, pbar=print_feedback)}
505 |         else:
506 |             print(ERR_MSG)
507 | 
508 |     def ttest(self, run_b_score=None, run_a_score=None, run_b_path=None, run_a_path=None, print_feedback=False):
509 |         """
510 |         Conducts a paired two-tailed t-test for reproduced runs that were derived from the same test collection
511 |         as in the original experiment.
512 | 
513 |         @param run_b_score: Scores of the baseline run,
514 |                             if not provided the scores of the RpdEvaluator object will be used instead.
515 |         @param run_a_score: Scores of the advanced run,
516 |                             if not provided the scores of the RpdEvaluator object will be used instead.
517 |         @param run_b_path: Path to another reproduced baseline run,
518 |                            if not provided the reproduced baseline run of the RpdEvaluator object will be used instead.
519 |         @param run_a_path: Path to another reproduced advanced run,
520 |                            if not provided the reproduced advanced run of the RpdEvaluator object will be used instead.
521 |         @param print_feedback: Boolean value indicating if feedback on progress should be printed.
522 |         @return: Dictionary with p-values that compare the score distributions of the baseline and advanced run.
523 |         """
524 |         if run_b_path:
525 |             if run_a_path:
526 |                 with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run:
527 |                     run_b_rep = pytrec_eval.parse_run(b_run)
528 |                     run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
529 |                     run_b_rep_score = self.rel_eval.evaluate(run_b_rep)
530 |                     run_a_rep = pytrec_eval.parse_run(a_run)
531 |                     run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)}
532 |                     run_a_rep_score = self.rel_eval.evaluate(run_a_rep)
533 |                 return self._ttest(run_b_score=run_b_rep_score, run_a_score=run_a_rep_score, print_feedback=print_feedback)
534 |             else:
535 |                 with open(run_b_path, 'r') as b_run:
536 |                     run_b_rep = pytrec_eval.parse_run(b_run)
537 |                     run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
538 |                     run_b_rep_score = self.rel_eval.evaluate(run_b_rep)
539 |                 return self._ttest(run_b_score=run_b_rep_score, run_a_score=None, print_feedback=print_feedback)
540 | 
541 |         return self._ttest(run_b_score=run_b_score, run_a_score=run_a_score, print_feedback=print_feedback)
542 | 
543 | 
544 | class RplEvaluator(Evaluator):
545 |     """
546 |     The Replicability Evaluator is used for quantifying the different levels of replication for runs that were
547 |     derived from a test collection not used in the original experiment.
548 |     """
549 |     def __init__(self, **kwargs):
550 |         super(RplEvaluator, self).__init__(**kwargs)
551 |         self.qrel_rpl_path = kwargs.get('qrel_rpl_path', None)
552 | 
553 |         if self.qrel_rpl_path:
554 |             with open(self.qrel_rpl_path, 'r') as f_qrel:
555 |                 qrel_rpl = pytrec_eval.parse_qrel(f_qrel)
556 |                 self.rel_eval_rpl = pytrec_eval.RelevanceEvaluator(qrel_rpl, pytrec_eval.supported_measures)
557 | 
558 |     def evaluate(self, run=None):
559 |         """
560 |         Evaluates the scores of the original and replicated baseline and advanced runs.
561 |         If a (replicated) run is provided only this one will be evaluated and a dictionary with the corresponding
562 |         scores is returned.
563 |         @param run: A replicated run. If not specified, the original and replicated runs of the the RplEvaluator will
564 |                     be used instead.
565 |         @return: If run is specified, a dictionary with the corresponding scores is returned.
566 |         """
567 |         if run:
568 |             return self.rel_eval_rpl.evaluate(run)
569 | 
570 |         super(RplEvaluator, self).evaluate()
571 |         if self.run_b_rep:
572 |             self.run_b_rep = break_ties(self.run_b_rep)
573 |             self.run_b_rep_score = self.rel_eval_rpl.evaluate(self.run_b_rep)
574 |         if self.run_a_rep:
575 |             self.run_a_rep = break_ties(self.run_a_rep)
576 |             self.run_a_rep_score = self.rel_eval_rpl.evaluate(self.run_a_rep)
577 | 
578 |     def ttest(self, run_b_score=None, run_a_score=None, run_b_path=None, run_a_path=None, print_feedback=False):
579 |         """
580 |         Conducts an un-paired two-tailed t-test for replicated runs that were derived from a test collection
581 |         not used in the original experiment.
582 | 
583 |         @param run_b_score: Scores of the baseline run,
584 |                             if not provided the scores of the RpdEvaluator object will be used instead.
585 |         @param run_a_score: Scores of the advanced run,
586 |                             if not provided the scores of the RpdEvaluator object will be used instead.
587 |         @param run_b_path: Path to another replicated baseline run,
588 |                            if not provided the replicated baseline run of the RplEvaluator object will be used instead.
589 |         @param run_a_path: Path to another replicated advanced run,
590 |                            if not provided the replicated advanced run of the RplEvaluator object will be used instead.
591 |         @param print_feedback: Boolean value indicating if feedback on progress should be printed.
592 |         @return: Dictionary with p-values that compare the score distributions of the baseline and advanced run.
593 |         """
594 |         if run_b_path:
595 |             if run_a_path:
596 |                 with open(run_b_path, 'r') as b_run, open(run_a_path, 'r') as a_run:
597 |                     run_b_rep = pytrec_eval.parse_run(b_run)
598 |                     run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
599 |                     run_b_rep_score = self.rel_eval_rpl.evaluate(run_b_rep)
600 |                     run_a_rep = pytrec_eval.parse_run(a_run)
601 |                     run_a_rep = {t: run_a_rep[t] for t in sorted(run_a_rep)}
602 |                     run_a_rep_score = self.rel_eval_rpl.evaluate(run_a_rep)
603 |                 return self._ttest(rpd=False, run_b_score=run_b_rep_score, run_a_score=run_a_rep_score, print_feedback=print_feedback)
604 |             else:
605 |                 with open(run_b_path, 'r') as b_run:
606 |                     run_b_rep = pytrec_eval.parse_run(b_run)
607 |                     run_b_rep = {t: run_b_rep[t] for t in sorted(run_b_rep)}
608 |                     run_b_rep_score = self.rel_eval_rpl.evaluate(run_b_rep)
609 |                 return self._ttest(rpd=False, run_b_score=run_b_rep_score, run_a_score=None, print_feedback=print_feedback)
610 | 
611 |         return self._ttest(rpd=False, run_b_score=run_b_score, run_a_score=run_a_score, print_feedback=print_feedback)
612 | 


--------------------------------------------------------------------------------
/repro_eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/repro_eval/__init__.py


--------------------------------------------------------------------------------
/repro_eval/__main__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Use repro_eval from the command line with e.g.
  3 | 
  4 | python -m repro_eval -t rpd -q qrel_orig -r orig_b rpd_b
  5 | 
  6 | python -m repro_eval -t rpd -q qrel_orig -r orig_b orig_a rpd_b rpd_a
  7 | 
  8 | python -m repro_eval -t rpd -m rmse -q qrel_orig -r orig_b rpd_b
  9 | 
 10 | python -m repro_eval -t rpl -q qrel_orig qrel_rpl -r orig_b rpl_b
 11 | 
 12 | python -m repro_eval -t rpl -q qrel_orig qrel_rpl -r orig_b orig_a rpl_b rpl_a
 13 | 
 14 | after having installed the Python package.
 15 | For other more specific examples also have a look at the README file.
 16 | Depending on the provided parameters and input run files,
 17 | evaluation measures will be printed.
 18 | """
 19 | 
 20 | import argparse
 21 | from repro_eval.Evaluator import RpdEvaluator, RplEvaluator
 22 | from repro_eval.util import print_simple_line, print_base_adv
 23 | from repro_eval.util import arp
 24 | 
 25 | 
 26 | def main():
 27 |     parser = argparse.ArgumentParser()
 28 | 
 29 |     parser.add_argument('-t', '--type')
 30 |     parser.add_argument('-m', '--measure', nargs='+')
 31 |     parser.add_argument('-q', '--qrels', nargs='+')
 32 |     parser.add_argument('-r', '--runs', nargs='+')
 33 | 
 34 |     args = parser.parse_args()
 35 | 
 36 |     if args.type in ['rpd', 'reproducibility']:
 37 |         if len(args.runs) == 4:
 38 |             rpd_eval = RpdEvaluator(qrel_orig_path=args.qrels[0],
 39 |                                     run_b_orig_path=args.runs[0],
 40 |                                     run_a_orig_path=args.runs[1],
 41 |                                     run_b_rep_path=args.runs[2],
 42 |                                     run_a_rep_path=args.runs[3])
 43 | 
 44 |         if len(args.runs) == 2:
 45 |             rpd_eval = RpdEvaluator(qrel_orig_path=args.qrels[0],
 46 |                                     run_b_orig_path=args.runs[0],
 47 |                                     run_a_orig_path=None,
 48 |                                     run_b_rep_path=args.runs[1],
 49 |                                     run_a_rep_path=None)
 50 | 
 51 |         rpd_eval.trim()
 52 |         rpd_eval.evaluate()
 53 | 
 54 |         measure_list = args.measure if args.measure is not None else []
 55 | 
 56 |         # KTU
 57 |         if 'ktu' in measure_list or args.measure is None:
 58 |             ktu = rpd_eval.ktau_union()
 59 |             print("Kendall's tau Union (KTU)")
 60 |             print('------------------------------------------------------------------')
 61 |             for topic, value in ktu.get('baseline').items():
 62 |                 value_adv = ktu.get('advanced').get(topic) if ktu.get('advanced') is not None else None
 63 |                 print_base_adv(topic, 'KTU', value, value_adv)
 64 |             value_adv = arp(ktu.get('advanced')) if ktu.get('advanced') is not None else None
 65 |             print_base_adv('ARP', 'KTU', arp(ktu.get('baseline')), value_adv)
 66 |             print()
 67 | 
 68 |         # RBO
 69 |         if 'rbo' in measure_list or args.measure is None:
 70 |             rbo = rpd_eval.rbo()
 71 |             print("Rank-biased Overlap (RBO)")
 72 |             print('------------------------------------------------------------------')
 73 |             for topic, value in rbo.get('baseline').items():
 74 |                 value_adv = rbo.get('advanced').get(topic) if rbo.get('advanced') is not None else None
 75 |                 print_base_adv(topic, 'RBO', value, value_adv)
 76 |             value_adv = arp(rbo.get('advanced')) if rbo.get('advanced') is not None else None
 77 |             print_base_adv('ARP', 'RBO', arp(rbo.get('baseline')), value_adv)
 78 |             print()
 79 | 
 80 |         # RMSE
 81 |         if 'rmse' in measure_list or args.measure is None:
 82 |             rmse = rpd_eval.rmse()
 83 |             print("Root mean square error (RMSE)")
 84 |             print('------------------------------------------------------------------')
 85 |             for measure, value in rmse.get('baseline').items():
 86 |                 value_adv = rmse.get('advanced').get(measure) if rmse.get('advanced') is not None else None
 87 |                 print_base_adv(measure, 'RMSE', value, value_adv)
 88 |             print()
 89 | 
 90 |         # ER
 91 |         if 'er' in measure_list or args.measure is None and len(args.runs) == 4:
 92 |             print("Effect ratio (ER)")
 93 |             print('------------------------------------------------------------------')
 94 |             er = rpd_eval.er()
 95 |             for measure, value in er.items():
 96 |                 print_simple_line(measure, 'ER', value)
 97 |             print()
 98 | 
 99 |         # DRI
100 |         if 'dri' in measure_list or args.measure is None and len(args.runs) == 4:
101 |             print("Delta Relative Improvement (DRI)")
102 |             print('------------------------------------------------------------------')
103 |             dri = rpd_eval.dri()
104 |             for measure, value in dri.items():
105 |                 print_simple_line(measure, 'DRI', value)
106 |             print()
107 | 
108 |         # ttest
109 |         if 'ttest' in measure_list or args.measure is None:
110 |             pvals = rpd_eval.ttest()
111 |             print("Two-tailed paired t-test (p-value)")
112 |             print('------------------------------------------------------------------')
113 |             for measure, value in pvals.get('baseline').items():
114 |                 value_adv = pvals.get('advanced').get(measure) if pvals.get('advanced') is not None else None
115 |                 print_base_adv(measure, 'PVAL', value, value_adv)
116 |             print()
117 | 
118 |     if args.type in ['rpl', 'replicability']:
119 |         if len(args.runs) == 4:
120 |             rpl_eval = RplEvaluator(qrel_orig_path=args.qrels[0],
121 |                                     run_b_orig_path=args.runs[0],
122 |                                     run_a_orig_path=args.runs[1],
123 |                                     run_b_rep_path=args.runs[2],
124 |                                     run_a_rep_path=args.runs[3],
125 |                                     qrel_rpl_path=args.qrels[1])
126 | 
127 |         if len(args.runs) == 2:
128 |             rpl_eval = RplEvaluator(qrel_orig_path=args.qrels[0],
129 |                                     run_b_orig_path=args.runs[0],
130 |                                     run_a_orig_path=None,
131 |                                     run_b_rep_path=args.runs[1],
132 |                                     run_a_rep_path=None,
133 |                                     qrel_rpl_path=args.qrels[1])
134 | 
135 |         rpl_eval.trim()
136 |         rpl_eval.evaluate()
137 | 
138 |         measure_list = args.measure if args.measure is not None else []
139 | 
140 |         # ER
141 |         if 'er' in measure_list or args.measure is None and len(args.runs) == 4:
142 |             print("Effect ratio (ER)")
143 |             print('------------------------------------------------------------------')
144 |             er = rpl_eval.er()
145 |             for measure, value in er.items():
146 |                 print_simple_line(measure, 'ER', value)
147 |             print()
148 | 
149 |         # DRI
150 |         if 'dri' in measure_list or args.measure is None and len(args.runs) == 4:
151 |             print("Delta Relative Improvement (DRI)")
152 |             print('------------------------------------------------------------------')
153 |             dri = rpl_eval.dri()
154 |             for measure, value in dri.items():
155 |                 print_simple_line(measure, 'DRI', value)
156 |             print()
157 | 
158 |         # ttest
159 |         if 'ttest' in measure_list or args.measure is None:
160 |             pvals = rpl_eval.ttest()
161 |             print("Two-tailed unpaired t-test (p-value)")
162 |             print('------------------------------------------------------------------')
163 |             for measure, value in pvals.get('baseline').items():
164 |                 value_adv = pvals.get('advanced').get(measure) if pvals.get('advanced') is not None else None
165 |                 print_base_adv(measure, 'PVAL', value, value_adv)
166 |             print()
167 | 
168 | 
169 | if __name__ == "__main__":
170 |     main()
171 | 


--------------------------------------------------------------------------------
/repro_eval/config.py:
--------------------------------------------------------------------------------
 1 | TRIM_THRESH = 1000  # default threshold for trimming the runs
 2 | PHI = 0.8  # default parameter for the Rank-Biased Overlap (RBO)
 3 | ERR_MSG = 'Please provide adequate run combinations and have them evaluated first.'  # error message
 4 | 
 5 | # evaluation measures of trec_eval that will be excluded from the reproduction and replication measures
 6 | exclude = [
 7 |     'runid',
 8 |     'num_q',
 9 |     'num_ret',
10 |     'num_rel',
11 |     'num_rel_ret',
12 |     'num_nonrel_judged_ret',
13 |     'relstring'
14 | ]
15 | 


--------------------------------------------------------------------------------
/repro_eval/measure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/repro_eval/measure/__init__.py


--------------------------------------------------------------------------------
/repro_eval/measure/document_order.py:
--------------------------------------------------------------------------------
  1 | """Evaluation measures at the level of document orderings."""
  2 | 
  3 | from repro_eval.config import TRIM_THRESH, PHI
  4 | from scipy.stats.stats import kendalltau
  5 | from tqdm import tqdm
  6 | from repro_eval.measure.external.rbo import rbo
  7 | from repro_eval.util import break_ties
  8 | 
  9 | 
 10 | def _rbo(run, ideal, p, depth):
 11 |     # Implementation taken from the TREC Health Misinformation Track with modifications
 12 |     # see also: https://github.com/claclark/Compatibility
 13 |     run_set = set()
 14 |     ideal_set = set()
 15 | 
 16 |     score = 0.0
 17 |     normalizer = 0.0
 18 |     weight = 1.0
 19 |     for i in range(depth):
 20 |         if i < len(run):
 21 |             run_set.add(run[i])
 22 |         if i < len(ideal):
 23 |             ideal_set.add(ideal[i])
 24 |         score += weight*len(ideal_set.intersection(run_set))/(i + 1)
 25 |         normalizer += weight
 26 |         weight *= p
 27 |     return score/normalizer
 28 | 
 29 | 
 30 | def _ktau_union(orig_run, rep_run, trim_thresh=TRIM_THRESH, pbar=False):
 31 |     """
 32 |     Helping function returning a generator to determine Kendall's tau Union (KTU) for all topics.
 33 | 
 34 |     @param orig_run: The original run.
 35 |     @param rep_run: The reproduced/replicated run.
 36 |     @param trim_thresh: Threshold values for the number of documents to be compared.
 37 |     @param pbar: Boolean value indicating if progress bar should be printed.
 38 |     @return: Generator with KTU values.
 39 |     """
 40 | 
 41 |     generator = tqdm(rep_run.items()) if pbar else rep_run.items()
 42 | 
 43 |     for topic, docs in generator:
 44 |         orig_docs = list(orig_run.get(topic).keys())[:trim_thresh]
 45 |         rep_docs = list(rep_run.get(topic).keys())[:trim_thresh]
 46 |         union = list(sorted(set(orig_docs + rep_docs)))
 47 |         orig_idx = [union.index(doc) for doc in orig_docs]
 48 |         rep_idx = [union.index(doc) for doc in rep_docs]
 49 |         yield topic, round(kendalltau(orig_idx, rep_idx).correlation, 14)
 50 | 
 51 | 
 52 | def ktau_union(orig_run, rep_run, trim_thresh=TRIM_THRESH, pbar=False):
 53 |     """
 54 |     Determines the Kendall's tau Union (KTU) between the original and reproduced document orderings
 55 |     according to the following paper:
 56 |     Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff.
 57 |     How to Measure the Reproducibility of System-oriented IR Experiments.
 58 |     Proceedings of SIGIR, pages 349-358, 2020.
 59 | 
 60 |     @param orig_run: The original run.
 61 |     @param rep_run: The reproduced/replicated run.
 62 |     @param trim_thresh: Threshold values for the number of documents to be compared.
 63 |     @param pbar: Boolean value indicating if progress bar should be printed.
 64 |     @return: Dictionary with KTU values that compare the document orderings of the original and reproduced runs.
 65 |     """
 66 | 
 67 |     # Safety check for runs that are not added via pytrec_eval
 68 |     orig_run = break_ties(orig_run)
 69 |     rep_run = break_ties(rep_run)
 70 | 
 71 |     return dict(_ktau_union(orig_run, rep_run, trim_thresh=trim_thresh, pbar=pbar))
 72 | 
 73 | 
 74 | def _RBO(orig_run, rep_run, phi, trim_thresh=TRIM_THRESH, pbar=False, misinfo=True):
 75 |     """
 76 |     Helping function returning a generator to determine the Rank-Biased Overlap (RBO) for all topics.
 77 | 
 78 |     @param orig_run: The original run.
 79 |     @param rep_run: The reproduced/replicated run.
 80 |     @param phi: Parameter for top-heaviness of the RBO.
 81 |     @param trim_thresh: Threshold values for the number of documents to be compared.
 82 |     @param pbar: Boolean value indicating if progress bar should be printed.
 83 |     @param misinfo: Use the RBO implementation that is also used in the TREC Health Misinformation Track.
 84 |                     See also: https://github.com/claclark/Compatibility
 85 |     @return: Generator with RBO values.
 86 |     """
 87 | 
 88 |     generator = tqdm(rep_run.items()) if pbar else rep_run.items()
 89 | 
 90 |     if misinfo:
 91 |         for topic, docs in generator:
 92 |             yield topic, _rbo(list(rep_run.get(topic).keys())[:trim_thresh],
 93 |                               list(orig_run.get(topic).keys())[:trim_thresh],
 94 |                               p=phi,
 95 |                               depth=trim_thresh)
 96 | 
 97 |     else:
 98 |         for topic, docs in generator:
 99 |             yield topic, rbo(list(rep_run.get(topic).keys())[:trim_thresh],
100 |                              list(orig_run.get(topic).keys())[:trim_thresh],
101 |                              p=phi).ext
102 | 
103 | 
104 | def RBO(orig_run, rep_run, phi=PHI, trim_thresh=TRIM_THRESH, pbar=False, misinfo=True):
105 |     """
106 |     Determines the Rank-Biased Overlap (RBO) between the original and reproduced document orderings
107 |     according to the following paper:
108 |     Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff.
109 |     How to Measure the Reproducibility of System-oriented IR Experiments.
110 |     Proceedings of SIGIR, pages 349-358, 2020.
111 | 
112 |     @param orig_run: The original run.
113 |     @param rep_run: The reproduced/replicated run.
114 |     @param phi: Parameter for top-heaviness of the RBO.
115 |     @param trim_thresh: Threshold values for the number of documents to be compared.
116 |     @param pbar: Boolean value indicating if progress bar should be printed.
117 |     @param misinfo: Use the RBO implementation that is also used in the TREC Health Misinformation Track.
118 |                     See also: https://github.com/claclark/Compatibility
119 |     @return: Dictionary with RBO values that compare the document orderings of the original and reproduced runs.
120 |     """
121 | 
122 |     # Safety check for runs that are not added via pytrec_eval
123 |     orig_run = break_ties(orig_run)
124 |     rep_run = break_ties(rep_run)
125 | 
126 |     return dict(_RBO(orig_run, rep_run, phi=phi, trim_thresh=trim_thresh, pbar=pbar, misinfo=misinfo))
127 | 


--------------------------------------------------------------------------------
/repro_eval/measure/effectiveness.py:
--------------------------------------------------------------------------------
 1 | """Evaluation measures at the level of effectiveness."""
 2 | 
 3 | import numpy as np
 4 | from math import sqrt
 5 | from copy import deepcopy
 6 | from tqdm import tqdm
 7 | from repro_eval.config import exclude
 8 | 
 9 | 
10 | def _rmse(orig_score, rep_core, pbar=False):
11 |     """
12 |     Helping function returning a generator to determine the Root Mean Square Error (RMSE) for all topics.
13 | 
14 |     @param orig_score: The original scores.
15 |     @param rep_core: The reproduced/replicated scores.
16 |     @param pbar: Boolean value indicating if progress bar should be printed.
17 |     @return: Generator with RMSE values.
18 |     """
19 |     orig_cp = deepcopy(orig_score)
20 |     rep_cp = deepcopy(rep_core)
21 |     measures_all = list(list(orig_cp.values())[0].keys())
22 |     topics = orig_cp.keys()
23 |     measures_valid = [m for m in measures_all if m not in exclude]
24 | 
25 |     measures = tqdm(measures_valid) if pbar else measures_valid
26 | 
27 |     for measure in measures:
28 |         orig_measure = np.array([orig_cp.get(topic).get(measure) for topic in topics])
29 |         rpl_measure = np.array([rep_cp.get(topic).get(measure) for topic in topics])
30 |         diff = orig_measure - rpl_measure
31 |         yield measure, sqrt(sum(np.square(diff))/len(diff))
32 | 
33 | 
34 | def rmse(orig_score, rep_score, pbar=False):
35 |     """
36 |     Determines the Root Mean Square Error (RMSE) between the original and reproduced topic scores
37 |     according to the following paper:
38 |     Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff.
39 |     How to Measure the Reproducibility of System-oriented IR Experiments.
40 |     Proceedings of SIGIR, pages 349-358, 2020.
41 | 
42 |     @param orig_score: The original scores.
43 |     @param rep_core: The reproduced/replicated scores.
44 |     @param pbar: Boolean value indicating if progress bar should be printed.
45 |     @return: Dictionary with RMSE values that measure the closeness between the original and reproduced topic scores.
46 |     """
47 |     return dict(_rmse(orig_score, rep_score, pbar=pbar))
48 | 
49 | 
50 | def _maxrmse(orig_score, pbar=False):
51 |     """
52 |     Helping function returning a generator to determine the maximum Root Mean Square Error (RMSE) for all topics.
53 | 
54 |     @param orig_score: The original scores.
55 |     @param pbar: Boolean value indicating if progress bar should be printed.
56 |     @return: Generator with RMSE values.
57 |     """
58 |     orig_cp = deepcopy(orig_score)
59 |     measures_all = list(list(orig_cp.values())[0].keys())
60 |     topics = orig_cp.keys()
61 |     measures_valid = [m for m in measures_all if m not in exclude]
62 |     measures = tqdm(measures_valid) if pbar else measures_valid
63 | 
64 |     for measure in measures:
65 |         orig_measure = np.array([orig_cp.get(topic).get(measure) for topic in topics])
66 |         _max = np.vectorize(lambda x: max(x, 1 - x))
67 |         maxdiff = _max(orig_measure)
68 |         yield measure, sqrt(sum(np.square(maxdiff))/len(maxdiff))
69 | 
70 | 
71 | def nrmse(orig_score, rep_score, pbar=False):
72 |     """
73 |     Determines the normalized Root Mean Square Error (RMSE) between the original and reproduced topic scores.
74 | 
75 |     @param orig_score: The original scores.
76 |     @param rep_core: The reproduced/replicated scores.
77 |     @param pbar: Boolean value indicating if progress bar should be printed.
78 |     @return: Dictionary with RMSE values that measure the closeness between the original and reproduced topic scores.
79 |     """
80 |     rmse = dict(_rmse(orig_score, rep_score, pbar=pbar))
81 |     maxrmse = dict(_maxrmse(orig_score, pbar=pbar))
82 |     return {measure: score / maxrmse.get(measure) for measure, score in rmse.items()}
83 | 


--------------------------------------------------------------------------------
/repro_eval/measure/external/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/repro_eval/measure/external/__init__.py


--------------------------------------------------------------------------------
/repro_eval/measure/external/rbo.py:
--------------------------------------------------------------------------------
  1 | """Rank-biased overlap, a ragged sorted list similarity measure.
  2 | 
  3 | See http://doi.acm.org/10.1145/1852102.1852106 for details. All functions
  4 | directly corresponding to concepts from the paper are named so that they can be
  5 | clearly cross-identified.
  6 | 
  7 | The definition of overlap has been modified to account for ties. Without this,
  8 | results for lists with tied items were being inflated. The modification itself
  9 | is not mentioned in the paper but seems to be reasonable, see function
 10 | ``overlap()``. Places in the code which diverge from the spec in the paper
 11 | because of this are highlighted with comments.
 12 | 
 13 | The two main functions for performing an RBO analysis are ``rbo()`` and
 14 | ``rbo_dict()``; see their respective docstrings for how to use them.
 15 | 
 16 | The following doctest just checks that equivalent specifications of a
 17 | problem yield the same result using both functions:
 18 | 
 19 |     >>> lst1 = [{"c", "a"}, "b", "d"]
 20 |     >>> lst2 = ["a", {"c", "b"}, "d"]
 21 |     >>> ans_rbo = _round(rbo(lst1, lst2, p=.9))
 22 |     >>> dct1 = dict(a=1, b=2, c=1, d=3)
 23 |     >>> dct2 = dict(a=1, b=2, c=2, d=3)
 24 |     >>> ans_rbo_dict = _round(rbo_dict(dct1, dct2, p=.9, sort_ascending=True))
 25 |     >>> ans_rbo == ans_rbo_dict
 26 |     True
 27 | 
 28 | """
 29 | 
 30 | from __future__ import division
 31 | 
 32 | import math
 33 | from bisect import bisect_left
 34 | from collections import namedtuple
 35 | 
 36 | 
 37 | RBO = namedtuple("RBO", "min res ext")
 38 | RBO.__doc__ += ": Result of full RBO analysis"
 39 | RBO.min.__doc__ = "Lower bound estimate"
 40 | RBO.res.__doc__ = "Residual corresponding to min; min + res is an upper bound estimate"
 41 | RBO.ext.__doc__ = "Extrapolated point estimate"
 42 | 
 43 | 
 44 | def _round(obj):
 45 |     if isinstance(obj, RBO):
 46 |         return RBO(_round(obj.min), _round(obj.res), _round(obj.ext))
 47 |     else:
 48 |         return round(obj, 3)
 49 | 
 50 | 
 51 | def set_at_depth(lst, depth):
 52 |     ans = set()
 53 |     for v in lst[:depth]:
 54 |         if isinstance(v, set):
 55 |             ans.update(v)
 56 |         else:
 57 |             ans.add(v)
 58 |     return ans
 59 | 
 60 | 
 61 | def raw_overlap(list1, list2, depth):
 62 |     """Overlap as defined in the article.
 63 | 
 64 |     """
 65 |     set1, set2 = set_at_depth(list1, depth), set_at_depth(list2, depth)
 66 |     return len(set1.intersection(set2)), len(set1), len(set2)
 67 | 
 68 | 
 69 | def overlap(list1, list2, depth):
 70 |     """Overlap which accounts for possible ties.
 71 | 
 72 |     This isn't mentioned in the paper but should be used in the ``rbo*()``
 73 |     functions below, otherwise overlap at a given depth might be > depth which
 74 |     inflates the result.
 75 | 
 76 |     There are no guidelines in the paper as to what's a good way to calculate
 77 |     this, but a good guess is agreement scaled by the minimum between the
 78 |     requested depth and the lengths of the considered lists (overlap shouldn't
 79 |     be larger than the number of ranks in the shorter list, otherwise results
 80 |     are conspicuously wrong when the lists are of unequal lengths -- rbo_ext is
 81 |     not between rbo_min and rbo_min + rbo_res.
 82 | 
 83 |     >>> overlap("abcd", "abcd", 3)
 84 |     3.0
 85 | 
 86 |     >>> overlap("abcd", "abcd", 5)
 87 |     4.0
 88 | 
 89 |     >>> overlap(["a", {"b", "c"}, "d"], ["a", {"b", "c"}, "d"], 2)
 90 |     2.0
 91 | 
 92 |     >>> overlap(["a", {"b", "c"}, "d"], ["a", {"b", "c"}, "d"], 3)
 93 |     3.0
 94 | 
 95 |     """
 96 |     return agreement(list1, list2, depth) * min(depth, len(list1), len(list2))
 97 |     # NOTE: comment the preceding and uncomment the following line if you want
 98 |     # to stick to the algorithm as defined by the paper
 99 |     # return raw_overlap(list1, list2, depth)[0]
100 | 
101 | 
102 | def agreement(list1, list2, depth):
103 |     """Proportion of shared values between two sorted lists at given depth.
104 | 
105 |     >>> _round(agreement("abcde", "abdcf", 1))
106 |     1.0
107 |     >>> _round(agreement("abcde", "abdcf", 3))
108 |     0.667
109 |     >>> _round(agreement("abcde", "abdcf", 4))
110 |     1.0
111 |     >>> _round(agreement("abcde", "abdcf", 5))
112 |     0.8
113 |     >>> _round(agreement([{1, 2}, 3], [1, {2, 3}], 1))
114 |     0.667
115 |     >>> _round(agreement([{1, 2}, 3], [1, {2, 3}], 2))
116 |     1.0
117 | 
118 |     """
119 |     len_intersection, len_set1, len_set2 = raw_overlap(list1, list2, depth)
120 |     return 2 * len_intersection / (len_set1 + len_set2)
121 | 
122 | 
123 | def cumulative_agreement(list1, list2, depth):
124 |     return (agreement(list1, list2, d) for d in range(1, depth + 1))
125 | 
126 | 
127 | def average_overlap(list1, list2, depth=None):
128 |     """Calculate average overlap between ``list1`` and ``list2``.
129 | 
130 |     >>> _round(average_overlap("abcdefg", "zcavwxy", 1))
131 |     0.0
132 |     >>> _round(average_overlap("abcdefg", "zcavwxy", 2))
133 |     0.0
134 |     >>> _round(average_overlap("abcdefg", "zcavwxy", 3))
135 |     0.222
136 |     >>> _round(average_overlap("abcdefg", "zcavwxy", 4))
137 |     0.292
138 |     >>> _round(average_overlap("abcdefg", "zcavwxy", 5))
139 |     0.313
140 |     >>> _round(average_overlap("abcdefg", "zcavwxy", 6))
141 |     0.317
142 |     >>> _round(average_overlap("abcdefg", "zcavwxy", 7))
143 |     0.312
144 | 
145 |     """
146 |     depth = min(len(list1), len(list2)) if depth is None else depth
147 |     return sum(cumulative_agreement(list1, list2, depth)) / depth
148 | 
149 | 
150 | def rbo_at_k(list1, list2, p, depth=None):
151 |     # ``p**d`` here instead of ``p**(d - 1)`` because enumerate starts at
152 |     # 0
153 |     depth = min(len(list1), len(list2)) if depth is None else depth
154 |     d_a = enumerate(cumulative_agreement(list1, list2, depth))
155 |     return (1 - p) * sum(p ** d * a for (d, a) in d_a)
156 | 
157 | 
158 | def rbo_min(list1, list2, p, depth=None):
159 |     """Tight lower bound on RBO.
160 | 
161 |     See equation (11) in paper.
162 | 
163 |     >>> _round(rbo_min("abcdefg", "abcdefg", .9))
164 |     0.767
165 |     >>> _round(rbo_min("abcdefgh", "abcdefg", .9))
166 |     0.767
167 | 
168 |     """
169 |     depth = min(len(list1), len(list2)) if depth is None else depth
170 |     x_k = overlap(list1, list2, depth)
171 |     log_term = x_k * math.log(1 - p)
172 |     sum_term = sum(
173 |         p ** d / d * (overlap(list1, list2, d) - x_k) for d in range(1, depth + 1)
174 |     )
175 |     return (1 - p) / p * (sum_term - log_term)
176 | 
177 | 
178 | def rbo_res(list1, list2, p):
179 |     """Upper bound on residual overlap beyond evaluated depth.
180 | 
181 |     See equation (30) in paper.
182 | 
183 |     NOTE: The doctests weren't verified against manual computations but seem
184 |     plausible. In particular, for identical lists, ``rbo_min()`` and
185 |     ``rbo_res()`` should add up to 1, which is the case.
186 | 
187 |     >>> _round(rbo_res("abcdefg", "abcdefg", .9))
188 |     0.233
189 |     >>> _round(rbo_res("abcdefg", "abcdefghijklmnopqrstuvwxyz", .9))
190 |     0.239
191 | 
192 |     """
193 |     S, L = sorted((list1, list2), key=len)
194 |     s, l = len(S), len(L)
195 |     x_l = overlap(list1, list2, l)
196 |     # since overlap(...) can be fractional in the general case of ties and f
197 |     # must be an integer --> math.ceil()
198 |     f = int(math.ceil(l + s - x_l))
199 |     # upper bound of range() is non-inclusive, therefore + 1 is needed
200 |     term1 = s * sum(p ** d / d for d in range(s + 1, f + 1))
201 |     term2 = l * sum(p ** d / d for d in range(l + 1, f + 1))
202 |     term3 = x_l * (math.log(1 / (1 - p)) - sum(p ** d / d for d in range(1, f + 1)))
203 |     return p ** s + p ** l - p ** f - (1 - p) / p * (term1 + term2 + term3)
204 | 
205 | 
206 | def rbo_ext(list1, list2, p):
207 |     """RBO point estimate based on extrapolating observed overlap.
208 | 
209 |     See equation (32) in paper.
210 | 
211 |     NOTE: The doctests weren't verified against manual computations but seem
212 |     plausible.
213 | 
214 |     >>> _round(rbo_ext("abcdefg", "abcdefg", .9))
215 |     1.0
216 |     >>> _round(rbo_ext("abcdefg", "bacdefg", .9))
217 |     0.9
218 | 
219 |     """
220 |     S, L = sorted((list1, list2), key=len)
221 |     s, l = len(S), len(L)
222 |     x_l = overlap(list1, list2, l)
223 |     x_s = overlap(list1, list2, s)
224 |     # the paper says overlap(..., d) / d, but it should be replaced by
225 |     # agreement(..., d) defined as per equation (28) so that ties are handled
226 |     # properly (otherwise values > 1 will be returned)
227 |     # sum1 = sum(p**d * overlap(list1, list2, d)[0] / d for d in range(1, l + 1))
228 |     sum1 = sum(p ** d * agreement(list1, list2, d) for d in range(1, l + 1))
229 |     sum2 = sum(p ** d * x_s * (d - s) / s / d for d in range(s + 1, l + 1))
230 |     term1 = (1 - p) / p * (sum1 + sum2)
231 |     term2 = p ** l * ((x_l - x_s) / l + x_s / s)
232 |     return term1 + term2
233 | 
234 | 
235 | def rbo(list1, list2, p):
236 |     """Complete RBO analysis (lower bound, residual, point estimate).
237 | 
238 |     ``list`` arguments should be already correctly sorted iterables and each
239 |     item should either be an atomic value or a set of values tied for that
240 |     rank. ``p`` is the probability of looking for overlap at rank k + 1 after
241 |     having examined rank k.
242 | 
243 |     >>> lst1 = [{"c", "a"}, "b", "d"]
244 |     >>> lst2 = ["a", {"c", "b"}, "d"]
245 |     >>> _round(rbo(lst1, lst2, p=.9))
246 |     RBO(min=0.489, res=0.477, ext=0.967)
247 | 
248 |     """
249 |     if not 0 <= p <= 1:
250 |         raise ValueError("The ``p`` parameter must be between 0 and 1.")
251 |     args = (list1, list2, p)
252 |     return RBO(rbo_min(*args), rbo_res(*args), rbo_ext(*args))
253 | 
254 | 
255 | def sort_dict(dct, *, ascending=False):
256 |     """Sort keys in ``dct`` according to their corresponding values.
257 | 
258 |     Sorts in descending order by default, because the values are
259 |     typically scores, i.e. the higher the better. Specify
260 |     ``ascending=True`` if the values are ranks, or some sort of score
261 |     where lower values are better.
262 | 
263 |     Ties are handled by creating sets of tied keys at the given position
264 |     in the sorted list.
265 | 
266 |     >>> dct = dict(a=1, b=2, c=1, d=3)
267 |     >>> list(sort_dict(dct)) == ['d', 'b', {'a', 'c'}]
268 |     True
269 |     >>> list(sort_dict(dct, ascending=True)) == [{'a', 'c'}, 'b', 'd']
270 |     True
271 | 
272 |     """
273 |     scores = []
274 |     items = []
275 |     # items should be unique, scores don't have to
276 |     for item, score in dct.items():
277 |         if not ascending:
278 |             score *= -1
279 |         i = bisect_left(scores, score)
280 |         if i == len(scores):
281 |             scores.append(score)
282 |             items.append(item)
283 |         elif scores[i] == score:
284 |             existing_item = items[i]
285 |             if isinstance(existing_item, set):
286 |                 existing_item.add(item)
287 |             else:
288 |                 items[i] = {existing_item, item}
289 |         else:
290 |             scores.insert(i, score)
291 |             items.insert(i, item)
292 |     return items
293 | 
294 | 
295 | def rbo_dict(dict1, dict2, p, *, sort_ascending=False):
296 |     """Wrapper around ``rbo()`` for dict input.
297 | 
298 |     Each dict maps items to be sorted to the score according to which
299 |     they should be sorted. The RBO analysis is then performed on the
300 |     resulting sorted lists.
301 | 
302 |     The sort is descending by default, because scores are typically the
303 |     higher the better, but this can be overridden by specifying
304 |     ``sort_ascending=True``.
305 | 
306 |     >>> dct1 = dict(a=1, b=2, c=1, d=3)
307 |     >>> dct2 = dict(a=1, b=2, c=2, d=3)
308 |     >>> _round(rbo_dict(dct1, dct2, p=.9, sort_ascending=True))
309 |     RBO(min=0.489, res=0.477, ext=0.967)
310 | 
311 |     """
312 |     list1, list2 = (
313 |         sort_dict(dict1, ascending=sort_ascending),
314 |         sort_dict(dict2, ascending=sort_ascending),
315 |     )
316 |     return rbo(list1, list2, p)
317 | 
318 | 
319 | if __name__ in ("__main__", "__console__"):
320 |     import doctest
321 | 
322 |     doctest.testmod()
323 | 


--------------------------------------------------------------------------------
/repro_eval/measure/overall_effects.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from copy import deepcopy
  3 | from tqdm import tqdm
  4 | from repro_eval.config import exclude
  5 | 
  6 | 
  7 | def diff(topic_score_a, topic_score_b):
  8 |     """
  9 |     Use this function to get a generator with absoulte differences
 10 |     between the topic scores of the baseline and advanced runs.
 11 | 
 12 |     @param topic_score_a: Topic scores of the advanced run.
 13 |     @param topic_score_b: Topic scores of the baseline run.
 14 |     @return: Generator with absolute differences between the topics scores.
 15 |     """
 16 |     for measure, value in topic_score_a.items():
 17 |         if measure not in exclude:
 18 |             yield measure, value - topic_score_b.get(measure)
 19 | 
 20 | 
 21 | def topic_diff(run_a, run_b):
 22 |     """
 23 |     Use this function to get a generator with absoulte differences
 24 |     between the topic scores of the baseline and advanced runs for each measure.
 25 | 
 26 |     @param run_a: The advanced run.
 27 |     @param run_b: The baseline run.
 28 |     @return: Generator with absolute differences between the topics scores for each measure.
 29 |     """
 30 |     run_a_cp = deepcopy(run_a)
 31 |     run_b_cp = deepcopy(run_b)
 32 | 
 33 |     for topic, measures in run_a_cp.items():
 34 |         yield topic, dict(diff(measures, run_b_cp.get(topic)))
 35 | 
 36 | 
 37 | def _mean_improvement(run_a, run_b):
 38 |     """
 39 |     Helping function returning a generator for determining the mean improvements.
 40 | 
 41 |     @param run_a: The advanced run.
 42 |     @param run_b: The baseline run.
 43 |     @return: Generator with mean improvements.
 44 |     """
 45 |     measures_all = list(list(run_a.values())[0].keys())
 46 |     measures_valid = [m for m in measures_all if m not in exclude]
 47 |     topics = run_a.keys()
 48 |     delta = dict(topic_diff(run_a, run_b))
 49 | 
 50 |     for measure in measures_valid:
 51 |         yield measure, np.array([delta.get(topic).get(measure) for topic in topics]).mean()
 52 | 
 53 | 
 54 | def mean_improvement(run_a, run_b):
 55 |     """
 56 |     Determines the relative improvement that is used to derive the Delta Relative Improvement (DeltaRI).
 57 | 
 58 |     @param run_a: The advanced run.
 59 |     @param run_b: The baseline run.
 60 |     @return: Dictionary with mean improvements for each measure.
 61 |     """
 62 |     return dict(_mean_improvement(run_a, run_b))
 63 | 
 64 | 
 65 | def _er(orig_score_a, orig_score_b, rep_score_a, rep_score_b, pbar=False):
 66 |     """
 67 |     Helping function returning a generator for determining the Effect Ratio (ER).
 68 | 
 69 |     @param orig_score_a: Scores of the original advanced run.
 70 |     @param orig_score_b: Scores of the original baseline run.
 71 |     @param rep_score_a: Scores of the reproduced/replicated advanced run.
 72 |     @param rep_score_b: Scores of the reproduced/replicated baseline run.
 73 |     @param pbar: Boolean value indicating if progress bar should be printed.
 74 |     @return: Generator with ER scores.
 75 |     """
 76 |     mi_orig = mean_improvement(orig_score_a, orig_score_b)
 77 |     mi_rep = mean_improvement(rep_score_a, rep_score_b)
 78 | 
 79 |     generator = tqdm(mi_rep.items()) if pbar else mi_rep.items()
 80 | 
 81 |     for measure, value in generator:
 82 |         yield measure, value / mi_orig.get(measure)
 83 | 
 84 | 
 85 | def ER(orig_score_a, orig_score_b, rep_score_a, rep_score_b, pbar=False):
 86 |     """
 87 |     Determines the Effect Ratio (ER) according to the following paper:
 88 |     Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff.
 89 |     How to Measure the Reproducibility of System-oriented IR Experiments.
 90 |     Proceedings of SIGIR, pages 349-358, 2020.
 91 | 
 92 |     The ER value is determined by the ratio between the mean improvements
 93 |     of the original and reproduced/replicated experiments.
 94 | 
 95 |     @param orig_score_a: Scores of the original advanced run.
 96 |     @param orig_score_b: Scores of the original baseline run.
 97 |     @param rep_score_a: Scores of the reproduced/replicated advanced run.
 98 |     @param rep_score_b: Scores of the reproduced/replicated baseline run.
 99 |     @param pbar: Boolean value indicating if progress bar should be printed.
100 |     @return: Dictionary containing the ER values for the specified run combination.
101 |     """
102 |     return dict(_er(orig_score_a, orig_score_b, rep_score_a, rep_score_b, pbar=pbar))
103 | 
104 | 
105 | def _mean_score(scores):
106 |     """
107 |     Helping function to determine the mean scores across the topics for each measure.
108 | 
109 |     @param scores: Run scores.
110 |     @return: Generator with mean scores.
111 |     """
112 |     measures_all = list(list(scores.values())[0].keys())
113 |     measures_valid = [m for m in measures_all if m not in exclude]
114 |     topics = scores.keys()
115 | 
116 |     for measure in measures_valid:
117 |         yield measure, np.array([scores.get(topic).get(measure) for topic in topics]).mean()
118 | 
119 | 
120 | def mean_score(scores):
121 |     """
122 |     Use this function to compute the mean scores across the topics for each measure.
123 | 
124 |     @param scores: Run scores.
125 |     @return: Dictionary containing the mean scores for each measure.
126 |     """
127 |     return dict(_mean_score(scores))
128 | 
129 | 
130 | def _rel_improve(scores_a, scores_b):
131 |     """
132 |     Helping function returning a generator for determining the relative improvements.
133 | 
134 |     @param scores_a: Scores of the advanced run.
135 |     @param scores_b: Scores of the baseline run.
136 |     @return: Generator with relative improvements.
137 |     """
138 |     mean_scores_a = mean_score(scores_a)
139 |     mean_scores_b = mean_score(scores_b)
140 | 
141 |     for measure, mean in mean_scores_a.items():
142 |         yield measure, (mean - mean_scores_b.get(measure)) / mean_scores_b.get(measure)
143 | 
144 | 
145 | def rel_improve(scores_a, scores_b):
146 |     """
147 |     Determines the relative improvement that is used to derive the Delta Relative Improvement (DeltaRI).
148 | 
149 |     @param scores_a: Scores of the advanced run.
150 |     @param scores_b: Scores of the baseline run.
151 |     @return: Dictionary with relative improvements for each measure.
152 |     """
153 |     return dict(_rel_improve(scores_a, scores_b))
154 | 
155 | 
156 | def _deltaRI(orig_score_a, orig_score_b, rep_score_a, rep_score_b, pbar=False):
157 |     """
158 |     Helping function returning a generator for determining the Delta Relative Improvement (DeltaRI).
159 | 
160 |     @param orig_score_a: Scores of the original advanced run.
161 |     @param orig_score_b: Scores of the original baseline run.
162 |     @param rep_score_a: Scores of the reproduced/replicated advanced run.
163 |     @param rep_score_b: Scores of the reproduced/replicated baseline run.
164 |     @param pbar: Boolean value indicating if progress bar should be printed.
165 |     @return: Generator with DeltaRI scores.
166 |     """
167 |     rel_improve_orig = rel_improve(orig_score_a, orig_score_b)
168 |     rel_improve_rep = rel_improve(rep_score_a, rep_score_b)
169 | 
170 |     generator = tqdm(rel_improve_orig.items()) if pbar else rel_improve_orig.items()
171 | 
172 |     for measure, ri in generator:
173 |         yield measure, ri - rel_improve_rep.get(measure)
174 | 
175 | 
176 | def deltaRI(orig_score_a, orig_score_b, rep_score_a, rep_score_b, pbar=False):
177 |     """
178 |     Determines the Delta Relative Improvement (DeltaRI) according to the following paper:
179 |     Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff.
180 |     How to Measure the Reproducibility of System-oriented IR Experiments.
181 |     Proceedings of SIGIR, pages 349-358, 2020.
182 | 
183 |     The DeltaRI value is determined by the difference between the relative improvements
184 |     of the original and reproduced/replicated experiments.
185 | 
186 |     @param orig_score_a: Scores of the original advanced run.
187 |     @param orig_score_b: Scores of the original baseline run.
188 |     @param rep_score_a: Scores of the reproduced/replicated advanced run.
189 |     @param rep_score_b: Scores of the reproduced/replicated baseline run.
190 |     @param pbar: Boolean value indicating if progress bar should be printed.
191 |     @return: Dictionary containing the DeltaRI values for the specified run combination.
192 |     """
193 |     return dict(_deltaRI(orig_score_a, orig_score_b, rep_score_a, rep_score_b, pbar=pbar))
194 | 


--------------------------------------------------------------------------------
/repro_eval/measure/statistics.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from scipy.stats.stats import ttest_rel, ttest_ind
 3 | from tqdm import tqdm
 4 | from repro_eval.util import topic_scores
 5 | 
 6 | 
 7 | def _ttest(orig_score, rep_score, rpd=True, pbar=False):
 8 |     """
 9 | 
10 |     @param orig_score: The original scores.
11 |     @param rep_score: The reproduced/replicated scores.
12 |     @param rpd: Boolean indicating if the evaluated runs are reproduced.
13 |     @param pbar: Boolean value indicating if progress bar should be printed.
14 |     @return: Generator with p-values.
15 |     """
16 |     if rpd:  # paired two-tailed t-test
17 |         topic_scores_orig = topic_scores(orig_score)
18 |         topic_scores_rep = topic_scores(rep_score)
19 | 
20 |         generator = tqdm(topic_scores_orig.items()) if pbar else topic_scores_orig.items()
21 | 
22 |         for measure, scores in generator:
23 |             yield measure, ttest_rel(scores, topic_scores_rep.get(measure)).pvalue
24 | 
25 |     else:  # else unpaired two-tailed t-test
26 |         topic_scores_orig = topic_scores(orig_score)
27 |         topic_scores_rep = topic_scores(rep_score)
28 | 
29 |         generator = tqdm(topic_scores_orig.items()) if pbar else topic_scores_orig.items()
30 | 
31 |         for measure, scores in generator:
32 |             yield measure, ttest_ind(scores, topic_scores_rep.get(measure)).pvalue
33 | 
34 | 
35 | def ttest(orig_score, rep_score, rpd=True, pbar=False):
36 |     """
37 | 
38 |     @param orig_score: The original scores.
39 |     @param rep_score: The reproduced/replicated scores.
40 |     @param rpd: Boolean indicating if the evaluated runs are reproduced.
41 |     @param pbar: Boolean value indicating if progress bar should be printed.
42 |     @return: Dictionary with p-values that compare the score distributions of the baseline and advanced run.
43 |     """
44 |     pvals = dict(_ttest(orig_score, rep_score, rpd=rpd, pbar=pbar))
45 |     nan_list = list(filter(lambda x: math.isnan(x), pvals.values()))
46 |     if len(nan_list) == len(pvals):  # is every pval is nan?
47 |         if orig_score == rep_score:  # equal score distributions?
48 |             pvals = dict.fromkeys(pvals, 1.0)
49 | 
50 |     return pvals
51 | 


--------------------------------------------------------------------------------
/repro_eval/metadata.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import platform
  4 | import pkg_resources
  5 | import warnings
  6 | from collections import defaultdict
  7 | from io import BytesIO, TextIOWrapper
  8 | import cpuinfo
  9 | import pytrec_eval
 10 | import git
 11 | from ruamel.yaml import YAML
 12 | from repro_eval import Evaluator
 13 | 
 14 | META_START = '# ir_metadata.start'
 15 | META_END = '# ir_metadata.end'
 16 | 
 17 | class PrimadExperiment:
 18 |     """
 19 |     The PrimadExperiment is used to determine the reproducibility measures 
 20 |     between a reference run and a set of one or more reproduced run files.
 21 |     Depending on the type of the PRIMAD experiment, several reproducibility 
 22 |     measures can be determined.
 23 |     
 24 |     @param ref_base_path: Path to a single run file that corresponds to the 
 25 |                           original (or reference) baseline of the experiments. 
 26 |     @param ref_adv_path: Path to a single run file that corresponds to the
 27 |                          original (or reference) baseline of the experiments.
 28 |     @param primad: String with lower and upper case letters depending on which 
 29 |                    PRIMAD components have changed in the experiments, e.g., 
 30 |                    "priMad" when only the Method changes due to parameter sweeps.
 31 |     @param rep_base: List containing paths to run files that reproduce the 
 32 |                      original (or reference) baseline run.
 33 |     @param rpd_qrels: Qrels file that is used to evaluate the reproducibility of
 34 |                       the experiments, i.e., it used to evaluate runs that are 
 35 |                       derived from the same test collection.
 36 |     @param rep_adv: List containing paths to run files that reproduce the 
 37 |                     original (or reference) advanced run.
 38 |     @param rpl_qrels: Qrels file that is used to evaluate the replicability of
 39 |                       the experiments, i.e., it is used to evaluate runs that are 
 40 |                       derived from a different test collection. Please note that 
 41 |                       "rpd_qrels" has to be provided too.
 42 |     """
 43 |     def __init__(self, **kwargs): 
 44 |     
 45 |         self.ref_base_path = kwargs.get('ref_base_path', None)
 46 |         if self.ref_base_path:
 47 |             self.ref_base_run = MetadataHandler.strip_metadata(self.ref_base_path)
 48 |         else:
 49 |             self.ref_base_run = None
 50 |         
 51 |         self.ref_adv_path = kwargs.get('ref_adv_path', None)
 52 |         if self.ref_adv_path:
 53 |             self.ref_adv_run = MetadataHandler.strip_metadata(self.ref_adv_path)
 54 |         else:
 55 |             self.ref_adv_run = None    
 56 |         
 57 |         self.primad = kwargs.get('primad', None)
 58 |         self.rep_base = kwargs.get('rep_base', None)
 59 |         self.rpd_qrels = kwargs.get('rpd_qrels', None)
 60 |         self.rep_adv = kwargs.get('rep_adv', None)
 61 |         self.rpl_qrels = kwargs.get('rpl_qrels', None)
 62 | 
 63 |         if self.rpl_qrels:
 64 |             self.rep_eval = Evaluator.RplEvaluator(qrel_orig_path=self.rpd_qrels,
 65 |                                                    qrel_rpl_path=self.rpl_qrels)  
 66 |             
 67 |             with open(self.rpd_qrels, 'r') as f_rpd_qrels, open(self.rpl_qrels, 'r') as f_rpl_qrels:
 68 |                 qrels = pytrec_eval.parse_qrel(f_rpd_qrels)
 69 |                 self.rpd_rel_eval = pytrec_eval.RelevanceEvaluator(qrels, pytrec_eval.supported_measures)
 70 |                 qrels = pytrec_eval.parse_qrel(f_rpl_qrels)
 71 |                 self.rpl_rel_eval = pytrec_eval.RelevanceEvaluator(qrels, pytrec_eval.supported_measures)
 72 |             
 73 |         
 74 |         elif self.primad[-1].islower(): # check if data component is the same
 75 |             self.rep_eval = Evaluator.RpdEvaluator(qrel_orig_path=self.rpd_qrels)
 76 |             
 77 |             with open(self.rpd_qrels, 'r') as f_qrels:
 78 |                 qrels = pytrec_eval.parse_qrel(f_qrels)
 79 |                 self.rpd_rel_eval = pytrec_eval.RelevanceEvaluator(qrels, pytrec_eval.supported_measures)
 80 |             
 81 |         else:
 82 |             raise ValueError('Please provide a correct combination of qrels and PRIMAD type.')
 83 |             
 84 |     def get_primad_type(self):
 85 |         """
 86 |         This method returns a string that identifies the type of the 
 87 |         PRIMAD experiment.
 88 | 
 89 |         @return: String with lower and upper case letters depending on which 
 90 |                  PRIMAD components have changed in the experiments, e.g., 
 91 |                  "priMad" when only the Method changes due to parameter sweeps.
 92 |         """
 93 |         return self.primad
 94 |     
 95 |     def evaluate(self):
 96 |         """
 97 |         This method validates the PRIMAD experiment in accordance with the given
 98 |         "primad" identifier. Currently, the following experiments are supported.
 99 |             - priMad: Parameter sweep
100 |             - PRIMAd: Reproducibility evaluation on the same test collection
101 |             - PRIMAD: Generalizability evaluation
102 |         
103 |         @return: Dictionary containing the average retrieval performance and 
104 |                  the reproducibility measures for each run.
105 |         """
106 |         
107 |         if self.primad == 'priMad':
108 |             if self.ref_adv_run is None and self.rep_adv is None:
109 |                 
110 |                 evaluations = {} 
111 |                 
112 |                 self.rep_eval.run_b_orig = self.ref_base_run
113 |                 self.rep_eval.evaluate()
114 | 
115 |                 for rep_run_path in self.rep_base + [self.ref_base_path]:
116 |                     
117 |                     run_evaluations = {}
118 |                 
119 |                     rep_run = MetadataHandler.strip_metadata(rep_run_path)
120 |                     scores = self.rpd_rel_eval.evaluate(rep_run)
121 |                     
122 |                     run_evaluations['arp'] = scores
123 |                     run_evaluations['ktu'] = self.rep_eval.ktau_union(run_b_rep=rep_run).get('baseline')
124 |                     run_evaluations['rbo'] = self.rep_eval.rbo(run_b_rep=rep_run).get('baseline')
125 |                     run_evaluations['rmse'] = self.rep_eval.nrmse(run_b_score=scores).get('baseline')
126 |                     run_evaluations['pval'] = self.rep_eval.ttest(run_b_score=scores).get('baseline')
127 |                 
128 |                     run_name = os.path.basename(rep_run_path)
129 |                 
130 |                     evaluations[run_name] = run_evaluations
131 |         
132 |                 return evaluations
133 |     
134 |         if self.primad == 'PRIMAd':
135 |             
136 |             evaluations = {} 
137 |             
138 |             self.rep_eval.run_b_orig = self.ref_base_run
139 |             self.rep_eval.run_a_orig = self.ref_adv_run
140 |             self.rep_eval.trim(t=1000)
141 |             self.rep_eval.evaluate()
142 |     
143 |             pairs = self._find_pairs(rep_base=self.rep_base, rep_adv=self.rep_adv)
144 |             pairs = pairs + [{'base': self.ref_base_path, 'adv': self.ref_adv_path}]
145 |             
146 |             for pair in pairs:
147 |                 
148 |                 pair_evaluations = {}
149 |                 
150 |                 rep_run_base = MetadataHandler.strip_metadata(pair.get('base'))
151 |                 rep_meta_base = MetadataHandler.read_metadata(pair.get('base'))                
152 |                 rep_run_adv = MetadataHandler.strip_metadata(pair.get('adv'))
153 |                 rep_meta_adv = MetadataHandler.read_metadata(pair.get('adv'))          
154 |                       
155 |                 self.rep_eval.trim(t=1000, run=rep_run_base)
156 |                 self.rep_eval.trim(t=1000, run=rep_run_adv)
157 |                 scores_base = self.rpd_rel_eval.evaluate(rep_run_base)
158 |                 scores_adv = self.rpd_rel_eval.evaluate(rep_run_adv)
159 |                 arp = {'baseline': scores_base, 'advanced': scores_adv}
160 |                 pair_evaluations['arp'] = arp 
161 |                 pair_evaluations['ktu'] = self.rep_eval.ktau_union(run_b_rep=rep_run_base, run_a_rep=rep_run_adv)
162 |                 pair_evaluations['rbo'] = self.rep_eval.rbo(run_b_rep=rep_run_base, run_a_rep=rep_run_adv)
163 |                 pair_evaluations['rmse'] = self.rep_eval.nrmse(run_b_score=scores_base, run_a_score=scores_adv)
164 |                 pair_evaluations['er'] = self.rep_eval.er(run_b_score=scores_base, run_a_score=scores_adv)
165 |                 pair_evaluations['dri'] = self.rep_eval.dri(run_b_score=scores_base, run_a_score=scores_adv)
166 |                 pair_evaluations['pval'] = self.rep_eval.ttest(run_b_score=scores_base, run_a_score=scores_adv)
167 |                 
168 |                 if rep_meta_base.get('actor').get('team') == rep_meta_adv.get('actor').get('team'):
169 |                     expid = rep_meta_base.get('actor').get('team')
170 |                 else:
171 |                     expid = '_'.join([rep_meta_base.get('tag'), rep_meta_adv.get('tag')])
172 |                 
173 |                 evaluations[expid] = pair_evaluations
174 |  
175 |             return evaluations
176 |     
177 |         if self.primad == 'PRIMAD':
178 |             evaluations = {} 
179 |             
180 |             self.rep_eval.run_b_orig = self.ref_base_run
181 |             self.rep_eval.run_a_orig = self.ref_adv_run
182 |             self.rep_eval.trim(t=1000)
183 |             self.rep_eval.evaluate()
184 |     
185 |             pairs = self._find_pairs(rep_base=self.rep_base, rep_adv=self.rep_adv)
186 |             pairs = pairs
187 |             
188 |             for pair in pairs:
189 |                 
190 |                 pair_evaluations = {}
191 |                 
192 |                 rep_run_base = MetadataHandler.strip_metadata(pair.get('base'))
193 |                 rep_meta_base = MetadataHandler.read_metadata(pair.get('base'))                
194 |                 rep_run_adv = MetadataHandler.strip_metadata(pair.get('adv'))
195 |                 rep_meta_adv = MetadataHandler.read_metadata(pair.get('adv'))          
196 |                       
197 |                 self.rep_eval.trim(t=1000, run=rep_run_base)
198 |                 self.rep_eval.trim(t=1000, run=rep_run_adv)
199 |                 scores_base = self.rpl_rel_eval.evaluate(rep_run_base)
200 |                 scores_adv = self.rpl_rel_eval.evaluate(rep_run_adv)
201 |                 arp = {'baseline': scores_base, 'advanced': scores_adv}
202 |                 pair_evaluations['arp'] = arp 
203 |                 pair_evaluations['er'] = self.rep_eval.er(run_b_score=scores_base, run_a_score=scores_adv)
204 |                 pair_evaluations['dri'] = self.rep_eval.dri(run_b_score=scores_base, run_a_score=scores_adv)
205 |                 pair_evaluations['pval'] = self.rep_eval.ttest(run_b_score=scores_base, run_a_score=scores_adv)
206 |                 
207 |                 expid = '_'.join([rep_meta_base.get('tag'), rep_meta_adv.get('tag')])  
208 |                 evaluations[expid] = pair_evaluations
209 |             
210 |             return evaluations
211 |         
212 |         else:
213 |             raise ValueError('The specified type of the PRIMAD experiments is not supported yet.')
214 |         
215 |     def _find_pairs(self, rep_base, rep_adv):
216 |         """
217 |         This method finds pairs between lists of baseline and advanced runs. 
218 |         A pair is defined by the highest number of matching PRIMAD components.
219 |         
220 |         @param rep_base: List with baseline runs.
221 |         @param rep_adv: List with advanced runs.
222 |         
223 |         @return: List with dictionaries containing paths to a baseline and an 
224 |                  advanced run.
225 |         """
226 | 
227 |         pairs = []
228 |         for brp in rep_base:
229 |             br = MetadataHandler.read_metadata(run_path=brp)
230 | 
231 |             arp = None
232 |             cnt = 0
233 |             
234 |             for _arp in rep_adv:
235 |                 _cnt = 0
236 |                 ar = MetadataHandler.read_metadata(run_path=_arp)
237 |                 
238 |                 for k,v in br.items():
239 |                     if v == ar.get(k):
240 |                         _cnt += 1
241 |                         
242 |                 if _cnt > cnt:
243 |                     cnt = _cnt 
244 |                     arp = _arp
245 |             
246 |             pairs.append({'base': brp, 'adv': arp})
247 |             
248 |         return pairs
249 |     
250 | 
251 | class MetadataAnalyzer:
252 |     """
253 |     The MetadataAnalyzer is used to analyze set of different run files in
254 |     reference to a run that has be be provided upon instantiation. The 
255 |     analyze_directory() method returns a dictionary with PRIMAD identifiers as
256 |     keys and lists with the corresponding run paths as values.
257 |     
258 |     @param run_path: Path to the reference run file.
259 |     """
260 |     
261 |     def __init__(self, run_path):
262 |         
263 |         self.reference_run_path = run_path
264 |         self.reference_run = MetadataHandler.strip_metadata(run_path)
265 |         self.reference_metadata = MetadataHandler.read_metadata(run_path)
266 |         
267 |     def set_reference(self, run_path):
268 |         """
269 |         Use this method to set a new reference run.
270 |         
271 |         @param run_path: Path to the new reference run file.
272 |         """
273 |         
274 |         self.reference_run_path = run_path
275 |         self.reference_run = MetadataHandler.strip_metadata(run_path)
276 |         self.reference_metadata = MetadataHandler.read_metadata(run_path)
277 |         
278 |     def analyze_directory(self, dir_path):    
279 |         """
280 |         Use this method to analyze the specified directory in comparison to the 
281 |         reference run. 
282 |         
283 |         @param dir_path: Path to the directory.
284 |         """
285 |         
286 |         components = ['platform', 'research goal', 'implementation', 'method', 'actor', 'data']
287 |         primad = {}
288 |         
289 |         files = os.listdir(dir_path)
290 |         
291 |         for _file in files:
292 |             file_path = os.path.join(dir_path, _file)
293 |     
294 |             if file_path == self.reference_run_path:
295 |                 continue
296 |             
297 |             _metadata = MetadataHandler.read_metadata(file_path)
298 |             
299 |             primad_str = ''
300 |                                         
301 |             for component in components:
302 |                 if self.reference_metadata[component] != _metadata[component]:
303 |                     primad_str += component[0].upper()
304 |                 else:
305 |                     primad_str += component[0]
306 |             
307 |             primad[file_path] = primad_str
308 |             
309 |         experiments = defaultdict(list)
310 |         for k, v in primad.items(): 
311 |             experiments[v].append(k)
312 |             
313 |         return experiments  
314 |     
315 |     @staticmethod
316 |     def filter_by_baseline(ref_run, runs):
317 |         """
318 |         Use this method to filter a list of runs wrt. to the baseline that is 
319 |         specified under "research goal/evaluation/baseline" of a given reference run.
320 |         
321 |         @param ref_run: The reference with the baseline.
322 |         @param runs: A list of run paths that is filtered.
323 |         """
324 |         
325 |         run_tag = MetadataHandler.read_metadata(ref_run).get('tag')
326 | 
327 |         filtered_list = []
328 |         for run in runs:
329 |             _metadata = MetadataHandler.read_metadata(run)
330 |             baseline = _metadata.get('research goal').get('evaluation').get('baseline')[0]
331 |             if baseline == run_tag:
332 |                 filtered_list.append(run)
333 |                 
334 |         return filtered_list
335 |     
336 |     @staticmethod 
337 |     def filter_by_test_collection(test_collection, runs):
338 |         """
339 |         Use this method to filter a list of runs wrt. to the test collection
340 |         specified under "data/test_collection".
341 |         
342 |         @param test_collection: Name of the test collection.
343 |         @param runs: A list of run paths that is filtered.
344 |         """
345 |         
346 |         filtered_list = []
347 |         for run in runs:
348 |             _metadata = MetadataHandler.read_metadata(run)
349 |             name = _metadata.get('data').get('test collection').get('name')
350 |             if test_collection == name:
351 |                 filtered_list.append(run)
352 |                 
353 |         return filtered_list
354 | 
355 | 
356 | class MetadataHandler:
357 |     """
358 |     Use the MetadataHandler for in- and output operations of annotated run files.
359 |     
360 |     @param run_path: Path the run file without metadata annotations. It is also 
361 |                      possible to load an already annotated run and modify it with
362 |                      the MetadataHandler.
363 |     @param metadata_path: Path to the YAML file containing the metadata that 
364 |                           should be added to the run file.
365 |     """
366 |     def __init__(self, run_path, metadata_path=None):
367 |          
368 |         self.run_path = run_path
369 |         
370 |         if metadata_path:
371 |             self._metadata = MetadataHandler.read_metadata_template(metadata_path)
372 |         else:
373 |             self._metadata = MetadataHandler.read_metadata(run_path)
374 |         
375 |     def get_metadata(self):
376 |         """
377 |         Use this method to get the currently set metadata annotations.
378 |         
379 |         @return: Nested dictionary containing the metadata annotations.
380 |         """
381 |         
382 |         return self._metadata
383 |     
384 |     def set_metadata(self, metadata_dict=None, metadata_path=None):
385 |         """
386 |         Use this method to set/update the metadata. It can either be provided with a 
387 |         dictionary of a path to a YAML file
388 |         
389 |         @param metadata_dict: Nested dictionary containing the metadata annotations.
390 |         @param metadata_path: Path to the YAML file with metadata.
391 |         """
392 |         if metadata_path:
393 |             self._metadata = MetadataHandler.read_metadata_template(metadata_path)
394 |         
395 |         if metadata_dict:
396 |             self._metadata = metadata_dict
397 |     
398 |     def dump_metadata(self, dump_path=None, complete_metadata=False, repo_path='.'):
399 |         """
400 |         Use this method to dump the current metadata into a YAML file. 
401 |         The filename is a concatenation of the run tag and the "_annotated" suffix.
402 |         
403 |         @param dump_path: Path to the directory where the metadata is dumped.
404 |         @param complete_metadata: If true, the Platform and Implementation will 
405 |                                   be added automatically, if not already provided.
406 |         @param repo_path: Path to the git repository of the Implementation that 
407 |                           underlies the run file. This path is needed for the 
408 |                           automatic completion.
409 |         """
410 |            
411 |         if complete_metadata:
412 |             self.complete_metadata(repo_path=repo_path)
413 |             
414 |         if self._metadata:
415 |                 
416 |             tag = self._metadata['tag'] 
417 |             f_out_name = '_'.join([tag, 'dump.yaml'])  
418 |             f_out_path = os.path.join(dump_path, f_out_name)    
419 |                 
420 |             with open(f_out_path, 'wb') as f_out:
421 |                 bytes_io = BytesIO()
422 |                 yaml = YAML()
423 |                 yaml.width = 4096
424 |                 yaml.dump(self._metadata, bytes_io)
425 |                 f_out.write(bytes_io.getvalue())
426 | 
427 |     def write_metadata(self, run_path=None, complete_metadata=False, repo_path='.'):
428 |         """
429 |         This method writes the metadata into the run file.
430 |         
431 |         @param run_path: Path to the annotated run file.
432 |         @param complete_metadata: If true, the Platform and Implementation will 
433 |                                   be added automatically, if not already provided.
434 |         @param repo_path: Path to the git repository of the Implementation that 
435 |                           underlies the run file. This path is needed for the 
436 |                           automatic completion.
437 |         """
438 |         if complete_metadata:
439 |             self.complete_metadata(repo_path=repo_path)
440 |         
441 |         bytes_io = BytesIO()
442 |         yaml = YAML()
443 |         yaml.width = 4096
444 |         yaml.dump(self._metadata, bytes_io)
445 |     
446 |         byte_str = bytes_io.getvalue().decode('UTF-8')
447 |         lines = byte_str.split('\n')
448 |         
449 |         if run_path is None:
450 |             f_out_path = '_'.join([self.run_path, 'annotated'])
451 |         else:
452 |             f_out_path = '_'.join([run_path])
453 |         
454 |         with open(f_out_path, 'w') as f_out:
455 |             
456 |             f_out.write(''.join([META_START, '\n']))
457 |             for line in lines[:-1]:
458 |                 f_out.write(' '.join(['#', line, '\n']))            
459 |             f_out.write(''.join([META_END, '\n']))
460 |             
461 |             with open(self.run_path, 'r') as f_in:
462 |                 for run_line in f_in.readlines():
463 |                     f_out.write(run_line)
464 |                 
465 |     def complete_metadata(self, repo_path='.'):
466 |         """
467 |         This method automatically adds metadata about the Platform and 
468 |         the Implementation component.
469 |         
470 |         @param repo_path: Path to the git repository of the Implementation that 
471 |                           underlies the run file. If not specified this method
472 |                           assumes that the program is executed from the root 
473 |                           directory of the git repository.
474 |         """
475 |         if self._metadata.get('platform') is None:
476 |             platform_dict = {
477 |                 'hardware': {
478 |                     'cpu': self._get_cpu(),  
479 |                     'ram': self._get_ram(),
480 |                     },
481 |                 'operating system': self._get_os(),
482 |                 'software': self._get_libs(),
483 |                 }
484 |             
485 |             self._metadata['platform'] = platform_dict
486 |             
487 |         if self._metadata.get('implementation') is None: 
488 |             self._metadata['implementation'] = self._get_src(repo_path=repo_path)
489 |     
490 |     @staticmethod
491 |     def strip_metadata(annotated_run):
492 |         '''
493 |         Strips off the metadata and returns a dict-version of the run that is parsed with pytrec_eval.
494 |         
495 |         @param annotated_run: Path to the annotated run file.
496 |         
497 |         @return: defaultdict that can be used with pytrec_eval or repro_eval.
498 |         '''
499 | 
500 |         with TextIOWrapper(buffer=BytesIO(), encoding='utf-8', line_buffering=True) as text_io_wrapper:
501 |             with open(annotated_run, 'r') as f_in:
502 |                 lines = f_in.readlines()
503 |                 for line in lines:
504 |                     if line[0] != '#':
505 |                         text_io_wrapper.write(line)
506 |             text_io_wrapper.seek(0,0)        
507 |             run = pytrec_eval.parse_run(text_io_wrapper)
508 |                         
509 |         return run
510 | 
511 |     @staticmethod
512 |     def read_metadata(run_path):
513 |         '''
514 |         Reads the metadata out of an annotated run and returns a dict containing the metadata.
515 |         
516 |         @param run_path: Path to the run file.
517 |         
518 |         @return: Dictionary containing the metadata information of the annotated 
519 |                  run file.
520 |         '''
521 |         
522 |         _metadata = None
523 |         
524 |         with open(run_path, 'r') as f_in: 
525 |             lines = f_in.readlines()
526 |             if lines[0].strip('\n') == META_START:
527 |                 metadata_str = ''
528 |                 yaml=YAML(typ='safe')
529 | 
530 |                 for line in lines[1:]:
531 |                     if line.strip('\n') != META_END:
532 |                         metadata_str += line.strip('#')
533 |                     else:
534 |                         break
535 |                 _metadata = yaml.load(metadata_str)
536 |         
537 |         return _metadata
538 |     
539 |     @staticmethod
540 |     def read_metadata_template(metadata_path):
541 |         """
542 |         This method reads in a YAML file containing the metadata.
543 |         
544 |         @param template_path: Path to the metadata YAML file.
545 |         
546 |         @return: Nested dictionary containing the metadata.
547 |         """
548 |         
549 |         with open(metadata_path, 'r') as f_in:
550 |             yaml = YAML(typ='safe')
551 |             return yaml.load(f_in)
552 | 
553 |     def _get_cpu(self):
554 |         """
555 |         Reads out metadata information about the CPU including the model's name
556 |         the architectures, the operation mode and the number of available cores.
557 |         """
558 |         
559 |         cpu = cpuinfo.get_cpu_info()
560 |         return {
561 |             'model': cpu['brand_raw'],
562 |             'architecture': platform.machine(),
563 |             'operation mode': '-'.join([str(cpu['bits']), 'bit']),
564 |             'number of cores': cpu['count'],
565 |         }
566 |       
567 |     def _get_os(self):
568 |         """
569 |         Reads out metadata information about the operating system including 
570 |         the platform (e.g. Linux), the kernel release version, 
571 |         and the distribution's name.
572 |         """
573 |         
574 |         try:
575 |             with open("/etc/os-release") as f_in:
576 |                 os_info = {}
577 |                 for line in f_in:
578 |                     k,v = line.rstrip().split('=')
579 |                     os_info[k] = v.strip('"')
580 |                 
581 |             distribution = os_info['PRETTY_NAME']
582 |             
583 |         except:
584 |             warnings.warn('/etc/os-release not found. Using the available information of the platform package instead.')
585 |             distribution = platform.version()
586 |              
587 |         return {
588 |             'platform': platform.system(),
589 |             'kernel': platform.release(),
590 |             'distribution': distribution,
591 |         }
592 | 
593 |     def _get_ram(self):
594 |         """
595 |         Reads out the available RAM and returns the size in GB.
596 |         """
597 |         
598 |         memory_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') 
599 |         memory_gb = memory_bytes/(1024.0 ** 3)  
600 |         return ' '.join([str(round(memory_gb, 2)),'GB'])
601 | 
602 |     def _get_libs(self):
603 |         """
604 |         Reads out all installed Python packages of the active environment.
605 |         """
606 |         
607 |         installed_packages = [d.project_name for d in pkg_resources.working_set]
608 |         return {'libraries': {'python': installed_packages}}
609 | 
610 |     def _get_src(self, repo_path='.'):
611 |         """ 
612 |         Reads out information from the specified repository.
613 |         
614 |         @param repo_path: Path to the git repository of the Implementation that 
615 |                           underlies the run file. If not specified this method
616 |                           assumes that the program is executed from the root 
617 |                           directory of the git repository.        
618 |         """
619 |         
620 |         extensions_path = pkg_resources.resource_filename(__name__, 'resources/extensions.json')
621 | 
622 |         repo = git.Repo(repo_path)
623 |         
624 |         with open(extensions_path, 'r') as input_file:
625 |             extensions = json.load(input_file)
626 |             
627 |         languages = set()
628 | 
629 |         for _, _, files in os.walk('.'):
630 |             for name in files:
631 |                 _, file_extension = os.path.splitext(name)
632 |                 language = extensions.get(file_extension[1:])
633 |                 if language:
634 |                     languages.add(language)
635 | 
636 |         return {
637 |             'repository': repo.remote().url,
638 |             'commit': str(repo.head.commit),
639 |             'lang': list(languages),
640 |         }
641 | 


--------------------------------------------------------------------------------
/repro_eval/resources/extensions.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "1": "Groff",
  3 |   "2": "Groff",
  4 |   "3": "Groff",
  5 |   "4": "Groff",
  6 |   "5": "Groff",
  7 |   "6": "Groff",
  8 |   "7": "Groff",
  9 |   "8": "Groff",
 10 |   "9": "Groff",
 11 |   "abap": "ABAP",
 12 |   "asc": "Public Key",
 13 |   "ash": "AGS Script",
 14 |   "ampl": "AMPL",
 15 |   "mod": "XML",
 16 |   "g4": "ANTLR",
 17 |   "apib": "API Blueprint",
 18 |   "apl": "APL",
 19 |   "dyalog": "APL",
 20 |   "asp": "ASP",
 21 |   "asax": "ASP",
 22 |   "ascx": "ASP",
 23 |   "ashx": "ASP",
 24 |   "asmx": "ASP",
 25 |   "aspx": "ASP",
 26 |   "axd": "ASP",
 27 |   "dats": "ATS",
 28 |   "hats": "ATS",
 29 |   "sats": "ATS",
 30 |   "as": "ActionScript",
 31 |   "adb": "Ada",
 32 |   "ada": "Ada",
 33 |   "ads": "Ada",
 34 |   "agda": "Agda",
 35 |   "als": "Alloy",
 36 |   "apacheconf": "ApacheConf",
 37 |   "vhost": "Nginx",
 38 |   "cls": "Visual Basic",
 39 |   "applescript": "AppleScript",
 40 |   "scpt": "AppleScript",
 41 |   "arc": "Arc",
 42 |   "ino": "Arduino",
 43 |   "asciidoc": "AsciiDoc",
 44 |   "adoc": "AsciiDoc",
 45 |   "aj": "AspectJ",
 46 |   "asm": "Assembly",
 47 |   "a51": "Assembly",
 48 |   "inc": "SourcePawn",
 49 |   "nasm": "Assembly",
 50 |   "aug": "Augeas",
 51 |   "ahk": "AutoHotkey",
 52 |   "ahkl": "AutoHotkey",
 53 |   "au3": "AutoIt",
 54 |   "awk": "Awk",
 55 |   "auk": "Awk",
 56 |   "gawk": "Awk",
 57 |   "mawk": "Awk",
 58 |   "nawk": "Awk",
 59 |   "bat": "Batchfile",
 60 |   "cmd": "Batchfile",
 61 |   "befunge": "Befunge",
 62 |   "bison": "Bison",
 63 |   "bb": "BlitzBasic",
 64 |   "decls": "BlitzBasic",
 65 |   "bmx": "BlitzMax",
 66 |   "bsv": "Bluespec",
 67 |   "boo": "Boo",
 68 |   "b": "Limbo",
 69 |   "bf": "HyPhy",
 70 |   "brs": "Brightscript",
 71 |   "bro": "Bro",
 72 |   "c": "C",
 73 |   "cats": "C",
 74 |   "h": "Objective-C",
 75 |   "idc": "C",
 76 |   "w": "C",
 77 |   "cs": "Smalltalk",
 78 |   "cake": "CoffeeScript",
 79 |   "cshtml": "C#",
 80 |   "csx": "C#",
 81 |   "cpp": "C++",
 82 |   "c++": "C++",
 83 |   "cc": "C++",
 84 |   "cp": "Component Pascal",
 85 |   "cxx": "C++",
 86 |   "h++": "C++",
 87 |   "hh": "Hack",
 88 |   "hpp": "C++",
 89 |   "hxx": "C++",
 90 |   "inl": "C++",
 91 |   "ipp": "C++",
 92 |   "tcc": "C++",
 93 |   "tpp": "C++",
 94 |   "c-objdump": "C-ObjDump",
 95 |   "chs": "C2hs Haskell",
 96 |   "clp": "CLIPS",
 97 |   "cmake": "CMake",
 98 |   "cmake.in": "CMake",
 99 |   "cob": "COBOL",
100 |   "cbl": "COBOL",
101 |   "ccp": "COBOL",
102 |   "cobol": "COBOL",
103 |   "cpy": "COBOL",
104 |   "css": "CSS",
105 |   "csv": "CSV",
106 |   "capnp": "Cap'n Proto",
107 |   "mss": "CartoCSS",
108 |   "ceylon": "Ceylon",
109 |   "chpl": "Chapel",
110 |   "ch": "xBase",
111 |   "ck": "ChucK",
112 |   "cirru": "Cirru",
113 |   "clw": "Clarion",
114 |   "icl": "Clean",
115 |   "dcl": "Clean",
116 |   "click": "Click",
117 |   "clj": "Clojure",
118 |   "boot": "Clojure",
119 |   "cl2": "Clojure",
120 |   "cljc": "Clojure",
121 |   "cljs": "Clojure",
122 |   "cljs.hl": "Clojure",
123 |   "cljscm": "Clojure",
124 |   "cljx": "Clojure",
125 |   "hic": "Clojure",
126 |   "coffee": "CoffeeScript",
127 |   "_coffee": "CoffeeScript",
128 |   "cjsx": "CoffeeScript",
129 |   "cson": "CoffeeScript",
130 |   "iced": "CoffeeScript",
131 |   "cfm": "ColdFusion",
132 |   "cfml": "ColdFusion",
133 |   "cfc": "ColdFusion CFC",
134 |   "lisp": "NewLisp",
135 |   "asd": "Common Lisp",
136 |   "cl": "OpenCL",
137 |   "l": "PicoLisp",
138 |   "lsp": "NewLisp",
139 |   "ny": "Common Lisp",
140 |   "podsl": "Common Lisp",
141 |   "sexp": "Common Lisp",
142 |   "cps": "Component Pascal",
143 |   "coq": "Coq",
144 |   "v": "Verilog",
145 |   "cppobjdump": "Cpp-ObjDump",
146 |   "c++-objdump": "Cpp-ObjDump",
147 |   "c++objdump": "Cpp-ObjDump",
148 |   "cpp-objdump": "Cpp-ObjDump",
149 |   "cxx-objdump": "Cpp-ObjDump",
150 |   "creole": "Creole",
151 |   "cr": "Crystal",
152 |   "feature": "Cucumber",
153 |   "cu": "Cuda",
154 |   "cuh": "Cuda",
155 |   "cy": "Cycript",
156 |   "pyx": "Cython",
157 |   "pxd": "Cython",
158 |   "pxi": "Cython",
159 |   "d": "Makefile",
160 |   "di": "D",
161 |   "d-objdump": "D-ObjDump",
162 |   "com": "DIGITAL Command Language",
163 |   "dm": "DM",
164 |   "zone": "DNS Zone",
165 |   "arpa": "DNS Zone",
166 |   "darcspatch": "Darcs Patch",
167 |   "dpatch": "Darcs Patch",
168 |   "dart": "Dart",
169 |   "diff": "Diff",
170 |   "patch": "Diff",
171 |   "dockerfile": "Dockerfile",
172 |   "djs": "Dogescript",
173 |   "dylan": "Dylan",
174 |   "dyl": "Dylan",
175 |   "intr": "Dylan",
176 |   "lid": "Dylan",
177 |   "E": "E",
178 |   "ecl": "ECLiPSe",
179 |   "eclxml": "ECL",
180 |   "sch": "KiCad",
181 |   "brd": "KiCad",
182 |   "epj": "Ecere Projects",
183 |   "e": "Eiffel",
184 |   "ex": "Elixir",
185 |   "exs": "Elixir",
186 |   "elm": "Elm",
187 |   "el": "Emacs Lisp",
188 |   "emacs": "Emacs Lisp",
189 |   "emacs.desktop": "Emacs Lisp",
190 |   "em": "EmberScript",
191 |   "emberscript": "EmberScript",
192 |   "erl": "Erlang",
193 |   "es": "JavaScript",
194 |   "escript": "Erlang",
195 |   "hrl": "Erlang",
196 |   "xrl": "Erlang",
197 |   "yrl": "Erlang",
198 |   "fs": "GLSL",
199 |   "fsi": "F#",
200 |   "fsx": "F#",
201 |   "fx": "HLSL",
202 |   "flux": "FLUX",
203 |   "f90": "FORTRAN",
204 |   "f": "Forth",
205 |   "f03": "FORTRAN",
206 |   "f08": "FORTRAN",
207 |   "f77": "FORTRAN",
208 |   "f95": "FORTRAN",
209 |   "for": "Forth",
210 |   "fpp": "FORTRAN",
211 |   "factor": "Factor",
212 |   "fy": "Fancy",
213 |   "fancypack": "Fancy",
214 |   "fan": "Fantom",
215 |   "eam.fs": "Formatted",
216 |   "fth": "Forth",
217 |   "4th": "Forth",
218 |   "forth": "Forth",
219 |   "fr": "Text",
220 |   "frt": "Forth",
221 |   "ftl": "FreeMarker",
222 |   "g": "GAP",
223 |   "gco": "G-code",
224 |   "gcode": "G-code",
225 |   "gms": "GAMS",
226 |   "gap": "GAP",
227 |   "gd": "GDScript",
228 |   "gi": "GAP",
229 |   "tst": "Scilab",
230 |   "s": "GAS",
231 |   "ms": "MAXScript",
232 |   "glsl": "GLSL",
233 |   "fp": "GLSL",
234 |   "frag": "JavaScript",
235 |   "frg": "GLSL",
236 |   "fsh": "GLSL",
237 |   "fshader": "GLSL",
238 |   "geo": "GLSL",
239 |   "geom": "GLSL",
240 |   "glslv": "GLSL",
241 |   "gshader": "GLSL",
242 |   "shader": "GLSL",
243 |   "vert": "GLSL",
244 |   "vrx": "GLSL",
245 |   "vsh": "GLSL",
246 |   "vshader": "GLSL",
247 |   "gml": "XML",
248 |   "kid": "Genshi",
249 |   "ebuild": "Gentoo Ebuild",
250 |   "eclass": "Gentoo Eclass",
251 |   "po": "Gettext Catalog",
252 |   "pot": "Gettext Catalog",
253 |   "glf": "Glyph",
254 |   "gp": "Gnuplot",
255 |   "gnu": "Gnuplot",
256 |   "gnuplot": "Gnuplot",
257 |   "plot": "Gnuplot",
258 |   "plt": "Gnuplot",
259 |   "go": "Go",
260 |   "golo": "Golo",
261 |   "gs": "JavaScript",
262 |   "gst": "Gosu",
263 |   "gsx": "Gosu",
264 |   "vark": "Gosu",
265 |   "grace": "Grace",
266 |   "gradle": "Gradle",
267 |   "gf": "Grammatical Framework",
268 |   "graphql": "GraphQL",
269 |   "dot": "Graphviz (DOT)",
270 |   "gv": "Graphviz (DOT)",
271 |   "man": "Groff",
272 |   "1in": "Groff",
273 |   "1m": "Groff",
274 |   "1x": "Groff",
275 |   "3in": "Groff",
276 |   "3m": "Groff",
277 |   "3qt": "Groff",
278 |   "3x": "Groff",
279 |   "me": "Groff",
280 |   "n": "Nemerle",
281 |   "rno": "Groff",
282 |   "roff": "Groff",
283 |   "groovy": "Groovy",
284 |   "grt": "Groovy",
285 |   "gtpl": "Groovy",
286 |   "gvy": "Groovy",
287 |   "gsp": "Groovy Server Pages",
288 |   "hcl": "HCL",
289 |   "tf": "HCL",
290 |   "hlsl": "HLSL",
291 |   "fxh": "HLSL",
292 |   "hlsli": "HLSL",
293 |   "html": "HTML",
294 |   "htm": "HTML",
295 |   "html.hl": "HTML",
296 |   "st": "Smalltalk",
297 |   "xht": "HTML",
298 |   "xhtml": "HTML",
299 |   "mustache": "HTML+Django",
300 |   "jinja": "HTML+Django",
301 |   "eex": "HTML+EEX",
302 |   "erb": "HTML+ERB",
303 |   "erb.deface": "HTML+ERB",
304 |   "phtml": "HTML+PHP",
305 |   "http": "HTTP",
306 |   "php": "PHP",
307 |   "haml": "Haml",
308 |   "haml.deface": "Haml",
309 |   "handlebars": "Handlebars",
310 |   "hbs": "Handlebars",
311 |   "hb": "Harbour",
312 |   "hs": "Haskell",
313 |   "hsc": "Haskell",
314 |   "hx": "Haxe",
315 |   "hxsl": "Haxe",
316 |   "hy": "Hy",
317 |   "pro": "QMake",
318 |   "dlm": "IDL",
319 |   "ipf": "IGOR Pro",
320 |   "ini": "INI",
321 |   "cfg": "INI",
322 |   "prefs": "INI",
323 |   "properties": "INI",
324 |   "irclog": "IRC log",
325 |   "weechatlog": "IRC log",
326 |   "idr": "Idris",
327 |   "lidr": "Idris",
328 |   "ni": "Inform 7",
329 |   "i7x": "Inform 7",
330 |   "iss": "Inno Setup",
331 |   "io": "Io",
332 |   "ik": "Ioke",
333 |   "thy": "Isabelle",
334 |   "ijs": "J",
335 |   "flex": "JFlex",
336 |   "jflex": "JFlex",
337 |   "json": "JSON",
338 |   "geojson": "JSON",
339 |   "lock": "JSON",
340 |   "topojson": "JSON",
341 |   "json5": "JSON5",
342 |   "jsonld": "JSONLD",
343 |   "jq": "JSONiq",
344 |   "jsx": "JSX",
345 |   "jade": "Jade",
346 |   "j": "Objective-J",
347 |   "java": "Java",
348 |   "jsp": "Java Server Pages",
349 |   "js": "JavaScript",
350 |   "_js": "JavaScript",
351 |   "bones": "JavaScript",
352 |   "es6": "JavaScript",
353 |   "jake": "JavaScript",
354 |   "jsb": "JavaScript",
355 |   "jscad": "JavaScript",
356 |   "jsfl": "JavaScript",
357 |   "jsm": "JavaScript",
358 |   "jss": "JavaScript",
359 |   "njs": "JavaScript",
360 |   "pac": "JavaScript",
361 |   "sjs": "JavaScript",
362 |   "ssjs": "JavaScript",
363 |   "sublime-build": "JavaScript",
364 |   "sublime-commands": "JavaScript",
365 |   "sublime-completions": "JavaScript",
366 |   "sublime-keymap": "JavaScript",
367 |   "sublime-macro": "JavaScript",
368 |   "sublime-menu": "JavaScript",
369 |   "sublime-mousemap": "JavaScript",
370 |   "sublime-project": "JavaScript",
371 |   "sublime-settings": "JavaScript",
372 |   "sublime-theme": "JavaScript",
373 |   "sublime-workspace": "JavaScript",
374 |   "sublime_metrics": "JavaScript",
375 |   "sublime_session": "JavaScript",
376 |   "xsjs": "JavaScript",
377 |   "xsjslib": "JavaScript",
378 |   "jl": "Julia",
379 |   "ipynb": "Jupyter Notebook",
380 |   "krl": "KRL",
381 |   "kicad_pcb": "KiCad",
382 |   "kit": "Kit",
383 |   "kt": "Kotlin",
384 |   "ktm": "Kotlin",
385 |   "kts": "Kotlin",
386 |   "lfe": "LFE",
387 |   "ll": "LLVM",
388 |   "lol": "LOLCODE",
389 |   "lsl": "LSL",
390 |   "lslp": "LSL",
391 |   "lvproj": "LabVIEW",
392 |   "lasso": "Lasso",
393 |   "las": "Lasso",
394 |   "lasso8": "Lasso",
395 |   "lasso9": "Lasso",
396 |   "ldml": "Lasso",
397 |   "latte": "Latte",
398 |   "lean": "Lean",
399 |   "hlean": "Lean",
400 |   "less": "Less",
401 |   "lex": "Lex",
402 |   "ly": "LilyPond",
403 |   "ily": "LilyPond",
404 |   "m": "Objective-C",
405 |   "ld": "Linker Script",
406 |   "lds": "Linker Script",
407 |   "liquid": "Liquid",
408 |   "lagda": "Literate Agda",
409 |   "litcoffee": "Literate CoffeeScript",
410 |   "lhs": "Literate Haskell",
411 |   "ls": "LoomScript",
412 |   "_ls": "LiveScript",
413 |   "xm": "Logos",
414 |   "x": "Logos",
415 |   "xi": "Logos",
416 |   "lgt": "Logtalk",
417 |   "logtalk": "Logtalk",
418 |   "lookml": "LookML",
419 |   "lua": "Lua",
420 |   "fcgi": "Shell",
421 |   "nse": "Lua",
422 |   "pd_lua": "Lua",
423 |   "rbxs": "Lua",
424 |   "wlua": "Lua",
425 |   "mumps": "M",
426 |   "m4": "M4Sugar",
427 |   "mcr": "MAXScript",
428 |   "mtml": "MTML",
429 |   "muf": "MUF",
430 |   "mak": "Makefile",
431 |   "mk": "Makefile",
432 |   "mkfile": "Makefile",
433 |   "mako": "Mako",
434 |   "mao": "Mako",
435 |   "md": "Markdown",
436 |   "markdown": "Markdown",
437 |   "mkd": "Markdown",
438 |   "mkdn": "Markdown",
439 |   "mkdown": "Markdown",
440 |   "ron": "Markdown",
441 |   "mask": "Mask",
442 |   "mathematica": "Mathematica",
443 |   "cdf": "Mathematica",
444 |   "ma": "Mathematica",
445 |   "mt": "Mathematica",
446 |   "nb": "Text",
447 |   "nbp": "Mathematica",
448 |   "wl": "Mathematica",
449 |   "wlt": "Mathematica",
450 |   "matlab": "Matlab",
451 |   "maxpat": "Max",
452 |   "maxhelp": "Max",
453 |   "maxproj": "Max",
454 |   "mxt": "Max",
455 |   "pat": "Max",
456 |   "mediawiki": "MediaWiki",
457 |   "wiki": "MediaWiki",
458 |   "moo": "Moocode",
459 |   "metal": "Metal",
460 |   "minid": "MiniD",
461 |   "druby": "Mirah",
462 |   "duby": "Mirah",
463 |   "mir": "Mirah",
464 |   "mirah": "Mirah",
465 |   "mo": "Modelica",
466 |   "mms": "Module Management System",
467 |   "mmk": "Module Management System",
468 |   "monkey": "Monkey",
469 |   "moon": "MoonScript",
470 |   "myt": "Myghty",
471 |   "ncl": "Text",
472 |   "nl": "NewLisp",
473 |   "nsi": "NSIS",
474 |   "nsh": "NSIS",
475 |   "axs": "NetLinx",
476 |   "axi": "NetLinx",
477 |   "axs.erb": "NetLinx+ERB",
478 |   "axi.erb": "NetLinx+ERB",
479 |   "nlogo": "NetLogo",
480 |   "nginxconf": "Nginx",
481 |   "nim": "Nimrod",
482 |   "nimrod": "Nimrod",
483 |   "ninja": "Ninja",
484 |   "nit": "Nit",
485 |   "nix": "Nix",
486 |   "nu": "Nu",
487 |   "numpy": "NumPy",
488 |   "numpyw": "NumPy",
489 |   "numsc": "NumPy",
490 |   "ml": "OCaml",
491 |   "eliom": "OCaml",
492 |   "eliomi": "OCaml",
493 |   "ml4": "OCaml",
494 |   "mli": "OCaml",
495 |   "mll": "OCaml",
496 |   "mly": "OCaml",
497 |   "objdump": "ObjDump",
498 |   "mm": "XML",
499 |   "sj": "Objective-J",
500 |   "omgrofl": "Omgrofl",
501 |   "opa": "Opa",
502 |   "opal": "Opal",
503 |   "opencl": "OpenCL",
504 |   "p": "OpenEdge ABL",
505 |   "scad": "OpenSCAD",
506 |   "org": "Org",
507 |   "ox": "Ox",
508 |   "oxh": "Ox",
509 |   "oxo": "Ox",
510 |   "oxygene": "Oxygene",
511 |   "oz": "Oz",
512 |   "pwn": "PAWN",
513 |   "aw": "PHP",
514 |   "ctp": "PHP",
515 |   "php3": "PHP",
516 |   "php4": "PHP",
517 |   "php5": "PHP",
518 |   "phps": "PHP",
519 |   "phpt": "PHP",
520 |   "pls": "PLSQL",
521 |   "pck": "PLSQL",
522 |   "pkb": "PLSQL",
523 |   "pks": "PLSQL",
524 |   "plb": "PLSQL",
525 |   "plsql": "PLSQL",
526 |   "sql": "SQLPL",
527 |   "pov": "POV-Ray SDL",
528 |   "pan": "Pan",
529 |   "psc": "Papyrus",
530 |   "parrot": "Parrot",
531 |   "pasm": "Parrot Assembly",
532 |   "pir": "Parrot Internal Representation",
533 |   "pas": "Pascal",
534 |   "dfm": "Pascal",
535 |   "dpr": "Pascal",
536 |   "lpr": "Pascal",
537 |   "pp": "Puppet",
538 |   "pl": "Prolog",
539 |   "al": "Perl",
540 |   "cgi": "Shell",
541 |   "perl": "Perl",
542 |   "ph": "Perl",
543 |   "plx": "Perl",
544 |   "pm": "Perl6",
545 |   "pod": "Pod",
546 |   "psgi": "Perl",
547 |   "t": "Turing",
548 |   "6pl": "Perl6",
549 |   "6pm": "Perl6",
550 |   "nqp": "Perl6",
551 |   "p6": "Perl6",
552 |   "p6l": "Perl6",
553 |   "p6m": "Perl6",
554 |   "pl6": "Perl6",
555 |   "pm6": "Perl6",
556 |   "pkl": "Pickle",
557 |   "pig": "PigLatin",
558 |   "pike": "Pike",
559 |   "pmod": "Pike",
560 |   "pogo": "PogoScript",
561 |   "pony": "Pony",
562 |   "ps": "PostScript",
563 |   "eps": "PostScript",
564 |   "ps1": "PowerShell",
565 |   "psd1": "PowerShell",
566 |   "psm1": "PowerShell",
567 |   "pde": "Processing",
568 |   "prolog": "Prolog",
569 |   "yap": "Prolog",
570 |   "spin": "Propeller Spin",
571 |   "proto": "Protocol Buffer",
572 |   "pub": "Public Key",
573 |   "pd": "Pure Data",
574 |   "pb": "PureBasic",
575 |   "pbi": "PureBasic",
576 |   "purs": "PureScript",
577 |   "py": "Python",
578 |   "bzl": "Python",
579 |   "gyp": "Python",
580 |   "lmi": "Python",
581 |   "pyde": "Python",
582 |   "pyp": "Python",
583 |   "pyt": "Python",
584 |   "pyw": "Python",
585 |   "rpy": "Ren'Py",
586 |   "tac": "Python",
587 |   "wsgi": "Python",
588 |   "xpy": "Python",
589 |   "pytb": "Python traceback",
590 |   "qml": "QML",
591 |   "qbs": "QML",
592 |   "pri": "QMake",
593 |   "r": "Rebol",
594 |   "rd": "R",
595 |   "rsx": "R",
596 |   "raml": "RAML",
597 |   "rdoc": "RDoc",
598 |   "rbbas": "REALbasic",
599 |   "rbfrm": "REALbasic",
600 |   "rbmnu": "REALbasic",
601 |   "rbres": "REALbasic",
602 |   "rbtbar": "REALbasic",
603 |   "rbuistate": "REALbasic",
604 |   "rhtml": "RHTML",
605 |   "rmd": "RMarkdown",
606 |   "rkt": "Racket",
607 |   "rktd": "Racket",
608 |   "rktl": "Racket",
609 |   "scrbl": "Racket",
610 |   "rl": "Ragel in Ruby Host",
611 |   "raw": "Raw token data",
612 |   "reb": "Rebol",
613 |   "r2": "Rebol",
614 |   "r3": "Rebol",
615 |   "rebol": "Rebol",
616 |   "red": "Red",
617 |   "reds": "Red",
618 |   "cw": "Redcode",
619 |   "rs": "Rust",
620 |   "rsh": "RenderScript",
621 |   "robot": "RobotFramework",
622 |   "rg": "Rouge",
623 |   "rb": "Ruby",
624 |   "builder": "Ruby",
625 |   "gemspec": "Ruby",
626 |   "god": "Ruby",
627 |   "irbrc": "Ruby",
628 |   "jbuilder": "Ruby",
629 |   "mspec": "Ruby",
630 |   "pluginspec": "XML",
631 |   "podspec": "Ruby",
632 |   "rabl": "Ruby",
633 |   "rake": "Ruby",
634 |   "rbuild": "Ruby",
635 |   "rbw": "Ruby",
636 |   "rbx": "Ruby",
637 |   "ru": "Ruby",
638 |   "ruby": "Ruby",
639 |   "thor": "Ruby",
640 |   "watchr": "Ruby",
641 |   "rs.in": "Rust",
642 |   "sas": "SAS",
643 |   "scss": "SCSS",
644 |   "smt2": "SMT",
645 |   "smt": "SMT",
646 |   "sparql": "SPARQL",
647 |   "rq": "SPARQL",
648 |   "sqf": "SQF",
649 |   "hqf": "SQF",
650 |   "cql": "SQL",
651 |   "ddl": "SQL",
652 |   "prc": "SQL",
653 |   "tab": "SQL",
654 |   "udf": "SQL",
655 |   "viw": "SQL",
656 |   "db2": "SQLPL",
657 |   "ston": "STON",
658 |   "svg": "SVG",
659 |   "sage": "Sage",
660 |   "sagews": "Sage",
661 |   "sls": "Scheme",
662 |   "sass": "Sass",
663 |   "scala": "Scala",
664 |   "sbt": "Scala",
665 |   "sc": "SuperCollider",
666 |   "scaml": "Scaml",
667 |   "scm": "Scheme",
668 |   "sld": "Scheme",
669 |   "sps": "Scheme",
670 |   "ss": "Scheme",
671 |   "sci": "Scilab",
672 |   "sce": "Scilab",
673 |   "self": "Self",
674 |   "sh": "Shell",
675 |   "bash": "Shell",
676 |   "bats": "Shell",
677 |   "command": "Shell",
678 |   "ksh": "Shell",
679 |   "sh.in": "Shell",
680 |   "tmux": "Shell",
681 |   "tool": "Shell",
682 |   "zsh": "Shell",
683 |   "sh-session": "ShellSession",
684 |   "shen": "Shen",
685 |   "sl": "Slash",
686 |   "slim": "Slim",
687 |   "smali": "Smali",
688 |   "tpl": "Smarty",
689 |   "sp": "SourcePawn",
690 |   "sma": "SourcePawn",
691 |   "nut": "Squirrel",
692 |   "stan": "Stan",
693 |   "ML": "Standard ML",
694 |   "fun": "Standard ML",
695 |   "sig": "Standard ML",
696 |   "sml": "Standard ML",
697 |   "do": "Stata",
698 |   "ado": "Stata",
699 |   "doh": "Stata",
700 |   "ihlp": "Stata",
701 |   "mata": "Stata",
702 |   "matah": "Stata",
703 |   "sthlp": "Stata",
704 |   "styl": "Stylus",
705 |   "scd": "SuperCollider",
706 |   "swift": "Swift",
707 |   "sv": "SystemVerilog",
708 |   "svh": "SystemVerilog",
709 |   "vh": "SystemVerilog",
710 |   "toml": "TOML",
711 |   "txl": "TXL",
712 |   "tcl": "Tcl",
713 |   "adp": "Tcl",
714 |   "tm": "Tcl",
715 |   "tcsh": "Tcsh",
716 |   "csh": "Tcsh",
717 |   "tex": "TeX",
718 |   "aux": "TeX",
719 |   "bbx": "TeX",
720 |   "bib": "TeX",
721 |   "cbx": "TeX",
722 |   "dtx": "TeX",
723 |   "ins": "TeX",
724 |   "lbx": "TeX",
725 |   "ltx": "TeX",
726 |   "mkii": "TeX",
727 |   "mkiv": "TeX",
728 |   "mkvi": "TeX",
729 |   "sty": "TeX",
730 |   "toc": "TeX",
731 |   "tea": "Tea",
732 |   "txt": "Text",
733 |   "no": "Text",
734 |   "textile": "Textile",
735 |   "thrift": "Thrift",
736 |   "tu": "Turing",
737 |   "ttl": "Turtle",
738 |   "twig": "Twig",
739 |   "ts": "XML",
740 |   "tsx": "XML",
741 |   "upc": "Unified Parallel C",
742 |   "anim": "Unity3D Asset",
743 |   "asset": "Unity3D Asset",
744 |   "mat": "Unity3D Asset",
745 |   "meta": "Unity3D Asset",
746 |   "prefab": "Unity3D Asset",
747 |   "unity": "Unity3D Asset",
748 |   "uno": "Uno",
749 |   "uc": "UnrealScript",
750 |   "ur": "UrWeb",
751 |   "urs": "UrWeb",
752 |   "vcl": "VCL",
753 |   "vhdl": "VHDL",
754 |   "vhd": "VHDL",
755 |   "vhf": "VHDL",
756 |   "vhi": "VHDL",
757 |   "vho": "VHDL",
758 |   "vhs": "VHDL",
759 |   "vht": "VHDL",
760 |   "vhw": "VHDL",
761 |   "vala": "Vala",
762 |   "vapi": "Vala",
763 |   "veo": "Verilog",
764 |   "vim": "VimL",
765 |   "vb": "Visual Basic",
766 |   "bas": "Visual Basic",
767 |   "frm": "Visual Basic",
768 |   "frx": "Visual Basic",
769 |   "vba": "Visual Basic",
770 |   "vbhtml": "Visual Basic",
771 |   "vbs": "Visual Basic",
772 |   "volt": "Volt",
773 |   "vue": "Vue",
774 |   "owl": "Web Ontology Language",
775 |   "webidl": "WebIDL",
776 |   "x10": "X10",
777 |   "xc": "XC",
778 |   "xml": "XML",
779 |   "ant": "XML",
780 |   "axml": "XML",
781 |   "ccxml": "XML",
782 |   "clixml": "XML",
783 |   "cproject": "XML",
784 |   "csl": "XML",
785 |   "csproj": "XML",
786 |   "ct": "XML",
787 |   "dita": "XML",
788 |   "ditamap": "XML",
789 |   "ditaval": "XML",
790 |   "dll.config": "XML",
791 |   "dotsettings": "XML",
792 |   "filters": "XML",
793 |   "fsproj": "XML",
794 |   "fxml": "XML",
795 |   "glade": "XML",
796 |   "grxml": "XML",
797 |   "iml": "XML",
798 |   "ivy": "XML",
799 |   "jelly": "XML",
800 |   "jsproj": "XML",
801 |   "kml": "XML",
802 |   "launch": "XML",
803 |   "mdpolicy": "XML",
804 |   "mxml": "XML",
805 |   "nproj": "XML",
806 |   "nuspec": "XML",
807 |   "odd": "XML",
808 |   "osm": "XML",
809 |   "plist": "XML",
810 |   "props": "XML",
811 |   "ps1xml": "XML",
812 |   "psc1": "XML",
813 |   "pt": "XML",
814 |   "rdf": "XML",
815 |   "rss": "XML",
816 |   "scxml": "XML",
817 |   "srdf": "XML",
818 |   "storyboard": "XML",
819 |   "stTheme": "XML",
820 |   "sublime-snippet": "XML",
821 |   "targets": "XML",
822 |   "tmCommand": "XML",
823 |   "tml": "XML",
824 |   "tmLanguage": "XML",
825 |   "tmPreferences": "XML",
826 |   "tmSnippet": "XML",
827 |   "tmTheme": "XML",
828 |   "ui": "XML",
829 |   "urdf": "XML",
830 |   "ux": "XML",
831 |   "vbproj": "XML",
832 |   "vcxproj": "XML",
833 |   "vssettings": "XML",
834 |   "vxml": "XML",
835 |   "wsdl": "XML",
836 |   "wsf": "XML",
837 |   "wxi": "XML",
838 |   "wxl": "XML",
839 |   "wxs": "XML",
840 |   "x3d": "XML",
841 |   "xacro": "XML",
842 |   "xaml": "XML",
843 |   "xib": "XML",
844 |   "xlf": "XML",
845 |   "xliff": "XML",
846 |   "xmi": "XML",
847 |   "xml.dist": "XML",
848 |   "xproj": "XML",
849 |   "xsd": "XML",
850 |   "xul": "XML",
851 |   "zcml": "XML",
852 |   "xsp-config": "XPages",
853 |   "xsp.metadata": "XPages",
854 |   "xpl": "XProc",
855 |   "xproc": "XProc",
856 |   "xquery": "XQuery",
857 |   "xq": "XQuery",
858 |   "xql": "XQuery",
859 |   "xqm": "XQuery",
860 |   "xqy": "XQuery",
861 |   "xs": "XS",
862 |   "xslt": "XSLT",
863 |   "xsl": "XSLT",
864 |   "xojo_code": "Xojo",
865 |   "xojo_menu": "Xojo",
866 |   "xojo_report": "Xojo",
867 |   "xojo_script": "Xojo",
868 |   "xojo_toolbar": "Xojo",
869 |   "xojo_window": "Xojo",
870 |   "xtend": "Xtend",
871 |   "yml": "YAML",
872 |   "reek": "YAML",
873 |   "rviz": "YAML",
874 |   "sublime-syntax": "YAML",
875 |   "syntax": "YAML",
876 |   "yaml": "YAML",
877 |   "yaml-tmlanguage": "YAML",
878 |   "yang": "YANG",
879 |   "y": "Yacc",
880 |   "yacc": "Yacc",
881 |   "yy": "Yacc",
882 |   "zep": "Zephir",
883 |   "zimpl": "Zimpl",
884 |   "zmpl": "Zimpl",
885 |   "zpl": "Zimpl",
886 |   "desktop": "desktop",
887 |   "desktop.in": "desktop",
888 |   "ec": "eC",
889 |   "eh": "eC",
890 |   "edn": "edn",
891 |   "fish": "fish",
892 |   "mu": "mupad",
893 |   "nc": "nesC",
894 |   "ooc": "ooc",
895 |   "rst": "reStructuredText",
896 |   "rest": "reStructuredText",
897 |   "rest.txt": "reStructuredText",
898 |   "rst.txt": "reStructuredText",
899 |   "wisp": "wisp",
900 |   "prg": "xBase",
901 |   "prw": "xBase"
902 | }


--------------------------------------------------------------------------------
/repro_eval/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irgroup/repro_eval/c0d7784f617d001908844003f4ec520c85534e6a/repro_eval/test/__init__.py


--------------------------------------------------------------------------------
/repro_eval/test/test_empty_rpd.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from repro_eval.Evaluator import RpdEvaluator
 3 | from repro_eval.config import ERR_MSG
 4 | 
 5 | rpd_eval = RpdEvaluator(qrel_orig_path=None,
 6 |                         run_b_orig_path=None,
 7 |                         run_a_orig_path=None,
 8 |                         run_b_rep_path=None,
 9 |                         run_a_rep_path=None)
10 | 
11 | 
12 | def test_ktu(capfd):
13 |     assert None is rpd_eval.ktau_union()
14 |     out, err = capfd.readouterr()
15 |     assert out == ''.join([ERR_MSG, '\n'])
16 | 
17 | 
18 | def test_rbo(capfd):
19 |     assert None is rpd_eval.rbo()
20 |     out, err = capfd.readouterr()
21 |     assert out == ''.join([ERR_MSG, '\n'])
22 | 
23 | 
24 | def test_rmse(capfd):
25 |     assert None is rpd_eval.rmse()
26 |     out, err = capfd.readouterr()
27 |     assert out == ''.join([ERR_MSG, '\n'])
28 | 
29 | 
30 | def test_er(capfd):
31 |     assert None is rpd_eval.er()
32 |     out, err = capfd.readouterr()
33 |     assert out == ''.join([ERR_MSG, '\n'])
34 | 
35 | 
36 | def test_dri(capfd):
37 |     assert None is rpd_eval.dri()
38 |     out, err = capfd.readouterr()
39 |     assert out == ''.join([ERR_MSG, '\n'])
40 | 
41 | 
42 | def test_ttest(capfd):
43 |     assert None is rpd_eval.ttest()
44 |     out, err = capfd.readouterr()
45 |     assert out == ''.join([ERR_MSG, '\n'])
46 | 


--------------------------------------------------------------------------------
/repro_eval/test/test_empty_rpl.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from repro_eval.Evaluator import RplEvaluator
 3 | from repro_eval.config import ERR_MSG
 4 | 
 5 | rpd_eval = RplEvaluator(qrel_orig_path=None,
 6 |                         run_b_orig_path=None,
 7 |                         run_a_orig_path=None,
 8 |                         run_b_rep_path=None,
 9 |                         run_a_rep_path=None,
10 |                         qrel_rpl_path=None)
11 | 
12 | 
13 | def test_er(capfd):
14 |     assert None is rpd_eval.er()
15 |     out, err = capfd.readouterr()
16 |     assert out == ''.join([ERR_MSG, '\n'])
17 | 
18 | 
19 | def test_dri(capfd):
20 |     assert None is rpd_eval.dri()
21 |     out, err = capfd.readouterr()
22 |     assert out == ''.join([ERR_MSG, '\n'])
23 | 
24 | 
25 | def test_ttest(capfd):
26 |     assert None is rpd_eval.ttest()
27 |     out, err = capfd.readouterr()
28 |     assert out == ''.join([ERR_MSG, '\n'])
29 | 


--------------------------------------------------------------------------------
/repro_eval/test/test_kwargs.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pytrec_eval
 3 | from repro_eval.Evaluator import RpdEvaluator
 4 | 
 5 | 
 6 | rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
 7 |                         run_b_orig_path='./example/orig_b.txt',
 8 |                         run_a_orig_path='./example/orig_a.txt',
 9 |                         run_b_rep_path='./example/rpd_b.txt',
10 |                         run_a_rep_path='./example/rpd_a.txt')
11 | 
12 | rpd_eval.trim()
13 | rpd_eval.evaluate()
14 | 
15 | ktu = rpd_eval.ktau_union()
16 | ktu_base = ktu.get('baseline')
17 | ktu_adv = ktu.get('advanced')
18 | 
19 | rbo = rpd_eval.rbo()
20 | rbo_base = rbo.get('baseline')
21 | rbo_adv = rbo.get('advanced')
22 | 
23 | 
24 | def test_path_ktu():
25 |     _ktu = rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt')
26 |     assert 'baseline' in _ktu.keys()
27 |     assert ktu_base == _ktu.get('baseline')
28 |     assert 'advanced' in _ktu.keys()
29 |     assert ktu_adv == _ktu.get('advanced')
30 | 
31 | 
32 | def test_path_rbo():
33 |     _rbo = rpd_eval.rbo(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt')
34 |     assert 'baseline' in _rbo.keys()
35 |     assert rbo_base == _rbo.get('baseline')
36 |     assert 'advanced' in _rbo.keys()
37 |     assert rbo_adv == _rbo.get('advanced')
38 | 
39 | 
40 | def test_run_ktu():
41 |     with open('./example/rpd_b.txt') as _base_file, open('./example/rpd_a.txt') as _adv_file:
42 |         _base_run = pytrec_eval.parse_run(_base_file)
43 |         _adv_run = pytrec_eval.parse_run(_adv_file)
44 |     _ktu = rpd_eval.ktau_union(run_b_rep=_base_run, run_a_rep=_adv_run)
45 |     assert 'baseline' in _ktu.keys()
46 |     assert ktu_base == _ktu.get('baseline')
47 |     assert 'advanced' in _ktu.keys()
48 |     assert ktu_adv == _ktu.get('advanced')
49 | 
50 | 
51 | def test_run_rbo():
52 |     with open('./example/rpd_b.txt') as _base_file, open('./example/rpd_a.txt') as _adv_file:
53 |         _base_run = pytrec_eval.parse_run(_base_file)
54 |         _adv_run = pytrec_eval.parse_run(_adv_file)
55 |     _rbo = rpd_eval.rbo(run_b_rep=_base_run, run_a_rep=_adv_run)
56 |     assert 'baseline' in _rbo.keys()
57 |     assert rbo_base == _rbo.get('baseline')
58 |     assert 'advanced' in _rbo.keys()
59 |     assert rbo_adv == _rbo.get('advanced')
60 | 


--------------------------------------------------------------------------------
/repro_eval/test/test_path_param.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from repro_eval.Evaluator import RpdEvaluator, RplEvaluator
  3 | import numpy as np
  4 | 
  5 | rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
  6 |                         run_b_orig_path='./example/orig_b.txt',
  7 |                         run_a_orig_path='./example/orig_a.txt',
  8 |                         run_b_rep_path='./example/rpd_b.txt',
  9 |                         run_a_rep_path='./example/rpd_a.txt')
 10 | 
 11 | rpd_eval.trim()
 12 | rpd_eval.evaluate()
 13 | 
 14 | 
 15 | def test_ktu_path_param():
 16 |     ktu = rpd_eval.ktau_union()
 17 |     assert 'baseline' in ktu.keys()
 18 |     assert 'advanced' in ktu.keys()
 19 | 
 20 |     _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
 21 |                             run_b_orig_path='./example/orig_b.txt',
 22 |                             run_a_orig_path='./example/orig_a.txt')
 23 |     _rpd_eval.trim()
 24 |     _rpd_eval.evaluate()
 25 | 
 26 |     _ktu = _rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt')
 27 |     assert 'baseline' in _ktu.keys()
 28 |     assert ktu.get('baseline') == _ktu.get('baseline')
 29 | 
 30 |     _ktu = _rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt')
 31 |     assert 'advanced' in _ktu.keys()
 32 |     assert ktu.get('advanced') == _ktu.get('advanced')
 33 | 
 34 | 
 35 | def test_rbo_path_param():
 36 |     rbo = rpd_eval.rbo()
 37 |     assert 'baseline' in rbo.keys()
 38 |     assert 'advanced' in rbo.keys()
 39 | 
 40 |     _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
 41 |                              run_b_orig_path='./example/orig_b.txt',
 42 |                              run_a_orig_path='./example/orig_a.txt')
 43 |     _rpd_eval.trim()
 44 |     _rpd_eval.evaluate()
 45 | 
 46 |     _rbo = _rpd_eval.rbo(run_b_path='./example/rpd_b.txt')
 47 |     assert 'baseline' in _rbo.keys()
 48 |     assert rbo.get('baseline') == _rbo.get('baseline')
 49 | 
 50 |     _rbo = _rpd_eval.rbo(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt')
 51 |     assert 'advanced' in _rbo.keys()
 52 |     assert rbo.get('advanced') == _rbo.get('advanced')
 53 | 
 54 | 
 55 | def test_rmse_path_param():
 56 |     rmse = rpd_eval.rmse()
 57 |     assert 'baseline' in rmse.keys()
 58 |     assert 'advanced' in rmse.keys()
 59 | 
 60 |     _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
 61 |                              run_b_orig_path='./example/orig_b.txt',
 62 |                              run_a_orig_path='./example/orig_a.txt')
 63 |     _rpd_eval.trim()
 64 |     _rpd_eval.evaluate()
 65 | 
 66 |     _rmse = _rpd_eval.rmse(run_b_path='./example/rpd_b.txt')
 67 |     assert 'baseline' in _rmse.keys()
 68 |     assert rmse.get('baseline') == _rmse.get('baseline')
 69 | 
 70 |     _rmse = _rpd_eval.rmse(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt')
 71 |     assert 'advanced' in _rmse.keys()
 72 |     assert rmse.get('advanced') == _rmse.get('advanced')
 73 | 
 74 | 
 75 | def test_rpd_ttest_path_param():
 76 |     pval = rpd_eval.ttest()
 77 |     assert 'baseline' in pval.keys()
 78 |     assert 'advanced' in pval.keys()
 79 | 
 80 |     _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
 81 |                              run_b_orig_path='./example/orig_b.txt',
 82 |                              run_a_orig_path='./example/orig_a.txt')
 83 |     _rpd_eval.trim()
 84 |     _rpd_eval.evaluate()
 85 | 
 86 |     _pval = _rpd_eval.ttest(run_b_path='./example/rpd_b.txt')
 87 |     assert 'baseline' in _pval.keys()
 88 |     # pick a few samples here since nan comparisons cause problems in combination with assert
 89 |     assert pval.get('baseline').get('ndcg') == _pval.get('baseline').get('ndcg')
 90 |     assert pval.get('baseline').get('P_10') == _pval.get('baseline').get('P_10')
 91 |     assert pval.get('baseline').get('map') == _pval.get('baseline').get('map')
 92 | 
 93 |     _pval = _rpd_eval.ttest(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt')
 94 |     assert 'advanced' in _pval.keys()
 95 |     # pick a few samples here since nan comparisons cause problems in combination with assert
 96 |     assert pval.get('advanced').get('ndcg') == _pval.get('advanced').get('ndcg')
 97 |     assert pval.get('advanced').get('P_10') == _pval.get('advanced').get('P_10')
 98 |     assert pval.get('advanced').get('map') == _pval.get('advanced').get('map')
 99 | 
100 | 
101 | def test_rpl_ttest_path_param():
102 |     rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
103 |                             run_b_orig_path='./example/orig_b.txt',
104 |                             run_a_orig_path='./example/orig_a.txt',
105 |                             run_b_rep_path='./example/rpl_b.txt',
106 |                             run_a_rep_path='./example/rpl_a.txt',
107 |                             qrel_rpl_path='./example/data/qrels/core18.txt')
108 | 
109 |     rpl_eval.trim()
110 |     rpl_eval.evaluate()
111 | 
112 |     pval = rpl_eval.ttest()
113 |     assert 'baseline' in pval.keys()
114 |     assert 'advanced' in pval.keys()
115 | 
116 |     _rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
117 |                              run_b_orig_path='./example/orig_b.txt',
118 |                              run_a_orig_path='./example/orig_a.txt',
119 |                              qrel_rpl_path='./example/data/qrels/core18.txt')
120 |     _rpl_eval.trim()
121 |     _rpl_eval.evaluate()
122 | 
123 |     _pval = _rpl_eval.ttest(run_b_path='./example/rpl_b.txt')
124 |     assert 'baseline' in _pval.keys()
125 |     # pick a few samples here since nan comparisons cause problems in combination with assert
126 |     assert pval.get('baseline').get('ndcg') == _pval.get('baseline').get('ndcg')
127 |     assert pval.get('baseline').get('P_10') == _pval.get('baseline').get('P_10')
128 |     assert pval.get('baseline').get('map') == _pval.get('baseline').get('map')
129 | 
130 |     _pval = _rpl_eval.ttest(run_b_path='./example/rpl_b.txt', run_a_path='./example/rpl_a.txt')
131 |     assert 'advanced' in _pval.keys()
132 |     # pick a few samples here since nan comparisons cause problems in combination with assert
133 |     assert pval.get('advanced').get('ndcg') == _pval.get('advanced').get('ndcg')
134 |     assert pval.get('advanced').get('P_10') == _pval.get('advanced').get('P_10')
135 |     assert pval.get('advanced').get('map') == _pval.get('advanced').get('map')
136 | 
137 | 
138 | def test_rpd_er_path_param():
139 |     er = rpd_eval.er()
140 | 
141 |     _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
142 |                              run_b_orig_path='./example/orig_b.txt',
143 |                              run_a_orig_path='./example/orig_a.txt')
144 |     _rpd_eval.trim()
145 |     _rpd_eval.evaluate()
146 | 
147 |     _er = _rpd_eval.er(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt')
148 | 
149 |     # pick a few samples here since nan comparisons cause problems in combination with assert
150 |     assert er.get('ndcg') == _er.get('ndcg')
151 |     assert er.get('P_10') == _er.get('P_10')
152 |     assert er.get('map') == _er.get('map')
153 | 
154 | 
155 | def test_rpl_er_path_param():
156 |     rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
157 |                             run_b_orig_path='./example/orig_b.txt',
158 |                             run_a_orig_path='./example/orig_a.txt',
159 |                             run_b_rep_path='./example/rpl_b.txt',
160 |                             run_a_rep_path='./example/rpl_a.txt',
161 |                             qrel_rpl_path='./example/data/qrels/core18.txt')
162 | 
163 |     rpl_eval.trim()
164 |     rpl_eval.evaluate()
165 | 
166 |     er = rpl_eval.er()
167 | 
168 |     _rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
169 |                              run_b_orig_path='./example/orig_b.txt',
170 |                              run_a_orig_path='./example/orig_a.txt',
171 |                              qrel_rpl_path='./example/data/qrels/core18.txt')
172 |     _rpl_eval.trim()
173 |     _rpl_eval.evaluate()
174 | 
175 |     _er = _rpl_eval.er(run_b_path='./example/rpl_b.txt', run_a_path='./example/rpl_a.txt')
176 | 
177 |     # pick a few samples here since nan comparisons cause problems in combination with assert
178 |     assert er.get('ndcg') == _er.get('ndcg')
179 |     assert er.get('P_10') == _er.get('P_10')
180 |     assert er.get('map') == _er.get('map')
181 | 
182 | 
183 | def test_rpd_dri_path_param():
184 |     dri = rpd_eval.dri()
185 | 
186 |     _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
187 |                              run_b_orig_path='./example/orig_b.txt',
188 |                              run_a_orig_path='./example/orig_a.txt')
189 |     _rpd_eval.trim()
190 |     _rpd_eval.evaluate()
191 | 
192 |     _dri = _rpd_eval.dri(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt')
193 | 
194 |     # pick a few samples here since nan comparisons cause problems in combination with assert
195 |     assert dri.get('ndcg') == _dri.get('ndcg')
196 |     assert dri.get('P_10') == _dri.get('P_10')
197 |     assert dri.get('map') == _dri.get('map')
198 | 
199 | 
200 | def test_rpl_dri_path_param():
201 |     rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
202 |                             run_b_orig_path='./example/orig_b.txt',
203 |                             run_a_orig_path='./example/orig_a.txt',
204 |                             run_b_rep_path='./example/rpl_b.txt',
205 |                             run_a_rep_path='./example/rpl_a.txt',
206 |                             qrel_rpl_path='./example/data/qrels/core18.txt')
207 | 
208 |     rpl_eval.trim()
209 |     rpl_eval.evaluate()
210 | 
211 |     dri = rpl_eval.dri()
212 | 
213 |     _rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
214 |                              run_b_orig_path='./example/orig_b.txt',
215 |                              run_a_orig_path='./example/orig_a.txt',
216 |                              qrel_rpl_path='./example/data/qrels/core18.txt')
217 |     _rpl_eval.trim()
218 |     _rpl_eval.evaluate()
219 | 
220 |     _dri = _rpl_eval.dri(run_b_path='./example/rpl_b.txt', run_a_path='./example/rpl_a.txt')
221 | 
222 |     # pick a few samples here since nan comparisons cause problems in combination with assert
223 |     assert dri.get('ndcg') == _dri.get('ndcg')
224 |     assert dri.get('P_10') == _dri.get('P_10')
225 |     assert dri.get('map') == _dri.get('map')
226 | 


--------------------------------------------------------------------------------
/repro_eval/test/test_rbo.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from repro_eval.Evaluator import RpdEvaluator
 3 | 
 4 | rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
 5 |                         run_b_orig_path='./example/orig_b.txt',
 6 |                         run_a_orig_path='./example/orig_a.txt',
 7 |                         run_b_rep_path='./example/rpd_b.txt',
 8 |                         run_a_rep_path='./example/rpd_a.txt')
 9 | 
10 | rpd_eval.trim()
11 | rpd_eval.evaluate()
12 | 
13 | 
14 | def test_rbo():
15 |     # compare rbo implementations by the 10th decimal
16 | 
17 |     rbo = rpd_eval.rbo()
18 |     rbo_slow = rpd_eval.rbo(misinfo=False)
19 | 
20 |     for k, v in rbo.get('baseline').items():
21 |         rbo['baseline'][k] = round(v, 10)
22 |     for k, v in rbo.get('advanced').items():
23 |         rbo['advanced'][k] = round(v, 10)
24 |     for k, v in rbo_slow.get('baseline').items():
25 |         rbo_slow['baseline'][k] = round(v, 10)
26 |     for k, v in rbo_slow.get('advanced').items():
27 |         rbo_slow['advanced'][k] = round(v, 10)
28 | 
29 |     assert rbo.get('baseline') == rbo_slow.get('baseline')
30 |     assert rbo.get('advanced') == rbo_slow.get('advanced')
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/repro_eval/test/test_rpd.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from repro_eval.Evaluator import RpdEvaluator
 3 | 
 4 | rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
 5 |                         run_b_orig_path='./example/orig_b.txt',
 6 |                         run_a_orig_path='./example/orig_a.txt',
 7 |                         run_b_rep_path='./example/rpd_b.txt',
 8 |                         run_a_rep_path='./example/rpd_a.txt')
 9 | 
10 | rpd_eval.trim()
11 | rpd_eval.evaluate()
12 | 
13 | 
14 | def test_ktu():
15 |     ktu = rpd_eval.ktau_union()
16 |     assert 'baseline' in ktu.keys()
17 |     assert 'advanced' in ktu.keys()
18 | 
19 | 
20 | def test_rbo():
21 |     rbo = rpd_eval.rbo()
22 |     assert 'baseline' in rbo.keys()
23 |     assert 'advanced' in rbo.keys()
24 | 
25 | 
26 | def test_rmse():
27 |     rmse = rpd_eval.rmse()
28 |     assert 'baseline' in rmse.keys()
29 |     assert 'advanced' in rmse.keys()
30 | 
31 | 
32 | def test_nrmse():
33 |     nrmse = rpd_eval.nrmse()
34 |     assert 'baseline' in nrmse.keys()
35 |     assert 'advanced' in nrmse.keys()
36 | 
37 | 
38 | def test_er():
39 |     er = rpd_eval.er()
40 |     assert 'map' in er.keys()
41 |     assert 'recip_rank' in er.keys()
42 |     assert 'P_10' in er.keys()
43 | 
44 | 
45 | def test_dri():
46 |     dri = rpd_eval.dri()
47 |     assert 'map' in dri.keys()
48 |     assert 'recip_rank' in dri.keys()
49 |     assert 'P_10' in dri.keys()
50 | 
51 | 
52 | def test_ttest():
53 |     ttest = rpd_eval.ttest()
54 |     assert 'baseline' in ttest.keys()
55 |     assert 'advanced' in ttest.keys()


--------------------------------------------------------------------------------
/repro_eval/test/test_rpl.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from repro_eval.Evaluator import RplEvaluator
 3 | 
 4 | rpl_eval = RplEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
 5 |                         run_b_orig_path='./example/orig_b.txt',
 6 |                         run_a_orig_path='./example/orig_a.txt',
 7 |                         run_b_rep_path='./example/rpl_b.txt',
 8 |                         run_a_rep_path='./example/rpl_a.txt',
 9 |                         qrel_rpl_path='./example/data/qrels/core18.txt')
10 | 
11 | rpl_eval.trim()
12 | rpl_eval.evaluate()
13 | 
14 | 
15 | def test_er():
16 |     er = rpl_eval.er()
17 |     assert 'map' in er.keys()
18 |     assert 'recip_rank' in er.keys()
19 |     assert 'P_10' in er.keys()
20 | 
21 | 
22 | def test_dri():
23 |     dri = rpl_eval.dri()
24 |     assert 'map' in dri.keys()
25 |     assert 'recip_rank' in dri.keys()
26 |     assert 'P_10' in dri.keys()
27 | 
28 | 
29 | def test_ttest():
30 |     ttest = rpl_eval.ttest()
31 |     assert 'baseline' in ttest.keys()
32 |     assert 'advanced' in ttest.keys()


--------------------------------------------------------------------------------
/repro_eval/test/test_ttest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from repro_eval.Evaluator import RpdEvaluator
 3 | 
 4 | 
 5 | def test_ttest_with_identical_score_distributions():
 6 |     rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
 7 |                             run_b_orig_path='./example/orig_b.txt',
 8 |                             run_a_orig_path='./example/orig_a.txt',
 9 |                             run_b_rep_path='./example/orig_b.txt',
10 |                             run_a_rep_path='./example/orig_a.txt')
11 | 
12 |     rpd_eval.trim()
13 |     rpd_eval.evaluate()
14 | 
15 |     ttest = rpd_eval.ttest()
16 | 
17 |     pvals = list(filter(lambda x: x == 1.0, ttest.get('baseline').values()))
18 |     assert len(pvals) == len(ttest.get('baseline').keys())
19 | 
20 |     pvals = list(filter(lambda x: x == 1.0, ttest.get('advanced').values()))
21 |     assert len(pvals) == len(ttest.get('advanced').keys())
22 | 


--------------------------------------------------------------------------------
/repro_eval/util.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from collections import OrderedDict
  3 | import numpy as np
  4 | from repro_eval.config import TRIM_THRESH, exclude
  5 | 
  6 | 
  7 | def trim(run, thresh=TRIM_THRESH):
  8 |     """
  9 |     Use this function to trim a run to a length of a document length specified by thresh.
 10 | 
 11 |     @param run: The run to be trimmed.
 12 |     @param thresh: The threshold value of the run length.
 13 |     """
 14 |     for topic, docs in run.items():
 15 |         run[topic] = dict(list(run[topic].items())[:thresh])
 16 | 
 17 | 
 18 | def arp(topic_scores):
 19 |     """
 20 |     This function computes the Average Retrieval Performance (ARP) according to the following paper:
 21 |     Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff.
 22 |     How to Measure the Reproducibility of System-oriented IR Experiments.
 23 |     Proceedings of SIGIR, pages 349-358, 2020.
 24 | 
 25 |     The ARP score is defined by the mean across the different topic scores of a run.
 26 | 
 27 |     @param topic_scores: Topic scores of an evaluated run.
 28 |     @return: The ARP score.
 29 |     """
 30 |     return np.array(list(topic_scores.values())).mean()
 31 | 
 32 | 
 33 | def _arp_scores(run):
 34 |     """
 35 |     Helping function returning a generator for determining the Average Retrieval Performance (ARP) scores.
 36 | 
 37 |     @param run: The run to be evaluated.
 38 |     @return: Generator with ARP scores for each trec_eval evaluation measure.
 39 |     """
 40 |     measures_all = list(list(run.values())[0].keys())
 41 |     measures_valid = [m for m in measures_all if m not in exclude]
 42 |     topics = run.keys()
 43 | 
 44 |     for measure in measures_valid:
 45 |         yield measure, np.array(list([run.get(topic).get(measure) for topic in topics])).mean()
 46 | 
 47 | 
 48 | def arp_scores(run):
 49 |     """
 50 |     This function computes the Average Retrieval Performance (ARP) scores according to the following paper:
 51 |     Timo Breuer, Nicola Ferro, Norbert Fuhr, Maria Maistro, Tetsuya Sakai, Philipp Schaer, Ian Soboroff.
 52 |     How to Measure the Reproducibility of System-oriented IR Experiments.
 53 |     Proceedings of SIGIR, pages 349-358, 2020.
 54 | 
 55 |     The ARP score is defined by the mean across the different topic scores of a run.
 56 |     For all measures outputted by trec_eval, the ARP scores will be determined.
 57 | 
 58 |     @param run: The run to be evaluated.
 59 |     @return: Dictionary containing the ARP scores for every measure outputted by trec_eval.
 60 |     """
 61 |     return dict(_arp_scores(run))
 62 | 
 63 | 
 64 | def _topic_scores(run_scores):
 65 |     """
 66 |     Helping function returning a generator for determining the topic scores for each measure.
 67 | 
 68 |     @param run_scores: The run scores of the previously evaluated run.
 69 |     @return: Generator with topic scores for each trec_eval evaluation measure.
 70 |     """
 71 |     measures_all = list(list(run_scores.values())[0].keys())
 72 |     measures_valid = [m for m in measures_all if m not in exclude]
 73 |     topics = run_scores.keys()
 74 | 
 75 |     for measure in measures_valid:
 76 |         yield measure, [run_scores.get(topic).get(measure) for topic in topics]
 77 | 
 78 | 
 79 | def topic_scores(run_scores):
 80 |     """
 81 |     Use this function for a dictionary that contains the topic scores for each measure outputted by trec_eval.
 82 | 
 83 |     @param run_scores: The run scores of the previously evaluated run.
 84 |     @return: Dictionary containing the topic scores for every measure outputted by trec_eval.
 85 |     """
 86 |     return dict(_topic_scores(run_scores))
 87 | 
 88 | 
 89 | def print_base_adv(measure_topic, repro_measure, base_value, adv_value=None):
 90 |     """
 91 |     Pretty print output in trec_eval inspired style. Use this for printing baseline and/or advanced results.
 92 | 
 93 |     @param measure_topic: The topic number.
 94 |     @param repro_measure: Name of the reproduction/replication measure.
 95 |     @param base_value: Value of the evaluated baseline run.
 96 |     @param adv_value: Value of the evaluated advanced run.
 97 |     """
 98 |     if adv_value:
 99 |         fill = ('{:3s}' if base_value < 0 else '{:4s}')
100 |         print(('{:25s}{:8s}{:8s}{:.4f}' + fill + '{:8s}{:.4f}').format(measure_topic, repro_measure,
101 |                                                                        'BASE', base_value, ' ', 'ADV', adv_value))
102 |     else:
103 |         print('{:25s}{:8s}{:8s}{:.4f}'.format(measure_topic, repro_measure, 'BASE', base_value))
104 | 
105 | 
106 | def print_simple_line(measure, repro_measure, value):
107 |     """
108 |     Use this for printing lines with trec_eval and reproduction/replication measures.
109 |     Pretty print output in trec_eval inspired style.
110 |     @param measure: Name of the trec_eval measure.
111 |     @param repro_measure: Name of the reproduction/replication measure.
112 |     @param value: Value of the evaluated run.
113 |     @return:
114 |     """
115 |     print('{:25s}{:8s}{:.4f}'.format(measure, repro_measure, value))
116 | 
117 | 
118 | def break_ties(run):
119 |     """
120 |     Use this function to break score ties like it is implemented in trec_eval.
121 |     Documents with the same score will be sorted in reverse alphabetical order.
122 |     :param run: Run with score ties. Nested dictionary structure (cf. pytrec_eval)
123 |     :return: Reordered run
124 |     """
125 |     for topic, ranking in run.items():
126 |         docid_score_tuple = list(ranking.items())
127 |         reordered_ranking = []
128 |         for k, v in itertools.groupby(docid_score_tuple, lambda item: item[1]):
129 |             reordered_ranking.extend(sorted(v, reverse=True))
130 |         run[topic] = OrderedDict(reordered_ranking)
131 |     return run
132 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from setuptools import setup
 3 | 
 4 | HERE = pathlib.Path(__file__).parent
 5 | README = (HERE / "README.md").read_text()
 6 | 
 7 | setup(name='repro_eval',
 8 |       version='0.4.0',
 9 |       description='A tool to quantify the replicability and reproducibility of system-oriented IR experiments.',
10 |       long_description=README,
11 |       long_description_content_type="text/markdown",
12 |       url='http://github.com/irgroup/repro_eval',
13 |       author='Timo Breuer',
14 |       author_email='timo.breuer@th-koeln.de',
15 |       license='MIT',
16 |       packages=['repro_eval', 
17 |                 'repro_eval.measure', 
18 |                 'repro_eval.measure.external'],
19 |       install_requires=[
20 |           'pytrec_eval',
21 |           'numpy',
22 |           'scipy',
23 |           'tqdm',
24 |           'ruamel.yaml',
25 |           'GitPython',
26 |           'py-cpuinfo'
27 |       ],
28 |       include_package_data=True,
29 |       package_data={'': ['resources/*.json']},
30 |       zip_safe=False)
31 | 


--------------------------------------------------------------------------------