├── pyproject.toml ├── images ├── evaluate.gif ├── nervaluate.gif └── evaluate_example.gif ├── setup.py ├── setup.cfg ├── LICENSE ├── .github └── workflows │ └── tests.yml ├── README.md ├── .gitignore ├── tests └── test_evaluate.py └── prodigy_evaluate └── __init__.py /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | line-length = 220 -------------------------------------------------------------------------------- /images/evaluate.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/prodigy-evaluate/HEAD/images/evaluate.gif -------------------------------------------------------------------------------- /images/nervaluate.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/prodigy-evaluate/HEAD/images/nervaluate.gif -------------------------------------------------------------------------------- /images/evaluate_example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/prodigy-evaluate/HEAD/images/evaluate_example.gif -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | if __name__ == "__main__": 4 | from setuptools import find_packages, setup 5 | 6 | setup(name="prodigy_evaluate", packages=find_packages()) 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | version = 0.1.0 3 | description = Recipes for evaluating spaCy pipelines 4 | url = https://github.com/explosion/prodigy-evaluate 5 | author = Explosion 6 | author_email = contact@explosion.ai 7 | 8 | [options] 9 | zip_safe = true 10 | python_requires = >=3.8 11 | install_requires = 12 | prodigy>=1.15.2,<2.0.0 13 | scikit-learn>=1.4.0,<1.5.0 14 | matplotlib>=3.8.0,<3.9.0 15 | nervaluate>=0.1.8,<0.2.0 16 | 17 | [options.entry_points] 18 | prodigy_recipes = 19 | evaluate.evaluate = prodigy_evaluate:evaluate 20 | evaluate.evaluate-example = prodigy_evaluate:evaluate_example 21 | evaluate.nervaluate = prodigy_evaluate:evaluate_nervaluate 22 | 23 | [bdist_wheel] 24 | universal = true 25 | 26 | [sdist] 27 | formats = gztar -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Explosion 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | setup: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3 14 | - name: Set up Python 3.9 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: 3.9 18 | cache: "pip" 19 | - name: Install Prodigy from private repo 20 | env: 21 | GITHUB_TOKEN: ${{ secrets.GHA_PRODIGY_READ }} # Use the secret here 22 | run: | 23 | export GIT_LFS_SKIP_SMUDGE=1 24 | pip install --upgrade pip 25 | git clone https://x-access-token:${GITHUB_TOKEN}@github.com/explosion/prodigy.git 26 | cd prodigy 27 | pip install setuptools wheel 28 | pip install -e . 29 | cd .. 30 | - name: Install additional dependencies 31 | run: | 32 | pip install -e . 33 | pip install pytest 34 | python -m spacy download en_core_web_sm 35 | pip install ruff black isort 36 | 37 | - name: Run pytest 38 | if: always() 39 | shell: bash 40 | run: python -m pytest tests 41 | 42 | - name: Run ruff 43 | if: always() 44 | shell: bash 45 | run: python -m ruff check prodigy_evaluate/ tests/ 46 | 47 | - name: Run black 48 | if: always() 49 | shell: bash 50 | run: python -m black --check prodigy_evaluate/ tests/ 51 | 52 | - name: Run isort 53 | if: always() 54 | shell: bash 55 | run: python -m isort prodigy_evaluate/ tests/ 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 🔎 Prodigy-evaluate 4 | 5 | This repository contains a Prodigy plugin for recipes to evaluate spaCy pipelines. It features multiple recipes: 6 | 7 | 1. `evaluate.evaluate`: Evaluate a spaCy pipeline on one or more datasets for different components. Passing flags like `--label-stats` or `--confusion-matrix` will compute a variety of evaluation metrics, including precision, recall, F1, accuracy, and more. 8 | 9 |

10 | 11 |

12 | 13 | 2. `evaluate.evaluate-example`: Evaluate a spaCy pipeline on one or more datasets for different components on a **per-example basis**. This is helpful for debugging and for understanding the hardest examples for your model. 14 | 15 |

16 | 17 |

18 | 19 | 3. `evaluate.nervaluate`: Evaluate a spaCy NER component on one or more datasets. This recipe uses the `nervaluate` library to calculate various metric for NER. You can learn more about the metrics in the [nervaluate documentation](https://github.com/MantisAI/nervaluate). This is helpful because the approach takes into account partial matches, which may be a more relevant metric for your NER use case. 20 | 21 |

22 | 23 |

24 | 25 | 26 | You can install this plugin via `pip`. 27 | 28 | ``` 29 | pip install "prodigy-evaluate @ git+https://github.com/explosion/prodigy-evaluate" 30 | ``` 31 | 32 | To learn more about this plugin and additional functionality, you can check the [Prodigy docs](https://prodi.gy/docs/plugins/#evaluate). 33 | 34 | ## Issues? 35 | 36 | Are you have trouble with this plugin? Let us know on our [support forum](https://support.prodi.gy/) and we'll get back to you! -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | #ignore python environments 10 | *_env/ 11 | *_venv/ 12 | 13 | #ignore notebooks 14 | *.ipynb 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | cover/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | .pybuilder/ 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | # For a library or package, you might want to ignore these files since the code is 94 | # intended to run in multiple environments; otherwise, check them in: 95 | # .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # poetry 105 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 106 | # This is especially recommended for binary packages to ensure reproducibility, and is more 107 | # commonly ignored for libraries. 108 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 109 | #poetry.lock 110 | 111 | # pdm 112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 113 | #pdm.lock 114 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 115 | # in version control. 116 | # https://pdm.fming.dev/#use-with-ide 117 | .pdm.toml 118 | 119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 120 | __pypackages__/ 121 | 122 | # Celery stuff 123 | celerybeat-schedule 124 | celerybeat.pid 125 | 126 | # SageMath parsed files 127 | *.sage.py 128 | 129 | # Environments 130 | .env 131 | .venv 132 | env/ 133 | venv/ 134 | ENV/ 135 | env.bak/ 136 | venv.bak/ 137 | 138 | # Spyder project settings 139 | .spyderproject 140 | .spyproject 141 | 142 | # Rope project settings 143 | .ropeproject 144 | 145 | # mkdocs documentation 146 | /site 147 | 148 | # mypy 149 | .mypy_cache/ 150 | .dmypy.json 151 | dmypy.json 152 | 153 | # Pyre type checker 154 | .pyre/ 155 | 156 | # pytype static type analyzer 157 | .pytype/ 158 | 159 | # Cython debug symbols 160 | cython_debug/ 161 | 162 | # PyCharm 163 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 164 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 165 | # and can be added to the global gitignore or merged into this file. For a more nuclear 166 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 167 | #.idea/ 168 | 169 | .DS_Store -------------------------------------------------------------------------------- /tests/test_evaluate.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Iterable, List 2 | 3 | import en_core_web_sm 4 | import pytest 5 | from prodigy.components.db import connect 6 | from prodigy.types import TaskType 7 | from spacy.training import Example 8 | 9 | from prodigy_evaluate import ( 10 | _create_ner_table, 11 | _display_eval_results, 12 | _get_actual_labels, 13 | _get_cf_actual_predicted, 14 | _get_predicted_labels, 15 | _get_score_for_metric, 16 | evaluate, 17 | evaluate_example, 18 | evaluate_nervaluate, 19 | ) 20 | 21 | 22 | @pytest.fixture 23 | def dataset() -> str: 24 | return "test_dataset" 25 | 26 | 27 | @pytest.fixture 28 | def spacy_model(): 29 | return "en_core_web_sm" 30 | 31 | 32 | @pytest.fixture 33 | def nlp(): 34 | return en_core_web_sm.load() 35 | 36 | 37 | @pytest.fixture 38 | def metric() -> str: 39 | return "ents_f" 40 | 41 | 42 | @pytest.fixture 43 | def data() -> Iterable[Dict]: 44 | return [ 45 | { 46 | "text": "My name is Freya.", 47 | "_input_hash": 896529854, 48 | "_task_hash": -1486695581, 49 | "tokens": [ 50 | {"text": "My", "start": 0, "end": 2, "id": 0, "ws": True}, 51 | {"text": "name", "start": 3, "end": 7, "id": 1, "ws": True}, 52 | {"text": "is", "start": 8, "end": 10, "id": 2, "ws": True}, 53 | {"text": "Freya", "start": 11, "end": 16, "id": 3, "ws": True}, 54 | {"text": ".", "start": 16, "end": 17, "id": 4, "ws": True}, 55 | ], 56 | "_view_id": "ner_manual", 57 | "spans": [ 58 | { 59 | "start": 11, 60 | "end": 16, 61 | "token_start": 3, 62 | "token_end": 3, 63 | "label": "PERSON", 64 | } 65 | ], 66 | "answer": "accept", 67 | "_timestamp": 1707211049, 68 | "_annotator_id": "2024-02-06_10-17-19", 69 | "_session_id": "2024-02-06_10-17-19", 70 | }, 71 | { 72 | "text": "My favorite city is London.", 73 | "_input_hash": -91551573, 74 | "_task_hash": -1162253049, 75 | "tokens": [ 76 | {"text": "My", "start": 0, "end": 2, "id": 0, "ws": True}, 77 | {"text": "favorite", "start": 3, "end": 11, "id": 1, "ws": True}, 78 | {"text": "city", "start": 12, "end": 16, "id": 2, "ws": True}, 79 | {"text": "is", "start": 17, "end": 19, "id": 3, "ws": True}, 80 | {"text": "London", "start": 20, "end": 26, "id": 4, "ws": True}, 81 | {"text": ".", "start": 26, "end": 27, "id": 5, "ws": False}, 82 | ], 83 | "_view_id": "ner_manual", 84 | "spans": [ 85 | { 86 | "start": 20, 87 | "end": 26, 88 | "token_start": 4, 89 | "token_end": 4, 90 | "label": "GPE", 91 | } 92 | ], 93 | "answer": "accept", 94 | "_timestamp": 1707211053, 95 | "_annotator_id": "2024-02-06_10-17-19", 96 | "_session_id": "2024-02-06_10-17-19", 97 | }, 98 | { 99 | "text": "I live in Berlin.", 100 | "_input_hash": -2101464790, 101 | "_task_hash": 1279282044, 102 | "tokens": [ 103 | {"text": "I", "start": 0, "end": 1, "id": 0, "ws": True}, 104 | {"text": "live", "start": 2, "end": 6, "id": 1, "ws": True}, 105 | {"text": "in", "start": 7, "end": 9, "id": 2, "ws": True}, 106 | {"text": "Berlin", "start": 10, "end": 16, "id": 3, "ws": True}, 107 | {"text": ".", "start": 16, "end": 17, "id": 4, "ws": True}, 108 | ], 109 | "_view_id": "ner_manual", 110 | "spans": [ 111 | { 112 | "start": 10, 113 | "end": 16, 114 | "token_start": 3, 115 | "token_end": 3, 116 | "label": "GPE", 117 | } 118 | ], 119 | "answer": "accept", 120 | "_timestamp": 1707211056, 121 | "_annotator_id": "2024-02-06_10-17-19", 122 | "_session_id": "2024-02-06_10-17-19", 123 | }, 124 | ] 125 | 126 | 127 | @pytest.fixture 128 | def scores() -> Dict[str, float]: 129 | return { 130 | "ents_f": 0.9, 131 | "ents_p": 0.8, 132 | "ents_r": 0.7, 133 | "tags_acc": 0.6, 134 | "sents_p": 0.5, 135 | "sents_r": 0.4, 136 | "sents_f": 0.3, 137 | } 138 | 139 | 140 | @pytest.fixture 141 | def db(dataset: str, data: List[TaskType]): 142 | database = connect() 143 | database.add_dataset(dataset) 144 | database.add_examples(data, datasets=[dataset]) 145 | return database 146 | 147 | 148 | @pytest.fixture 149 | def ner_examples(nlp): 150 | data = { 151 | "Apple Inc. is an American multinational technology company.": { 152 | "entities": [(0, 10, "ORG")] # Span covering "Apple Inc." 153 | }, 154 | "Musk is the CEO of Tesla, Inc.": { 155 | "entities": [ 156 | (0, 4, "PERSON"), 157 | (19, 30, "ORG"), 158 | ] # Spans covering "Musk" and "Tesla, Inc." 159 | }, 160 | } 161 | examples = [] 162 | for text, annot in data.items(): 163 | examples.append(Example.from_dict(nlp.make_doc(text), annot)) 164 | 165 | return examples 166 | 167 | 168 | @pytest.fixture 169 | def textcat_examples(nlp): 170 | data = { 171 | "SpaCy is an amazing library for NLP.": {"POSITIVE": 1.0, "NEGATIVE": 0.0}, 172 | "I dislike rainy days.": {"POSITIVE": 0.0, "NEGATIVE": 1.0}, 173 | } 174 | 175 | examples = [] 176 | for text, annot in data.items(): 177 | doc = nlp.make_doc(text) 178 | doc.cats = annot 179 | ref_doc = nlp.make_doc(text) 180 | ref_doc.cats = annot 181 | example = Example(doc, ref_doc) 182 | examples.append(example) 183 | 184 | return examples 185 | 186 | 187 | @pytest.fixture 188 | def nervaluate_results(): 189 | return { 190 | "ent_type": { 191 | "correct": 2, 192 | "incorrect": 0, 193 | "partial": 0, 194 | "missed": 1, 195 | "spurious": 0, 196 | "possible": 3, 197 | "actual": 2, 198 | "precision": 1.0, 199 | "recall": 0.6666666666666666, 200 | "f1": 0.8, 201 | }, 202 | "partial": { 203 | "correct": 2, 204 | "incorrect": 0, 205 | "partial": 0, 206 | "missed": 1, 207 | "spurious": 0, 208 | "possible": 3, 209 | "actual": 2, 210 | "precision": 1.0, 211 | "recall": 0.6666666666666666, 212 | "f1": 0.8, 213 | }, 214 | "strict": { 215 | "correct": 2, 216 | "incorrect": 0, 217 | "partial": 0, 218 | "missed": 1, 219 | "spurious": 0, 220 | "possible": 3, 221 | "actual": 2, 222 | "precision": 1.0, 223 | "recall": 0.6666666666666666, 224 | "f1": 0.8, 225 | }, 226 | "exact": { 227 | "correct": 2, 228 | "incorrect": 0, 229 | "partial": 0, 230 | "missed": 1, 231 | "spurious": 0, 232 | "possible": 3, 233 | "actual": 2, 234 | "precision": 1.0, 235 | "recall": 0.6666666666666666, 236 | "f1": 0.8, 237 | }, 238 | } 239 | 240 | 241 | ######## evaluation tests ######## 242 | 243 | 244 | def test_evaluate_example(spacy_model, dataset, metric, db, capsys): 245 | evaluate_example(model=spacy_model, ner=dataset, metric=metric, n_results=5) 246 | 247 | captured = capsys.readouterr() 248 | 249 | assert "Scored Example" in captured.out 250 | 251 | db.drop_dataset(dataset) 252 | 253 | 254 | def test_evaluate(spacy_model, dataset, db, capsys): 255 | results = evaluate( 256 | model=spacy_model, 257 | ner=dataset, 258 | label_stats=True, 259 | cf_matrix=False, # False 260 | ) 261 | 262 | captured = capsys.readouterr() 263 | 264 | assert "P" in captured.out 265 | assert "R" in captured.out 266 | assert "F" in captured.out 267 | 268 | assert isinstance(results, dict) 269 | assert "token_acc" in results 270 | assert "token_p" in results 271 | assert results.get("token_p") == 1 272 | assert isinstance(results.get("ents_p"), float) 273 | assert results.get("speed") > 1 274 | 275 | db.drop_dataset(dataset) 276 | 277 | 278 | def test_nervaluate(spacy_model, dataset, db, capsys): 279 | results = evaluate_nervaluate( 280 | model=spacy_model, 281 | ner=dataset, 282 | ) 283 | captured = capsys.readouterr() 284 | 285 | assert "Correct" in captured.out 286 | assert "Metric" in captured.out 287 | assert "Ent type" in captured.out 288 | assert "Incorrect" in captured.out 289 | assert "Recall" in captured.out 290 | assert "F1" in captured.out 291 | assert "Partial" in captured.out 292 | 293 | assert isinstance(results, dict) 294 | assert "ent_type" in list(results["overall_results"].keys()) 295 | assert "partial" in results["overall_results"] 296 | 297 | assert results["overall_results"]["ent_type"]["f1"] == 1.0 298 | 299 | db.drop_dataset(dataset) 300 | 301 | 302 | def test_display_eval_results(scores, capsys): 303 | _display_eval_results(scores, "sc") 304 | captured = capsys.readouterr() 305 | 306 | assert "Results" in captured.out 307 | 308 | 309 | def test_get_score_for_metric(scores, metric: str): 310 | res = _get_score_for_metric(scores, metric) 311 | 312 | assert isinstance(res, float) 313 | assert isinstance(scores, dict) 314 | assert isinstance(metric, str) 315 | assert metric is not None 316 | 317 | 318 | def test_get_actual_labels_ner(ner_examples): 319 | ner_labels = _get_actual_labels(ner_examples, "ner") 320 | assert isinstance(ner_labels, list) 321 | assert len(ner_labels) == 2 322 | assert all(isinstance(label, str) for label in ner_labels[0]) 323 | assert all(isinstance(label, str) for label in ner_labels[1]) 324 | assert "O" in ner_labels[0] 325 | assert "B-ORG" in ner_labels[0] 326 | assert "U-PERSON" in ner_labels[1] 327 | 328 | 329 | def test_get_actual_labels_textcat(textcat_examples): 330 | textcat_labels = _get_actual_labels(textcat_examples, "textcat") 331 | assert isinstance(textcat_labels, list) 332 | assert len(textcat_labels) == 2 333 | assert "POSITIVE" in textcat_labels 334 | assert "NEGATIVE" in textcat_labels 335 | assert all(isinstance(label, str) for label in textcat_labels) 336 | 337 | 338 | # here we need a model as we're using one in _get_predicted_labels 339 | # because nlp.evaluate does not create example.predicted values 340 | def test_get_predicted_labels_ner(nlp, ner_examples): 341 | pred_ner_labels = _get_predicted_labels(nlp, ner_examples, "ner") 342 | assert isinstance(pred_ner_labels, list) 343 | assert len(pred_ner_labels) == 2 344 | assert all(isinstance(label, str) for label in pred_ner_labels[0]) 345 | assert all(isinstance(label, str) for label in pred_ner_labels[1]) 346 | 347 | assert "O" in pred_ner_labels[1] 348 | assert "B-ORG" in pred_ner_labels[0] 349 | 350 | 351 | def test_get_cf_actual_predicted(nlp, ner_examples): 352 | actual, predicted, labels, actual_flat, predicted_flat = _get_cf_actual_predicted( 353 | nlp, ner_examples, "ner" 354 | ) 355 | assert isinstance(actual[0], list) 356 | assert isinstance(actual_flat[0], str) 357 | 358 | assert isinstance(predicted[0], list) 359 | assert isinstance(predicted_flat[1], str) 360 | 361 | assert isinstance(actual, list) 362 | assert isinstance(predicted, list) 363 | assert isinstance(labels, list) 364 | assert "O" in actual[0] 365 | assert "B-ORG" in predicted[1] 366 | 367 | 368 | def test_create_ner_table(nervaluate_results, capsys): 369 | _create_ner_table(nervaluate_results) 370 | captured = capsys.readouterr() 371 | 372 | assert "Correct" in captured.out 373 | assert "Metric" in captured.out 374 | assert "Ent type" in captured.out 375 | assert "Incorrect" in captured.out 376 | assert "Recall" in captured.out 377 | assert "F1" in captured.out 378 | assert "Partial" in captured.out 379 | -------------------------------------------------------------------------------- /prodigy_evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from dataclasses import dataclass 4 | from pathlib import Path 5 | from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union 6 | 7 | import matplotlib.pyplot as plt 8 | import spacy 9 | import srsly 10 | from nervaluate import Evaluator 11 | from prodigy.core import recipe 12 | from prodigy.errors import RecipeError 13 | from prodigy.recipes.data_utils import get_datasets_from_cli_eval, merge_corpus 14 | from prodigy.recipes.train import RECIPE_ARGS, set_log_level, setup_gpu 15 | from prodigy.util import SPANCAT_DEFAULT_KEY, msg 16 | from radicli import Arg 17 | 18 | # additional imports 19 | from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix 20 | from spacy.cli.evaluate import handle_scores_per_type 21 | from spacy.language import Language 22 | from spacy.training import offsets_to_biluo_tags 23 | from spacy.training.example import Example 24 | 25 | 26 | @recipe( 27 | "evaluate.evaluate", 28 | # fmt: off 29 | model=Arg(help="Name or path of model to evaluate"), 30 | ner=RECIPE_ARGS["ner"], 31 | textcat=RECIPE_ARGS["textcat"], 32 | textcat_multilabel=RECIPE_ARGS["textcat_multilabel"], 33 | tagger=RECIPE_ARGS["tagger"], 34 | senter=RECIPE_ARGS["senter"], 35 | parser=RECIPE_ARGS["parser"], 36 | spancat=RECIPE_ARGS["spancat"], 37 | coref=RECIPE_ARGS["coref"], 38 | label_stats=Arg("--label-stats", "-LS", help="Show per-label scores"), 39 | gpu_id=RECIPE_ARGS["gpu_id"], 40 | verbose=RECIPE_ARGS["verbose"], 41 | silent=RECIPE_ARGS["silent"], 42 | cf_matrix = Arg("--confusion-matrix", "-CF", help="Show confusion matrix for the specified component"), 43 | cf_path = Arg("--cf-path", "-CP", help="Path to save the confusion matrix array"), 44 | spans_key=Arg("--spans-key", help="Optional spans key to evaluate if spancat component is used."), 45 | # fmt: on 46 | ) 47 | def evaluate( 48 | model: Union[str, Path], 49 | ner: Sequence[str] = tuple(), 50 | textcat: Sequence[str] = tuple(), 51 | textcat_multilabel: Sequence[str] = tuple(), 52 | tagger: Sequence[str] = tuple(), 53 | senter: Sequence[str] = tuple(), 54 | parser: Sequence[str] = tuple(), 55 | spancat: Sequence[str] = tuple(), 56 | coref: Sequence[str] = tuple(), 57 | label_stats: bool = False, 58 | gpu_id: int = -1, 59 | verbose: bool = False, 60 | silent: bool = True, 61 | cf_matrix: bool = False, 62 | cf_path: Optional[Path] = None, 63 | spans_key: str = SPANCAT_DEFAULT_KEY, 64 | ) -> Dict[str, Any]: 65 | """Evaluate a spaCy pipeline on one or more datasets for different components. 66 | 67 | This command takes care of merging all annotations on the same input data like the 68 | prodigy train command. 69 | 70 | You can also use the --label-stats flag to show per-label scores for NER and textcat 71 | components. This will show the precision, recall and F-score for each label. 72 | 73 | Finally, you can also use --confusion-matrix to show the confusion matrix for the 74 | specified component. This will only work for NER or textcat components. 75 | 76 | Example Usage: 77 | 78 | ``` 79 | prodigy evaluate.evaluate en_core_web_sm --ner my_eval_dataset --confusion-matrix 80 | ``` 81 | """ 82 | set_log_level(verbose=verbose, silent=silent) 83 | setup_gpu(gpu_id) 84 | nlp = spacy.load(model) 85 | 86 | pipes = get_datasets_from_cli_eval( 87 | ner, 88 | textcat, 89 | textcat_multilabel, 90 | tagger, 91 | senter, 92 | parser, 93 | spancat, 94 | coref, 95 | ) 96 | pipe_key = [k for k in pipes if pipes.get(k)][0] 97 | 98 | compat_pipes = { 99 | pipe_name: ([], eval_sets) for pipe_name, eval_sets in pipes.items() 100 | } 101 | merged_corpus = merge_corpus(nlp, compat_pipes) 102 | dev_examples = merged_corpus["dev"](nlp) 103 | scores = nlp.evaluate(dev_examples) 104 | 105 | if pipe_key in ["ner", "textcat"]: 106 | ( 107 | actual_labels, 108 | predicted_labels, 109 | labels, 110 | flat_actual_labels, 111 | flat_predicted_labels, 112 | ) = _get_cf_actual_predicted( 113 | nlp=nlp, dev_examples=dev_examples, pipe_key=pipe_key 114 | ) 115 | labels_to_include = [label for label in labels if label != "O"] 116 | if pipe_key == "ner": 117 | actual_labels = flat_actual_labels 118 | predicted_labels = flat_predicted_labels 119 | 120 | cfarray = confusion_matrix( 121 | actual_labels, predicted_labels, labels=labels_to_include, normalize="true" 122 | ) 123 | 124 | _display_eval_results( 125 | scores, spans_key=spans_key, silent=False, per_type=label_stats 126 | ) 127 | 128 | if cf_matrix: 129 | if pipe_key not in ["ner", "textcat"]: 130 | msg.fail( 131 | f"Confusion matrix is not supported for {pipe_key} component", exits=1 132 | ) 133 | _display_confusion_matrix( 134 | cm=cfarray, 135 | labels=labels_to_include, 136 | ) 137 | msg.good("Confusion matrix displayed") 138 | 139 | if cf_path: 140 | if pipe_key not in ["ner", "textcat"]: 141 | msg.fail( 142 | f"Confusion matrix is not supported for {pipe_key} component", exits=1 143 | ) 144 | if not cf_path.exists(): 145 | os.makedirs(cf_path) 146 | 147 | full_cf_path = cf_path / "cf_array.json" 148 | srsly.write_json( 149 | full_cf_path, 150 | { 151 | "cf_array": cfarray.tolist(), 152 | "labels": labels_to_include, 153 | }, 154 | ) 155 | msg.good(f"Confusion matrix array saved to {full_cf_path}") 156 | 157 | return scores 158 | 159 | 160 | @recipe( 161 | "evaluate.evaluate-example", 162 | # fmt: off 163 | model=Arg(help="Path to model to evaluate"), 164 | ner=RECIPE_ARGS["ner"], 165 | textcat=RECIPE_ARGS["textcat"], 166 | textcat_multilabel=RECIPE_ARGS["textcat_multilabel"], 167 | tagger=RECIPE_ARGS["tagger"], 168 | senter=RECIPE_ARGS["senter"], 169 | parser=RECIPE_ARGS["parser"], 170 | spancat=RECIPE_ARGS["spancat"], 171 | coref=RECIPE_ARGS["coref"], 172 | gpu_id=RECIPE_ARGS["gpu_id"], 173 | verbose=RECIPE_ARGS["verbose"], 174 | silent=RECIPE_ARGS["silent"], 175 | metric=Arg("--metric", "-m", help="Metric to use for sorting examples"), 176 | n_results = Arg("--n-results", "-NR", help="Number of top examples to display"), 177 | output_path=Arg("--output-path", "-OP", help="Path to save the top examples and scores") 178 | # fmt: on 179 | ) 180 | def evaluate_example( 181 | model: Union[str, Path], 182 | ner: Sequence[str] = tuple(), 183 | textcat: Sequence[str] = tuple(), 184 | textcat_multilabel: Sequence[str] = tuple(), 185 | tagger: Sequence[str] = tuple(), 186 | senter: Sequence[str] = tuple(), 187 | parser: Sequence[str] = tuple(), 188 | spancat: Sequence[str] = tuple(), 189 | coref: Sequence[str] = tuple(), 190 | gpu_id: int = -1, 191 | verbose: bool = False, 192 | silent: bool = True, 193 | metric: Optional[str] = None, 194 | n_results: int = 10, 195 | output_path: Optional[Path] = None, 196 | ): 197 | """Evaluate a spaCy pipeline on one or more datasets for different components 198 | on a per-example basis. This command will run an evaluation on each example individually 199 | and then sort by the desired `--metric` argument. 200 | 201 | This is useful for debugging and understanding the easiest 202 | and hardest examples for your model. 203 | 204 | Example Usage: 205 | ``` 206 | prodigy evaluate.evaluate-example en_core_web_sm --ner my_eval_dataset --metric ents_f 207 | ``` 208 | 209 | This will sort examples by lowest NER F-score. 210 | """ 211 | if not metric: 212 | raise RecipeError( 213 | "You must pass a metric to sort examples via --metric argument. Refer to prodigy evaluate-example documentation for available metric types." 214 | ) 215 | 216 | set_log_level(verbose=verbose, silent=silent) 217 | setup_gpu(gpu_id) 218 | nlp = spacy.load(model) 219 | 220 | pipes = get_datasets_from_cli_eval( 221 | ner, 222 | textcat, 223 | textcat_multilabel, 224 | tagger, 225 | senter, 226 | parser, 227 | spancat, 228 | coref, 229 | ) 230 | compat_pipes = { 231 | pipe_name: ([], eval_sets) for pipe_name, eval_sets in pipes.items() 232 | } 233 | merged_corpus = merge_corpus(nlp, compat_pipes) 234 | dev_examples = merged_corpus["dev"](nlp) 235 | results: List[ScoredExample] = evaluate_each_example(nlp, dev_examples, metric) 236 | 237 | top_results: List[ScoredExample] = results[:n_results] 238 | 239 | if len(top_results) == 0: 240 | msg.fail(f"No examples found for the metric {metric}.", exits=1) 241 | avg_text_len = sum([len(ex.example.text) for ex in top_results]) / len(top_results) 242 | if avg_text_len > 100: 243 | msg.warn( 244 | f"Average # of characters of top examples is {round(avg_text_len, 2)}. This will not display well in the terminal. Consider saving the top examples to file with `--output-path` and investigating accordingly." 245 | ) 246 | 247 | def split_string_into_tuples(input_string: str, length: int = 50): 248 | """ 249 | This function takes a string and splits it into tuples of length `length`. 250 | Useful for wrapping long strings in tables. 251 | """ 252 | input_string = input_string.rstrip() 253 | if len(input_string) > length: 254 | result = tuple( 255 | input_string[i : i + length].rstrip() 256 | for i in range(0, len(input_string), length) 257 | ) 258 | return result 259 | else: 260 | return input_string 261 | 262 | data = [ 263 | ( 264 | split_string_into_tuples(ex.example.text), 265 | round(ex.score, 2) if ex.score is not None else None, 266 | ) 267 | for ex in top_results 268 | ] 269 | headers = ["Example", metric] 270 | widths = (50, 9) 271 | aligns = ("l", "l") 272 | 273 | msg.divider("Scored Examples") 274 | msg.table( 275 | data, header=headers, divider=True, widths=widths, aligns=aligns, multiline=True 276 | ) 277 | 278 | if output_path: 279 | if not output_path.exists(): 280 | os.makedirs(output_path) 281 | 282 | results_path = output_path / "hardest_examples.jsonl" 283 | 284 | results_jsonl = [] 285 | for data in top_results: 286 | results_json = { 287 | "text": data.example.text, 288 | "meta": {"score": data.score, "metric": metric}, 289 | } 290 | results_jsonl.append(results_json) 291 | 292 | srsly.write_jsonl(results_path, results_jsonl) 293 | msg.good(f"The examples with the lowest scores saved to {results_path}") 294 | msg.info( 295 | "You can inspect the NER/spancat/textcat predictions on the hardest examples by running one of the Prodigy `*.correct` or `*.model-annotate` workflows. See documentation for more details: https://prodi.gy/docs/recipes" 296 | ) 297 | 298 | 299 | @recipe( 300 | "evaluate.nervaluate", 301 | # fmt: off 302 | model=Arg(help="Path to model to evaluate"), 303 | ner=RECIPE_ARGS["ner"], 304 | gpu_id=RECIPE_ARGS["gpu_id"], 305 | verbose=RECIPE_ARGS["verbose"], 306 | per_label=Arg("--per-label", "-PL", help="Show per-label NER nervaluate scores"), 307 | # fmt: on 308 | ) 309 | def evaluate_nervaluate( 310 | model: Union[str, Path], 311 | ner: Sequence[str], 312 | gpu_id: int = -1, 313 | verbose: bool = False, 314 | per_label: bool = False, 315 | ): 316 | """ 317 | Evaluate spaCy's NER component using nervaluate metrics. the `nervaluate` library 318 | provides full named-entity (i.e. not tag/token) evaluation metrics based on SemEval’13. 319 | 320 | For more information on these metric, see https://github.com/MantisAI/nervaluate. 321 | 322 | Example Usage: 323 | 324 | ``` 325 | prodigy evaluate.nervaluate en_core_web_sm --ner my_eval_dataset 326 | ``` 327 | """ 328 | set_log_level(verbose=verbose, silent=True) # silence component merging 329 | setup_gpu(gpu_id) 330 | nlp = spacy.load(model) 331 | merged_corpus = merge_corpus(nlp, {"ner": ([], [ner])}) 332 | dev_examples = merged_corpus["dev"](nlp) 333 | ( 334 | actual_labels, 335 | predicted_labels, 336 | labels, 337 | flat_actual_labels, 338 | flat_predicted_labels, 339 | ) = _get_cf_actual_predicted(nlp=nlp, dev_examples=dev_examples, pipe_key="ner") 340 | 341 | evaluator = Evaluator(actual_labels, predicted_labels, tags=labels, loader="list") 342 | ner_results, ner_results_by_tag = evaluator.evaluate() 343 | msg.divider("nervaluate NER metrics") 344 | msg.info( 345 | "Full named-entity (i.e., not tag/token) evaluation metrics based on SemEval’13. For more information on these metrics, see https://github.com/MantisAI/nervaluate" 346 | ) 347 | msg.text("NER: Overall") 348 | _create_ner_table(ner_results) 349 | 350 | if per_label: 351 | for tag, tag_results in ner_results_by_tag.items(): 352 | if tag != "O": 353 | msg.text(title=f"NER: {tag}") 354 | _create_ner_table(tag_results) 355 | 356 | return {"overall_results": ner_results, "results_by_tag": ner_results_by_tag} 357 | 358 | 359 | @dataclass 360 | class ScoredExample: 361 | example: Example 362 | score: Optional[float] 363 | scores: Dict[str, float] 364 | 365 | 366 | def _get_score_for_metric(scores: Dict[str, float], metric: str) -> Union[float, None]: 367 | """Returns the score for the specified metric. 368 | 369 | Args: 370 | scores (Dict[str, float]): Dictionary containing scores for different metrics 371 | metric (str): Metric to get the score for 372 | 373 | Returns: 374 | Union[float, None]: Score for the specified metric or None if not found 375 | """ 376 | 377 | return scores.get(metric, None) 378 | 379 | 380 | def evaluate_each_example( 381 | nlp: Language, 382 | dev_examples: Iterable[Example], 383 | metric: str, 384 | desc: bool = False, 385 | skip_none: bool = True, 386 | ) -> List[ScoredExample]: 387 | def sort_key(x: Tuple[Example, Dict[str, float]]) -> Union[float, int]: 388 | _, eval_scores = x 389 | res = _get_score_for_metric(eval_scores, metric) 390 | if res is None: 391 | res = 0 392 | if not isinstance(res, (float, int)): 393 | raise ValueError(f"Invalid metric to sort by: {metric}", res) 394 | return res 395 | 396 | per_example_scores = {} 397 | for example in dev_examples: 398 | scores = nlp.evaluate([example]) 399 | res = _get_score_for_metric(scores, metric) 400 | if res is None and skip_none: 401 | continue 402 | per_example_scores[example] = scores 403 | 404 | sorted_per_example_scores = [ 405 | ScoredExample( 406 | example=eg, 407 | score=_get_score_for_metric(example_scores, metric), 408 | scores=example_scores, 409 | ) 410 | for eg, example_scores in sorted( 411 | per_example_scores.items(), key=sort_key, reverse=desc 412 | ) 413 | ] 414 | return sorted_per_example_scores 415 | 416 | 417 | def _display_eval_results( 418 | scores: Dict[str, Any], spans_key: str, silent: bool = False, per_type: bool = False 419 | ) -> None: 420 | """Displays the evaluation results for the specified component. 421 | 422 | Args: 423 | scores (Dict[str, Any]): Dictionary containing evaluation scores from `nlp.evaluate` 424 | spans_key (str): Optional spans key to evaluate if spancat component is used. 425 | silent (bool, optional): Whether to display all results or not. Defaults to False. 426 | """ 427 | metrics = { 428 | "TOK": "token_acc", 429 | "TAG": "tag_acc", 430 | "POS": "pos_acc", 431 | "MORPH": "morph_acc", 432 | "LEMMA": "lemma_acc", 433 | "UAS": "dep_uas", 434 | "LAS": "dep_las", 435 | "NER P": "ents_p", 436 | "NER R": "ents_r", 437 | "NER F": "ents_f", 438 | "TEXTCAT": "cats_score", 439 | "SENT P": "sents_p", 440 | "SENT R": "sents_r", 441 | "SENT F": "sents_f", 442 | "SPAN P": f"spans_{spans_key}_p", 443 | "SPAN R": f"spans_{spans_key}_r", 444 | "SPAN F": f"spans_{spans_key}_f", 445 | "SPEED": "speed", 446 | } 447 | results = {} 448 | data = {} 449 | for metric, key in metrics.items(): 450 | if key in scores: 451 | if key == "cats_score": 452 | metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" 453 | if isinstance(scores[key], (int, float)): 454 | if key == "speed": 455 | results[metric] = f"{scores[key]:.0f}" 456 | else: 457 | results[metric] = f"{scores[key]*100:.2f}" 458 | else: 459 | results[metric] = "-" 460 | data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] 461 | msg.table(results, title="Results") 462 | 463 | if per_type: 464 | data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent) 465 | 466 | 467 | #### Confusion matrix functions #### 468 | 469 | 470 | def _get_actual_labels(dev_examples: Iterable[Example], pipe_key: str) -> List[Any]: 471 | """Returns the actual labels for the specified component. 472 | 473 | Args: 474 | dev_examples (Iterable[Example]): List of examples 475 | pipe_key (str): Name of the component 476 | 477 | Returns: 478 | List[Any]: List of actual labels 479 | """ 480 | actual_labels = [] 481 | for ex in dev_examples: 482 | ref = ex.reference # we have reference but we don't have predicted 483 | if pipe_key == "ner": 484 | ents = ex.get_aligned_ner() 485 | ents_clean = ["O" if x is None else x for x in ents] 486 | actual_labels.append(ents_clean) 487 | elif pipe_key == "textcat": 488 | text_labels = ref.cats 489 | most_likely_class = ( 490 | max(text_labels, key=lambda k: text_labels[k]) 491 | if text_labels != {} 492 | else "O" 493 | ) 494 | actual_labels.append(most_likely_class) 495 | 496 | return actual_labels 497 | 498 | 499 | def _get_predicted_labels( 500 | nlp: Language, dev_examples: Iterable[Example], pipe_key: str 501 | ) -> List[Any]: 502 | """Returns the predicted labels for the specified component. 503 | 504 | Args: 505 | nlp (Language): spaCy model 506 | dev_examples (Iterable[Example]): List of examples 507 | pipe_key (str): Name of the component 508 | 509 | Returns: 510 | List[Any]: List of predicted labels 511 | """ 512 | 513 | texts = [eg.text for eg in dev_examples] 514 | pred_labels = [] 515 | for eg in nlp.pipe(texts): 516 | if pipe_key == "ner": 517 | ents = [(ent.start_char, ent.end_char, ent.label_) for ent in eg.ents] 518 | biluo_tags = offsets_to_biluo_tags(eg, ents) 519 | pred_labels.append(biluo_tags) 520 | elif pipe_key == "textcat": 521 | text_labels = eg.cats 522 | most_likely_class = ( 523 | max(text_labels, key=lambda k: text_labels[k]) 524 | if text_labels != {} 525 | else "O" 526 | ) 527 | pred_labels.append(most_likely_class) 528 | 529 | return pred_labels 530 | 531 | 532 | def _get_cf_actual_predicted( 533 | nlp: Language, dev_examples: Iterable[Example], pipe_key: str 534 | ): 535 | """Returns the actual and predicted labels for the specified component. 536 | 537 | Args: 538 | nlp (Language): spaCy model 539 | dev_examples (Iterable[Example]): List of examples 540 | pipe_key (str): Name of the component 541 | 542 | Returns: 543 | Tuple containing actual labels, predicted labels, labels, flat actual labels and flat predicted labels 544 | """ 545 | actual_labels = [label for label in _get_actual_labels(dev_examples, pipe_key)] 546 | predicted_labels = [ 547 | label for label in _get_predicted_labels(nlp, dev_examples, pipe_key) 548 | ] 549 | if pipe_key == "textcat": 550 | labels = set(predicted_labels).union(set(actual_labels)) 551 | return actual_labels, predicted_labels, list(labels), [], [] 552 | 553 | elif pipe_key == "ner": 554 | actual_labels_flat = [ 555 | label.split("-")[-1] for sublist in actual_labels for label in sublist 556 | ] 557 | predicted_labels_flat = [ 558 | label.split("-")[-1] for sublist in predicted_labels for label in sublist 559 | ] 560 | labels = set(predicted_labels_flat).union(set(actual_labels_flat)) 561 | 562 | return ( 563 | actual_labels, 564 | predicted_labels, 565 | list(labels), 566 | actual_labels_flat, 567 | predicted_labels_flat, 568 | ) 569 | 570 | 571 | def _display_confusion_matrix(cm: List[List[float]], labels: List[Any]) -> None: 572 | """Displays the confusion matrix for the specified component. 573 | 574 | Args: 575 | cm (List[List[float]]): Confusion matrix array 576 | labels (List[Any]): List of labels 577 | """ 578 | disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels) 579 | ax = disp.plot(colorbar=False, cmap="Blues") 580 | ax.ax_.set_title("Confusion Matrix") 581 | plt.show() 582 | 583 | 584 | def _create_ner_table(results: Dict[str, Dict[str, float]]): 585 | """Creates a table for NER results. 586 | 587 | Args: 588 | results (Dict[str, Dict[str, float]]): Dictionary containing NER results. 589 | """ 590 | 591 | ner_metrics = [ 592 | "correct", 593 | "incorrect", 594 | "partial", 595 | "missed", 596 | "spurious", 597 | "possible", 598 | "actual", 599 | "precision", 600 | "recall", 601 | "f1", 602 | ] 603 | headers = tuple(["Metric"] + [m.capitalize() for m in ner_metrics]) 604 | 605 | metrics_formatted = [] 606 | for eval_type, metrics in results.items(): 607 | row = [eval_type.replace("_", " ").capitalize()] 608 | row.extend( 609 | [ 610 | ( 611 | round(metrics.get(key, None), 2) 612 | if metrics.get(key, None) is not None 613 | else None 614 | ) 615 | for key in ner_metrics 616 | ] 617 | ) 618 | metrics_formatted.append(row) 619 | 620 | msg.table(metrics_formatted, header=headers, divider=True) 621 | --------------------------------------------------------------------------------