├── pyproject.toml
├── images
├── evaluate.gif
├── nervaluate.gif
└── evaluate_example.gif
├── setup.py
├── setup.cfg
├── LICENSE
├── .github
└── workflows
│ └── tests.yml
├── README.md
├── .gitignore
├── tests
└── test_evaluate.py
└── prodigy_evaluate
└── __init__.py
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.ruff]
2 | line-length = 220
--------------------------------------------------------------------------------
/images/evaluate.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/prodigy-evaluate/HEAD/images/evaluate.gif
--------------------------------------------------------------------------------
/images/nervaluate.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/prodigy-evaluate/HEAD/images/nervaluate.gif
--------------------------------------------------------------------------------
/images/evaluate_example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/prodigy-evaluate/HEAD/images/evaluate_example.gif
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | if __name__ == "__main__":
4 | from setuptools import find_packages, setup
5 |
6 | setup(name="prodigy_evaluate", packages=find_packages())
7 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | version = 0.1.0
3 | description = Recipes for evaluating spaCy pipelines
4 | url = https://github.com/explosion/prodigy-evaluate
5 | author = Explosion
6 | author_email = contact@explosion.ai
7 |
8 | [options]
9 | zip_safe = true
10 | python_requires = >=3.8
11 | install_requires =
12 | prodigy>=1.15.2,<2.0.0
13 | scikit-learn>=1.4.0,<1.5.0
14 | matplotlib>=3.8.0,<3.9.0
15 | nervaluate>=0.1.8,<0.2.0
16 |
17 | [options.entry_points]
18 | prodigy_recipes =
19 | evaluate.evaluate = prodigy_evaluate:evaluate
20 | evaluate.evaluate-example = prodigy_evaluate:evaluate_example
21 | evaluate.nervaluate = prodigy_evaluate:evaluate_nervaluate
22 |
23 | [bdist_wheel]
24 | universal = true
25 |
26 | [sdist]
27 | formats = gztar
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Explosion
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on:
4 | pull_request:
5 | push:
6 | branches:
7 | - main
8 |
9 | jobs:
10 | setup:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - uses: actions/checkout@v3
14 | - name: Set up Python 3.9
15 | uses: actions/setup-python@v4
16 | with:
17 | python-version: 3.9
18 | cache: "pip"
19 | - name: Install Prodigy from private repo
20 | env:
21 | GITHUB_TOKEN: ${{ secrets.GHA_PRODIGY_READ }} # Use the secret here
22 | run: |
23 | export GIT_LFS_SKIP_SMUDGE=1
24 | pip install --upgrade pip
25 | git clone https://x-access-token:${GITHUB_TOKEN}@github.com/explosion/prodigy.git
26 | cd prodigy
27 | pip install setuptools wheel
28 | pip install -e .
29 | cd ..
30 | - name: Install additional dependencies
31 | run: |
32 | pip install -e .
33 | pip install pytest
34 | python -m spacy download en_core_web_sm
35 | pip install ruff black isort
36 |
37 | - name: Run pytest
38 | if: always()
39 | shell: bash
40 | run: python -m pytest tests
41 |
42 | - name: Run ruff
43 | if: always()
44 | shell: bash
45 | run: python -m ruff check prodigy_evaluate/ tests/
46 |
47 | - name: Run black
48 | if: always()
49 | shell: bash
50 | run: python -m black --check prodigy_evaluate/ tests/
51 |
52 | - name: Run isort
53 | if: always()
54 | shell: bash
55 | run: python -m isort prodigy_evaluate/ tests/
56 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # 🔎 Prodigy-evaluate
4 |
5 | This repository contains a Prodigy plugin for recipes to evaluate spaCy pipelines. It features multiple recipes:
6 |
7 | 1. `evaluate.evaluate`: Evaluate a spaCy pipeline on one or more datasets for different components. Passing flags like `--label-stats` or `--confusion-matrix` will compute a variety of evaluation metrics, including precision, recall, F1, accuracy, and more.
8 |
9 |
10 |
11 |
12 |
13 | 2. `evaluate.evaluate-example`: Evaluate a spaCy pipeline on one or more datasets for different components on a **per-example basis**. This is helpful for debugging and for understanding the hardest examples for your model.
14 |
15 |
16 |
17 |
18 |
19 | 3. `evaluate.nervaluate`: Evaluate a spaCy NER component on one or more datasets. This recipe uses the `nervaluate` library to calculate various metric for NER. You can learn more about the metrics in the [nervaluate documentation](https://github.com/MantisAI/nervaluate). This is helpful because the approach takes into account partial matches, which may be a more relevant metric for your NER use case.
20 |
21 |
22 |
23 |
24 |
25 |
26 | You can install this plugin via `pip`.
27 |
28 | ```
29 | pip install "prodigy-evaluate @ git+https://github.com/explosion/prodigy-evaluate"
30 | ```
31 |
32 | To learn more about this plugin and additional functionality, you can check the [Prodigy docs](https://prodi.gy/docs/plugins/#evaluate).
33 |
34 | ## Issues?
35 |
36 | Are you have trouble with this plugin? Let us know on our [support forum](https://support.prodi.gy/) and we'll get back to you!
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | #ignore python environments
10 | *_env/
11 | *_venv/
12 |
13 | #ignore notebooks
14 | *.ipynb
15 |
16 | # Distribution / packaging
17 | .Python
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | wheels/
30 | share/python-wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 | MANIFEST
35 |
36 | # PyInstaller
37 | # Usually these files are written by a python script from a template
38 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
39 | *.manifest
40 | *.spec
41 |
42 | # Installer logs
43 | pip-log.txt
44 | pip-delete-this-directory.txt
45 |
46 | # Unit test / coverage reports
47 | htmlcov/
48 | .tox/
49 | .nox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | coverage.xml
55 | *.cover
56 | *.py,cover
57 | .hypothesis/
58 | .pytest_cache/
59 | cover/
60 |
61 | # Translations
62 | *.mo
63 | *.pot
64 |
65 | # Django stuff:
66 | *.log
67 | local_settings.py
68 | db.sqlite3
69 | db.sqlite3-journal
70 |
71 | # Flask stuff:
72 | instance/
73 | .webassets-cache
74 |
75 | # Scrapy stuff:
76 | .scrapy
77 |
78 | # Sphinx documentation
79 | docs/_build/
80 |
81 | # PyBuilder
82 | .pybuilder/
83 | target/
84 |
85 | # Jupyter Notebook
86 | .ipynb_checkpoints
87 |
88 | # IPython
89 | profile_default/
90 | ipython_config.py
91 |
92 | # pyenv
93 | # For a library or package, you might want to ignore these files since the code is
94 | # intended to run in multiple environments; otherwise, check them in:
95 | # .python-version
96 |
97 | # pipenv
98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | # install all needed dependencies.
102 | #Pipfile.lock
103 |
104 | # poetry
105 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106 | # This is especially recommended for binary packages to ensure reproducibility, and is more
107 | # commonly ignored for libraries.
108 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109 | #poetry.lock
110 |
111 | # pdm
112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113 | #pdm.lock
114 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
115 | # in version control.
116 | # https://pdm.fming.dev/#use-with-ide
117 | .pdm.toml
118 |
119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
120 | __pypackages__/
121 |
122 | # Celery stuff
123 | celerybeat-schedule
124 | celerybeat.pid
125 |
126 | # SageMath parsed files
127 | *.sage.py
128 |
129 | # Environments
130 | .env
131 | .venv
132 | env/
133 | venv/
134 | ENV/
135 | env.bak/
136 | venv.bak/
137 |
138 | # Spyder project settings
139 | .spyderproject
140 | .spyproject
141 |
142 | # Rope project settings
143 | .ropeproject
144 |
145 | # mkdocs documentation
146 | /site
147 |
148 | # mypy
149 | .mypy_cache/
150 | .dmypy.json
151 | dmypy.json
152 |
153 | # Pyre type checker
154 | .pyre/
155 |
156 | # pytype static type analyzer
157 | .pytype/
158 |
159 | # Cython debug symbols
160 | cython_debug/
161 |
162 | # PyCharm
163 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
164 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
165 | # and can be added to the global gitignore or merged into this file. For a more nuclear
166 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
167 | #.idea/
168 |
169 | .DS_Store
--------------------------------------------------------------------------------
/tests/test_evaluate.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Iterable, List
2 |
3 | import en_core_web_sm
4 | import pytest
5 | from prodigy.components.db import connect
6 | from prodigy.types import TaskType
7 | from spacy.training import Example
8 |
9 | from prodigy_evaluate import (
10 | _create_ner_table,
11 | _display_eval_results,
12 | _get_actual_labels,
13 | _get_cf_actual_predicted,
14 | _get_predicted_labels,
15 | _get_score_for_metric,
16 | evaluate,
17 | evaluate_example,
18 | evaluate_nervaluate,
19 | )
20 |
21 |
22 | @pytest.fixture
23 | def dataset() -> str:
24 | return "test_dataset"
25 |
26 |
27 | @pytest.fixture
28 | def spacy_model():
29 | return "en_core_web_sm"
30 |
31 |
32 | @pytest.fixture
33 | def nlp():
34 | return en_core_web_sm.load()
35 |
36 |
37 | @pytest.fixture
38 | def metric() -> str:
39 | return "ents_f"
40 |
41 |
42 | @pytest.fixture
43 | def data() -> Iterable[Dict]:
44 | return [
45 | {
46 | "text": "My name is Freya.",
47 | "_input_hash": 896529854,
48 | "_task_hash": -1486695581,
49 | "tokens": [
50 | {"text": "My", "start": 0, "end": 2, "id": 0, "ws": True},
51 | {"text": "name", "start": 3, "end": 7, "id": 1, "ws": True},
52 | {"text": "is", "start": 8, "end": 10, "id": 2, "ws": True},
53 | {"text": "Freya", "start": 11, "end": 16, "id": 3, "ws": True},
54 | {"text": ".", "start": 16, "end": 17, "id": 4, "ws": True},
55 | ],
56 | "_view_id": "ner_manual",
57 | "spans": [
58 | {
59 | "start": 11,
60 | "end": 16,
61 | "token_start": 3,
62 | "token_end": 3,
63 | "label": "PERSON",
64 | }
65 | ],
66 | "answer": "accept",
67 | "_timestamp": 1707211049,
68 | "_annotator_id": "2024-02-06_10-17-19",
69 | "_session_id": "2024-02-06_10-17-19",
70 | },
71 | {
72 | "text": "My favorite city is London.",
73 | "_input_hash": -91551573,
74 | "_task_hash": -1162253049,
75 | "tokens": [
76 | {"text": "My", "start": 0, "end": 2, "id": 0, "ws": True},
77 | {"text": "favorite", "start": 3, "end": 11, "id": 1, "ws": True},
78 | {"text": "city", "start": 12, "end": 16, "id": 2, "ws": True},
79 | {"text": "is", "start": 17, "end": 19, "id": 3, "ws": True},
80 | {"text": "London", "start": 20, "end": 26, "id": 4, "ws": True},
81 | {"text": ".", "start": 26, "end": 27, "id": 5, "ws": False},
82 | ],
83 | "_view_id": "ner_manual",
84 | "spans": [
85 | {
86 | "start": 20,
87 | "end": 26,
88 | "token_start": 4,
89 | "token_end": 4,
90 | "label": "GPE",
91 | }
92 | ],
93 | "answer": "accept",
94 | "_timestamp": 1707211053,
95 | "_annotator_id": "2024-02-06_10-17-19",
96 | "_session_id": "2024-02-06_10-17-19",
97 | },
98 | {
99 | "text": "I live in Berlin.",
100 | "_input_hash": -2101464790,
101 | "_task_hash": 1279282044,
102 | "tokens": [
103 | {"text": "I", "start": 0, "end": 1, "id": 0, "ws": True},
104 | {"text": "live", "start": 2, "end": 6, "id": 1, "ws": True},
105 | {"text": "in", "start": 7, "end": 9, "id": 2, "ws": True},
106 | {"text": "Berlin", "start": 10, "end": 16, "id": 3, "ws": True},
107 | {"text": ".", "start": 16, "end": 17, "id": 4, "ws": True},
108 | ],
109 | "_view_id": "ner_manual",
110 | "spans": [
111 | {
112 | "start": 10,
113 | "end": 16,
114 | "token_start": 3,
115 | "token_end": 3,
116 | "label": "GPE",
117 | }
118 | ],
119 | "answer": "accept",
120 | "_timestamp": 1707211056,
121 | "_annotator_id": "2024-02-06_10-17-19",
122 | "_session_id": "2024-02-06_10-17-19",
123 | },
124 | ]
125 |
126 |
127 | @pytest.fixture
128 | def scores() -> Dict[str, float]:
129 | return {
130 | "ents_f": 0.9,
131 | "ents_p": 0.8,
132 | "ents_r": 0.7,
133 | "tags_acc": 0.6,
134 | "sents_p": 0.5,
135 | "sents_r": 0.4,
136 | "sents_f": 0.3,
137 | }
138 |
139 |
140 | @pytest.fixture
141 | def db(dataset: str, data: List[TaskType]):
142 | database = connect()
143 | database.add_dataset(dataset)
144 | database.add_examples(data, datasets=[dataset])
145 | return database
146 |
147 |
148 | @pytest.fixture
149 | def ner_examples(nlp):
150 | data = {
151 | "Apple Inc. is an American multinational technology company.": {
152 | "entities": [(0, 10, "ORG")] # Span covering "Apple Inc."
153 | },
154 | "Musk is the CEO of Tesla, Inc.": {
155 | "entities": [
156 | (0, 4, "PERSON"),
157 | (19, 30, "ORG"),
158 | ] # Spans covering "Musk" and "Tesla, Inc."
159 | },
160 | }
161 | examples = []
162 | for text, annot in data.items():
163 | examples.append(Example.from_dict(nlp.make_doc(text), annot))
164 |
165 | return examples
166 |
167 |
168 | @pytest.fixture
169 | def textcat_examples(nlp):
170 | data = {
171 | "SpaCy is an amazing library for NLP.": {"POSITIVE": 1.0, "NEGATIVE": 0.0},
172 | "I dislike rainy days.": {"POSITIVE": 0.0, "NEGATIVE": 1.0},
173 | }
174 |
175 | examples = []
176 | for text, annot in data.items():
177 | doc = nlp.make_doc(text)
178 | doc.cats = annot
179 | ref_doc = nlp.make_doc(text)
180 | ref_doc.cats = annot
181 | example = Example(doc, ref_doc)
182 | examples.append(example)
183 |
184 | return examples
185 |
186 |
187 | @pytest.fixture
188 | def nervaluate_results():
189 | return {
190 | "ent_type": {
191 | "correct": 2,
192 | "incorrect": 0,
193 | "partial": 0,
194 | "missed": 1,
195 | "spurious": 0,
196 | "possible": 3,
197 | "actual": 2,
198 | "precision": 1.0,
199 | "recall": 0.6666666666666666,
200 | "f1": 0.8,
201 | },
202 | "partial": {
203 | "correct": 2,
204 | "incorrect": 0,
205 | "partial": 0,
206 | "missed": 1,
207 | "spurious": 0,
208 | "possible": 3,
209 | "actual": 2,
210 | "precision": 1.0,
211 | "recall": 0.6666666666666666,
212 | "f1": 0.8,
213 | },
214 | "strict": {
215 | "correct": 2,
216 | "incorrect": 0,
217 | "partial": 0,
218 | "missed": 1,
219 | "spurious": 0,
220 | "possible": 3,
221 | "actual": 2,
222 | "precision": 1.0,
223 | "recall": 0.6666666666666666,
224 | "f1": 0.8,
225 | },
226 | "exact": {
227 | "correct": 2,
228 | "incorrect": 0,
229 | "partial": 0,
230 | "missed": 1,
231 | "spurious": 0,
232 | "possible": 3,
233 | "actual": 2,
234 | "precision": 1.0,
235 | "recall": 0.6666666666666666,
236 | "f1": 0.8,
237 | },
238 | }
239 |
240 |
241 | ######## evaluation tests ########
242 |
243 |
244 | def test_evaluate_example(spacy_model, dataset, metric, db, capsys):
245 | evaluate_example(model=spacy_model, ner=dataset, metric=metric, n_results=5)
246 |
247 | captured = capsys.readouterr()
248 |
249 | assert "Scored Example" in captured.out
250 |
251 | db.drop_dataset(dataset)
252 |
253 |
254 | def test_evaluate(spacy_model, dataset, db, capsys):
255 | results = evaluate(
256 | model=spacy_model,
257 | ner=dataset,
258 | label_stats=True,
259 | cf_matrix=False, # False
260 | )
261 |
262 | captured = capsys.readouterr()
263 |
264 | assert "P" in captured.out
265 | assert "R" in captured.out
266 | assert "F" in captured.out
267 |
268 | assert isinstance(results, dict)
269 | assert "token_acc" in results
270 | assert "token_p" in results
271 | assert results.get("token_p") == 1
272 | assert isinstance(results.get("ents_p"), float)
273 | assert results.get("speed") > 1
274 |
275 | db.drop_dataset(dataset)
276 |
277 |
278 | def test_nervaluate(spacy_model, dataset, db, capsys):
279 | results = evaluate_nervaluate(
280 | model=spacy_model,
281 | ner=dataset,
282 | )
283 | captured = capsys.readouterr()
284 |
285 | assert "Correct" in captured.out
286 | assert "Metric" in captured.out
287 | assert "Ent type" in captured.out
288 | assert "Incorrect" in captured.out
289 | assert "Recall" in captured.out
290 | assert "F1" in captured.out
291 | assert "Partial" in captured.out
292 |
293 | assert isinstance(results, dict)
294 | assert "ent_type" in list(results["overall_results"].keys())
295 | assert "partial" in results["overall_results"]
296 |
297 | assert results["overall_results"]["ent_type"]["f1"] == 1.0
298 |
299 | db.drop_dataset(dataset)
300 |
301 |
302 | def test_display_eval_results(scores, capsys):
303 | _display_eval_results(scores, "sc")
304 | captured = capsys.readouterr()
305 |
306 | assert "Results" in captured.out
307 |
308 |
309 | def test_get_score_for_metric(scores, metric: str):
310 | res = _get_score_for_metric(scores, metric)
311 |
312 | assert isinstance(res, float)
313 | assert isinstance(scores, dict)
314 | assert isinstance(metric, str)
315 | assert metric is not None
316 |
317 |
318 | def test_get_actual_labels_ner(ner_examples):
319 | ner_labels = _get_actual_labels(ner_examples, "ner")
320 | assert isinstance(ner_labels, list)
321 | assert len(ner_labels) == 2
322 | assert all(isinstance(label, str) for label in ner_labels[0])
323 | assert all(isinstance(label, str) for label in ner_labels[1])
324 | assert "O" in ner_labels[0]
325 | assert "B-ORG" in ner_labels[0]
326 | assert "U-PERSON" in ner_labels[1]
327 |
328 |
329 | def test_get_actual_labels_textcat(textcat_examples):
330 | textcat_labels = _get_actual_labels(textcat_examples, "textcat")
331 | assert isinstance(textcat_labels, list)
332 | assert len(textcat_labels) == 2
333 | assert "POSITIVE" in textcat_labels
334 | assert "NEGATIVE" in textcat_labels
335 | assert all(isinstance(label, str) for label in textcat_labels)
336 |
337 |
338 | # here we need a model as we're using one in _get_predicted_labels
339 | # because nlp.evaluate does not create example.predicted values
340 | def test_get_predicted_labels_ner(nlp, ner_examples):
341 | pred_ner_labels = _get_predicted_labels(nlp, ner_examples, "ner")
342 | assert isinstance(pred_ner_labels, list)
343 | assert len(pred_ner_labels) == 2
344 | assert all(isinstance(label, str) for label in pred_ner_labels[0])
345 | assert all(isinstance(label, str) for label in pred_ner_labels[1])
346 |
347 | assert "O" in pred_ner_labels[1]
348 | assert "B-ORG" in pred_ner_labels[0]
349 |
350 |
351 | def test_get_cf_actual_predicted(nlp, ner_examples):
352 | actual, predicted, labels, actual_flat, predicted_flat = _get_cf_actual_predicted(
353 | nlp, ner_examples, "ner"
354 | )
355 | assert isinstance(actual[0], list)
356 | assert isinstance(actual_flat[0], str)
357 |
358 | assert isinstance(predicted[0], list)
359 | assert isinstance(predicted_flat[1], str)
360 |
361 | assert isinstance(actual, list)
362 | assert isinstance(predicted, list)
363 | assert isinstance(labels, list)
364 | assert "O" in actual[0]
365 | assert "B-ORG" in predicted[1]
366 |
367 |
368 | def test_create_ner_table(nervaluate_results, capsys):
369 | _create_ner_table(nervaluate_results)
370 | captured = capsys.readouterr()
371 |
372 | assert "Correct" in captured.out
373 | assert "Metric" in captured.out
374 | assert "Ent type" in captured.out
375 | assert "Incorrect" in captured.out
376 | assert "Recall" in captured.out
377 | assert "F1" in captured.out
378 | assert "Partial" in captured.out
379 |
--------------------------------------------------------------------------------
/prodigy_evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | from dataclasses import dataclass
4 | from pathlib import Path
5 | from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
6 |
7 | import matplotlib.pyplot as plt
8 | import spacy
9 | import srsly
10 | from nervaluate import Evaluator
11 | from prodigy.core import recipe
12 | from prodigy.errors import RecipeError
13 | from prodigy.recipes.data_utils import get_datasets_from_cli_eval, merge_corpus
14 | from prodigy.recipes.train import RECIPE_ARGS, set_log_level, setup_gpu
15 | from prodigy.util import SPANCAT_DEFAULT_KEY, msg
16 | from radicli import Arg
17 |
18 | # additional imports
19 | from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
20 | from spacy.cli.evaluate import handle_scores_per_type
21 | from spacy.language import Language
22 | from spacy.training import offsets_to_biluo_tags
23 | from spacy.training.example import Example
24 |
25 |
26 | @recipe(
27 | "evaluate.evaluate",
28 | # fmt: off
29 | model=Arg(help="Name or path of model to evaluate"),
30 | ner=RECIPE_ARGS["ner"],
31 | textcat=RECIPE_ARGS["textcat"],
32 | textcat_multilabel=RECIPE_ARGS["textcat_multilabel"],
33 | tagger=RECIPE_ARGS["tagger"],
34 | senter=RECIPE_ARGS["senter"],
35 | parser=RECIPE_ARGS["parser"],
36 | spancat=RECIPE_ARGS["spancat"],
37 | coref=RECIPE_ARGS["coref"],
38 | label_stats=Arg("--label-stats", "-LS", help="Show per-label scores"),
39 | gpu_id=RECIPE_ARGS["gpu_id"],
40 | verbose=RECIPE_ARGS["verbose"],
41 | silent=RECIPE_ARGS["silent"],
42 | cf_matrix = Arg("--confusion-matrix", "-CF", help="Show confusion matrix for the specified component"),
43 | cf_path = Arg("--cf-path", "-CP", help="Path to save the confusion matrix array"),
44 | spans_key=Arg("--spans-key", help="Optional spans key to evaluate if spancat component is used."),
45 | # fmt: on
46 | )
47 | def evaluate(
48 | model: Union[str, Path],
49 | ner: Sequence[str] = tuple(),
50 | textcat: Sequence[str] = tuple(),
51 | textcat_multilabel: Sequence[str] = tuple(),
52 | tagger: Sequence[str] = tuple(),
53 | senter: Sequence[str] = tuple(),
54 | parser: Sequence[str] = tuple(),
55 | spancat: Sequence[str] = tuple(),
56 | coref: Sequence[str] = tuple(),
57 | label_stats: bool = False,
58 | gpu_id: int = -1,
59 | verbose: bool = False,
60 | silent: bool = True,
61 | cf_matrix: bool = False,
62 | cf_path: Optional[Path] = None,
63 | spans_key: str = SPANCAT_DEFAULT_KEY,
64 | ) -> Dict[str, Any]:
65 | """Evaluate a spaCy pipeline on one or more datasets for different components.
66 |
67 | This command takes care of merging all annotations on the same input data like the
68 | prodigy train command.
69 |
70 | You can also use the --label-stats flag to show per-label scores for NER and textcat
71 | components. This will show the precision, recall and F-score for each label.
72 |
73 | Finally, you can also use --confusion-matrix to show the confusion matrix for the
74 | specified component. This will only work for NER or textcat components.
75 |
76 | Example Usage:
77 |
78 | ```
79 | prodigy evaluate.evaluate en_core_web_sm --ner my_eval_dataset --confusion-matrix
80 | ```
81 | """
82 | set_log_level(verbose=verbose, silent=silent)
83 | setup_gpu(gpu_id)
84 | nlp = spacy.load(model)
85 |
86 | pipes = get_datasets_from_cli_eval(
87 | ner,
88 | textcat,
89 | textcat_multilabel,
90 | tagger,
91 | senter,
92 | parser,
93 | spancat,
94 | coref,
95 | )
96 | pipe_key = [k for k in pipes if pipes.get(k)][0]
97 |
98 | compat_pipes = {
99 | pipe_name: ([], eval_sets) for pipe_name, eval_sets in pipes.items()
100 | }
101 | merged_corpus = merge_corpus(nlp, compat_pipes)
102 | dev_examples = merged_corpus["dev"](nlp)
103 | scores = nlp.evaluate(dev_examples)
104 |
105 | if pipe_key in ["ner", "textcat"]:
106 | (
107 | actual_labels,
108 | predicted_labels,
109 | labels,
110 | flat_actual_labels,
111 | flat_predicted_labels,
112 | ) = _get_cf_actual_predicted(
113 | nlp=nlp, dev_examples=dev_examples, pipe_key=pipe_key
114 | )
115 | labels_to_include = [label for label in labels if label != "O"]
116 | if pipe_key == "ner":
117 | actual_labels = flat_actual_labels
118 | predicted_labels = flat_predicted_labels
119 |
120 | cfarray = confusion_matrix(
121 | actual_labels, predicted_labels, labels=labels_to_include, normalize="true"
122 | )
123 |
124 | _display_eval_results(
125 | scores, spans_key=spans_key, silent=False, per_type=label_stats
126 | )
127 |
128 | if cf_matrix:
129 | if pipe_key not in ["ner", "textcat"]:
130 | msg.fail(
131 | f"Confusion matrix is not supported for {pipe_key} component", exits=1
132 | )
133 | _display_confusion_matrix(
134 | cm=cfarray,
135 | labels=labels_to_include,
136 | )
137 | msg.good("Confusion matrix displayed")
138 |
139 | if cf_path:
140 | if pipe_key not in ["ner", "textcat"]:
141 | msg.fail(
142 | f"Confusion matrix is not supported for {pipe_key} component", exits=1
143 | )
144 | if not cf_path.exists():
145 | os.makedirs(cf_path)
146 |
147 | full_cf_path = cf_path / "cf_array.json"
148 | srsly.write_json(
149 | full_cf_path,
150 | {
151 | "cf_array": cfarray.tolist(),
152 | "labels": labels_to_include,
153 | },
154 | )
155 | msg.good(f"Confusion matrix array saved to {full_cf_path}")
156 |
157 | return scores
158 |
159 |
160 | @recipe(
161 | "evaluate.evaluate-example",
162 | # fmt: off
163 | model=Arg(help="Path to model to evaluate"),
164 | ner=RECIPE_ARGS["ner"],
165 | textcat=RECIPE_ARGS["textcat"],
166 | textcat_multilabel=RECIPE_ARGS["textcat_multilabel"],
167 | tagger=RECIPE_ARGS["tagger"],
168 | senter=RECIPE_ARGS["senter"],
169 | parser=RECIPE_ARGS["parser"],
170 | spancat=RECIPE_ARGS["spancat"],
171 | coref=RECIPE_ARGS["coref"],
172 | gpu_id=RECIPE_ARGS["gpu_id"],
173 | verbose=RECIPE_ARGS["verbose"],
174 | silent=RECIPE_ARGS["silent"],
175 | metric=Arg("--metric", "-m", help="Metric to use for sorting examples"),
176 | n_results = Arg("--n-results", "-NR", help="Number of top examples to display"),
177 | output_path=Arg("--output-path", "-OP", help="Path to save the top examples and scores")
178 | # fmt: on
179 | )
180 | def evaluate_example(
181 | model: Union[str, Path],
182 | ner: Sequence[str] = tuple(),
183 | textcat: Sequence[str] = tuple(),
184 | textcat_multilabel: Sequence[str] = tuple(),
185 | tagger: Sequence[str] = tuple(),
186 | senter: Sequence[str] = tuple(),
187 | parser: Sequence[str] = tuple(),
188 | spancat: Sequence[str] = tuple(),
189 | coref: Sequence[str] = tuple(),
190 | gpu_id: int = -1,
191 | verbose: bool = False,
192 | silent: bool = True,
193 | metric: Optional[str] = None,
194 | n_results: int = 10,
195 | output_path: Optional[Path] = None,
196 | ):
197 | """Evaluate a spaCy pipeline on one or more datasets for different components
198 | on a per-example basis. This command will run an evaluation on each example individually
199 | and then sort by the desired `--metric` argument.
200 |
201 | This is useful for debugging and understanding the easiest
202 | and hardest examples for your model.
203 |
204 | Example Usage:
205 | ```
206 | prodigy evaluate.evaluate-example en_core_web_sm --ner my_eval_dataset --metric ents_f
207 | ```
208 |
209 | This will sort examples by lowest NER F-score.
210 | """
211 | if not metric:
212 | raise RecipeError(
213 | "You must pass a metric to sort examples via --metric argument. Refer to prodigy evaluate-example documentation for available metric types."
214 | )
215 |
216 | set_log_level(verbose=verbose, silent=silent)
217 | setup_gpu(gpu_id)
218 | nlp = spacy.load(model)
219 |
220 | pipes = get_datasets_from_cli_eval(
221 | ner,
222 | textcat,
223 | textcat_multilabel,
224 | tagger,
225 | senter,
226 | parser,
227 | spancat,
228 | coref,
229 | )
230 | compat_pipes = {
231 | pipe_name: ([], eval_sets) for pipe_name, eval_sets in pipes.items()
232 | }
233 | merged_corpus = merge_corpus(nlp, compat_pipes)
234 | dev_examples = merged_corpus["dev"](nlp)
235 | results: List[ScoredExample] = evaluate_each_example(nlp, dev_examples, metric)
236 |
237 | top_results: List[ScoredExample] = results[:n_results]
238 |
239 | if len(top_results) == 0:
240 | msg.fail(f"No examples found for the metric {metric}.", exits=1)
241 | avg_text_len = sum([len(ex.example.text) for ex in top_results]) / len(top_results)
242 | if avg_text_len > 100:
243 | msg.warn(
244 | f"Average # of characters of top examples is {round(avg_text_len, 2)}. This will not display well in the terminal. Consider saving the top examples to file with `--output-path` and investigating accordingly."
245 | )
246 |
247 | def split_string_into_tuples(input_string: str, length: int = 50):
248 | """
249 | This function takes a string and splits it into tuples of length `length`.
250 | Useful for wrapping long strings in tables.
251 | """
252 | input_string = input_string.rstrip()
253 | if len(input_string) > length:
254 | result = tuple(
255 | input_string[i : i + length].rstrip()
256 | for i in range(0, len(input_string), length)
257 | )
258 | return result
259 | else:
260 | return input_string
261 |
262 | data = [
263 | (
264 | split_string_into_tuples(ex.example.text),
265 | round(ex.score, 2) if ex.score is not None else None,
266 | )
267 | for ex in top_results
268 | ]
269 | headers = ["Example", metric]
270 | widths = (50, 9)
271 | aligns = ("l", "l")
272 |
273 | msg.divider("Scored Examples")
274 | msg.table(
275 | data, header=headers, divider=True, widths=widths, aligns=aligns, multiline=True
276 | )
277 |
278 | if output_path:
279 | if not output_path.exists():
280 | os.makedirs(output_path)
281 |
282 | results_path = output_path / "hardest_examples.jsonl"
283 |
284 | results_jsonl = []
285 | for data in top_results:
286 | results_json = {
287 | "text": data.example.text,
288 | "meta": {"score": data.score, "metric": metric},
289 | }
290 | results_jsonl.append(results_json)
291 |
292 | srsly.write_jsonl(results_path, results_jsonl)
293 | msg.good(f"The examples with the lowest scores saved to {results_path}")
294 | msg.info(
295 | "You can inspect the NER/spancat/textcat predictions on the hardest examples by running one of the Prodigy `*.correct` or `*.model-annotate` workflows. See documentation for more details: https://prodi.gy/docs/recipes"
296 | )
297 |
298 |
299 | @recipe(
300 | "evaluate.nervaluate",
301 | # fmt: off
302 | model=Arg(help="Path to model to evaluate"),
303 | ner=RECIPE_ARGS["ner"],
304 | gpu_id=RECIPE_ARGS["gpu_id"],
305 | verbose=RECIPE_ARGS["verbose"],
306 | per_label=Arg("--per-label", "-PL", help="Show per-label NER nervaluate scores"),
307 | # fmt: on
308 | )
309 | def evaluate_nervaluate(
310 | model: Union[str, Path],
311 | ner: Sequence[str],
312 | gpu_id: int = -1,
313 | verbose: bool = False,
314 | per_label: bool = False,
315 | ):
316 | """
317 | Evaluate spaCy's NER component using nervaluate metrics. the `nervaluate` library
318 | provides full named-entity (i.e. not tag/token) evaluation metrics based on SemEval’13.
319 |
320 | For more information on these metric, see https://github.com/MantisAI/nervaluate.
321 |
322 | Example Usage:
323 |
324 | ```
325 | prodigy evaluate.nervaluate en_core_web_sm --ner my_eval_dataset
326 | ```
327 | """
328 | set_log_level(verbose=verbose, silent=True) # silence component merging
329 | setup_gpu(gpu_id)
330 | nlp = spacy.load(model)
331 | merged_corpus = merge_corpus(nlp, {"ner": ([], [ner])})
332 | dev_examples = merged_corpus["dev"](nlp)
333 | (
334 | actual_labels,
335 | predicted_labels,
336 | labels,
337 | flat_actual_labels,
338 | flat_predicted_labels,
339 | ) = _get_cf_actual_predicted(nlp=nlp, dev_examples=dev_examples, pipe_key="ner")
340 |
341 | evaluator = Evaluator(actual_labels, predicted_labels, tags=labels, loader="list")
342 | ner_results, ner_results_by_tag = evaluator.evaluate()
343 | msg.divider("nervaluate NER metrics")
344 | msg.info(
345 | "Full named-entity (i.e., not tag/token) evaluation metrics based on SemEval’13. For more information on these metrics, see https://github.com/MantisAI/nervaluate"
346 | )
347 | msg.text("NER: Overall")
348 | _create_ner_table(ner_results)
349 |
350 | if per_label:
351 | for tag, tag_results in ner_results_by_tag.items():
352 | if tag != "O":
353 | msg.text(title=f"NER: {tag}")
354 | _create_ner_table(tag_results)
355 |
356 | return {"overall_results": ner_results, "results_by_tag": ner_results_by_tag}
357 |
358 |
359 | @dataclass
360 | class ScoredExample:
361 | example: Example
362 | score: Optional[float]
363 | scores: Dict[str, float]
364 |
365 |
366 | def _get_score_for_metric(scores: Dict[str, float], metric: str) -> Union[float, None]:
367 | """Returns the score for the specified metric.
368 |
369 | Args:
370 | scores (Dict[str, float]): Dictionary containing scores for different metrics
371 | metric (str): Metric to get the score for
372 |
373 | Returns:
374 | Union[float, None]: Score for the specified metric or None if not found
375 | """
376 |
377 | return scores.get(metric, None)
378 |
379 |
380 | def evaluate_each_example(
381 | nlp: Language,
382 | dev_examples: Iterable[Example],
383 | metric: str,
384 | desc: bool = False,
385 | skip_none: bool = True,
386 | ) -> List[ScoredExample]:
387 | def sort_key(x: Tuple[Example, Dict[str, float]]) -> Union[float, int]:
388 | _, eval_scores = x
389 | res = _get_score_for_metric(eval_scores, metric)
390 | if res is None:
391 | res = 0
392 | if not isinstance(res, (float, int)):
393 | raise ValueError(f"Invalid metric to sort by: {metric}", res)
394 | return res
395 |
396 | per_example_scores = {}
397 | for example in dev_examples:
398 | scores = nlp.evaluate([example])
399 | res = _get_score_for_metric(scores, metric)
400 | if res is None and skip_none:
401 | continue
402 | per_example_scores[example] = scores
403 |
404 | sorted_per_example_scores = [
405 | ScoredExample(
406 | example=eg,
407 | score=_get_score_for_metric(example_scores, metric),
408 | scores=example_scores,
409 | )
410 | for eg, example_scores in sorted(
411 | per_example_scores.items(), key=sort_key, reverse=desc
412 | )
413 | ]
414 | return sorted_per_example_scores
415 |
416 |
417 | def _display_eval_results(
418 | scores: Dict[str, Any], spans_key: str, silent: bool = False, per_type: bool = False
419 | ) -> None:
420 | """Displays the evaluation results for the specified component.
421 |
422 | Args:
423 | scores (Dict[str, Any]): Dictionary containing evaluation scores from `nlp.evaluate`
424 | spans_key (str): Optional spans key to evaluate if spancat component is used.
425 | silent (bool, optional): Whether to display all results or not. Defaults to False.
426 | """
427 | metrics = {
428 | "TOK": "token_acc",
429 | "TAG": "tag_acc",
430 | "POS": "pos_acc",
431 | "MORPH": "morph_acc",
432 | "LEMMA": "lemma_acc",
433 | "UAS": "dep_uas",
434 | "LAS": "dep_las",
435 | "NER P": "ents_p",
436 | "NER R": "ents_r",
437 | "NER F": "ents_f",
438 | "TEXTCAT": "cats_score",
439 | "SENT P": "sents_p",
440 | "SENT R": "sents_r",
441 | "SENT F": "sents_f",
442 | "SPAN P": f"spans_{spans_key}_p",
443 | "SPAN R": f"spans_{spans_key}_r",
444 | "SPAN F": f"spans_{spans_key}_f",
445 | "SPEED": "speed",
446 | }
447 | results = {}
448 | data = {}
449 | for metric, key in metrics.items():
450 | if key in scores:
451 | if key == "cats_score":
452 | metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
453 | if isinstance(scores[key], (int, float)):
454 | if key == "speed":
455 | results[metric] = f"{scores[key]:.0f}"
456 | else:
457 | results[metric] = f"{scores[key]*100:.2f}"
458 | else:
459 | results[metric] = "-"
460 | data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
461 | msg.table(results, title="Results")
462 |
463 | if per_type:
464 | data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
465 |
466 |
467 | #### Confusion matrix functions ####
468 |
469 |
470 | def _get_actual_labels(dev_examples: Iterable[Example], pipe_key: str) -> List[Any]:
471 | """Returns the actual labels for the specified component.
472 |
473 | Args:
474 | dev_examples (Iterable[Example]): List of examples
475 | pipe_key (str): Name of the component
476 |
477 | Returns:
478 | List[Any]: List of actual labels
479 | """
480 | actual_labels = []
481 | for ex in dev_examples:
482 | ref = ex.reference # we have reference but we don't have predicted
483 | if pipe_key == "ner":
484 | ents = ex.get_aligned_ner()
485 | ents_clean = ["O" if x is None else x for x in ents]
486 | actual_labels.append(ents_clean)
487 | elif pipe_key == "textcat":
488 | text_labels = ref.cats
489 | most_likely_class = (
490 | max(text_labels, key=lambda k: text_labels[k])
491 | if text_labels != {}
492 | else "O"
493 | )
494 | actual_labels.append(most_likely_class)
495 |
496 | return actual_labels
497 |
498 |
499 | def _get_predicted_labels(
500 | nlp: Language, dev_examples: Iterable[Example], pipe_key: str
501 | ) -> List[Any]:
502 | """Returns the predicted labels for the specified component.
503 |
504 | Args:
505 | nlp (Language): spaCy model
506 | dev_examples (Iterable[Example]): List of examples
507 | pipe_key (str): Name of the component
508 |
509 | Returns:
510 | List[Any]: List of predicted labels
511 | """
512 |
513 | texts = [eg.text for eg in dev_examples]
514 | pred_labels = []
515 | for eg in nlp.pipe(texts):
516 | if pipe_key == "ner":
517 | ents = [(ent.start_char, ent.end_char, ent.label_) for ent in eg.ents]
518 | biluo_tags = offsets_to_biluo_tags(eg, ents)
519 | pred_labels.append(biluo_tags)
520 | elif pipe_key == "textcat":
521 | text_labels = eg.cats
522 | most_likely_class = (
523 | max(text_labels, key=lambda k: text_labels[k])
524 | if text_labels != {}
525 | else "O"
526 | )
527 | pred_labels.append(most_likely_class)
528 |
529 | return pred_labels
530 |
531 |
532 | def _get_cf_actual_predicted(
533 | nlp: Language, dev_examples: Iterable[Example], pipe_key: str
534 | ):
535 | """Returns the actual and predicted labels for the specified component.
536 |
537 | Args:
538 | nlp (Language): spaCy model
539 | dev_examples (Iterable[Example]): List of examples
540 | pipe_key (str): Name of the component
541 |
542 | Returns:
543 | Tuple containing actual labels, predicted labels, labels, flat actual labels and flat predicted labels
544 | """
545 | actual_labels = [label for label in _get_actual_labels(dev_examples, pipe_key)]
546 | predicted_labels = [
547 | label for label in _get_predicted_labels(nlp, dev_examples, pipe_key)
548 | ]
549 | if pipe_key == "textcat":
550 | labels = set(predicted_labels).union(set(actual_labels))
551 | return actual_labels, predicted_labels, list(labels), [], []
552 |
553 | elif pipe_key == "ner":
554 | actual_labels_flat = [
555 | label.split("-")[-1] for sublist in actual_labels for label in sublist
556 | ]
557 | predicted_labels_flat = [
558 | label.split("-")[-1] for sublist in predicted_labels for label in sublist
559 | ]
560 | labels = set(predicted_labels_flat).union(set(actual_labels_flat))
561 |
562 | return (
563 | actual_labels,
564 | predicted_labels,
565 | list(labels),
566 | actual_labels_flat,
567 | predicted_labels_flat,
568 | )
569 |
570 |
571 | def _display_confusion_matrix(cm: List[List[float]], labels: List[Any]) -> None:
572 | """Displays the confusion matrix for the specified component.
573 |
574 | Args:
575 | cm (List[List[float]]): Confusion matrix array
576 | labels (List[Any]): List of labels
577 | """
578 | disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
579 | ax = disp.plot(colorbar=False, cmap="Blues")
580 | ax.ax_.set_title("Confusion Matrix")
581 | plt.show()
582 |
583 |
584 | def _create_ner_table(results: Dict[str, Dict[str, float]]):
585 | """Creates a table for NER results.
586 |
587 | Args:
588 | results (Dict[str, Dict[str, float]]): Dictionary containing NER results.
589 | """
590 |
591 | ner_metrics = [
592 | "correct",
593 | "incorrect",
594 | "partial",
595 | "missed",
596 | "spurious",
597 | "possible",
598 | "actual",
599 | "precision",
600 | "recall",
601 | "f1",
602 | ]
603 | headers = tuple(["Metric"] + [m.capitalize() for m in ner_metrics])
604 |
605 | metrics_formatted = []
606 | for eval_type, metrics in results.items():
607 | row = [eval_type.replace("_", " ").capitalize()]
608 | row.extend(
609 | [
610 | (
611 | round(metrics.get(key, None), 2)
612 | if metrics.get(key, None) is not None
613 | else None
614 | )
615 | for key in ner_metrics
616 | ]
617 | )
618 | metrics_formatted.append(row)
619 |
620 | msg.table(metrics_formatted, header=headers, divider=True)
621 |
--------------------------------------------------------------------------------