├── pyproject.toml
├── images
    ├── evaluate.gif
    ├── nervaluate.gif
    └── evaluate_example.gif
├── setup.py
├── setup.cfg
├── LICENSE
├── .github
    └── workflows
    │   └── tests.yml
├── README.md
├── .gitignore
├── tests
    └── test_evaluate.py
└── prodigy_evaluate
    └── __init__.py


/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.ruff]
2 | line-length = 220


--------------------------------------------------------------------------------
/images/evaluate.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/prodigy-evaluate/HEAD/images/evaluate.gif


--------------------------------------------------------------------------------
/images/nervaluate.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/prodigy-evaluate/HEAD/images/nervaluate.gif


--------------------------------------------------------------------------------
/images/evaluate_example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/prodigy-evaluate/HEAD/images/evaluate_example.gif


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | if __name__ == "__main__":
4 |     from setuptools import find_packages, setup
5 | 
6 |     setup(name="prodigy_evaluate", packages=find_packages())
7 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | version = 0.1.0
 3 | description = Recipes for evaluating spaCy pipelines
 4 | url = https://github.com/explosion/prodigy-evaluate
 5 | author = Explosion
 6 | author_email = contact@explosion.ai
 7 | 
 8 | [options]
 9 | zip_safe = true
10 | python_requires = >=3.8
11 | install_requires =
12 |     prodigy>=1.15.2,<2.0.0
13 |     scikit-learn>=1.4.0,<1.5.0
14 |     matplotlib>=3.8.0,<3.9.0
15 |     nervaluate>=0.1.8,<0.2.0
16 | 
17 | [options.entry_points]
18 | prodigy_recipes =
19 |     evaluate.evaluate = prodigy_evaluate:evaluate
20 |     evaluate.evaluate-example = prodigy_evaluate:evaluate_example
21 |     evaluate.nervaluate = prodigy_evaluate:evaluate_nervaluate
22 | 
23 | [bdist_wheel]
24 | universal = true
25 | 
26 | [sdist]
27 | formats = gztar


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Explosion
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | jobs:
10 |   setup:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v3
14 |       - name: Set up Python 3.9
15 |         uses: actions/setup-python@v4
16 |         with:
17 |           python-version: 3.9
18 |           cache: "pip"
19 |       - name: Install Prodigy from private repo
20 |         env:
21 |           GITHUB_TOKEN: ${{ secrets.GHA_PRODIGY_READ }} # Use the secret here
22 |         run: |
23 |           export GIT_LFS_SKIP_SMUDGE=1 
24 |           pip install --upgrade pip
25 |           git clone https://x-access-token:${GITHUB_TOKEN}@github.com/explosion/prodigy.git
26 |           cd prodigy
27 |           pip install setuptools wheel
28 |           pip install -e .
29 |           cd ..
30 |       - name: Install additional dependencies
31 |         run: |
32 |           pip install -e .
33 |           pip install pytest
34 |           python -m spacy download en_core_web_sm
35 |           pip install ruff black isort
36 |       
37 |       - name: Run pytest
38 |         if: always()
39 |         shell: bash
40 |         run: python -m pytest tests
41 |       
42 |       - name: Run ruff
43 |         if: always()
44 |         shell: bash
45 |         run: python -m ruff check prodigy_evaluate/ tests/ 
46 | 
47 |       - name: Run black
48 |         if: always()
49 |         shell: bash
50 |         run: python -m black --check prodigy_evaluate/ tests/
51 |     
52 |       - name: Run isort
53 |         if: always()
54 |         shell: bash
55 |         run: python -m isort prodigy_evaluate/ tests/
56 |       


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
 2 | 
 3 | # 🔎 Prodigy-evaluate
 4 | 
 5 | This repository contains a Prodigy plugin for recipes to evaluate spaCy pipelines. It features multiple recipes:
 6 | 
 7 | 1. `evaluate.evaluate`: Evaluate a spaCy pipeline on one or more datasets for different components. Passing flags like `--label-stats` or `--confusion-matrix` will compute a variety of evaluation metrics, including precision, recall, F1, accuracy, and more. 
 8 | 
 9 | <p align="center">
10 |   <img src="images/evaluate.gif" width="75%">
11 | </p>
12 | 
13 | 2. `evaluate.evaluate-example`: Evaluate a spaCy pipeline on one or more datasets for different components on a **per-example basis**. This is helpful for debugging and for understanding the hardest examples for your model. 
14 | 
15 | <p align="center">
16 |   <img src="images/evaluate_example.gif" width="75%">
17 | </p>
18 | 
19 | 3. `evaluate.nervaluate`: Evaluate a spaCy NER component on one or more datasets. This recipe uses the `nervaluate` library to calculate various metric for NER. You can learn more about the metrics in the [nervaluate documentation](https://github.com/MantisAI/nervaluate). This is helpful because the approach takes into account partial matches, which may be a more relevant metric for your NER use case. 
20 | 
21 | <p align="center">
22 |   <img src="images/nervaluate.gif" width="75%">
23 | </p>
24 | 
25 | 
26 | You can install this plugin via `pip`. 
27 | 
28 | ```
29 | pip install "prodigy-evaluate @ git+https://github.com/explosion/prodigy-evaluate"
30 | ```
31 | 
32 | To learn more about this plugin and additional functionality, you can check the [Prodigy docs](https://prodi.gy/docs/plugins/#evaluate).
33 | 
34 | ## Issues? 
35 | 
36 | Are you have trouble with this plugin? Let us know on our [support forum](https://support.prodi.gy/) and we'll get back to you! 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | #ignore python environments
 10 | *_env/
 11 | *_venv/
 12 | 
 13 | #ignore notebooks
 14 | *.ipynb
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | cover/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | .pybuilder/
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | 
 88 | # IPython
 89 | profile_default/
 90 | ipython_config.py
 91 | 
 92 | # pyenv
 93 | #   For a library or package, you might want to ignore these files since the code is
 94 | #   intended to run in multiple environments; otherwise, check them in:
 95 | # .python-version
 96 | 
 97 | # pipenv
 98 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 99 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | #   install all needed dependencies.
102 | #Pipfile.lock
103 | 
104 | # poetry
105 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
107 | #   commonly ignored for libraries.
108 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109 | #poetry.lock
110 | 
111 | # pdm
112 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113 | #pdm.lock
114 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
115 | #   in version control.
116 | #   https://pdm.fming.dev/#use-with-ide
117 | .pdm.toml
118 | 
119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
120 | __pypackages__/
121 | 
122 | # Celery stuff
123 | celerybeat-schedule
124 | celerybeat.pid
125 | 
126 | # SageMath parsed files
127 | *.sage.py
128 | 
129 | # Environments
130 | .env
131 | .venv
132 | env/
133 | venv/
134 | ENV/
135 | env.bak/
136 | venv.bak/
137 | 
138 | # Spyder project settings
139 | .spyderproject
140 | .spyproject
141 | 
142 | # Rope project settings
143 | .ropeproject
144 | 
145 | # mkdocs documentation
146 | /site
147 | 
148 | # mypy
149 | .mypy_cache/
150 | .dmypy.json
151 | dmypy.json
152 | 
153 | # Pyre type checker
154 | .pyre/
155 | 
156 | # pytype static type analyzer
157 | .pytype/
158 | 
159 | # Cython debug symbols
160 | cython_debug/
161 | 
162 | # PyCharm
163 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
164 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
165 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
166 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
167 | #.idea/
168 | 
169 | .DS_Store


--------------------------------------------------------------------------------
/tests/test_evaluate.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Iterable, List
  2 | 
  3 | import en_core_web_sm
  4 | import pytest
  5 | from prodigy.components.db import connect
  6 | from prodigy.types import TaskType
  7 | from spacy.training import Example
  8 | 
  9 | from prodigy_evaluate import (
 10 |     _create_ner_table,
 11 |     _display_eval_results,
 12 |     _get_actual_labels,
 13 |     _get_cf_actual_predicted,
 14 |     _get_predicted_labels,
 15 |     _get_score_for_metric,
 16 |     evaluate,
 17 |     evaluate_example,
 18 |     evaluate_nervaluate,
 19 | )
 20 | 
 21 | 
 22 | @pytest.fixture
 23 | def dataset() -> str:
 24 |     return "test_dataset"
 25 | 
 26 | 
 27 | @pytest.fixture
 28 | def spacy_model():
 29 |     return "en_core_web_sm"
 30 | 
 31 | 
 32 | @pytest.fixture
 33 | def nlp():
 34 |     return en_core_web_sm.load()
 35 | 
 36 | 
 37 | @pytest.fixture
 38 | def metric() -> str:
 39 |     return "ents_f"
 40 | 
 41 | 
 42 | @pytest.fixture
 43 | def data() -> Iterable[Dict]:
 44 |     return [
 45 |         {
 46 |             "text": "My name is Freya.",
 47 |             "_input_hash": 896529854,
 48 |             "_task_hash": -1486695581,
 49 |             "tokens": [
 50 |                 {"text": "My", "start": 0, "end": 2, "id": 0, "ws": True},
 51 |                 {"text": "name", "start": 3, "end": 7, "id": 1, "ws": True},
 52 |                 {"text": "is", "start": 8, "end": 10, "id": 2, "ws": True},
 53 |                 {"text": "Freya", "start": 11, "end": 16, "id": 3, "ws": True},
 54 |                 {"text": ".", "start": 16, "end": 17, "id": 4, "ws": True},
 55 |             ],
 56 |             "_view_id": "ner_manual",
 57 |             "spans": [
 58 |                 {
 59 |                     "start": 11,
 60 |                     "end": 16,
 61 |                     "token_start": 3,
 62 |                     "token_end": 3,
 63 |                     "label": "PERSON",
 64 |                 }
 65 |             ],
 66 |             "answer": "accept",
 67 |             "_timestamp": 1707211049,
 68 |             "_annotator_id": "2024-02-06_10-17-19",
 69 |             "_session_id": "2024-02-06_10-17-19",
 70 |         },
 71 |         {
 72 |             "text": "My favorite city is London.",
 73 |             "_input_hash": -91551573,
 74 |             "_task_hash": -1162253049,
 75 |             "tokens": [
 76 |                 {"text": "My", "start": 0, "end": 2, "id": 0, "ws": True},
 77 |                 {"text": "favorite", "start": 3, "end": 11, "id": 1, "ws": True},
 78 |                 {"text": "city", "start": 12, "end": 16, "id": 2, "ws": True},
 79 |                 {"text": "is", "start": 17, "end": 19, "id": 3, "ws": True},
 80 |                 {"text": "London", "start": 20, "end": 26, "id": 4, "ws": True},
 81 |                 {"text": ".", "start": 26, "end": 27, "id": 5, "ws": False},
 82 |             ],
 83 |             "_view_id": "ner_manual",
 84 |             "spans": [
 85 |                 {
 86 |                     "start": 20,
 87 |                     "end": 26,
 88 |                     "token_start": 4,
 89 |                     "token_end": 4,
 90 |                     "label": "GPE",
 91 |                 }
 92 |             ],
 93 |             "answer": "accept",
 94 |             "_timestamp": 1707211053,
 95 |             "_annotator_id": "2024-02-06_10-17-19",
 96 |             "_session_id": "2024-02-06_10-17-19",
 97 |         },
 98 |         {
 99 |             "text": "I live in Berlin.",
100 |             "_input_hash": -2101464790,
101 |             "_task_hash": 1279282044,
102 |             "tokens": [
103 |                 {"text": "I", "start": 0, "end": 1, "id": 0, "ws": True},
104 |                 {"text": "live", "start": 2, "end": 6, "id": 1, "ws": True},
105 |                 {"text": "in", "start": 7, "end": 9, "id": 2, "ws": True},
106 |                 {"text": "Berlin", "start": 10, "end": 16, "id": 3, "ws": True},
107 |                 {"text": ".", "start": 16, "end": 17, "id": 4, "ws": True},
108 |             ],
109 |             "_view_id": "ner_manual",
110 |             "spans": [
111 |                 {
112 |                     "start": 10,
113 |                     "end": 16,
114 |                     "token_start": 3,
115 |                     "token_end": 3,
116 |                     "label": "GPE",
117 |                 }
118 |             ],
119 |             "answer": "accept",
120 |             "_timestamp": 1707211056,
121 |             "_annotator_id": "2024-02-06_10-17-19",
122 |             "_session_id": "2024-02-06_10-17-19",
123 |         },
124 |     ]
125 | 
126 | 
127 | @pytest.fixture
128 | def scores() -> Dict[str, float]:
129 |     return {
130 |         "ents_f": 0.9,
131 |         "ents_p": 0.8,
132 |         "ents_r": 0.7,
133 |         "tags_acc": 0.6,
134 |         "sents_p": 0.5,
135 |         "sents_r": 0.4,
136 |         "sents_f": 0.3,
137 |     }
138 | 
139 | 
140 | @pytest.fixture
141 | def db(dataset: str, data: List[TaskType]):
142 |     database = connect()
143 |     database.add_dataset(dataset)
144 |     database.add_examples(data, datasets=[dataset])
145 |     return database
146 | 
147 | 
148 | @pytest.fixture
149 | def ner_examples(nlp):
150 |     data = {
151 |         "Apple Inc. is an American multinational technology company.": {
152 |             "entities": [(0, 10, "ORG")]  # Span covering "Apple Inc."
153 |         },
154 |         "Musk is the CEO of Tesla, Inc.": {
155 |             "entities": [
156 |                 (0, 4, "PERSON"),
157 |                 (19, 30, "ORG"),
158 |             ]  # Spans covering "Musk" and "Tesla, Inc."
159 |         },
160 |     }
161 |     examples = []
162 |     for text, annot in data.items():
163 |         examples.append(Example.from_dict(nlp.make_doc(text), annot))
164 | 
165 |     return examples
166 | 
167 | 
168 | @pytest.fixture
169 | def textcat_examples(nlp):
170 |     data = {
171 |         "SpaCy is an amazing library for NLP.": {"POSITIVE": 1.0, "NEGATIVE": 0.0},
172 |         "I dislike rainy days.": {"POSITIVE": 0.0, "NEGATIVE": 1.0},
173 |     }
174 | 
175 |     examples = []
176 |     for text, annot in data.items():
177 |         doc = nlp.make_doc(text)
178 |         doc.cats = annot
179 |         ref_doc = nlp.make_doc(text)
180 |         ref_doc.cats = annot
181 |         example = Example(doc, ref_doc)
182 |         examples.append(example)
183 | 
184 |     return examples
185 | 
186 | 
187 | @pytest.fixture
188 | def nervaluate_results():
189 |     return {
190 |         "ent_type": {
191 |             "correct": 2,
192 |             "incorrect": 0,
193 |             "partial": 0,
194 |             "missed": 1,
195 |             "spurious": 0,
196 |             "possible": 3,
197 |             "actual": 2,
198 |             "precision": 1.0,
199 |             "recall": 0.6666666666666666,
200 |             "f1": 0.8,
201 |         },
202 |         "partial": {
203 |             "correct": 2,
204 |             "incorrect": 0,
205 |             "partial": 0,
206 |             "missed": 1,
207 |             "spurious": 0,
208 |             "possible": 3,
209 |             "actual": 2,
210 |             "precision": 1.0,
211 |             "recall": 0.6666666666666666,
212 |             "f1": 0.8,
213 |         },
214 |         "strict": {
215 |             "correct": 2,
216 |             "incorrect": 0,
217 |             "partial": 0,
218 |             "missed": 1,
219 |             "spurious": 0,
220 |             "possible": 3,
221 |             "actual": 2,
222 |             "precision": 1.0,
223 |             "recall": 0.6666666666666666,
224 |             "f1": 0.8,
225 |         },
226 |         "exact": {
227 |             "correct": 2,
228 |             "incorrect": 0,
229 |             "partial": 0,
230 |             "missed": 1,
231 |             "spurious": 0,
232 |             "possible": 3,
233 |             "actual": 2,
234 |             "precision": 1.0,
235 |             "recall": 0.6666666666666666,
236 |             "f1": 0.8,
237 |         },
238 |     }
239 | 
240 | 
241 | ######## evaluation tests ########
242 | 
243 | 
244 | def test_evaluate_example(spacy_model, dataset, metric, db, capsys):
245 |     evaluate_example(model=spacy_model, ner=dataset, metric=metric, n_results=5)
246 | 
247 |     captured = capsys.readouterr()
248 | 
249 |     assert "Scored Example" in captured.out
250 | 
251 |     db.drop_dataset(dataset)
252 | 
253 | 
254 | def test_evaluate(spacy_model, dataset, db, capsys):
255 |     results = evaluate(
256 |         model=spacy_model,
257 |         ner=dataset,
258 |         label_stats=True,
259 |         cf_matrix=False,  # False
260 |     )
261 | 
262 |     captured = capsys.readouterr()
263 | 
264 |     assert "P" in captured.out
265 |     assert "R" in captured.out
266 |     assert "F" in captured.out
267 | 
268 |     assert isinstance(results, dict)
269 |     assert "token_acc" in results
270 |     assert "token_p" in results
271 |     assert results.get("token_p") == 1
272 |     assert isinstance(results.get("ents_p"), float)
273 |     assert results.get("speed") > 1
274 | 
275 |     db.drop_dataset(dataset)
276 | 
277 | 
278 | def test_nervaluate(spacy_model, dataset, db, capsys):
279 |     results = evaluate_nervaluate(
280 |         model=spacy_model,
281 |         ner=dataset,
282 |     )
283 |     captured = capsys.readouterr()
284 | 
285 |     assert "Correct" in captured.out
286 |     assert "Metric" in captured.out
287 |     assert "Ent type" in captured.out
288 |     assert "Incorrect" in captured.out
289 |     assert "Recall" in captured.out
290 |     assert "F1" in captured.out
291 |     assert "Partial" in captured.out
292 | 
293 |     assert isinstance(results, dict)
294 |     assert "ent_type" in list(results["overall_results"].keys())
295 |     assert "partial" in results["overall_results"]
296 | 
297 |     assert results["overall_results"]["ent_type"]["f1"] == 1.0
298 | 
299 |     db.drop_dataset(dataset)
300 | 
301 | 
302 | def test_display_eval_results(scores, capsys):
303 |     _display_eval_results(scores, "sc")
304 |     captured = capsys.readouterr()
305 | 
306 |     assert "Results" in captured.out
307 | 
308 | 
309 | def test_get_score_for_metric(scores, metric: str):
310 |     res = _get_score_for_metric(scores, metric)
311 | 
312 |     assert isinstance(res, float)
313 |     assert isinstance(scores, dict)
314 |     assert isinstance(metric, str)
315 |     assert metric is not None
316 | 
317 | 
318 | def test_get_actual_labels_ner(ner_examples):
319 |     ner_labels = _get_actual_labels(ner_examples, "ner")
320 |     assert isinstance(ner_labels, list)
321 |     assert len(ner_labels) == 2
322 |     assert all(isinstance(label, str) for label in ner_labels[0])
323 |     assert all(isinstance(label, str) for label in ner_labels[1])
324 |     assert "O" in ner_labels[0]
325 |     assert "B-ORG" in ner_labels[0]
326 |     assert "U-PERSON" in ner_labels[1]
327 | 
328 | 
329 | def test_get_actual_labels_textcat(textcat_examples):
330 |     textcat_labels = _get_actual_labels(textcat_examples, "textcat")
331 |     assert isinstance(textcat_labels, list)
332 |     assert len(textcat_labels) == 2
333 |     assert "POSITIVE" in textcat_labels
334 |     assert "NEGATIVE" in textcat_labels
335 |     assert all(isinstance(label, str) for label in textcat_labels)
336 | 
337 | 
338 | # here we need a model as we're using one in _get_predicted_labels
339 | # because nlp.evaluate does not create example.predicted values
340 | def test_get_predicted_labels_ner(nlp, ner_examples):
341 |     pred_ner_labels = _get_predicted_labels(nlp, ner_examples, "ner")
342 |     assert isinstance(pred_ner_labels, list)
343 |     assert len(pred_ner_labels) == 2
344 |     assert all(isinstance(label, str) for label in pred_ner_labels[0])
345 |     assert all(isinstance(label, str) for label in pred_ner_labels[1])
346 | 
347 |     assert "O" in pred_ner_labels[1]
348 |     assert "B-ORG" in pred_ner_labels[0]
349 | 
350 | 
351 | def test_get_cf_actual_predicted(nlp, ner_examples):
352 |     actual, predicted, labels, actual_flat, predicted_flat = _get_cf_actual_predicted(
353 |         nlp, ner_examples, "ner"
354 |     )
355 |     assert isinstance(actual[0], list)
356 |     assert isinstance(actual_flat[0], str)
357 | 
358 |     assert isinstance(predicted[0], list)
359 |     assert isinstance(predicted_flat[1], str)
360 | 
361 |     assert isinstance(actual, list)
362 |     assert isinstance(predicted, list)
363 |     assert isinstance(labels, list)
364 |     assert "O" in actual[0]
365 |     assert "B-ORG" in predicted[1]
366 | 
367 | 
368 | def test_create_ner_table(nervaluate_results, capsys):
369 |     _create_ner_table(nervaluate_results)
370 |     captured = capsys.readouterr()
371 | 
372 |     assert "Correct" in captured.out
373 |     assert "Metric" in captured.out
374 |     assert "Ent type" in captured.out
375 |     assert "Incorrect" in captured.out
376 |     assert "Recall" in captured.out
377 |     assert "F1" in captured.out
378 |     assert "Partial" in captured.out
379 | 


--------------------------------------------------------------------------------
/prodigy_evaluate/__init__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from dataclasses import dataclass
  4 | from pathlib import Path
  5 | from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | import spacy
  9 | import srsly
 10 | from nervaluate import Evaluator
 11 | from prodigy.core import recipe
 12 | from prodigy.errors import RecipeError
 13 | from prodigy.recipes.data_utils import get_datasets_from_cli_eval, merge_corpus
 14 | from prodigy.recipes.train import RECIPE_ARGS, set_log_level, setup_gpu
 15 | from prodigy.util import SPANCAT_DEFAULT_KEY, msg
 16 | from radicli import Arg
 17 | 
 18 | # additional imports
 19 | from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
 20 | from spacy.cli.evaluate import handle_scores_per_type
 21 | from spacy.language import Language
 22 | from spacy.training import offsets_to_biluo_tags
 23 | from spacy.training.example import Example
 24 | 
 25 | 
 26 | @recipe(
 27 |     "evaluate.evaluate",
 28 |     # fmt: off
 29 |     model=Arg(help="Name or path of model to evaluate"),
 30 |     ner=RECIPE_ARGS["ner"],
 31 |     textcat=RECIPE_ARGS["textcat"],
 32 |     textcat_multilabel=RECIPE_ARGS["textcat_multilabel"],
 33 |     tagger=RECIPE_ARGS["tagger"],
 34 |     senter=RECIPE_ARGS["senter"],
 35 |     parser=RECIPE_ARGS["parser"],
 36 |     spancat=RECIPE_ARGS["spancat"],
 37 |     coref=RECIPE_ARGS["coref"],
 38 |     label_stats=Arg("--label-stats", "-LS", help="Show per-label scores"),
 39 |     gpu_id=RECIPE_ARGS["gpu_id"],
 40 |     verbose=RECIPE_ARGS["verbose"],
 41 |     silent=RECIPE_ARGS["silent"],
 42 |     cf_matrix = Arg("--confusion-matrix", "-CF",  help="Show confusion matrix for the specified component"),
 43 |     cf_path = Arg("--cf-path", "-CP", help="Path to save the confusion matrix array"),
 44 |     spans_key=Arg("--spans-key", help="Optional spans key to evaluate if spancat component is used."),
 45 |     # fmt: on
 46 | )
 47 | def evaluate(
 48 |     model: Union[str, Path],
 49 |     ner: Sequence[str] = tuple(),
 50 |     textcat: Sequence[str] = tuple(),
 51 |     textcat_multilabel: Sequence[str] = tuple(),
 52 |     tagger: Sequence[str] = tuple(),
 53 |     senter: Sequence[str] = tuple(),
 54 |     parser: Sequence[str] = tuple(),
 55 |     spancat: Sequence[str] = tuple(),
 56 |     coref: Sequence[str] = tuple(),
 57 |     label_stats: bool = False,
 58 |     gpu_id: int = -1,
 59 |     verbose: bool = False,
 60 |     silent: bool = True,
 61 |     cf_matrix: bool = False,
 62 |     cf_path: Optional[Path] = None,
 63 |     spans_key: str = SPANCAT_DEFAULT_KEY,
 64 | ) -> Dict[str, Any]:
 65 |     """Evaluate a spaCy pipeline on one or more datasets for different components.
 66 | 
 67 |     This command takes care of merging all annotations on the same input data like the
 68 |     prodigy train command.
 69 | 
 70 |     You can also use the --label-stats flag to show per-label scores for NER and textcat
 71 |     components. This will show the precision, recall and F-score for each label.
 72 | 
 73 |     Finally, you can also use --confusion-matrix to show the confusion matrix for the
 74 |     specified component. This will only work for NER or textcat components.
 75 | 
 76 |     Example Usage:
 77 | 
 78 |         ```
 79 |         prodigy evaluate.evaluate en_core_web_sm --ner my_eval_dataset --confusion-matrix
 80 |         ```
 81 |     """
 82 |     set_log_level(verbose=verbose, silent=silent)
 83 |     setup_gpu(gpu_id)
 84 |     nlp = spacy.load(model)
 85 | 
 86 |     pipes = get_datasets_from_cli_eval(
 87 |         ner,
 88 |         textcat,
 89 |         textcat_multilabel,
 90 |         tagger,
 91 |         senter,
 92 |         parser,
 93 |         spancat,
 94 |         coref,
 95 |     )
 96 |     pipe_key = [k for k in pipes if pipes.get(k)][0]
 97 | 
 98 |     compat_pipes = {
 99 |         pipe_name: ([], eval_sets) for pipe_name, eval_sets in pipes.items()
100 |     }
101 |     merged_corpus = merge_corpus(nlp, compat_pipes)
102 |     dev_examples = merged_corpus["dev"](nlp)
103 |     scores = nlp.evaluate(dev_examples)
104 | 
105 |     if pipe_key in ["ner", "textcat"]:
106 |         (
107 |             actual_labels,
108 |             predicted_labels,
109 |             labels,
110 |             flat_actual_labels,
111 |             flat_predicted_labels,
112 |         ) = _get_cf_actual_predicted(
113 |             nlp=nlp, dev_examples=dev_examples, pipe_key=pipe_key
114 |         )
115 |         labels_to_include = [label for label in labels if label != "O"]
116 |         if pipe_key == "ner":
117 |             actual_labels = flat_actual_labels
118 |             predicted_labels = flat_predicted_labels
119 | 
120 |         cfarray = confusion_matrix(
121 |             actual_labels, predicted_labels, labels=labels_to_include, normalize="true"
122 |         )
123 | 
124 |     _display_eval_results(
125 |         scores, spans_key=spans_key, silent=False, per_type=label_stats
126 |     )
127 | 
128 |     if cf_matrix:
129 |         if pipe_key not in ["ner", "textcat"]:
130 |             msg.fail(
131 |                 f"Confusion matrix is not supported for {pipe_key} component", exits=1
132 |             )
133 |         _display_confusion_matrix(
134 |             cm=cfarray,
135 |             labels=labels_to_include,
136 |         )
137 |         msg.good("Confusion matrix displayed")
138 | 
139 |     if cf_path:
140 |         if pipe_key not in ["ner", "textcat"]:
141 |             msg.fail(
142 |                 f"Confusion matrix is not supported for {pipe_key} component", exits=1
143 |             )
144 |         if not cf_path.exists():
145 |             os.makedirs(cf_path)
146 | 
147 |         full_cf_path = cf_path / "cf_array.json"
148 |         srsly.write_json(
149 |             full_cf_path,
150 |             {
151 |                 "cf_array": cfarray.tolist(),
152 |                 "labels": labels_to_include,
153 |             },
154 |         )
155 |         msg.good(f"Confusion matrix array saved to {full_cf_path}")
156 | 
157 |     return scores
158 | 
159 | 
160 | @recipe(
161 |     "evaluate.evaluate-example",
162 |     # fmt: off
163 |     model=Arg(help="Path to model to evaluate"),
164 |     ner=RECIPE_ARGS["ner"],
165 |     textcat=RECIPE_ARGS["textcat"],
166 |     textcat_multilabel=RECIPE_ARGS["textcat_multilabel"],
167 |     tagger=RECIPE_ARGS["tagger"],
168 |     senter=RECIPE_ARGS["senter"],
169 |     parser=RECIPE_ARGS["parser"],
170 |     spancat=RECIPE_ARGS["spancat"],
171 |     coref=RECIPE_ARGS["coref"],
172 |     gpu_id=RECIPE_ARGS["gpu_id"],
173 |     verbose=RECIPE_ARGS["verbose"],
174 |     silent=RECIPE_ARGS["silent"],
175 |     metric=Arg("--metric", "-m", help="Metric to use for sorting examples"),
176 |     n_results = Arg("--n-results", "-NR", help="Number of top examples to display"),
177 |     output_path=Arg("--output-path", "-OP", help="Path to save the top examples and scores")
178 |     # fmt: on
179 | )
180 | def evaluate_example(
181 |     model: Union[str, Path],
182 |     ner: Sequence[str] = tuple(),
183 |     textcat: Sequence[str] = tuple(),
184 |     textcat_multilabel: Sequence[str] = tuple(),
185 |     tagger: Sequence[str] = tuple(),
186 |     senter: Sequence[str] = tuple(),
187 |     parser: Sequence[str] = tuple(),
188 |     spancat: Sequence[str] = tuple(),
189 |     coref: Sequence[str] = tuple(),
190 |     gpu_id: int = -1,
191 |     verbose: bool = False,
192 |     silent: bool = True,
193 |     metric: Optional[str] = None,
194 |     n_results: int = 10,
195 |     output_path: Optional[Path] = None,
196 | ):
197 |     """Evaluate a spaCy pipeline on one or more datasets for different components
198 |     on a per-example basis. This command will run an evaluation on each example individually
199 |     and then sort by the desired `--metric` argument.
200 | 
201 |     This is useful for debugging and understanding the easiest
202 |     and hardest examples for your model.
203 | 
204 |     Example Usage:
205 |         ```
206 |         prodigy evaluate.evaluate-example en_core_web_sm --ner my_eval_dataset --metric ents_f
207 |         ```
208 | 
209 |     This will sort examples by lowest NER F-score.
210 |     """
211 |     if not metric:
212 |         raise RecipeError(
213 |             "You must pass a metric to sort examples via --metric argument. Refer to prodigy evaluate-example documentation for available metric types."
214 |         )
215 | 
216 |     set_log_level(verbose=verbose, silent=silent)
217 |     setup_gpu(gpu_id)
218 |     nlp = spacy.load(model)
219 | 
220 |     pipes = get_datasets_from_cli_eval(
221 |         ner,
222 |         textcat,
223 |         textcat_multilabel,
224 |         tagger,
225 |         senter,
226 |         parser,
227 |         spancat,
228 |         coref,
229 |     )
230 |     compat_pipes = {
231 |         pipe_name: ([], eval_sets) for pipe_name, eval_sets in pipes.items()
232 |     }
233 |     merged_corpus = merge_corpus(nlp, compat_pipes)
234 |     dev_examples = merged_corpus["dev"](nlp)
235 |     results: List[ScoredExample] = evaluate_each_example(nlp, dev_examples, metric)
236 | 
237 |     top_results: List[ScoredExample] = results[:n_results]
238 | 
239 |     if len(top_results) == 0:
240 |         msg.fail(f"No examples found for the metric {metric}.", exits=1)
241 |     avg_text_len = sum([len(ex.example.text) for ex in top_results]) / len(top_results)
242 |     if avg_text_len > 100:
243 |         msg.warn(
244 |             f"Average # of characters of top examples is {round(avg_text_len, 2)}. This will not display well in the terminal. Consider saving the top examples to file with `--output-path` and investigating accordingly."
245 |         )
246 | 
247 |     def split_string_into_tuples(input_string: str, length: int = 50):
248 |         """
249 |         This function takes a string and splits it into tuples of length `length`.
250 |         Useful for wrapping long strings in tables.
251 |         """
252 |         input_string = input_string.rstrip()
253 |         if len(input_string) > length:
254 |             result = tuple(
255 |                 input_string[i : i + length].rstrip()
256 |                 for i in range(0, len(input_string), length)
257 |             )
258 |             return result
259 |         else:
260 |             return input_string
261 | 
262 |     data = [
263 |         (
264 |             split_string_into_tuples(ex.example.text),
265 |             round(ex.score, 2) if ex.score is not None else None,
266 |         )
267 |         for ex in top_results
268 |     ]
269 |     headers = ["Example", metric]
270 |     widths = (50, 9)
271 |     aligns = ("l", "l")
272 | 
273 |     msg.divider("Scored Examples")
274 |     msg.table(
275 |         data, header=headers, divider=True, widths=widths, aligns=aligns, multiline=True
276 |     )
277 | 
278 |     if output_path:
279 |         if not output_path.exists():
280 |             os.makedirs(output_path)
281 | 
282 |         results_path = output_path / "hardest_examples.jsonl"
283 | 
284 |         results_jsonl = []
285 |         for data in top_results:
286 |             results_json = {
287 |                 "text": data.example.text,
288 |                 "meta": {"score": data.score, "metric": metric},
289 |             }
290 |             results_jsonl.append(results_json)
291 | 
292 |         srsly.write_jsonl(results_path, results_jsonl)
293 |         msg.good(f"The examples with the lowest scores saved to {results_path}")
294 |         msg.info(
295 |             "You can inspect the NER/spancat/textcat predictions on the hardest examples by running one of the Prodigy `*.correct` or `*.model-annotate` workflows. See documentation for more details: https://prodi.gy/docs/recipes"
296 |         )
297 | 
298 | 
299 | @recipe(
300 |     "evaluate.nervaluate",
301 |     # fmt: off
302 |     model=Arg(help="Path to model to evaluate"),
303 |     ner=RECIPE_ARGS["ner"],
304 |     gpu_id=RECIPE_ARGS["gpu_id"],
305 |     verbose=RECIPE_ARGS["verbose"],
306 |     per_label=Arg("--per-label", "-PL", help="Show per-label NER nervaluate scores"),
307 |     # fmt: on
308 | )
309 | def evaluate_nervaluate(
310 |     model: Union[str, Path],
311 |     ner: Sequence[str],
312 |     gpu_id: int = -1,
313 |     verbose: bool = False,
314 |     per_label: bool = False,
315 | ):
316 |     """
317 |     Evaluate spaCy's NER component using nervaluate metrics. the `nervaluate` library
318 |     provides full named-entity (i.e. not tag/token) evaluation metrics based on SemEval’13.
319 | 
320 |     For more information on these metric, see https://github.com/MantisAI/nervaluate.
321 | 
322 |     Example Usage:
323 | 
324 |         ```
325 |         prodigy evaluate.nervaluate en_core_web_sm --ner my_eval_dataset
326 |         ```
327 |     """
328 |     set_log_level(verbose=verbose, silent=True)  # silence component merging
329 |     setup_gpu(gpu_id)
330 |     nlp = spacy.load(model)
331 |     merged_corpus = merge_corpus(nlp, {"ner": ([], [ner])})
332 |     dev_examples = merged_corpus["dev"](nlp)
333 |     (
334 |         actual_labels,
335 |         predicted_labels,
336 |         labels,
337 |         flat_actual_labels,
338 |         flat_predicted_labels,
339 |     ) = _get_cf_actual_predicted(nlp=nlp, dev_examples=dev_examples, pipe_key="ner")
340 | 
341 |     evaluator = Evaluator(actual_labels, predicted_labels, tags=labels, loader="list")
342 |     ner_results, ner_results_by_tag = evaluator.evaluate()
343 |     msg.divider("nervaluate NER metrics")
344 |     msg.info(
345 |         "Full named-entity (i.e., not tag/token) evaluation metrics based on SemEval’13. For more information on these metrics, see https://github.com/MantisAI/nervaluate"
346 |     )
347 |     msg.text("NER: Overall")
348 |     _create_ner_table(ner_results)
349 | 
350 |     if per_label:
351 |         for tag, tag_results in ner_results_by_tag.items():
352 |             if tag != "O":
353 |                 msg.text(title=f"NER: {tag}")
354 |                 _create_ner_table(tag_results)
355 | 
356 |     return {"overall_results": ner_results, "results_by_tag": ner_results_by_tag}
357 | 
358 | 
359 | @dataclass
360 | class ScoredExample:
361 |     example: Example
362 |     score: Optional[float]
363 |     scores: Dict[str, float]
364 | 
365 | 
366 | def _get_score_for_metric(scores: Dict[str, float], metric: str) -> Union[float, None]:
367 |     """Returns the score for the specified metric.
368 | 
369 |     Args:
370 |         scores (Dict[str, float]): Dictionary containing scores for different metrics
371 |         metric (str): Metric to get the score for
372 | 
373 |     Returns:
374 |         Union[float, None]: Score for the specified metric or None if not found
375 |     """
376 | 
377 |     return scores.get(metric, None)
378 | 
379 | 
380 | def evaluate_each_example(
381 |     nlp: Language,
382 |     dev_examples: Iterable[Example],
383 |     metric: str,
384 |     desc: bool = False,
385 |     skip_none: bool = True,
386 | ) -> List[ScoredExample]:
387 |     def sort_key(x: Tuple[Example, Dict[str, float]]) -> Union[float, int]:
388 |         _, eval_scores = x
389 |         res = _get_score_for_metric(eval_scores, metric)
390 |         if res is None:
391 |             res = 0
392 |         if not isinstance(res, (float, int)):
393 |             raise ValueError(f"Invalid metric to sort by: {metric}", res)
394 |         return res
395 | 
396 |     per_example_scores = {}
397 |     for example in dev_examples:
398 |         scores = nlp.evaluate([example])
399 |         res = _get_score_for_metric(scores, metric)
400 |         if res is None and skip_none:
401 |             continue
402 |         per_example_scores[example] = scores
403 | 
404 |     sorted_per_example_scores = [
405 |         ScoredExample(
406 |             example=eg,
407 |             score=_get_score_for_metric(example_scores, metric),
408 |             scores=example_scores,
409 |         )
410 |         for eg, example_scores in sorted(
411 |             per_example_scores.items(), key=sort_key, reverse=desc
412 |         )
413 |     ]
414 |     return sorted_per_example_scores
415 | 
416 | 
417 | def _display_eval_results(
418 |     scores: Dict[str, Any], spans_key: str, silent: bool = False, per_type: bool = False
419 | ) -> None:
420 |     """Displays the evaluation results for the specified component.
421 | 
422 |     Args:
423 |         scores (Dict[str, Any]): Dictionary containing evaluation scores from `nlp.evaluate`
424 |         spans_key (str): Optional spans key to evaluate if spancat component is used.
425 |         silent (bool, optional): Whether to display all results or not. Defaults to False.
426 |     """
427 |     metrics = {
428 |         "TOK": "token_acc",
429 |         "TAG": "tag_acc",
430 |         "POS": "pos_acc",
431 |         "MORPH": "morph_acc",
432 |         "LEMMA": "lemma_acc",
433 |         "UAS": "dep_uas",
434 |         "LAS": "dep_las",
435 |         "NER P": "ents_p",
436 |         "NER R": "ents_r",
437 |         "NER F": "ents_f",
438 |         "TEXTCAT": "cats_score",
439 |         "SENT P": "sents_p",
440 |         "SENT R": "sents_r",
441 |         "SENT F": "sents_f",
442 |         "SPAN P": f"spans_{spans_key}_p",
443 |         "SPAN R": f"spans_{spans_key}_r",
444 |         "SPAN F": f"spans_{spans_key}_f",
445 |         "SPEED": "speed",
446 |     }
447 |     results = {}
448 |     data = {}
449 |     for metric, key in metrics.items():
450 |         if key in scores:
451 |             if key == "cats_score":
452 |                 metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
453 |             if isinstance(scores[key], (int, float)):
454 |                 if key == "speed":
455 |                     results[metric] = f"{scores[key]:.0f}"
456 |                 else:
457 |                     results[metric] = f"{scores[key]*100:.2f}"
458 |             else:
459 |                 results[metric] = "-"
460 |             data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
461 |     msg.table(results, title="Results")
462 | 
463 |     if per_type:
464 |         data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
465 | 
466 | 
467 | #### Confusion matrix functions ####
468 | 
469 | 
470 | def _get_actual_labels(dev_examples: Iterable[Example], pipe_key: str) -> List[Any]:
471 |     """Returns the actual labels for the specified component.
472 | 
473 |     Args:
474 |         dev_examples (Iterable[Example]): List of examples
475 |         pipe_key (str): Name of the component
476 | 
477 |     Returns:
478 |         List[Any]: List of actual labels
479 |     """
480 |     actual_labels = []
481 |     for ex in dev_examples:
482 |         ref = ex.reference  # we have reference but we don't have predicted
483 |         if pipe_key == "ner":
484 |             ents = ex.get_aligned_ner()
485 |             ents_clean = ["O" if x is None else x for x in ents]
486 |             actual_labels.append(ents_clean)
487 |         elif pipe_key == "textcat":
488 |             text_labels = ref.cats
489 |             most_likely_class = (
490 |                 max(text_labels, key=lambda k: text_labels[k])
491 |                 if text_labels != {}
492 |                 else "O"
493 |             )
494 |             actual_labels.append(most_likely_class)
495 | 
496 |     return actual_labels
497 | 
498 | 
499 | def _get_predicted_labels(
500 |     nlp: Language, dev_examples: Iterable[Example], pipe_key: str
501 | ) -> List[Any]:
502 |     """Returns the predicted labels for the specified component.
503 | 
504 |     Args:
505 |         nlp (Language): spaCy model
506 |         dev_examples (Iterable[Example]): List of examples
507 |         pipe_key (str): Name of the component
508 | 
509 |     Returns:
510 |         List[Any]: List of predicted labels
511 |     """
512 | 
513 |     texts = [eg.text for eg in dev_examples]
514 |     pred_labels = []
515 |     for eg in nlp.pipe(texts):
516 |         if pipe_key == "ner":
517 |             ents = [(ent.start_char, ent.end_char, ent.label_) for ent in eg.ents]
518 |             biluo_tags = offsets_to_biluo_tags(eg, ents)
519 |             pred_labels.append(biluo_tags)
520 |         elif pipe_key == "textcat":
521 |             text_labels = eg.cats
522 |             most_likely_class = (
523 |                 max(text_labels, key=lambda k: text_labels[k])
524 |                 if text_labels != {}
525 |                 else "O"
526 |             )
527 |             pred_labels.append(most_likely_class)
528 | 
529 |     return pred_labels
530 | 
531 | 
532 | def _get_cf_actual_predicted(
533 |     nlp: Language, dev_examples: Iterable[Example], pipe_key: str
534 | ):
535 |     """Returns the actual and predicted labels for the specified component.
536 | 
537 |     Args:
538 |         nlp (Language): spaCy model
539 |         dev_examples (Iterable[Example]): List of examples
540 |         pipe_key (str): Name of the component
541 | 
542 |     Returns:
543 |         Tuple containing actual labels, predicted labels, labels, flat actual labels and flat predicted labels
544 |     """
545 |     actual_labels = [label for label in _get_actual_labels(dev_examples, pipe_key)]
546 |     predicted_labels = [
547 |         label for label in _get_predicted_labels(nlp, dev_examples, pipe_key)
548 |     ]
549 |     if pipe_key == "textcat":
550 |         labels = set(predicted_labels).union(set(actual_labels))
551 |         return actual_labels, predicted_labels, list(labels), [], []
552 | 
553 |     elif pipe_key == "ner":
554 |         actual_labels_flat = [
555 |             label.split("-")[-1] for sublist in actual_labels for label in sublist
556 |         ]
557 |         predicted_labels_flat = [
558 |             label.split("-")[-1] for sublist in predicted_labels for label in sublist
559 |         ]
560 |         labels = set(predicted_labels_flat).union(set(actual_labels_flat))
561 | 
562 |         return (
563 |             actual_labels,
564 |             predicted_labels,
565 |             list(labels),
566 |             actual_labels_flat,
567 |             predicted_labels_flat,
568 |         )
569 | 
570 | 
571 | def _display_confusion_matrix(cm: List[List[float]], labels: List[Any]) -> None:
572 |     """Displays the confusion matrix for the specified component.
573 | 
574 |     Args:
575 |         cm (List[List[float]]): Confusion matrix array
576 |         labels (List[Any]): List of labels
577 |     """
578 |     disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
579 |     ax = disp.plot(colorbar=False, cmap="Blues")
580 |     ax.ax_.set_title("Confusion Matrix")
581 |     plt.show()
582 | 
583 | 
584 | def _create_ner_table(results: Dict[str, Dict[str, float]]):
585 |     """Creates a table for NER results.
586 | 
587 |     Args:
588 |         results (Dict[str, Dict[str, float]]): Dictionary containing NER results.
589 |     """
590 | 
591 |     ner_metrics = [
592 |         "correct",
593 |         "incorrect",
594 |         "partial",
595 |         "missed",
596 |         "spurious",
597 |         "possible",
598 |         "actual",
599 |         "precision",
600 |         "recall",
601 |         "f1",
602 |     ]
603 |     headers = tuple(["Metric"] + [m.capitalize() for m in ner_metrics])
604 | 
605 |     metrics_formatted = []
606 |     for eval_type, metrics in results.items():
607 |         row = [eval_type.replace("_", " ").capitalize()]
608 |         row.extend(
609 |             [
610 |                 (
611 |                     round(metrics.get(key, None), 2)
612 |                     if metrics.get(key, None) is not None
613 |                     else None
614 |                 )
615 |                 for key in ner_metrics
616 |             ]
617 |         )
618 |         metrics_formatted.append(row)
619 | 
620 |     msg.table(metrics_formatted, header=headers, divider=True)
621 | 


--------------------------------------------------------------------------------