├── seqscore
    ├── scripts
    │   ├── __init__.py
    │   └── seqscore.py
    ├── __init__.py
    ├── processing.py
    ├── util.py
    ├── model.py
    ├── validation.py
    └── scoring.py
├── tests
    ├── test_files
    │   ├── empty.txt
    │   ├── map_LOC_GPE.json
    │   ├── map_PERSON.json
    │   ├── map_bad_value.json
    │   ├── map_NAME.json
    │   ├── map_bad_duplicate.json
    │   ├── minimal_comments_1.bio
    │   ├── minimal_comments_2.bio
    │   ├── minimal_comments_3.bio
    │   ├── count_minimal_ref.txt
    │   ├── count_minimal_ref_comma.txt
    │   ├── minimal_comments_4.bio
    │   ├── count_minimal_twofiles_ref.txt
    │   ├── minimal_no_names.bio
    │   ├── minimal_no_LOC.bio
    │   ├── minimal_GPE.bio
    │   ├── space_delim.txt
    │   ├── minimal_bio_copy.txt
    │   ├── minimal_NAME.bio
    │   ├── minimal_bio_extra_line.txt
    │   └── minimal_comments.bio
    ├── test.sh
    ├── conll_annotation
    │   ├── bad_label1.bio
    │   ├── bad_label2.bio
    │   ├── bad_label3.bio
    │   ├── minimal2.bio
    │   ├── invalid1_BIO_discard.txt
    │   ├── invalid1.bio
    │   ├── minimal.bio
    │   ├── minimal.bioes
    │   ├── minimal.io
    │   ├── invalid1.bioes
    │   ├── minimal_fields.bio
    │   └── minimal_fields.iob
    ├── check.sh
    ├── test_all.sh
    ├── conll_predictions
    │   ├── correct1.bio
    │   ├── incorrect1.bio
    │   ├── correct1_improper_sequence_ref.txt
    │   ├── correct1_improper_sequence_pred.txt
    │   ├── incorrect_type_not_in_reference.bio
    │   ├── incorrect1_nopredictions.bio
    │   └── correct1_improper_sequence.bio
    ├── conll_merged_predictions
    │   ├── incorrect1_nopredictions.bio
    │   ├── correct1.bio
    │   ├── incorrect1.bio
    │   └── correct1_improper_sequence.bio
    ├── create_release.sh
    ├── pre_commit.sh
    ├── import_all.py
    ├── test_seqscore_main.py
    ├── test_extract_text_click.py
    ├── test_utils.py
    ├── test_model.py
    ├── test_summarize_click.py
    ├── test_conll_format.py
    ├── test_repair_click.py
    ├── test_scoring_click.py
    ├── test_conversion_click.py
    ├── test_count_click.py
    ├── test_validation_click.py
    ├── test_process_click.py
    ├── test_scoring.py
    ├── test_validation.py
    └── test_encoding.py
├── samples
    ├── type_map_NAME.json
    ├── invalid_count.csv
    ├── reference_count.csv
    ├── keep_ORG.bio
    ├── remove_ORG.bio
    ├── invalid.bio
    ├── invalid_repair_discard.bio
    ├── predicted.bio
    ├── reference.bio
    ├── all_NAME.bio
    ├── reference.bioes
    └── invalid_repair_conlleval.bio
├── .coveragerc
├── pyproject.toml
├── .flake8
├── docs
    ├── index.rst
    ├── Makefile
    ├── make.bat
    └── conf.py
├── requirements.txt
├── .readthedocs.yaml
├── LICENSE
├── .github
    └── workflows
    │   └── main.yml
├── setup.py
├── .gitignore
├── external
    └── conlleval.pl
└── README.md


/seqscore/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_files/empty.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/seqscore/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.7.0"
2 | 


--------------------------------------------------------------------------------
/samples/type_map_NAME.json:
--------------------------------------------------------------------------------
1 | {
2 |   "NAME": ["LOC", "ORG"]
3 | }
4 | 


--------------------------------------------------------------------------------
/tests/test_files/map_LOC_GPE.json:
--------------------------------------------------------------------------------
1 | {
2 |   "GPE": ["LOC"]
3 | }
4 | 


--------------------------------------------------------------------------------
/tests/test_files/map_PERSON.json:
--------------------------------------------------------------------------------
1 | {
2 |   "PERSON": ["PER"]
3 | }
4 | 


--------------------------------------------------------------------------------
/tests/test_files/map_bad_value.json:
--------------------------------------------------------------------------------
1 | {
2 |   "GPE": "LOC"
3 | }
4 | 


--------------------------------------------------------------------------------
/tests/test_files/map_NAME.json:
--------------------------------------------------------------------------------
1 | {
2 |   "NAME": ["LOC", "ORG"]
3 | }
4 | 


--------------------------------------------------------------------------------
/tests/test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euxo pipefail
3 | 
4 | pytest tests/
5 | 


--------------------------------------------------------------------------------
/tests/conll_annotation/bad_label1.bio:
--------------------------------------------------------------------------------
1 | This	O
2 | is
3 | a	O
4 | sentence	O
5 | .	O
6 | 


--------------------------------------------------------------------------------
/tests/conll_annotation/bad_label2.bio:
--------------------------------------------------------------------------------
1 | This	O
2 | is	O
3 | a	O
4 | sentence	GPE
5 | .	O
6 | 


--------------------------------------------------------------------------------
/tests/conll_annotation/bad_label3.bio:
--------------------------------------------------------------------------------
1 | This	OUT
2 | is	OUT
3 | a	OUT
4 | sentence	OUT
5 | .	OUT
6 | 


--------------------------------------------------------------------------------
/tests/test_files/map_bad_duplicate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "GPE1": ["LOC"],
3 |   "GPE2": ["LOC"]
4 | }
5 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | exclude_lines =
3 |     raise NotImplementedError
4 |     pragma: no cover
5 | 


--------------------------------------------------------------------------------
/tests/test_files/minimal_comments_1.bio:
--------------------------------------------------------------------------------
1 | #
2 | This	O
3 | is	O
4 | a	O
5 | sentence	O
6 | .	O
7 | 
8 | 


--------------------------------------------------------------------------------
/samples/invalid_count.csv:
--------------------------------------------------------------------------------
1 | 1	ORG	University of Pennsylvania
2 | 1	LOC	West Philadelphia
3 | 1	LOC	Pennsylvania
4 | 


--------------------------------------------------------------------------------
/tests/test_files/minimal_comments_2.bio:
--------------------------------------------------------------------------------
1 | # Comment
2 | This	O
3 | is	O
4 | a	O
5 | sentence	O
6 | .	O
7 | 
8 | 


--------------------------------------------------------------------------------
/samples/reference_count.csv:
--------------------------------------------------------------------------------
1 | 1	ORG	University of Pennsylvania
2 | 1	LOC	West Philadelphia
3 | 1	LOC	Pennsylvania
4 | 


--------------------------------------------------------------------------------
/tests/test_files/minimal_comments_3.bio:
--------------------------------------------------------------------------------
1 | # Three fields
2 | This	O
3 | is	O
4 | a	O
5 | sentence	O
6 | .	O
7 | 
8 | 


--------------------------------------------------------------------------------
/tests/test_files/count_minimal_ref.txt:
--------------------------------------------------------------------------------
1 | 1	ORG	University of Pennsylvania
2 | 1	LOC	West Philadelphia
3 | 1	LOC	Pennsylvania
4 | 


--------------------------------------------------------------------------------
/tests/test_files/count_minimal_ref_comma.txt:
--------------------------------------------------------------------------------
1 | 1,ORG,University of Pennsylvania
2 | 1,LOC,West Philadelphia
3 | 1,LOC,Pennsylvania
4 | 


--------------------------------------------------------------------------------
/tests/test_files/minimal_comments_4.bio:
--------------------------------------------------------------------------------
1 | # Now four fields
2 | # And a second line
3 | This	O
4 | is	O
5 | a	O
6 | sentence	O
7 | .	O
8 | 
9 | 


--------------------------------------------------------------------------------
/tests/check.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euxo pipefail
3 | 
4 | files=(seqscore/ tests/ setup.py)
5 | ruff check "${files[@]}"
6 | mypy "${files[@]}"
7 | 


--------------------------------------------------------------------------------
/tests/test_all.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euxo pipefail
3 | 
4 | files=(seqscore/ tests/ setup.py)
5 | black --check "${files[@]}"
6 | flake8 "${files[@]}"
7 | mypy "${files[@]}"
8 | pytest tests/
9 | 


--------------------------------------------------------------------------------
/tests/test_files/count_minimal_twofiles_ref.txt:
--------------------------------------------------------------------------------
1 | 2	LOC	Waltham
2 | 1	ORG	University of Pennsylvania
3 | 1	LOC	West Philadelphia
4 | 1	LOC	Pennsylvania
5 | 1	ORG	Brandeis University
6 | 1	LOC	Massachusetts
7 | 


--------------------------------------------------------------------------------
/samples/keep_ORG.bio:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	B-ORG
 8 | of	I-ORG
 9 | Pennsylvania	I-ORG
10 | is	O
11 | in	O
12 | West	O
13 | Philadelphia	O
14 | ,	O
15 | Pennsylvania	O
16 | .	O
17 | 
18 | 


--------------------------------------------------------------------------------
/samples/remove_ORG.bio:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	O
 8 | of	O
 9 | Pennsylvania	O
10 | is	O
11 | in	O
12 | West	B-LOC
13 | Philadelphia	I-LOC
14 | ,	O
15 | Pennsylvania	B-LOC
16 | .	O
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/conll_annotation/minimal2.bio:
--------------------------------------------------------------------------------
 1 | Brandeis	B-ORG
 2 | University	I-ORG
 3 | is	O
 4 | located	O
 5 | in	O
 6 | Waltham B-LOC
 7 | ,	O
 8 | Massachusetts	B-LOC
 9 | .	O
10 | 
11 | Waltham B-LOC
12 | is	O
13 | lovely	O
14 | .	O
15 | 
16 | 


--------------------------------------------------------------------------------
/tests/test_files/minimal_no_names.bio:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	O
 8 | of	O
 9 | Pennsylvania	O
10 | is	O
11 | in	O
12 | West	O
13 | Philadelphia	O
14 | ,	O
15 | Pennsylvania	O
16 | .	O
17 | 
18 | 


--------------------------------------------------------------------------------
/samples/invalid.bio:
--------------------------------------------------------------------------------
 1 | This O
 2 | is O
 3 | a O
 4 | sentence O
 5 | . O
 6 | 
 7 | University I-ORG
 8 | of I-ORG
 9 | Pennsylvania I-ORG
10 | is O
11 | in O
12 | West B-LOC
13 | Philadelphia I-LOC
14 | , O
15 | Pennsylvania B-LOC
16 | . O
17 | 
18 | 


--------------------------------------------------------------------------------
/samples/invalid_repair_discard.bio:
--------------------------------------------------------------------------------
 1 | This O
 2 | is O
 3 | a O
 4 | sentence O
 5 | . O
 6 | 
 7 | University O
 8 | of O
 9 | Pennsylvania O
10 | is O
11 | in O
12 | West B-LOC
13 | Philadelphia I-LOC
14 | , O
15 | Pennsylvania B-LOC
16 | . O
17 | 
18 | 


--------------------------------------------------------------------------------
/samples/predicted.bio:
--------------------------------------------------------------------------------
 1 | This O
 2 | is O
 3 | a O
 4 | sentence O
 5 | . O
 6 | 
 7 | University B-ORG
 8 | of I-ORG
 9 | Pennsylvania I-ORG
10 | is O
11 | in O
12 | West B-LOC
13 | Philadelphia B-LOC
14 | , O
15 | Pennsylvania B-LOC
16 | . O
17 | 
18 | 


--------------------------------------------------------------------------------
/samples/reference.bio:
--------------------------------------------------------------------------------
 1 | This O
 2 | is O
 3 | a O
 4 | sentence O
 5 | . O
 6 | 
 7 | University B-ORG
 8 | of I-ORG
 9 | Pennsylvania I-ORG
10 | is O
11 | in O
12 | West B-LOC
13 | Philadelphia I-LOC
14 | , O
15 | Pennsylvania B-LOC
16 | . O
17 | 
18 | 


--------------------------------------------------------------------------------
/samples/all_NAME.bio:
--------------------------------------------------------------------------------
 1 | This O
 2 | is O
 3 | a O
 4 | sentence O
 5 | . O
 6 | 
 7 | University B-NAME
 8 | of I-NAME
 9 | Pennsylvania I-NAME
10 | is O
11 | in O
12 | West B-NAME
13 | Philadelphia I-NAME
14 | , O
15 | Pennsylvania B-NAME
16 | . O
17 | 
18 | 


--------------------------------------------------------------------------------
/samples/reference.bioes:
--------------------------------------------------------------------------------
 1 | This O
 2 | is O
 3 | a O
 4 | sentence O
 5 | . O
 6 | 
 7 | University B-ORG
 8 | of I-ORG
 9 | Pennsylvania E-ORG
10 | is O
11 | in O
12 | West B-LOC
13 | Philadelphia E-LOC
14 | , O
15 | Pennsylvania S-LOC
16 | . O
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/conll_annotation/invalid1_BIO_discard.txt:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	O
 8 | of	O
 9 | Pennsylvania	O
10 | is	O
11 | in	O
12 | West	O
13 | Philadelphia	O
14 | ,	O
15 | Pennsylvania	O
16 | .	O
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/test_files/minimal_no_LOC.bio:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	B-ORG
 8 | of	I-ORG
 9 | Pennsylvania	I-ORG
10 | is	O
11 | in	O
12 | West	O
13 | Philadelphia	O
14 | ,	O
15 | Pennsylvania	O
16 | .	O
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/conll_predictions/correct1.bio:
--------------------------------------------------------------------------------
 1 | This O
 2 | is O
 3 | a O
 4 | sentence O
 5 | . O
 6 | 
 7 | University B-ORG
 8 | of I-ORG
 9 | Pennsylvania I-ORG
10 | is O
11 | in O
12 | West B-LOC
13 | Philadelphia I-LOC
14 | , O
15 | Pennsylvania B-LOC
16 | . O
17 | 


--------------------------------------------------------------------------------
/tests/conll_predictions/incorrect1.bio:
--------------------------------------------------------------------------------
 1 | This O
 2 | is O
 3 | a O
 4 | sentence O
 5 | . O
 6 | 
 7 | University B-ORG
 8 | of I-ORG
 9 | Pennsylvania I-ORG
10 | is O
11 | in O
12 | West B-LOC
13 | Philadelphia B-LOC
14 | , O
15 | Pennsylvania B-LOC
16 | . O
17 | 


--------------------------------------------------------------------------------
/tests/test_files/minimal_GPE.bio:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	B-ORG
 8 | of	I-ORG
 9 | Pennsylvania	I-ORG
10 | is	O
11 | in	O
12 | West	B-GPE
13 | Philadelphia	I-GPE
14 | ,	O
15 | Pennsylvania	B-GPE
16 | .	O
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/test_files/space_delim.txt:
--------------------------------------------------------------------------------
 1 | This O
 2 | is O
 3 | a O
 4 | sentence O
 5 | . O
 6 | 
 7 | University B-ORG
 8 | of I-ORG
 9 | Pennsylvania I-ORG
10 | is O
11 | in O
12 | West B-LOC
13 | Philadelphia I-LOC
14 | , O
15 | Pennsylvania B-LOC
16 | . O
17 | 
18 | 


--------------------------------------------------------------------------------
/samples/invalid_repair_conlleval.bio:
--------------------------------------------------------------------------------
 1 | This O
 2 | is O
 3 | a O
 4 | sentence O
 5 | . O
 6 | 
 7 | University B-ORG
 8 | of I-ORG
 9 | Pennsylvania I-ORG
10 | is O
11 | in O
12 | West B-LOC
13 | Philadelphia I-LOC
14 | , O
15 | Pennsylvania B-LOC
16 | . O
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/conll_annotation/invalid1.bio:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	I-ORG
 8 | of	I-ORG
 9 | Pennsylvania	I-ORG
10 | is	O
11 | in	O
12 | West	I-LOC
13 | Philadelphia	I-LOC
14 | ,	O
15 | Pennsylvania	I-LOC
16 | .	O
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/conll_annotation/minimal.bio:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	B-ORG
 8 | of	I-ORG
 9 | Pennsylvania	I-ORG
10 | is	O
11 | in	O
12 | West	B-LOC
13 | Philadelphia	I-LOC
14 | ,	O
15 | Pennsylvania	B-LOC
16 | .	O
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/conll_annotation/minimal.bioes:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	B-ORG
 8 | of	I-ORG
 9 | Pennsylvania	E-ORG
10 | is	O
11 | in	O
12 | West	B-LOC
13 | Philadelphia	E-LOC
14 | ,	O
15 | Pennsylvania	S-LOC
16 | .	O
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/conll_annotation/minimal.io:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	I-ORG
 8 | of	I-ORG
 9 | Pennsylvania	I-ORG
10 | is	O
11 | in	O
12 | West	I-LOC
13 | Philadelphia	I-LOC
14 | ,	O
15 | Pennsylvania	I-LOC
16 | .	O
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/test_files/minimal_bio_copy.txt:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	B-ORG
 8 | of	I-ORG
 9 | Pennsylvania	I-ORG
10 | is	O
11 | in	O
12 | West	B-LOC
13 | Philadelphia	I-LOC
14 | ,	O
15 | Pennsylvania	B-LOC
16 | .	O
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/test_files/minimal_NAME.bio:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	B-NAME
 8 | of	I-NAME
 9 | Pennsylvania	I-NAME
10 | is	O
11 | in	O
12 | West	B-NAME
13 | Philadelphia	I-NAME
14 | ,	O
15 | Pennsylvania	B-NAME
16 | .	O
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/conll_predictions/correct1_improper_sequence_ref.txt:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	B-ORG
 8 | of	I-ORG
 9 | Pennsylvania	I-ORG
10 | is	O
11 | in	O
12 | West	B-LOC
13 | Philadelphia	I-LOC
14 | ,	O
15 | Pennsylvania	B-LOC
16 | .	O
17 | 


--------------------------------------------------------------------------------
/tests/test_files/minimal_bio_extra_line.txt:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	B-ORG
 8 | of	I-ORG
 9 | Pennsylvania	I-ORG
10 | is	O
11 | in	O
12 | West	B-LOC
13 | Philadelphia	I-LOC
14 | ,	O
15 | Pennsylvania	B-LOC
16 | .	O
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/tests/conll_predictions/correct1_improper_sequence_pred.txt:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	I-ORG
 8 | of	I-ORG
 9 | Pennsylvania	I-ORG
10 | is	O
11 | in	O
12 | West	B-LOC
13 | Philadelphia	I-LOC
14 | ,	O
15 | Pennsylvania	B-LOC
16 | .	O
17 | 


--------------------------------------------------------------------------------
/tests/conll_predictions/incorrect_type_not_in_reference.bio:
--------------------------------------------------------------------------------
 1 | This O
 2 | is O
 3 | a O
 4 | sentence B-SPURIOUS
 5 | . O
 6 | 
 7 | University B-ORG
 8 | of I-ORG
 9 | Pennsylvania I-ORG
10 | is O
11 | in O
12 | West B-LOC
13 | Philadelphia I-LOC
14 | , O
15 | Pennsylvania B-LOC
16 | . O
17 | 


--------------------------------------------------------------------------------
/tests/conll_predictions/incorrect1_nopredictions.bio:
--------------------------------------------------------------------------------
 1 | This O O
 2 | is O O
 3 | a O O
 4 | sentence O O
 5 | . O O
 6 | 
 7 | University B-ORG O
 8 | of I-ORG O
 9 | Pennsylvania I-ORG O
10 | is O O
11 | in O O
12 | West B-LOC O
13 | Philadelphia I-LOC O
14 | , O O
15 | Pennsylvania B-LOC O
16 | . O O
17 | 


--------------------------------------------------------------------------------
/tests/conll_merged_predictions/incorrect1_nopredictions.bio:
--------------------------------------------------------------------------------
 1 | This O O
 2 | is O O
 3 | a O O
 4 | sentence O O
 5 | . O O
 6 | 
 7 | University B-ORG O
 8 | of I-ORG O
 9 | Pennsylvania I-ORG O
10 | is O O
11 | in O O
12 | West B-LOC O
13 | Philadelphia I-LOC O
14 | , O O
15 | Pennsylvania B-LOC O
16 | . O O
17 | 


--------------------------------------------------------------------------------
/tests/create_release.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | set -euxo pipefail
 3 | 
 4 | rm -rf dist/*
 5 | python -m pip install --upgrade build twine
 6 | python -m build
 7 | 
 8 | # When the above is done, run the following.
 9 | # This is intentionally not run in the script.
10 | # python -m twine upload dist/*
11 | 


--------------------------------------------------------------------------------
/tests/conll_merged_predictions/correct1.bio:
--------------------------------------------------------------------------------
 1 | This O O
 2 | is O O
 3 | a O O
 4 | sentence O O
 5 | . O O
 6 | 
 7 | University B-ORG B-ORG
 8 | of I-ORG I-ORG
 9 | Pennsylvania I-ORG I-ORG
10 | is O O
11 | in O O
12 | West B-LOC B-LOC
13 | Philadelphia I-LOC I-LOC
14 | , O O
15 | Pennsylvania B-LOC B-LOC
16 | . O O
17 | 


--------------------------------------------------------------------------------
/tests/conll_merged_predictions/incorrect1.bio:
--------------------------------------------------------------------------------
 1 | This O O
 2 | is O O
 3 | a O O
 4 | sentence O O
 5 | . O O
 6 | 
 7 | University B-ORG B-ORG
 8 | of I-ORG I-ORG
 9 | Pennsylvania I-ORG I-ORG
10 | is O O
11 | in O O
12 | West B-LOC B-LOC
13 | Philadelphia I-LOC B-LOC
14 | , O O
15 | Pennsylvania B-LOC B-LOC
16 | . O O
17 | 


--------------------------------------------------------------------------------
/tests/conll_predictions/correct1_improper_sequence.bio:
--------------------------------------------------------------------------------
 1 | This O O
 2 | is O O
 3 | a O O
 4 | sentence O O
 5 | . O O
 6 | 
 7 | University B-ORG I-ORG
 8 | of I-ORG I-ORG
 9 | Pennsylvania I-ORG I-ORG
10 | is O O
11 | in O O
12 | West B-LOC B-LOC
13 | Philadelphia I-LOC I-LOC
14 | , O O
15 | Pennsylvania B-LOC B-LOC
16 | . O O
17 | 


--------------------------------------------------------------------------------
/tests/conll_merged_predictions/correct1_improper_sequence.bio:
--------------------------------------------------------------------------------
 1 | This O O
 2 | is O O
 3 | a O O
 4 | sentence O O
 5 | . O O
 6 | 
 7 | University B-ORG I-ORG
 8 | of I-ORG I-ORG
 9 | Pennsylvania I-ORG I-ORG
10 | is O O
11 | in O O
12 | West B-LOC B-LOC
13 | Philadelphia I-LOC I-LOC
14 | , O O
15 | Pennsylvania B-LOC B-LOC
16 | . O O
17 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.mypy]
 2 | python_version = 3.9
 3 | strict_optional = false
 4 | disallow_untyped_defs = true
 5 | disallow_untyped_calls = true
 6 | 
 7 | [[tool.mypy.overrides]]
 8 | module = [
 9 |     "setuptools",
10 |     "click.*",
11 | ]
12 | ignore_missing_imports = true
13 | 
14 | [tool.ruff]
15 | line-length = 90
16 | target-version = "py39"
17 | 


--------------------------------------------------------------------------------
/tests/pre_commit.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euxo pipefail
 3 | 
 4 | files=(seqscore/ tests/ *.py)
 5 | ruff check --fix "${files[@]}"
 6 | ruff check --select I --fix "${files[@]}"  # Organize imports
 7 | ruff format "${files[@]}"
 8 | ruff check "${files[@]}"  # Redundant but ensures CI will pass
 9 | mypy "${files[@]}"
10 | pytest --cov-report term-missing --cov=seqscore tests/
11 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | exclude =
 3 | ignore =
 4 |     # False positives on imports used only in generic type annotations
 5 |     F401
 6 |     # In conflict with modern PEP8, see https://gitlab.com/pycqa/flake8/issues/139
 7 |     W503
 8 |     # Raises false positives on @overload
 9 |     F811
10 |     # In conflict with PEP8, space is allowed before colon in slicing
11 |     E203
12 |     # Don't warn about line length
13 |     E501
14 | 


--------------------------------------------------------------------------------
/tests/test_files/minimal_comments.bio:
--------------------------------------------------------------------------------
 1 | #
 2 | This	O
 3 | is	O
 4 | a	O
 5 | sentence	O
 6 | with	O
 7 | comment	O
 8 | characters	O
 9 | #	O
10 | ##	O
11 | #1	O
12 | .	O
13 | 
14 | # Comment
15 | This	O
16 | is	O
17 | a	O
18 | sentence	O
19 | .	O
20 | 
21 | # Three fields
22 | This	O
23 | is	O
24 | a	O
25 | sentence	O
26 | .	O
27 | 
28 | # Now four fields
29 | # And a second line
30 | This	O
31 | is	O
32 | a	O
33 | sentence	O
34 | .	O
35 | 
36 | 


--------------------------------------------------------------------------------
/tests/conll_annotation/invalid1.bioes:
--------------------------------------------------------------------------------
 1 | This	O
 2 | is	O
 3 | a	O
 4 | sentence	O
 5 | .	O
 6 | 
 7 | University	B-ORG
 8 | of	I-ORG
 9 | Pennsylvania	I-ORG
10 | is	O
11 | in	O
12 | West	S-LOC
13 | Philadelphia	I-LOC
14 | ,	O
15 | Pennsylvania	B-LOC
16 | .	O
17 | 
18 | University	B-ORG
19 | of	I-ORG
20 | Maryland	I-ORG
21 | 
22 | Department	I-ORG
23 | of	I-ORG
24 | Commerce	E-ORG
25 | 
26 | University	I-ORG
27 | of	I-ORG
28 | Maryland	I-ORG
29 | 
30 | Massachusetts B-LOC
31 | 
32 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. SeqScore documentation master file, created by
 2 |    sphinx-quickstart on Wed Nov 10 05:11:47 2021.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | SeqScore
 7 | ========
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 | 
14 | 
15 | Indices and tables
16 | ==================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # This file only contains dependencies needed for development.
 2 | # setup.py contains the actual package dependencies, and the package
 3 | # should be installed before these requirements.
 4 | 
 5 | # Type annotations for tabulate
 6 | types-tabulate
 7 | 
 8 | # For testing
 9 | pytest==8.3.5
10 | pytest-cov==5.0.0
11 | 
12 | # For development
13 | mypy==1.14.1
14 | ruff==0.9.10
15 | 
16 | # Documentation build
17 | # Disabled for now since we don't need them
18 | # sphinx
19 | # sphinx-rtd-theme
20 | 


--------------------------------------------------------------------------------
/tests/conll_annotation/minimal_fields.bio:
--------------------------------------------------------------------------------
 1 | -DOCSTART-	-X-	-X-	O
 2 | 
 3 | This	field3	field4	O
 4 | is	field5	field6	O
 5 | a	field7	field8	O
 6 | sentence	field9	field10	O
 7 | .	field11	field12	O
 8 | 
 9 | -DOCSTART-	-X-	-X-	O
10 | 
11 | University	field13	field14	B-ORG
12 | of	field15	field16	I-ORG
13 | Pennsylvania	field17	field18	I-ORG
14 | is	field19	field20	O
15 | in	field21	field22	O
16 | West	field23	field24	B-LOC
17 | Philadelphia	field25	field26	I-LOC
18 | ,	field27	field28	O
19 | Pennsylvania	field29	field30	B-LOC
20 | .	field31	field32	O
21 | 
22 | 


--------------------------------------------------------------------------------
/tests/conll_annotation/minimal_fields.iob:
--------------------------------------------------------------------------------
 1 | -DOCSTART-	-X-	-X-	O
 2 | 
 3 | This	field3	field4	O
 4 | is	field5	field6	O
 5 | a	field7	field8	O
 6 | sentence	field9	field10	O
 7 | .	field11	field12	O
 8 | 
 9 | -DOCSTART-	-X-	-X-	O
10 | 
11 | University	field13	field14	I-ORG
12 | of	field15	field16	I-ORG
13 | Pennsylvania	field17	field18	I-ORG
14 | is	field19	field20	O
15 | in	field21	field22	O
16 | West	field23	field24	I-LOC
17 | Philadelphia	field25	field26	I-LOC
18 | ,	field27	field28	O
19 | Pennsylvania	field29	field30	I-LOC
20 | .	field31	field32	O
21 | 
22 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-20.04
11 |   tools:
12 |     python: "3.8"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |    configuration: docs/conf.py
17 | 
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 |    install:
21 |      - method: pip
22 |        path: .
23 | 


--------------------------------------------------------------------------------
/tests/import_all.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | """Import all top-level modules in seqscore.
 3 | 
 4 | This imports all the top-level modules as a smoke test for making sure all needed
 5 | dependencies are installed. We use this as basic test before pytest is installed
 6 | to make sure that there are no dependencies that we are accidentally relying on
 7 | pytest to install for us.
 8 | """
 9 | 
10 | import seqscore.scripts.seqscore  # noqa: F401
11 | from seqscore import conll, encoding, model, scoring, util, validation  # noqa: F401
12 | 
13 | print(f"{__file__}:", "Successfully imported all top-level modules")
14 | 


--------------------------------------------------------------------------------
/tests/test_seqscore_main.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | import seqscore
 4 | 
 5 | HELP_OUTPUT = "Usage: seqscore [OPTIONS] COMMAND [ARGS]..."
 6 | 
 7 | 
 8 | def test_seqscore_help() -> None:
 9 |     result = subprocess.run(["seqscore", "--help"], capture_output=True, encoding="UTF-8")
10 |     assert result.returncode == 0
11 |     assert result.stdout.startswith(HELP_OUTPUT)
12 | 
13 | 
14 | def test_seqscore_version() -> None:
15 |     result = subprocess.run(
16 |         ["seqscore", "--version"], capture_output=True, encoding="UTF-8"
17 |     )
18 |     assert result.returncode == 0
19 |     assert result.stdout == f"seqscore, version {seqscore.__version__}\n"
20 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.https://www.sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Constantine Lignos, Chester Palen-Michel, and Nolan Holley
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - main
 7 |     - dev
 8 |   pull_request:
 9 |     branches:
10 |     - main
11 |     - dev
12 | 
13 | jobs:
14 |   build:
15 |     runs-on: ubuntu-22.04
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
21 | 
22 |     steps:
23 |       - uses: actions/checkout@v4
24 | 
25 |       - name: Set up Python ${{ matrix.python-version }}
26 |         uses: actions/setup-python@v5
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 | 
30 |       - name: Install package and test imports
31 |         run: |
32 |           python -m pip install --upgrade pip
33 |           pip install .
34 |           ./tests/import_all.py
35 | 
36 |       - name: Install test dependencies
37 |         run: |
38 |           pip install pytest
39 | 
40 |       - name: Run tests
41 |         run: |
42 |           ./tests/test.sh
43 | 
44 |       - name: Install quality check dependencies
45 |         run: |
46 |           pip install -r requirements.txt
47 | 
48 |       - name: Run quality checks
49 |         run: |
50 |           ./tests/check.sh
51 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | from os import path
 4 | 
 5 | from setuptools import find_packages, setup
 6 | 
 7 | from seqscore import __version__
 8 | 
 9 | 
10 | def setup_package() -> None:
11 |     root = path.abspath(path.dirname(__file__))
12 |     with open(path.join(root, "README.md"), encoding="utf-8") as f:
13 |         long_description = f.read()
14 | 
15 |     setup(
16 |         name="seqscore",
17 |         version=__version__,
18 |         packages=find_packages(include=("seqscore", "seqscore.*")),
19 |         # Package type information
20 |         package_data={"seqscore": ["py.typed"]},
21 |         python_requires=">=3.9",
22 |         license="MIT",
23 |         description="SeqScore: Scoring for named entity recognition and other sequence labeling tasks",
24 |         long_description=long_description,
25 |         install_requires=[
26 |             "attrs>=19.2.0",
27 |             "click",
28 |             "tabulate",
29 |         ],
30 |         entry_points="""
31 |             [console_scripts]
32 |             seqscore=seqscore.scripts.seqscore:cli
33 |         """,
34 |         classifiers=[
35 |             "Development Status :: 4 - Beta",
36 |             "License :: OSI Approved :: MIT License",
37 |             "Programming Language :: Python :: 3.9",
38 |             "Programming Language :: Python :: 3.10",
39 |             "Programming Language :: Python :: 3.11",
40 |             "Programming Language :: Python :: 3.12",
41 |             "Programming Language :: Python :: 3.13",
42 |             "Topic :: Scientific/Engineering :: Artificial Intelligence",
43 |         ],
44 |         url="https://github.com/bltlab/seqscore",
45 |         long_description_content_type="text/markdown",
46 |         author="Constantine Lignos",
47 |         author_email="lignos@brandeis.edu",
48 |     )
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     setup_package()
53 | 


--------------------------------------------------------------------------------
/tests/test_extract_text_click.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | from pathlib import Path
 3 | from typing import Optional, Union
 4 | 
 5 | from click.testing import CliRunner
 6 | 
 7 | from seqscore.scripts.seqscore import extract_text
 8 | 
 9 | TMP_DIR: Optional[tempfile.TemporaryDirectory] = None
10 | MINIMAL_SENTENCES = [
11 |     ["This", "is", "a", "sentence", "."],
12 |     [
13 |         "University",
14 |         "of",
15 |         "Pennsylvania",
16 |         "is",
17 |         "in",
18 |         "West",
19 |         "Philadelphia",
20 |         ",",
21 |         "Pennsylvania",
22 |         ".",
23 |     ],
24 | ]
25 | 
26 | 
27 | def setup_module() -> None:
28 |     """Create temporary directory used by tests."""
29 |     global TMP_DIR
30 |     TMP_DIR = tempfile.TemporaryDirectory()
31 | 
32 | 
33 | def teardown_module() -> None:
34 |     """Remove temporary directory used by tests."""
35 |     TMP_DIR.cleanup()
36 | 
37 | 
38 | def test_single_file() -> None:
39 |     runner = CliRunner()
40 |     input_path = str(Path("tests") / "conll_annotation" / "minimal.bio")
41 |     output_path = str(Path(TMP_DIR.name) / "out.txt")
42 |     result = runner.invoke(
43 |         extract_text,
44 |         [
45 |             input_path,
46 |             output_path,
47 |         ],
48 |     )
49 |     assert result.exit_code == 0
50 |     actual_text = _read_tokenized_lines(output_path)
51 |     assert actual_text == MINIMAL_SENTENCES
52 | 
53 | 
54 | def test_multiple_files() -> None:
55 |     runner = CliRunner()
56 |     input_path = str(Path("tests") / "conll_annotation" / "minimal.bio")
57 |     output_path = str(Path(TMP_DIR.name) / "out.txt")
58 |     result = runner.invoke(
59 |         extract_text,
60 |         [
61 |             input_path,
62 |             input_path,  # Put it again as a second file
63 |             output_path,
64 |         ],
65 |     )
66 |     assert result.exit_code == 0
67 |     # It's the same sentences, but with a blank line between them for the document break
68 |     expected_text = MINIMAL_SENTENCES[:]
69 |     expected_text.append([""])
70 |     expected_text.extend(MINIMAL_SENTENCES)
71 | 
72 |     actual_text = _read_tokenized_lines(output_path)
73 |     assert actual_text == expected_text
74 | 
75 | 
76 | def _read_tokenized_lines(path: Union[str, Path]) -> list[list[str]]:
77 |     return [line.rstrip("\n").split(" ") for line in open(path, encoding="utf8")]
78 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from seqscore.util import file_fields_match, file_lines_match, tuplify_strs
 4 | 
 5 | 
 6 | def test_tuplify_strs() -> None:
 7 |     strs = ["a", "b", "c"]
 8 |     tup = tuplify_strs(strs)
 9 |     assert tup == ("a", "b", "c")
10 | 
11 | 
12 | def test_identical_files() -> None:
13 |     assert file_fields_match(
14 |         os.path.join("tests", "test_files", "minimal_bio_copy.txt"),
15 |         os.path.join("tests", "conll_annotation", "minimal.bio"),
16 |     )
17 |     assert file_lines_match(
18 |         os.path.join("tests", "test_files", "minimal_bio_copy.txt"),
19 |         os.path.join("tests", "conll_annotation", "minimal.bio"),
20 |     )
21 | 
22 | 
23 | def test_empty_file() -> None:
24 |     assert not file_fields_match(
25 |         os.path.join("tests", "test_files", "empty.txt"),
26 |         os.path.join("tests", "conll_annotation", "minimal.bio"),
27 |     )
28 |     assert not file_lines_match(
29 |         os.path.join("tests", "test_files", "empty.txt"),
30 |         os.path.join("tests", "conll_annotation", "minimal.bio"),
31 |     )
32 | 
33 | 
34 | def test_differing_whitespace() -> None:
35 |     assert file_fields_match(
36 |         os.path.join("tests", "test_files", "space_delim.txt"),
37 |         os.path.join("tests", "conll_annotation", "minimal.bio"),
38 |     )
39 |     assert not file_lines_match(
40 |         os.path.join("tests", "test_files", "space_delim.txt"),
41 |         os.path.join("tests", "conll_annotation", "minimal.bio"),
42 |     )
43 | 
44 | 
45 | def test_differing_file_fields() -> None:
46 |     assert not file_fields_match(
47 |         os.path.join("tests", "conll_annotation", "minimal.bio"),
48 |         os.path.join("tests", "conll_annotation", "invalid1.bio"),
49 |     )
50 |     assert not file_lines_match(
51 |         os.path.join("tests", "conll_annotation", "minimal.bio"),
52 |         os.path.join("tests", "conll_annotation", "invalid1.bio"),
53 |     )
54 | 
55 | 
56 | def test_extra_line() -> None:
57 |     assert not file_fields_match(
58 |         os.path.join("tests", "test_files", "minimal_bio_extra_line.txt"),
59 |         os.path.join("tests", "conll_annotation", "minimal.bio"),
60 |     )
61 |     assert not file_lines_match(
62 |         os.path.join("tests", "test_files", "minimal_bio_extra_line.txt"),
63 |         os.path.join("tests", "conll_annotation", "minimal.bio"),
64 |     )
65 | 


--------------------------------------------------------------------------------
/tests/test_model.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from seqscore.model import LabeledSequence, Mention, SequenceProvenance, Span
 4 | 
 5 | 
 6 | def test_span() -> None:
 7 |     assert len(Span(0, 1)) == 1
 8 |     assert len(Span(1, 2)) == 1
 9 |     assert len(Span(0, 2)) == 2
10 | 
11 |     with pytest.raises(ValueError):
12 |         Span(-1, 0)
13 | 
14 |     with pytest.raises(ValueError):
15 |         Span(0, 0)
16 | 
17 | 
18 | def test_mention() -> None:
19 |     m1 = Mention(Span(0, 1), "PER")
20 |     assert m1.type == "PER"
21 |     assert m1.span == Span(0, 1)
22 |     assert len(m1) == 1
23 | 
24 |     with pytest.raises(ValueError):
25 |         Mention(Span(0, 1), "")
26 | 
27 |     with pytest.raises(TypeError):
28 |         # Intentionally incorrect type
29 |         Mention(Span(0, 1), None)  # type: ignore
30 | 
31 | 
32 | def test_labeled_sentence() -> None:
33 |     s1 = LabeledSequence(
34 |         ["a", "b"],
35 |         ["B-PER", "I-PER"],
36 |         provenance=SequenceProvenance(7, "test"),
37 |     )
38 |     assert s1.tokens == ("a", "b")
39 |     assert s1[0] == "a"
40 |     assert s1[0:2] == ("a", "b")
41 |     assert list(s1) == ["a", "b"]
42 |     assert s1.labels == ("B-PER", "I-PER")
43 |     assert s1.provenance == SequenceProvenance(7, "test")
44 |     assert str(s1) == "a/B-PER b/I-PER"
45 |     assert s1.tokens_with_labels() == (("a", "B-PER"), ("b", "I-PER"))
46 |     assert s1.span_tokens(Span(0, 1)) == ("a",)
47 |     assert s1.mention_tokens(Mention(Span(0, 1), "PER")) == ("a",)
48 | 
49 |     s2 = LabeledSequence(s1.tokens, s1.labels)
50 |     # Provenance not included in equality
51 |     assert s1 == s2
52 | 
53 |     with pytest.raises(ValueError):
54 |         # Mismatched length
55 |         LabeledSequence(["a", "b"], ["B-PER"])
56 | 
57 |     with pytest.raises(ValueError):
58 |         # Empty
59 |         LabeledSequence([], [])
60 | 
61 |     with pytest.raises(ValueError):
62 |         # Bad label
63 |         LabeledSequence(["a"], [""])
64 | 
65 |     with pytest.raises(ValueError):
66 |         # Bad token
67 |         LabeledSequence([""], ["B-PER"])
68 | 
69 |     s2 = s1.with_mentions([Mention(Span(0, 2), "PER")])
70 |     assert s2.mentions == (Mention(Span(0, 2), "PER"),)
71 | 
72 |     with pytest.raises(ValueError):
73 |         # Mismatched length between tokens and other_fields
74 |         LabeledSequence(["a", "b"], ["B-PER", "I-PER"], other_fields=[["DT"]])
75 | 


--------------------------------------------------------------------------------
/seqscore/processing.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import Iterable
 2 | 
 3 | from seqscore.model import LabeledSequence, Mention
 4 | 
 5 | 
 6 | class TypeMapper:
 7 |     def __init__(
 8 |         self,
 9 |         keep_types: Iterable[str],
10 |         remove_types: Iterable[str],
11 |         type_map: dict[str, list[str]],
12 |     ):
13 |         # Copy keep/remove as sets
14 |         self.keep_types: set[str] = set(keep_types)
15 |         self.remove_types: set[str] = set(remove_types)
16 |         # Since the CLI prevents these from both being specified, this can't be hit by tests
17 |         if self.keep_types and self.remove_types:  # pragma: no cover
18 |             raise ValueError("Cannot specify both keep_types and remove_types")
19 | 
20 |         # Invert the type map
21 |         self.type_map: dict[str, str] = {}
22 |         for to_type, from_types in type_map.items():
23 |             assert to_type  # Type cannot be blank
24 |             for from_type in from_types:
25 |                 assert from_type  # Type cannot be blank
26 |                 if from_type in self.type_map:
27 |                     raise ValueError(
28 |                         f"Multiple mappings specified for type {repr(from_type)} in type map"
29 |                     )
30 |                 else:
31 |                     self.type_map[from_type] = to_type
32 | 
33 |     def map_types(self, sequence: LabeledSequence) -> LabeledSequence:
34 |         new_mentions: list[Mention] = []
35 |         for mention in sequence.mentions:
36 |             if mention.type in self.type_map:
37 |                 mention = mention.with_type(self.type_map[mention.type])
38 | 
39 |             if (self.keep_types and mention.type not in self.keep_types) or (
40 |                 self.remove_types and mention.type in self.remove_types
41 |             ):
42 |                 continue
43 | 
44 |             new_mentions.append(mention)
45 | 
46 |         return sequence.with_mentions(new_mentions)
47 | 
48 | 
49 | def modify_types(
50 |     docs: list[list[LabeledSequence]],
51 |     keep_types: set[str],
52 |     remove_types: set[str],
53 |     type_map: dict[str, list[str]],
54 | ) -> list[list[LabeledSequence]]:
55 |     mapper = TypeMapper(keep_types, remove_types, type_map)
56 |     mapped_docs: list[list[LabeledSequence]] = []
57 |     for doc in docs:
58 |         mapped_docs.append([mapper.map_types(sequence) for sequence in doc])
59 | 
60 |     return mapped_docs
61 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Editors and IDEs
132 | *~
133 | *.swp
134 | .idea/
135 | 
136 | # OS dotfiles
137 | .DS_Store
138 | 
139 | # Development directories
140 | /data
141 | /external
142 | /output
143 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | from seqscore import __version__
 8 | 
 9 | # -- Path setup --------------------------------------------------------------
10 | 
11 | # If extensions (or modules to document with autodoc) are in another directory,
12 | # add these directories to sys.path here. If the directory is relative to the
13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
14 | #
15 | # import os
16 | # import sys
17 | # sys.path.insert(0, os.path.abspath('.'))
18 | 
19 | 
20 | # -- Project information -----------------------------------------------------
21 | 
22 | project = "SeqScore"
23 | copyright = "2021, Constantine Lignos, Chester Palen-Michel, and Nolan Holley"
24 | author = "Constantine Lignos, Chester Palen-Michel, and Nolan Holley"
25 | 
26 | version = __version__
27 | # The full version, including alpha/beta/rc tags
28 | release = version
29 | 
30 | 
31 | # -- General configuration ---------------------------------------------------
32 | 
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
35 | # ones.
36 | extensions = [
37 |     "sphinx.ext.duration",
38 |     "sphinx.ext.doctest",
39 |     "sphinx.ext.autodoc",
40 |     "sphinx.ext.autosummary",
41 |     "sphinx.ext.intersphinx",
42 | ]
43 | 
44 | intersphinx_mapping = {
45 |     "python": ("https://docs.python.org/3/", None),
46 |     "sphinx": ("https://www.sphinx-doc.org/en/master/", None),
47 | }
48 | intersphinx_disabled_domains = ["std"]
49 | 
50 | # Add any paths that contain templates here, relative to this directory.
51 | templates_path = ["_templates"]
52 | 
53 | # List of patterns, relative to source directory, that match files and
54 | # directories to ignore when looking for source files.
55 | # This pattern also affects html_static_path and html_extra_path.
56 | exclude_patterns = []
57 | 
58 | 
59 | # -- Options for HTML output -------------------------------------------------
60 | 
61 | # The theme to use for HTML and HTML Help pages.  See the documentation for
62 | # a list of builtin themes.
63 | #
64 | html_theme = "sphinx_rtd_theme"
65 | 
66 | # Add any paths that contain custom static files (such as style sheets) here,
67 | # relative to this directory. They are copied after the builtin static files,
68 | # so a file named "default.css" will overwrite the builtin "default.css".
69 | html_static_path = ["_static"]
70 | 
71 | # -- Options for EPUB output
72 | epub_show_urls = "footnote"
73 | 


--------------------------------------------------------------------------------
/seqscore/util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from collections.abc import Iterable
 3 | from itertools import zip_longest
 4 | from os import PathLike
 5 | from pathlib import Path
 6 | from typing import Any, Optional, Union
 7 | 
 8 | from attr import Attribute, validators
 9 | 
10 | # Union[str, Path] isn't enough to appease PyCharm's type checker, so adding Path here
11 | # avoids warnings.
12 | PathType = Union[str, Path, PathLike]
13 | 
14 | 
15 | # Type-specific implementations to work around type checker limitations. No, writing these as
16 | # generic functions with type variables does not satisfy all type checkers.
17 | def tuplify_strs(strs: Iterable[str]) -> tuple[str, ...]:
18 |     return tuple(strs)
19 | 
20 | 
21 | def tuplify_optional_nested_strs(
22 |     items: Optional[Iterable[Iterable[str]]],
23 | ) -> Optional[tuple[tuple[str, ...], ...]]:
24 |     if items is not None:
25 |         return tuple(tuple(item) for item in items)
26 |     else:
27 |         return None
28 | 
29 | 
30 | def file_fields_match(path1: PathType, path2: PathType, *, debug: bool = False) -> bool:
31 |     """Return whether the whitespace-delimited fields of two files are identical."""
32 |     with open(path1, encoding="utf8") as f1, open(path2, encoding="utf8") as f2:
33 |         for l1, l2 in zip_longest(f1, f2):
34 |             if l1 is None or l2 is None or l1.split() != l2.split():
35 |                 if debug:  # pragma: no cover
36 |                     print("Non-matching lines:")
37 |                     print(repr(l1))
38 |                     print(repr(l2))
39 |                 return False
40 |         return True
41 | 
42 | 
43 | def file_lines_match(path1: PathType, path2: PathType, debug: bool = False) -> bool:
44 |     """Return whether lines of two files are identical ignoring line endings."""
45 |     with open(path1, encoding="utf8") as f1, open(path2, encoding="utf8") as f2:
46 |         for l1, l2 in zip_longest(f1, f2):
47 |             if l1 is None or l2 is None or l1.rstrip("\r\n") != l2.rstrip("\r\n"):
48 |                 if debug:  # pragma: no cover
49 |                     print("Lines differ:")
50 |                     print(l1.strip() if l1 else l1)
51 |                     print(l2.strip() if l2 else l2)
52 |                 return False
53 |         return True
54 | 
55 | 
56 | def normalize_str_with_path(s: str) -> str:
57 |     """Normalize the OS path separator to '/'."""
58 |     return s.replace(os.path.sep, "/")
59 | 
60 | 
61 | # Instantiate in advance for _validator_nonempty_str
62 | _instance_of_str = validators.instance_of(str)
63 | 
64 | 
65 | def validator_nonempty_str(_inst: Any, attr: Attribute, value: Any) -> None:
66 |     # Check type
67 |     _instance_of_str(value, attr, value)
68 |     # Check string isn't empty
69 |     if not value:
70 |         raise ValueError(f"Empty string: {repr(value)}")
71 | 


--------------------------------------------------------------------------------
/tests/test_summarize_click.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from click.testing import CliRunner
 4 | 
 5 | from seqscore.scripts.seqscore import summarize
 6 | 
 7 | 
 8 | def test_summarize_bio_onedoc() -> None:
 9 |     runner = CliRunner()
10 |     result = runner.invoke(
11 |         summarize,
12 |         [
13 |             "--labels",
14 |             "BIO",
15 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
16 |         ],
17 |     )
18 |     assert result.exit_code == 0
19 |     assert (
20 |         result.output
21 |         == """File 'tests/conll_annotation/minimal.bio' contains 1 document(s) and 2 sentences
22 | | Entity Type   |   Count |
23 | |---------------|---------|
24 | | LOC           |       2 |
25 | | ORG           |       1 |
26 | """
27 |     )
28 | 
29 | 
30 | def test_summarize_bio_onedoc_quiet() -> None:
31 |     runner = CliRunner()
32 |     result = runner.invoke(
33 |         summarize,
34 |         [
35 |             "--labels",
36 |             "BIO",
37 |             "--quiet",
38 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
39 |         ],
40 |     )
41 |     assert result.exit_code == 0
42 |     assert (
43 |         result.output
44 |         == """| Entity Type   |   Count |
45 | |---------------|---------|
46 | | LOC           |       2 |
47 | | ORG           |       1 |
48 | """
49 |     )
50 | 
51 | 
52 | def test_summarize_iob_twodoc() -> None:
53 |     runner = CliRunner()
54 |     result = runner.invoke(
55 |         summarize,
56 |         [
57 |             "--labels",
58 |             "IOB",
59 |             os.path.join("tests", "conll_annotation", "minimal_fields.iob"),
60 |         ],
61 |     )
62 |     assert result.exit_code == 0
63 |     assert (
64 |         result.output
65 |         == """File 'tests/conll_annotation/minimal_fields.iob' contains 2 document(s) and 2 sentences
66 | | Entity Type   |   Count |
67 | |---------------|---------|
68 | | LOC           |       2 |
69 | | ORG           |       1 |
70 | """
71 |     )
72 | 
73 | 
74 | def test_summarize_bio_twofiles() -> None:
75 |     runner = CliRunner()
76 |     result = runner.invoke(
77 |         summarize,
78 |         [
79 |             "--labels",
80 |             "BIO",
81 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
82 |             os.path.join("tests", "conll_annotation", "minimal2.bio"),
83 |         ],
84 |     )
85 |     assert result.exit_code == 0
86 |     assert (
87 |         result.output
88 |         == """File 'tests/conll_annotation/minimal.bio' contains 1 document(s) and 2 sentences
89 | File 'tests/conll_annotation/minimal2.bio' contains 1 document(s) and 2 sentences
90 | Total 2 document(s) and 4 sentences
91 | | Entity Type   |   Count |
92 | |---------------|---------|
93 | | LOC           |       5 |
94 | | ORG           |       2 |
95 | """
96 |     )
97 | 


--------------------------------------------------------------------------------
/tests/test_conll_format.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from seqscore.conll import CoNLLFormatError, CoNLLIngester
 6 | from seqscore.encoding import REPAIR_NONE, get_encoding
 7 | from seqscore.validation import InvalidLabelError
 8 | 
 9 | 
10 | def test_parse_comments_true() -> None:
11 |     mention_encoding = get_encoding("BIO")
12 |     ingester = CoNLLIngester(mention_encoding, parse_comment_lines=True)
13 |     comments_path = Path("tests") / "test_files" / "minimal_comments.bio"
14 |     with comments_path.open(encoding="utf8") as file:
15 |         documents = list(ingester.ingest(file, "test", REPAIR_NONE))
16 | 
17 |     assert len(documents) == 1
18 |     sequences = documents[0]
19 |     assert len(sequences) == 4
20 |     assert sequences[0].comment == "#"
21 |     assert sequences[1].comment == "# Comment"
22 |     assert sequences[2].comment == "# Three fields"
23 |     assert sequences[3].comment == "# Now four fields\n# And a second line"
24 | 
25 |     first_sent = sequences[0]
26 | 
27 |     assert first_sent[0] == "This"
28 |     assert first_sent[7] == "#"
29 |     assert first_sent[8] == "##"
30 |     assert first_sent[9] == "#1"
31 | 
32 | 
33 | def test_parse_comments_false() -> None:
34 |     mention_encoding = get_encoding("BIO")
35 |     ingester = CoNLLIngester(mention_encoding)
36 | 
37 |     comments_path = Path("tests") / "test_files" / "minimal_comments_1.bio"
38 |     with comments_path.open(encoding="utf8") as file:
39 |         # err1 needs to not be reused below because the exception is a different type
40 |         with pytest.raises(CoNLLFormatError) as err1:
41 |             list(ingester.ingest(file, "test", REPAIR_NONE))
42 |         assert (
43 |             str(err1.value)
44 |             == "Line 1 of test does not appear to be delimited and begins with #. Perhaps you want to use the --parse-comment-lines flag? Line contents: '#'"
45 |         )
46 | 
47 |     comments_path = Path("tests") / "test_files" / "minimal_comments_2.bio"
48 |     with comments_path.open(encoding="utf8") as file:
49 |         with pytest.raises(InvalidLabelError) as err:
50 |             list(ingester.ingest(file, "test", REPAIR_NONE))
51 |         assert (
52 |             str(err.value)
53 |             == "Could not parse label 'Comment' on line 1 of test during validation: Label 'Comment' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines."
54 |         )
55 | 
56 |     comments_path = Path("tests") / "test_files" / "minimal_comments_3.bio"
57 |     with comments_path.open(encoding="utf8") as file:
58 |         with pytest.raises(InvalidLabelError) as err:
59 |             list(ingester.ingest(file, "test", REPAIR_NONE))
60 |         assert (
61 |             str(err.value)
62 |             == "Could not parse label 'fields' on line 1 of test during validation: Label 'fields' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines."
63 |         )
64 | 
65 |     comments_path = Path("tests") / "test_files" / "minimal_comments_4.bio"
66 |     with comments_path.open(encoding="utf8") as file:
67 |         with pytest.raises(InvalidLabelError) as err:
68 |             list(ingester.ingest(file, "test", REPAIR_NONE))
69 |         assert (
70 |             str(err.value)
71 |             == "Could not parse label 'fields' on line 1 of test during validation: Label 'fields' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines."
72 |         )
73 | 


--------------------------------------------------------------------------------
/tests/test_repair_click.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | from typing import Optional
  4 | 
  5 | from click.testing import CliRunner
  6 | 
  7 | from seqscore.scripts.seqscore import repair
  8 | from seqscore.util import file_fields_match, normalize_str_with_path
  9 | 
 10 | TMP_DIR: Optional[tempfile.TemporaryDirectory] = None
 11 | 
 12 | 
 13 | def setup_module() -> None:
 14 |     """Create temporary directory used by tests."""
 15 |     global TMP_DIR
 16 |     TMP_DIR = tempfile.TemporaryDirectory()
 17 | 
 18 | 
 19 | def teardown_module() -> None:
 20 |     """Remove temporary directory used by tests."""
 21 |     TMP_DIR.cleanup()
 22 | 
 23 | 
 24 | def test_repair_BIO_conlleval() -> None:
 25 |     runner = CliRunner()
 26 |     result = runner.invoke(
 27 |         repair,
 28 |         [
 29 |             "--repair-method",
 30 |             "conlleval",
 31 |             "--labels",
 32 |             "BIO",
 33 |             os.path.join("tests", "conll_annotation", "invalid1.bio"),
 34 |             os.path.join(TMP_DIR.name, "invalid_BIO_repaired_conlleval.txt"),
 35 |         ],
 36 |     )
 37 |     assert result.exit_code == 0
 38 |     assert (
 39 |         normalize_str_with_path(
 40 |             "Validation errors in sequence at line 7 of tests/conll_annotation/invalid1.bio:"
 41 |         )
 42 |         in result.output
 43 |     )
 44 |     assert (
 45 |         "Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7"
 46 |         in result.output
 47 |     )
 48 |     assert (
 49 |         "Invalid transition 'O' -> 'I-LOC' for token 'West' on line 12" in result.output
 50 |     )
 51 |     assert (
 52 |         "Invalid transition 'O' -> 'I-LOC' for token 'Pennsylvania' on line 15"
 53 |         in result.output
 54 |     )
 55 |     assert "Used method conlleval to repair:" in result.output
 56 |     assert (
 57 |         "Old: ('I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'I-LOC', 'I-LOC', 'O', 'I-LOC', 'O')"
 58 |         in result.output
 59 |     )
 60 |     assert (
 61 |         "New: ('B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'O')"
 62 |         in result.output
 63 |     )
 64 |     assert file_fields_match(
 65 |         os.path.join(TMP_DIR.name, "invalid_BIO_repaired_conlleval.txt"),
 66 |         os.path.join("tests", "conll_annotation", "minimal.bio"),
 67 |     )
 68 | 
 69 | 
 70 | def test_repair_BIO_discard() -> None:
 71 |     runner = CliRunner()
 72 |     result = runner.invoke(
 73 |         repair,
 74 |         [
 75 |             "--labels",
 76 |             "BIO",
 77 |             "--repair-method",
 78 |             "discard",
 79 |             os.path.join("tests", "conll_annotation", "invalid1.bio"),
 80 |             os.path.join(TMP_DIR.name, "invalid_BIO_repaired_discard.txt"),
 81 |         ],
 82 |     )
 83 |     assert result.exit_code == 0
 84 |     assert (
 85 |         normalize_str_with_path(
 86 |             "Validation errors in sequence at line 7 of tests/conll_annotation/invalid1.bio:"
 87 |         )
 88 |         in result.output
 89 |     )
 90 |     assert (
 91 |         "Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7"
 92 |         in result.output
 93 |     )
 94 |     assert (
 95 |         "Invalid transition 'O' -> 'I-LOC' for token 'West' on line 12" in result.output
 96 |     )
 97 |     assert (
 98 |         "Invalid transition 'O' -> 'I-LOC' for token 'Pennsylvania' on line 15"
 99 |         in result.output
100 |     )
101 |     assert "Used method discard to repair:" in result.output
102 |     assert (
103 |         "Old: ('I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'I-LOC', 'I-LOC', 'O', 'I-LOC', 'O')"
104 |         in result.output
105 |     )
106 |     assert "New: ('O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O')" in result.output
107 |     assert file_fields_match(
108 |         os.path.join(TMP_DIR.name, "invalid_BIO_repaired_discard.txt"),
109 |         os.path.join("tests", "conll_annotation", "invalid1_BIO_discard.txt"),
110 |     )
111 | 
112 | 
113 | def test_invalid_label() -> None:
114 |     runner = CliRunner()
115 |     result = runner.invoke(
116 |         repair,
117 |         [
118 |             "--labels",
119 |             "BIO",
120 |             "--repair-method",
121 |             "conlleval",
122 |             os.path.join("tests", "conll_annotation", "invalid1.bioes"),
123 |             os.path.join(TMP_DIR.name, "temp.txt"),
124 |         ],
125 |     )
126 |     assert result.exit_code != 0
127 | 
128 | 
129 | def test_repair_none_raises_error() -> None:
130 |     runner = CliRunner()
131 |     result = runner.invoke(
132 |         repair,
133 |         [
134 |             "--labels",
135 |             "BIO",
136 |             "--repair-method",
137 |             "none",
138 |             os.path.join("tests", "conll_annotation", "invalid1.bio"),
139 |             os.path.join(TMP_DIR.name, "temp.txt"),
140 |         ],
141 |     )
142 |     assert result.exit_code != 0
143 | 


--------------------------------------------------------------------------------
/seqscore/model.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import Iterable, Iterator, Sequence
  2 | from itertools import repeat
  3 | from typing import Any, Optional, Union, overload
  4 | 
  5 | from attr import Attribute, attrib, attrs
  6 | 
  7 | from seqscore.util import (
  8 |     tuplify_optional_nested_strs,
  9 |     tuplify_strs,
 10 |     validator_nonempty_str,
 11 | )
 12 | 
 13 | 
 14 | def _validator_nonnegative(_inst: Any, _attr: Attribute, value: Any) -> None:
 15 |     if value < 0:
 16 |         raise ValueError(f"Negative value: {repr(value)}")
 17 | 
 18 | 
 19 | def _tuplify_mentions(
 20 |     mentions: Iterable["Mention"],
 21 | ) -> tuple["Mention", ...]:
 22 |     return tuple(mentions)
 23 | 
 24 | 
 25 | @attrs(frozen=True, slots=True)
 26 | class Span:
 27 |     start: int = attrib(validator=_validator_nonnegative)
 28 |     end: int = attrib(validator=_validator_nonnegative)
 29 | 
 30 |     def __attrs_post_init__(self) -> None:
 31 |         if not self.end > self.start:
 32 |             raise ValueError(
 33 |                 f"End of span ({self.end}) must be greater than start ({self.start}"
 34 |             )
 35 | 
 36 |     def __len__(self) -> int:
 37 |         return self.end - self.start
 38 | 
 39 | 
 40 | @attrs(frozen=True, slots=True)
 41 | class Mention:
 42 |     span: Span = attrib()
 43 |     type: str = attrib(validator=validator_nonempty_str)
 44 | 
 45 |     def __len__(self) -> int:
 46 |         return len(self.span)
 47 | 
 48 |     def with_type(self, new_type: str) -> "Mention":
 49 |         return Mention(self.span, new_type)
 50 | 
 51 | 
 52 | @attrs(frozen=True, slots=True)
 53 | class SequenceProvenance:
 54 |     starting_line: int = attrib()
 55 |     source: Optional[str] = attrib()
 56 | 
 57 | 
 58 | @attrs(frozen=True, slots=True)
 59 | class LabeledSequence(Sequence[str]):
 60 |     tokens: tuple[str, ...] = attrib(converter=tuplify_strs)
 61 |     labels: tuple[str, ...] = attrib(converter=tuplify_strs)
 62 |     mentions: tuple[Mention, ...] = attrib(default=(), converter=_tuplify_mentions)
 63 |     other_fields: Optional[tuple[tuple[str, ...], ...]] = attrib(
 64 |         default=None, kw_only=True, converter=tuplify_optional_nested_strs
 65 |     )
 66 |     provenance: Optional[SequenceProvenance] = attrib(
 67 |         default=None, eq=False, kw_only=True
 68 |     )
 69 |     comment: Optional[str] = attrib(default=None, eq=False, kw_only=True)
 70 | 
 71 |     def __attrs_post_init__(self) -> None:
 72 |         # TODO: Check for overlapping mentions
 73 | 
 74 |         if len(self.tokens) != len(self.labels):
 75 |             raise ValueError(
 76 |                 f"Tokens ({len(self.tokens)}) and labels ({len(self.labels)}) "
 77 |                 "must be of the same length"
 78 |             )
 79 |         if not self.tokens:
 80 |             raise ValueError("Tokens and labels must be non-empty")
 81 | 
 82 |         if self.other_fields and len(self.tokens) != len(self.other_fields):
 83 |             raise ValueError(
 84 |                 f"Tokens ({len(self.tokens)}) and other_fields ({len(self.other_fields)}) "
 85 |                 "must be of the same length"
 86 |             )
 87 | 
 88 |         for label in self.labels:
 89 |             # Labels cannot be None or an empty string
 90 |             if not label:
 91 |                 raise ValueError(f"Invalid label: {repr(label)}")
 92 | 
 93 |         for token in self.tokens:
 94 |             # Labels cannot be None or an empty string
 95 |             if not token:
 96 |                 raise ValueError(f"Invalid token: {repr(token)}")
 97 | 
 98 |     def with_mentions(self, mentions: Sequence[Mention]) -> "LabeledSequence":
 99 |         return LabeledSequence(
100 |             self.tokens, self.labels, mentions, provenance=self.provenance
101 |         )
102 | 
103 |     @overload
104 |     def __getitem__(self, index: int) -> str:
105 |         raise NotImplementedError
106 | 
107 |     @overload
108 |     def __getitem__(self, index: slice) -> tuple[str, ...]:
109 |         raise NotImplementedError
110 | 
111 |     def __getitem__(self, i: Union[int, slice]) -> Union[str, tuple[str, ...]]:
112 |         return self.tokens[i]
113 | 
114 |     def __iter__(self) -> Iterator[str]:
115 |         return iter(self.tokens)
116 | 
117 |     def __len__(self) -> int:
118 |         # Guaranteed that labels and tokens are same length by construction
119 |         return len(self.tokens)
120 | 
121 |     def __str__(self) -> str:
122 |         return " ".join(
123 |             "/".join((token, label)) for token, label in zip(self.tokens, self.labels)
124 |         )
125 | 
126 |     def tokens_with_labels(self) -> tuple[tuple[str, str], ...]:
127 |         return tuple(zip(self.tokens, self.labels))
128 | 
129 |     def tokens_with_other_fields(
130 |         self,
131 |     ) -> tuple[tuple[str, Optional[tuple[str, ...]]], ...]:
132 |         if self.other_fields:
133 |             return tuple(zip(self.tokens, self.other_fields))
134 |         else:
135 |             return tuple(zip(self.tokens, repeat(None)))
136 | 
137 |     def span_tokens(self, span: Span) -> tuple[str, ...]:
138 |         return self.tokens[span.start : span.end]
139 | 
140 |     def mention_tokens(self, mention: Mention) -> tuple[str, ...]:
141 |         return self.span_tokens(mention.span)
142 | 


--------------------------------------------------------------------------------
/seqscore/validation.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import Iterable, Sequence
  2 | from typing import Any, Optional
  3 | 
  4 | from attr import attrib, attrs
  5 | 
  6 | from seqscore.encoding import _ENCODING_NAMES, Encoding, EncodingError
  7 | from seqscore.util import tuplify_strs
  8 | 
  9 | # All encodings can be validated
 10 | VALIDATION_SUPPORTED_ENCODINGS: Sequence[str] = tuple(_ENCODING_NAMES)
 11 | 
 12 | 
 13 | @attrs
 14 | class ValidationError:
 15 |     msg: str = attrib()
 16 |     label: str = attrib()
 17 |     type: str = attrib()
 18 |     state: str = attrib()
 19 |     token: Optional[str] = attrib(default=None)
 20 |     line_num: Optional[int] = attrib(default=None)
 21 |     source_name: Optional[str] = attrib(default=None)
 22 | 
 23 | 
 24 | class InvalidStateError(ValidationError):
 25 |     pass
 26 | 
 27 | 
 28 | class InvalidTransitionError(ValidationError):
 29 |     pass
 30 | 
 31 | 
 32 | class InvalidLabelError(EncodingError):
 33 |     def __init__(self, label: str, *args: Any, **kwargs: Any) -> None:
 34 |         super().__init__(*args, **kwargs)
 35 |         self.label: str = label
 36 | 
 37 | 
 38 | def tuplify_errors(errors: Iterable[ValidationError]) -> tuple[ValidationError, ...]:
 39 |     return tuple(errors)
 40 | 
 41 | 
 42 | @attrs
 43 | class SequenceValidationResult:
 44 |     errors: Sequence[ValidationError] = attrib(converter=tuplify_errors)
 45 |     n_tokens: int = attrib()
 46 |     repaired_labels: Optional[tuple[str, ...]] = attrib(
 47 |         converter=tuplify_strs, default=()
 48 |     )
 49 | 
 50 |     def is_valid(self) -> bool:
 51 |         return not self.errors
 52 | 
 53 |     def invalid_state_errors(self) -> list[InvalidStateError]:
 54 |         return [error for error in self.errors if isinstance(error, InvalidStateError)]
 55 | 
 56 |     def __len__(self) -> int:
 57 |         return len(self.errors)
 58 | 
 59 | 
 60 | @attrs(frozen=True)
 61 | class ValidationResult:
 62 |     errors: Sequence[ValidationError] = attrib(converter=tuplify_errors)
 63 |     n_tokens: int = attrib()
 64 |     n_sequences: int = attrib()
 65 |     n_docs: int = attrib()
 66 | 
 67 | 
 68 | def validate_labels(
 69 |     labels: Sequence[str],
 70 |     encoding: Encoding,
 71 |     *,
 72 |     repair: Optional[str] = None,
 73 |     tokens: Optional[Sequence[str]] = None,
 74 |     line_nums: Optional[Sequence[int]] = None,
 75 |     source_name: Optional[str] = None,
 76 | ) -> SequenceValidationResult:
 77 |     assert not tokens or len(tokens) == len(labels), (
 78 |         "Tokens and labels must be the same length"
 79 |     )
 80 |     assert not line_nums or len(line_nums) == len(labels), (
 81 |         "Line numbers and labels must be the same length"
 82 |     )
 83 | 
 84 |     errors: list[ValidationError] = []
 85 |     outside = encoding.dialect.outside
 86 | 
 87 |     # Treat sequence as if preceded by outside
 88 |     prev_label = outside
 89 |     prev_state, prev_entity_type = encoding.split_label(prev_label)
 90 | 
 91 |     # Enumerate so we can look up tokens and labels if needed
 92 |     for idx, label in enumerate(labels):
 93 |         try:
 94 |             state, entity_type = encoding.split_label(label)
 95 |         except EncodingError as e:
 96 |             line_msg = f" on line {line_nums[idx]}" if line_nums else ""
 97 |             source_msg = f" of {source_name}" if source_name else ""
 98 |             raise InvalidLabelError(
 99 |                 label,
100 |                 f"Could not parse label {repr(label)}{line_msg}{source_msg} during validation: "
101 |                 + str(e),
102 |             ) from e
103 | 
104 |         if not encoding.is_valid_state(state):
105 |             msg = f"Invalid state {repr(state)} in label {repr(label)}"
106 |             if tokens:
107 |                 token = tokens[idx]
108 |                 msg += f" for token {repr(token)}"
109 |             else:
110 |                 token = None
111 | 
112 |             if line_nums:
113 |                 line_num = line_nums[idx]
114 |                 msg += f" on line {line_num}"
115 |             else:
116 |                 line_num = None
117 | 
118 |             if source_name:
119 |                 msg += f" of {source_name}"
120 | 
121 |             errors.append(
122 |                 InvalidStateError(
123 |                     msg, label, entity_type, state, token, line_num, source_name
124 |                 )
125 |             )
126 | 
127 |         if not encoding.is_valid_transition(
128 |             prev_state, prev_entity_type, state, entity_type
129 |         ):
130 |             msg = f"Invalid transition {repr(prev_label)} -> {repr(label)}"
131 |             if tokens:
132 |                 token = tokens[idx]
133 |                 msg += f" for token {repr(token)}"
134 |             else:
135 |                 token = None
136 | 
137 |             if line_nums:
138 |                 line_num = line_nums[idx]
139 |                 msg += f" on line {line_num}"
140 |             else:
141 |                 line_num = None
142 | 
143 |             if source_name:
144 |                 msg += f" of {source_name}"
145 | 
146 |             errors.append(
147 |                 InvalidTransitionError(
148 |                     msg, label, entity_type, state, token, line_num, source_name
149 |                 )
150 |             )
151 |         prev_label, prev_state, prev_entity_type = (
152 |             label,
153 |             state,
154 |             entity_type,
155 |         )
156 | 
157 |     # Treat sequence as if followed by outside
158 |     label = outside
159 |     state, entity_type = encoding.split_label(label)
160 |     if not encoding.is_valid_transition(prev_state, prev_entity_type, state, entity_type):
161 |         msg = f"Invalid transition {repr(prev_label)} -> {repr(label)}"
162 |         if tokens:
163 |             token = tokens[-1]
164 |             msg += f" after token {repr(token)}"
165 |         else:
166 |             token = None
167 | 
168 |         if line_nums:
169 |             line_num = line_nums[-1]
170 |             msg += f" on line {line_num}"
171 |         else:
172 |             line_num = None
173 | 
174 |         msg += " at end of sequence"
175 | 
176 |         errors.append(
177 |             InvalidTransitionError(
178 |                 msg, prev_label, prev_entity_type, prev_state, token, line_num
179 |             )
180 |         )
181 | 
182 |     if errors and repair:
183 |         repaired_labels = encoding.repair_labels(labels, repair)
184 |         return SequenceValidationResult(errors, len(labels), repaired_labels)
185 |     else:
186 |         return SequenceValidationResult(errors, len(labels))
187 | 


--------------------------------------------------------------------------------
/tests/test_scoring_click.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | 
  4 | from click.testing import CliRunner
  5 | 
  6 | from seqscore.scripts.seqscore import score
  7 | 
  8 | 
  9 | def test_score_correct_labels() -> None:
 10 |     runner = CliRunner()
 11 |     result = runner.invoke(
 12 |         score,
 13 |         [
 14 |             "--labels",
 15 |             "BIO",
 16 |             "--reference",
 17 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
 18 |             "--score-format",
 19 |             "delim",
 20 |             os.path.join("tests", "conll_predictions", "correct1.bio"),
 21 |         ],
 22 |     )
 23 |     assert result.exit_code == 0
 24 |     assert "Type\tPrecision\tRecall\tF1\tReference\tPredicted\tCorrect" in result.output
 25 |     assert "ALL\t100.00\t100.00\t100.00\t3\t3\t3" in result.output
 26 |     assert "LOC\t100.00\t100.00\t100.00\t2\t2\t2" in result.output
 27 |     assert "ORG\t100.00\t100.00\t100.00\t1\t1\t1" in result.output
 28 | 
 29 | 
 30 | def test_score_invalid_sequence_conlleval() -> None:
 31 |     runner = CliRunner()
 32 |     result = runner.invoke(
 33 |         score,
 34 |         [
 35 |             "--repair-method",
 36 |             "conlleval",
 37 |             "--labels",
 38 |             "BIO",
 39 |             "--reference",
 40 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
 41 |             "--score-format",
 42 |             "delim",
 43 |             os.path.join(
 44 |                 "tests", "conll_predictions", "correct1_improper_sequence_pred.txt"
 45 |             ),
 46 |         ],
 47 |     )
 48 |     assert result.exit_code == 0
 49 |     assert "Used method conlleval to repair:" in result.output
 50 |     assert "Type\tPrecision\tRecall\tF1\tReference\tPredicted\tCorrect" in result.output
 51 |     assert "ALL\t100.00\t100.00\t100.00\t3\t3\t3" in result.output
 52 |     assert "LOC\t100.00\t100.00\t100.00\t2\t2\t2" in result.output
 53 |     assert "ORG\t100.00\t100.00\t100.00\t1\t1\t1" in result.output
 54 | 
 55 | 
 56 | def test_score_invalid_sequence_discard() -> None:
 57 |     runner = CliRunner()
 58 |     result = runner.invoke(
 59 |         score,
 60 |         [
 61 |             "--labels",
 62 |             "BIO",
 63 |             "--repair-method",
 64 |             "discard",
 65 |             "--reference",
 66 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
 67 |             "--score-format",
 68 |             "delim",
 69 |             os.path.join(
 70 |                 "tests", "conll_predictions", "correct1_improper_sequence_pred.txt"
 71 |             ),
 72 |         ],
 73 |     )
 74 |     assert result.exit_code == 0
 75 |     assert "Used method discard to repair:" in result.output
 76 |     assert "Type\tPrecision\tRecall\tF1\tReference\tPredicted\tCorrect" in result.output
 77 |     assert "ALL\t100.00\t66.67\t80.00\t3\t2\t2" in result.output
 78 |     assert "LOC\t100.00\t100.00\t100.00\t2\t2\t2" in result.output
 79 |     assert "ORG\t0.00\t0.00\t0.00\t1\t0\t0" in result.output
 80 | 
 81 | 
 82 | def test_score_invalid_sequence_none() -> None:
 83 |     runner = CliRunner()
 84 |     result = runner.invoke(
 85 |         score,
 86 |         [
 87 |             "--labels",
 88 |             "BIO",
 89 |             "--repair-method",
 90 |             "none",
 91 |             "--reference",
 92 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
 93 |             "--score-format",
 94 |             "delim",
 95 |             os.path.join(
 96 |                 "tests", "conll_predictions", "correct1_improper_sequence_pred.txt"
 97 |             ),
 98 |         ],
 99 |     )
100 |     assert result.exit_code != 0
101 | 
102 | 
103 | def test_score_valid_incorrect_sequence() -> None:
104 |     runner = CliRunner()
105 |     result = runner.invoke(
106 |         score,
107 |         [
108 |             "--repair-method",
109 |             "conlleval",
110 |             "--labels",
111 |             "BIO",
112 |             "--reference",
113 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
114 |             "--score-format",
115 |             "delim",
116 |             os.path.join("tests", "conll_predictions", "incorrect1.bio"),
117 |         ],
118 |     )
119 |     assert result.exit_code == 0
120 |     assert "Type\tPrecision\tRecall\tF1\tReference\tPredicted\tCorrect" in result.output
121 |     assert "ALL\t50.00\t66.67\t57.14\t3\t4\t2" in result.output
122 |     assert "LOC\t33.33\t50.00\t40.00\t2\t3\t1" in result.output
123 |     assert "ORG\t100.00\t100.00\t100.00\t1\t1\t1" in result.output
124 | 
125 | 
126 | def test_score_entity_type_not_in_reference() -> None:
127 |     runner = CliRunner()
128 |     result = runner.invoke(
129 |         score,
130 |         [
131 |             "--labels",
132 |             "BIO",
133 |             "--reference",
134 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
135 |             "--score-format",
136 |             "delim",
137 |             os.path.join(
138 |                 "tests", "conll_predictions", "incorrect_type_not_in_reference.bio"
139 |             ),
140 |         ],
141 |     )
142 |     assert result.exit_code == 0
143 |     output_fields = [line.split("\t") for line in result.output.rstrip("\n").split("\n")]
144 |     assert output_fields == [
145 |         ["Type", "Precision", "Recall", "F1", "Reference", "Predicted", "Correct"],
146 |         ["ALL", "75.00", "100.00", "85.71", "3", "4", "3"],
147 |         ["LOC", "100.00", "100.00", "100.00", "2", "2", "2"],
148 |         ["ORG", "100.00", "100.00", "100.00", "1", "1", "1"],
149 |         ["SPURIOUS", "0.00", "0.00", "0.00", "0", "1", "0"],
150 |     ]
151 | 
152 | 
153 | def test_score_invalid_labels() -> None:
154 |     runner = CliRunner()
155 |     result = runner.invoke(
156 |         score,
157 |         [
158 |             "--labels",
159 |             "BIO",
160 |             "--reference",
161 |             os.path.join("tests", "conll_annotation", "minimal.bioes"),
162 |             "--score-format",
163 |             "delim",
164 |             os.path.join("tests", "conll_predictions", "incorrect1.bio"),
165 |         ],
166 |     )
167 |     assert result.exit_code != 0
168 | 
169 | 
170 | def test_score_multiple_files() -> None:
171 |     runner = CliRunner()
172 |     result = runner.invoke(
173 |         score,
174 |         [
175 |             "--labels",
176 |             "BIO",
177 |             "--reference",
178 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
179 |             "--score-format",
180 |             "delim",
181 |         ]
182 |         + glob.glob(os.path.join("tests", "conll_predictions", "*1.bio")),
183 |     )
184 |     assert result.exit_code == 0
185 |     assert "SD\tALL\tNA\tNA\t30.30\tNA\tNA\tNA" in result.output
186 |     assert "Mean\tALL\tNA\tNA\t78.57\tNA\tNA\tNA" in result.output
187 | 


--------------------------------------------------------------------------------
/tests/test_conversion_click.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | from typing import Optional
  4 | 
  5 | from click.testing import CliRunner
  6 | 
  7 | from seqscore.scripts.seqscore import convert
  8 | from seqscore.util import file_fields_match
  9 | 
 10 | TMP_DIR: Optional[tempfile.TemporaryDirectory] = None
 11 | 
 12 | 
 13 | def setup_module() -> None:
 14 |     """Create temporary directory used by tests."""
 15 |     global TMP_DIR
 16 |     TMP_DIR = tempfile.TemporaryDirectory()
 17 | 
 18 | 
 19 | def teardown_module() -> None:
 20 |     """Remove temporary directory used by tests."""
 21 |     TMP_DIR.cleanup()
 22 | 
 23 | 
 24 | def test_invalid_conversion_BIO() -> None:
 25 |     runner = CliRunner()
 26 |     result = runner.invoke(
 27 |         convert,
 28 |         [
 29 |             "--input-labels",
 30 |             "BIO",
 31 |             "--output-labels",
 32 |             "BIOES",
 33 |             os.path.join("tests", "conll_annotation", "invalid1.bio"),
 34 |             os.path.join(TMP_DIR.name, "temp.txt"),
 35 |         ],
 36 |     )
 37 |     assert result.exit_code != 0
 38 | 
 39 | 
 40 | def test_invalid_conversion_BIOES() -> None:
 41 |     runner = CliRunner()
 42 |     result = runner.invoke(
 43 |         convert,
 44 |         [
 45 |             "--input-labels",
 46 |             "BIOES",
 47 |             "--output-labels",
 48 |             "BIO",
 49 |             os.path.join("tests", "conll_annotation", "invalid1.bioes"),
 50 |             os.path.join(TMP_DIR.name, "temp.txt"),
 51 |         ],
 52 |     )
 53 |     assert result.exit_code != 0
 54 | 
 55 | 
 56 | def test_BIO_to_BIOES() -> None:
 57 |     runner = CliRunner()
 58 |     result = runner.invoke(
 59 |         convert,
 60 |         [
 61 |             "--input-labels",
 62 |             "BIO",
 63 |             "--output-labels",
 64 |             "BIOES",
 65 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
 66 |             os.path.join(TMP_DIR.name, "BIOtoBIOES.txt"),
 67 |         ],
 68 |     )
 69 |     assert result.exit_code == 0
 70 |     assert file_fields_match(
 71 |         os.path.join(TMP_DIR.name, "BIOtoBIOES.txt"),
 72 |         os.path.join("tests", "conll_annotation", "minimal.bioes"),
 73 |     )
 74 | 
 75 | 
 76 | def test_BIOES_to_BIO() -> None:
 77 |     runner = CliRunner()
 78 |     result = runner.invoke(
 79 |         convert,
 80 |         [
 81 |             "--input-labels",
 82 |             "BIOES",
 83 |             "--output-labels",
 84 |             "BIO",
 85 |             os.path.join("tests", "conll_annotation", "minimal.bioes"),
 86 |             os.path.join(TMP_DIR.name, "BIOEStoBIO.txt"),
 87 |         ],
 88 |     )
 89 |     assert result.exit_code == 0
 90 |     assert file_fields_match(
 91 |         os.path.join(TMP_DIR.name, "BIOEStoBIO.txt"),
 92 |         os.path.join("tests", "conll_annotation", "minimal.bio"),
 93 |     )
 94 | 
 95 | 
 96 | def test_BIO_to_IO() -> None:
 97 |     runner = CliRunner()
 98 |     result = runner.invoke(
 99 |         convert,
100 |         [
101 |             "--input-labels",
102 |             "BIO",
103 |             "--output-labels",
104 |             "IO",
105 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
106 |             os.path.join(TMP_DIR.name, "BIOtoIO.txt"),
107 |         ],
108 |     )
109 |     assert result.exit_code == 0
110 |     assert file_fields_match(
111 |         os.path.join(TMP_DIR.name, "BIOtoIO.txt"),
112 |         os.path.join("tests", "conll_annotation", "minimal.io"),
113 |     )
114 | 
115 | 
116 | def test_IO_to_BIO() -> None:
117 |     runner = CliRunner()
118 |     result = runner.invoke(
119 |         convert,
120 |         [
121 |             "--input-labels",
122 |             "IO",
123 |             "--output-labels",
124 |             "BIO",
125 |             os.path.join("tests", "conll_annotation", "minimal.io"),
126 |             os.path.join(TMP_DIR.name, "IOtoBIO.txt"),
127 |         ],
128 |     )
129 |     assert result.exit_code == 0
130 |     # conversion will not necessarily reproduce BIO correctly but does in this case
131 |     assert file_fields_match(
132 |         os.path.join(TMP_DIR.name, "IOtoBIO.txt"),
133 |         os.path.join("tests", "conll_annotation", "minimal.bio"),
134 |     )
135 | 
136 | 
137 | def test_BIO_to_IOB_fields() -> None:
138 |     runner = CliRunner()
139 |     result = runner.invoke(
140 |         convert,
141 |         [
142 |             "--input-labels",
143 |             "BIO",
144 |             "--output-labels",
145 |             "IOB",
146 |             os.path.join("tests", "conll_annotation", "minimal_fields.bio"),
147 |             os.path.join(TMP_DIR.name, "BIOtoIOB.txt"),
148 |         ],
149 |     )
150 |     assert result.exit_code == 0
151 |     assert file_fields_match(
152 |         os.path.join(TMP_DIR.name, "BIOtoIOB.txt"),
153 |         os.path.join("tests", "conll_annotation", "minimal_fields.iob"),
154 |     )
155 | 
156 | 
157 | def test_IOB_to_BIO_fields() -> None:
158 |     runner = CliRunner()
159 |     result = runner.invoke(
160 |         convert,
161 |         [
162 |             "--input-labels",
163 |             "IOB",
164 |             "--output-labels",
165 |             "BIO",
166 |             os.path.join("tests", "conll_annotation", "minimal_fields.iob"),
167 |             os.path.join(TMP_DIR.name, "IOBtoBIO.txt"),
168 |         ],
169 |     )
170 |     assert result.exit_code == 0
171 |     assert file_fields_match(
172 |         os.path.join(TMP_DIR.name, "IOBtoBIO.txt"),
173 |         os.path.join("tests", "conll_annotation", "minimal_fields.bio"),
174 |     )
175 | 
176 | 
177 | def test_IO_to_BIOES() -> None:
178 |     runner = CliRunner()
179 |     result = runner.invoke(
180 |         convert,
181 |         [
182 |             "--input-labels",
183 |             "IO",
184 |             "--output-labels",
185 |             "BIOES",
186 |             os.path.join("tests", "conll_annotation", "minimal.io"),
187 |             os.path.join(TMP_DIR.name, "IOtoBIOES.txt"),
188 |         ],
189 |     )
190 |     assert result.exit_code == 0
191 |     # conversion will not necessarily reproduce BIOES correctly but does in this case
192 |     assert file_fields_match(
193 |         os.path.join(TMP_DIR.name, "IOtoBIOES.txt"),
194 |         os.path.join("tests", "conll_annotation", "minimal.bioes"),
195 |     )
196 | 
197 | 
198 | def test_BIOES_to_IO() -> None:
199 |     runner = CliRunner()
200 |     result = runner.invoke(
201 |         convert,
202 |         [
203 |             "--input-labels",
204 |             "BIOES",
205 |             "--output-labels",
206 |             "IO",
207 |             os.path.join("tests", "conll_annotation", "minimal.bioes"),
208 |             os.path.join(TMP_DIR.name, "BIOEStoIO.txt"),
209 |         ],
210 |     )
211 |     assert result.exit_code == 0
212 |     assert file_fields_match(
213 |         os.path.join(TMP_DIR.name, "BIOEStoIO.txt"),
214 |         os.path.join("tests", "conll_annotation", "minimal.io"),
215 |     )
216 | 
217 | 
218 | def test_same_input_and_output_labels_raises_error() -> None:
219 |     runner = CliRunner()
220 |     result = runner.invoke(
221 |         convert,
222 |         [
223 |             "--input-labels",
224 |             "BIO",
225 |             "--output-labels",
226 |             "BIO",
227 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
228 |             os.path.join(TMP_DIR.name, "temp.txt"),
229 |         ],
230 |     )
231 |     assert result.exit_code != 0
232 | 


--------------------------------------------------------------------------------
/tests/test_count_click.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | from typing import Optional
  4 | 
  5 | from click.testing import CliRunner
  6 | 
  7 | from seqscore.scripts.seqscore import count
  8 | from seqscore.util import file_lines_match
  9 | 
 10 | TMP_DIR: Optional[tempfile.TemporaryDirectory] = None
 11 | 
 12 | 
 13 | def setup_module() -> None:
 14 |     """Create temporary directory used by tests."""
 15 |     global TMP_DIR
 16 |     TMP_DIR = tempfile.TemporaryDirectory()
 17 | 
 18 | 
 19 | def teardown_module() -> None:
 20 |     """Remove temporary directory used by tests."""
 21 |     TMP_DIR.cleanup()
 22 | 
 23 | 
 24 | def test_count_BIO() -> None:
 25 |     runner = CliRunner()
 26 |     result = runner.invoke(
 27 |         count,
 28 |         [
 29 |             "--labels",
 30 |             "BIO",
 31 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
 32 |             "--output-file",
 33 |             os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
 34 |         ],
 35 |     )
 36 |     assert result.exit_code == 0
 37 |     assert file_lines_match(
 38 |         os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
 39 |         os.path.join("tests", "test_files", "count_minimal_ref.txt"),
 40 |     )
 41 | 
 42 | 
 43 | def test_count_BIO_stdout() -> None:
 44 |     runner = CliRunner()
 45 |     result = runner.invoke(
 46 |         count,
 47 |         [
 48 |             "--labels",
 49 |             "BIO",
 50 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
 51 |         ],
 52 |     )
 53 |     assert result.exit_code == 0
 54 |     expected_lines = open(
 55 |         os.path.join("tests", "test_files", "count_minimal_ref.txt")
 56 |     ).read()
 57 |     assert result.stdout == expected_lines
 58 | 
 59 | 
 60 | def test_count_BIO_twofiles() -> None:
 61 |     runner = CliRunner()
 62 |     result = runner.invoke(
 63 |         count,
 64 |         [
 65 |             "--labels",
 66 |             "BIO",
 67 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
 68 |             os.path.join("tests", "conll_annotation", "minimal2.bio"),
 69 |             "--output-file",
 70 |             os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
 71 |         ],
 72 |     )
 73 |     assert result.exit_code == 0
 74 |     assert file_lines_match(
 75 |         os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
 76 |         os.path.join("tests", "test_files", "count_minimal_twofiles_ref.txt"),
 77 |         debug=True,
 78 |     )
 79 | 
 80 | 
 81 | def test_count_BIO_tab1() -> None:
 82 |     runner = CliRunner()
 83 |     result = runner.invoke(
 84 |         count,
 85 |         [
 86 |             "--labels",
 87 |             "BIO",
 88 |             "--output-delim",
 89 |             "\t",  # Actual tab
 90 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
 91 |             "--output-file",
 92 |             os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
 93 |         ],
 94 |     )
 95 |     assert result.exit_code == 0
 96 |     assert file_lines_match(
 97 |         os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
 98 |         os.path.join("tests", "test_files", "count_minimal_ref.txt"),
 99 |     )
100 | 
101 | 
102 | def test_count_BIO_tab2() -> None:
103 |     runner = CliRunner()
104 |     result = runner.invoke(
105 |         count,
106 |         [
107 |             "--labels",
108 |             "BIO",
109 |             "--output-delim",
110 |             r"\t",  # Backlash and t
111 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
112 |             "--output-file",
113 |             os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
114 |         ],
115 |     )
116 |     assert result.exit_code == 0
117 |     assert file_lines_match(
118 |         os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
119 |         os.path.join("tests", "test_files", "count_minimal_ref.txt"),
120 |     )
121 | 
122 | 
123 | def test_count_BIO_tab3() -> None:
124 |     runner = CliRunner()
125 |     result = runner.invoke(
126 |         count,
127 |         [
128 |             "--labels",
129 |             "BIO",
130 |             "--output-delim",
131 |             "tab",  # Tab spelled out
132 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
133 |             "--output-file",
134 |             os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
135 |         ],
136 |     )
137 |     assert result.exit_code == 0
138 |     assert file_lines_match(
139 |         os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
140 |         os.path.join("tests", "test_files", "count_minimal_ref.txt"),
141 |     )
142 | 
143 | 
144 | def test_count_BIO_comma() -> None:
145 |     runner = CliRunner()
146 |     result = runner.invoke(
147 |         count,
148 |         [
149 |             "--labels",
150 |             "BIO",
151 |             "--output-delim",
152 |             ",",
153 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
154 |             "--output-file",
155 |             os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
156 |         ],
157 |     )
158 |     assert result.exit_code == 0
159 |     assert file_lines_match(
160 |         os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
161 |         os.path.join("tests", "test_files", "count_minimal_ref_comma.txt"),
162 |     )
163 | 
164 | 
165 | def test_count_BIOES() -> None:
166 |     runner = CliRunner()
167 |     result = runner.invoke(
168 |         count,
169 |         [
170 |             "--labels",
171 |             "BIOES",
172 |             "--repair-method",
173 |             "none",
174 |             os.path.join("tests", "conll_annotation", "minimal.bioes"),
175 |             "--output-file",
176 |             os.path.join(TMP_DIR.name, "count_BIOES_out.txt"),
177 |         ],
178 |     )
179 |     assert result.exit_code == 0
180 |     assert file_lines_match(
181 |         os.path.join(TMP_DIR.name, "count_BIOES_out.txt"),
182 |         os.path.join("tests", "test_files", "count_minimal_ref.txt"),
183 |     )
184 | 
185 | 
186 | def test_count_IO() -> None:
187 |     runner = CliRunner()
188 |     result = runner.invoke(
189 |         count,
190 |         [
191 |             "--labels",
192 |             "IO",
193 |             "--repair-method",
194 |             "none",
195 |             os.path.join("tests", "conll_annotation", "minimal.io"),
196 |             "--output-file",
197 |             os.path.join(TMP_DIR.name, "count_IO_out.txt"),
198 |         ],
199 |     )
200 |     assert result.exit_code == 0
201 |     assert file_lines_match(
202 |         os.path.join(TMP_DIR.name, "count_IO_out.txt"),
203 |         os.path.join("tests", "test_files", "count_minimal_ref.txt"),
204 |     )
205 | 
206 | 
207 | def test_count_BIO_invalid_conlleval() -> None:
208 |     runner = CliRunner()
209 |     result = runner.invoke(
210 |         count,
211 |         [
212 |             "--labels",
213 |             "BIO",
214 |             "--repair-method",
215 |             "conlleval",
216 |             os.path.join("tests", "conll_annotation", "invalid1.bio"),
217 |             "--output-file",
218 |             os.path.join(TMP_DIR.name, "count_BIO_conlleval_out.txt"),
219 |         ],
220 |     )
221 |     assert result.exit_code == 0
222 |     assert file_lines_match(
223 |         os.path.join(TMP_DIR.name, "count_BIO_conlleval_out.txt"),
224 |         os.path.join("tests", "test_files", "count_minimal_ref.txt"),
225 |     )
226 | 
227 | 
228 | def test_count_BIO_invalid_discard() -> None:
229 |     runner = CliRunner()
230 |     result = runner.invoke(
231 |         count,
232 |         [
233 |             "--labels",
234 |             "BIO",
235 |             "--repair-method",
236 |             "discard",
237 |             os.path.join("tests", "conll_annotation", "invalid1.bio"),
238 |             "--output-file",
239 |             os.path.join(TMP_DIR.name, "count_BIO_discard_out.txt"),
240 |         ],
241 |     )
242 |     assert result.exit_code == 0
243 |     # all entities have invalid label sequences
244 |     with open(
245 |         os.path.join(TMP_DIR.name, "count_BIO_discard_out.txt"), encoding="utf8"
246 |     ) as output:
247 |         assert not output.readlines()
248 | 


--------------------------------------------------------------------------------
/tests/test_validation_click.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from click.testing import CliRunner
  4 | 
  5 | from seqscore.scripts.seqscore import validate
  6 | from seqscore.util import normalize_str_with_path
  7 | 
  8 | 
  9 | def test_valid_bio() -> None:
 10 |     runner = CliRunner()
 11 |     result = runner.invoke(
 12 |         validate,
 13 |         ["--labels", "BIO", os.path.join("tests", "conll_annotation", "minimal.bio")],
 14 |     )
 15 |     assert (
 16 |         result.output
 17 |         == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/minimal.bio\n"
 18 |     )
 19 |     assert result.exit_code == 0
 20 | 
 21 | 
 22 | def test_valid_bio_quiet() -> None:
 23 |     runner = CliRunner()
 24 |     result = runner.invoke(
 25 |         validate,
 26 |         [
 27 |             "--labels",
 28 |             "BIO",
 29 |             "-q",
 30 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
 31 |         ],
 32 |     )
 33 |     assert result.output == ""
 34 |     assert result.exit_code == 0
 35 | 
 36 | 
 37 | def test_valid_bio_twofiles() -> None:
 38 |     runner = CliRunner()
 39 |     result = runner.invoke(
 40 |         validate,
 41 |         [
 42 |             "--labels",
 43 |             "BIO",
 44 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
 45 |             os.path.join("tests", "conll_annotation", "minimal2.bio"),
 46 |         ],
 47 |     )
 48 |     assert result.output == (
 49 |         "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/minimal.bio\n"
 50 |         "No errors found in 13 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/minimal2.bio\n"
 51 |     )
 52 |     assert result.exit_code == 0
 53 | 
 54 | 
 55 | def test_mixed_valid_bio_twofiles_quiet() -> None:
 56 |     runner = CliRunner()
 57 |     result = runner.invoke(
 58 |         validate,
 59 |         [
 60 |             "--quiet",
 61 |             "--labels",
 62 |             "BIO",
 63 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
 64 |             os.path.join("tests", "conll_annotation", "invalid1.bio"),
 65 |         ],
 66 |     )
 67 |     assert result.output == (
 68 |         "Encountered 3 errors in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/invalid1.bio\n"
 69 |         "Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7\n"
 70 |         "Invalid transition 'O' -> 'I-LOC' for token 'West' on line 12\n"
 71 |         "Invalid transition 'O' -> 'I-LOC' for token 'Pennsylvania' on line 15\n"
 72 |     )
 73 |     assert result.exit_code != 0
 74 | 
 75 | 
 76 | def test_mixed_valid_bio_twofiles() -> None:
 77 |     runner = CliRunner()
 78 |     result = runner.invoke(
 79 |         validate,
 80 |         [
 81 |             "--labels",
 82 |             "BIO",
 83 |             os.path.join("tests", "conll_annotation", "minimal.bio"),
 84 |             os.path.join("tests", "conll_annotation", "invalid1.bio"),
 85 |         ],
 86 |     )
 87 |     assert result.output == (
 88 |         "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/minimal.bio\n"
 89 |         "Encountered 3 errors in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/invalid1.bio\n"
 90 |         "Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7\n"
 91 |         "Invalid transition 'O' -> 'I-LOC' for token 'West' on line 12\n"
 92 |         "Invalid transition 'O' -> 'I-LOC' for token 'Pennsylvania' on line 15\n"
 93 |     )
 94 |     assert result.exit_code != 0
 95 | 
 96 | 
 97 | def test_valid_bioes() -> None:
 98 |     runner = CliRunner()
 99 |     result = runner.invoke(
100 |         validate,
101 |         ["--labels", "BIOES", os.path.join("tests", "conll_annotation", "minimal.bioes")],
102 |     )
103 |     assert (
104 |         result.output
105 |         == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/minimal.bioes\n"
106 |     )
107 |     assert result.exit_code == 0
108 | 
109 | 
110 | def test_invalid_bio() -> None:
111 |     runner = CliRunner()
112 |     result = runner.invoke(
113 |         validate,
114 |         ["--labels", "BIO", os.path.join("tests", "conll_annotation", "invalid1.bio")],
115 |     )
116 |     assert result.exit_code != 0
117 |     assert (
118 |         normalize_str_with_path(
119 |             "Encountered 3 errors in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/invalid1.bio"
120 |         )
121 |         in result.output
122 |     )
123 |     assert (
124 |         "Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7"
125 |         in result.output
126 |     )
127 |     assert (
128 |         "Invalid transition 'O' -> 'I-LOC' for token 'West' on line 12" in result.output
129 |     )
130 |     assert (
131 |         "Invalid transition 'O' -> 'I-LOC' for token 'Pennsylvania' on line 15"
132 |         in result.output
133 |     )
134 | 
135 | 
136 | def test_invalid_bioes() -> None:
137 |     runner = CliRunner()
138 |     result = runner.invoke(
139 |         validate,
140 |         [
141 |             "--labels",
142 |             "BIOES",
143 |             os.path.join("tests", "conll_annotation", "invalid1.bioes"),
144 |         ],
145 |     )
146 |     assert result.exit_code != 0
147 |     assert (
148 |         normalize_str_with_path(
149 |             "Encountered 9 errors in 25 tokens, 6 sequences, and 1 document(s) in tests/conll_annotation/invalid1.bioes"
150 |         )
151 |         in result.output
152 |     )
153 |     assert "Invalid transition 'I-ORG' -> 'O' for token 'is' on line 10" in result.output
154 |     assert (
155 |         "Invalid transition 'S-LOC' -> 'I-LOC' for token 'Philadelphia' on line 13"
156 |         in result.output
157 |     )
158 |     assert "Invalid transition 'I-LOC' -> 'O' for token ',' on line 14" in result.output
159 |     assert "Invalid transition 'B-LOC' -> 'O' for token '.' on line 16" in result.output
160 |     assert (
161 |         "Invalid transition 'I-ORG' -> 'O' after token 'Maryland' on line 20"
162 |         in result.output
163 |     )
164 |     assert (
165 |         "Invalid transition 'O' -> 'I-ORG' for token 'Department' on line 22"
166 |         in result.output
167 |     )
168 |     assert (
169 |         "Invalid transition 'O' -> 'I-ORG' for token 'University' on line 26"
170 |         in result.output
171 |     )
172 |     assert (
173 |         "Invalid transition 'I-ORG' -> 'O' after token 'Maryland' on line 28"
174 |         in result.output
175 |     )
176 |     assert (
177 |         "Invalid transition 'B-LOC' -> 'O' after token 'Massachusetts' on line 30"
178 |         in result.output
179 |     )
180 | 
181 | 
182 | def test_invalid_state() -> None:
183 |     runner = CliRunner()
184 |     result = runner.invoke(
185 |         validate,
186 |         # Intentionally declaring IO labels for a BIO file
187 |         ["--labels", "IO", os.path.join("tests", "conll_annotation", "minimal.bio")],
188 |     )
189 |     assert result.exit_code != 0
190 |     output = result.output.split("\n")
191 |     assert output == [
192 |         "Encountered 9 errors in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/minimal.bio",
193 |         "Invalid state 'B' in label 'B-ORG' for token 'University' on line 7",
194 |         "Invalid transition 'O' -> 'B-ORG' for token 'University' on line 7",
195 |         "Invalid transition 'B-ORG' -> 'I-ORG' for token 'of' on line 8",
196 |         "Invalid state 'B' in label 'B-LOC' for token 'West' on line 12",
197 |         "Invalid transition 'O' -> 'B-LOC' for token 'West' on line 12",
198 |         "Invalid transition 'B-LOC' -> 'I-LOC' for token 'Philadelphia' on line 13",
199 |         "Invalid state 'B' in label 'B-LOC' for token 'Pennsylvania' on line 15",
200 |         "Invalid transition 'O' -> 'B-LOC' for token 'Pennsylvania' on line 15",
201 |         "Invalid transition 'B-LOC' -> 'O' for token '.' on line 16",
202 |         "",
203 |     ]
204 | 
205 | 
206 | def test_bad_label() -> None:
207 |     runner = CliRunner()
208 |     result = runner.invoke(
209 |         validate,
210 |         ["--labels", "BIO", os.path.join("tests", "conll_annotation", "bad_label2.bio")],
211 |     )
212 |     assert result.exit_code != 0
213 |     assert (
214 |         str(result.exception)
215 |         == "Could not parse label 'GPE' on line 4 during validation: Label 'GPE' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'."
216 |     )
217 | 


--------------------------------------------------------------------------------
/tests/test_process_click.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | from pathlib import Path
  3 | from typing import Optional
  4 | 
  5 | from click.testing import CliRunner
  6 | 
  7 | from seqscore.scripts.seqscore import process
  8 | from seqscore.util import file_fields_match
  9 | 
 10 | TMP_DIR: Optional[tempfile.TemporaryDirectory] = None
 11 | ANNOTATION_DIR = Path("tests", "conll_annotation")
 12 | TEST_FILES_DIR = Path("tests", "test_files")
 13 | 
 14 | 
 15 | def setup_module() -> None:
 16 |     """Create temporary directory used by tests."""
 17 |     global TMP_DIR
 18 |     TMP_DIR = tempfile.TemporaryDirectory()
 19 | 
 20 | 
 21 | def teardown_module() -> None:
 22 |     """Remove temporary directory used by tests."""
 23 |     TMP_DIR.cleanup()
 24 | 
 25 | 
 26 | def test_keep_types1() -> None:
 27 |     runner = CliRunner()
 28 |     input_path = str(ANNOTATION_DIR / "minimal.bio")
 29 |     output_path = str(Path(TMP_DIR.name) / "out.bio")
 30 |     result = runner.invoke(
 31 |         process,
 32 |         [
 33 |             "--keep-types",
 34 |             "ORG",
 35 |             "--labels",
 36 |             "BIO",
 37 |             input_path,
 38 |             output_path,
 39 |         ],
 40 |     )
 41 |     assert result.exit_code == 0
 42 |     # Output will not have LOC since ORG was kept
 43 |     assert file_fields_match(TEST_FILES_DIR / "minimal_no_LOC.bio", output_path)
 44 | 
 45 | 
 46 | def test_keep_types2() -> None:
 47 |     runner = CliRunner()
 48 |     input_path = str(ANNOTATION_DIR / "minimal.bio")
 49 |     output_path = str(Path(TMP_DIR.name) / "out.bio")
 50 |     result = runner.invoke(
 51 |         process,
 52 |         [
 53 |             "--keep-types",
 54 |             "LOC,ORG",
 55 |             "--labels",
 56 |             "BIO",
 57 |             input_path,
 58 |             output_path,
 59 |         ],
 60 |     )
 61 |     assert result.exit_code == 0
 62 |     # Input should be unchanged since all types were kept
 63 |     assert file_fields_match(input_path, output_path)
 64 | 
 65 | 
 66 | def test_remove_types1() -> None:
 67 |     runner = CliRunner()
 68 |     input_path = str(ANNOTATION_DIR / "minimal.bio")
 69 |     output_path = str(Path(TMP_DIR.name) / "out.bio")
 70 |     result = runner.invoke(
 71 |         process,
 72 |         [
 73 |             "--remove-types",
 74 |             "LOC",
 75 |             "--labels",
 76 |             "BIO",
 77 |             input_path,
 78 |             output_path,
 79 |         ],
 80 |     )
 81 |     assert result.exit_code == 0
 82 |     # Output will not have LOC
 83 |     assert file_fields_match(TEST_FILES_DIR / "minimal_no_LOC.bio", output_path)
 84 | 
 85 | 
 86 | def test_remove_types2() -> None:
 87 |     runner = CliRunner()
 88 |     input_path = str(ANNOTATION_DIR / "minimal.bio")
 89 |     output_path = str(Path(TMP_DIR.name) / "out.bio")
 90 |     result = runner.invoke(
 91 |         process,
 92 |         [
 93 |             "--remove-types",
 94 |             "MISC",
 95 |             "--labels",
 96 |             "BIO",
 97 |             input_path,
 98 |             output_path,
 99 |         ],
100 |     )
101 |     assert result.exit_code == 0
102 |     # Input should be unchanged since MISC isn't in the data
103 |     assert file_fields_match(input_path, output_path)
104 | 
105 | 
106 | def test_remove_types3() -> None:
107 |     runner = CliRunner()
108 |     input_path = str(ANNOTATION_DIR / "minimal.bio")
109 |     output_path = str(Path(TMP_DIR.name) / "out.bio")
110 |     result = runner.invoke(
111 |         process,
112 |         [
113 |             "--remove-types",
114 |             "LOC,ORG",
115 |             "--labels",
116 |             "BIO",
117 |             input_path,
118 |             output_path,
119 |         ],
120 |     )
121 |     assert result.exit_code == 0
122 |     # Output won't have any names since all types were removed
123 |     assert file_fields_match(TEST_FILES_DIR / "minimal_no_names.bio", output_path)
124 | 
125 | 
126 | def test_map_types1() -> None:
127 |     runner = CliRunner()
128 |     map_path = str(TEST_FILES_DIR / "map_LOC_GPE.json")
129 |     input_path = str(ANNOTATION_DIR / "minimal.bio")
130 |     output_path = str(Path(TMP_DIR.name) / "out.bio")
131 |     result = runner.invoke(
132 |         process,
133 |         [
134 |             "--type-map",
135 |             map_path,
136 |             "--labels",
137 |             "BIO",
138 |             input_path,
139 |             output_path,
140 |         ],
141 |     )
142 |     assert result.exit_code == 0
143 |     # Output will have GPE instead of LOC
144 |     assert file_fields_match(TEST_FILES_DIR / "minimal_GPE.bio", output_path)
145 | 
146 | 
147 | def test_map_types2() -> None:
148 |     runner = CliRunner()
149 |     map_path = str(TEST_FILES_DIR / "map_NAME.json")
150 |     input_path = str(ANNOTATION_DIR / "minimal.bio")
151 |     output_path = str(Path(TMP_DIR.name) / "out.bio")
152 |     result = runner.invoke(
153 |         process,
154 |         [
155 |             "--type-map",
156 |             map_path,
157 |             "--labels",
158 |             "BIO",
159 |             input_path,
160 |             output_path,
161 |         ],
162 |     )
163 |     assert result.exit_code == 0
164 |     # All types will be NAME in output
165 |     assert file_fields_match(TEST_FILES_DIR / "minimal_NAME.bio", output_path)
166 | 
167 | 
168 | def test_map_types3() -> None:
169 |     runner = CliRunner()
170 |     map_path = str(TEST_FILES_DIR / "map_PERSON.json")
171 |     input_path = str(ANNOTATION_DIR / "minimal.bio")
172 |     output_path = str(Path(TMP_DIR.name) / "out.bio")
173 |     result = runner.invoke(
174 |         process,
175 |         [
176 |             "--type-map",
177 |             map_path,
178 |             "--labels",
179 |             "BIO",
180 |             input_path,
181 |             output_path,
182 |         ],
183 |     )
184 |     assert result.exit_code == 0
185 |     # Input will be unchanged since map doesn't affect LOC and ORG
186 |     assert file_fields_match(input_path, output_path)
187 | 
188 | 
189 | def test_map_types_remove_types() -> None:
190 |     runner = CliRunner()
191 |     map_path = str(TEST_FILES_DIR / "map_LOC_GPE.json")
192 |     input_path = str(ANNOTATION_DIR / "minimal.bio")
193 |     output_path = str(Path(TMP_DIR.name) / "out.bio")
194 |     result = runner.invoke(
195 |         process,
196 |         [
197 |             "--type-map",
198 |             map_path,
199 |             "--remove-types",
200 |             "LOC",
201 |             "--labels",
202 |             "BIO",
203 |             input_path,
204 |             output_path,
205 |         ],
206 |     )
207 |     assert result.exit_code == 0
208 |     # LOC will be mapped to GPE since mapping applies before removal
209 |     assert file_fields_match(TEST_FILES_DIR / "minimal_GPE.bio", output_path)
210 | 
211 | 
212 | def test_map_types_keep_types() -> None:
213 |     runner = CliRunner()
214 |     map_path = str(TEST_FILES_DIR / "map_LOC_GPE.json")
215 |     input_path = str(ANNOTATION_DIR / "minimal.bio")
216 |     output_path = str(Path(TMP_DIR.name) / "out.bio")
217 |     result = runner.invoke(
218 |         process,
219 |         [
220 |             "--type-map",
221 |             map_path,
222 |             "--keep-types",
223 |             "LOC",
224 |             "--labels",
225 |             "BIO",
226 |             input_path,
227 |             output_path,
228 |         ],
229 |     )
230 |     assert result.exit_code == 0
231 |     # No names since LOC will be mapped to GPE and only LOC will be kept
232 |     assert file_fields_match(TEST_FILES_DIR / "minimal_no_names.bio", output_path)
233 | 
234 | 
235 | def test_map_types_invalid_map() -> None:
236 |     runner = CliRunner()
237 |     map_path = str(TEST_FILES_DIR / "map_bad_value.json")
238 |     input_path = str(ANNOTATION_DIR / "minimal.bio")
239 |     output_path = str(Path(TMP_DIR.name) / "out.bio")
240 |     result = runner.invoke(
241 |         process,
242 |         [
243 |             "--type-map",
244 |             map_path,
245 |             "--labels",
246 |             "BIO",
247 |             input_path,
248 |             output_path,
249 |         ],
250 |     )
251 |     # Malformed map, dictionary value is a string and not a list
252 |     assert result.exit_code != 0
253 | 
254 | 
255 | def test_map_types_duplicate_mapping() -> None:
256 |     runner = CliRunner()
257 |     map_path = str(TEST_FILES_DIR / "map_bad_duplicate.json")
258 |     input_path = str(ANNOTATION_DIR / "minimal.bio")
259 |     output_path = str(Path(TMP_DIR.name) / "out.bio")
260 |     result = runner.invoke(
261 |         process,
262 |         [
263 |             "--type-map",
264 |             map_path,
265 |             "--labels",
266 |             "BIO",
267 |             input_path,
268 |             output_path,
269 |         ],
270 |     )
271 |     # Malformed map, dictionary value is a string and not a list
272 |     assert result.exit_code != 0
273 | 
274 | 
275 | def test_keep_and_remove_types() -> None:
276 |     runner = CliRunner()
277 |     input_path = str(ANNOTATION_DIR / "minimal.bio")
278 |     output_path = str(Path(TMP_DIR.name) / "out.bio")
279 |     result = runner.invoke(
280 |         process,
281 |         [
282 |             "--keep-types",
283 |             "LOC,ORG",
284 |             "--remove-types",
285 |             "MISC",
286 |             "--labels",
287 |             "BIO",
288 |             input_path,
289 |             output_path,
290 |         ],
291 |     )
292 |     # Can't specify both keep and remove
293 |     assert result.exit_code != 0
294 | 


--------------------------------------------------------------------------------
/tests/test_scoring.py:
--------------------------------------------------------------------------------
  1 | from decimal import Decimal
  2 | 
  3 | import pytest
  4 | 
  5 | from seqscore.encoding import EncodingError
  6 | from seqscore.model import LabeledSequence, Mention, SequenceProvenance, Span
  7 | from seqscore.scoring import (
  8 |     AccuracyScore,
  9 |     ClassificationScore,
 10 |     TokenCountError,
 11 |     compute_scores,
 12 |     convert_score,
 13 |     score_label_sequences,
 14 |     score_sequence_label_accuracy,
 15 |     score_sequence_mentions,
 16 | )
 17 | 
 18 | 
 19 | def test_score_sentence_labels_correct() -> None:
 20 |     ref_labels = ["O", "B-ORG", "I-ORG", "O"]
 21 |     pred_labels = ref_labels[:]
 22 |     score = AccuracyScore()
 23 |     score_sequence_label_accuracy(pred_labels, ref_labels, score)
 24 |     assert score.total == 4
 25 |     assert score.hits == 4
 26 |     assert score.accuracy == 1.0
 27 | 
 28 | 
 29 | def test_score_sentence_labels_incorrect() -> None:
 30 |     ref_labels = ["O", "B-ORG", "I-ORG", "O"]
 31 |     pred_labels = ref_labels[:]
 32 |     pred_labels[2] = "B-LOC"
 33 |     score = AccuracyScore()
 34 |     score_sequence_label_accuracy(pred_labels, ref_labels, score)
 35 |     assert score.total == 4
 36 |     assert score.hits == 3
 37 |     assert score.accuracy == pytest.approx(3 / 4)
 38 | 
 39 | 
 40 | def test_score_sentence_labels_invalid() -> None:
 41 |     ref_labels = ["O", "B-ORG", "I-ORG", "O"]
 42 |     # Shorter predictions than reference
 43 |     pred_labels = ref_labels[:-1]
 44 |     with pytest.raises(ValueError):
 45 |         score_sequence_label_accuracy(pred_labels, ref_labels, AccuracyScore())
 46 | 
 47 | 
 48 | def test_score_sentence_mentions_correct() -> None:
 49 |     ref_mentions = [Mention(Span(0, 2), "PER"), Mention(Span(4, 5), "ORG")]
 50 |     pred_mentions = [Mention(Span(0, 2), "PER"), Mention(Span(4, 5), "ORG")]
 51 |     score = ClassificationScore()
 52 |     score_sequence_mentions(pred_mentions, ref_mentions, score)
 53 |     assert score.true_pos == 2
 54 |     assert score.false_pos == 0
 55 |     assert score.false_neg == 0
 56 |     assert score.type_scores == {
 57 |         "PER": ClassificationScore(true_pos=1),
 58 |         "ORG": ClassificationScore(true_pos=1),
 59 |     }
 60 |     assert score.total_ref == 2
 61 |     assert score.total_pos == 2
 62 |     assert score.precision == 1.0
 63 |     assert score.recall == 1.0
 64 |     assert score.f1 == 1.0
 65 | 
 66 | 
 67 | def test_score_sentence_mentions_incorrect1() -> None:
 68 |     ref_mentions = [
 69 |         Mention(Span(0, 2), "LOC"),
 70 |         Mention(Span(4, 5), "PER"),
 71 |         Mention(Span(7, 8), "MISC"),
 72 |         Mention(Span(9, 11), "MISC"),
 73 |     ]
 74 |     pred_mentions = [
 75 |         Mention(Span(0, 2), "ORG"),
 76 |         Mention(Span(4, 5), "PER"),
 77 |         Mention(
 78 |             Span(6, 7), "SPURIOUS"
 79 |         ),  # Note that this type isn't even in the reference
 80 |         Mention(Span(9, 11), "MISC"),
 81 |     ]
 82 |     score = ClassificationScore()
 83 |     score_sequence_mentions(pred_mentions, ref_mentions, score)
 84 |     assert score.true_pos == 2
 85 |     assert score.false_pos == 2
 86 |     assert score.false_neg == 2
 87 |     assert score.type_scores == {
 88 |         "PER": ClassificationScore(true_pos=1),
 89 |         "LOC": ClassificationScore(false_neg=1),
 90 |         "MISC": ClassificationScore(false_neg=1, true_pos=1),
 91 |         "ORG": ClassificationScore(false_pos=1),
 92 |         "SPURIOUS": ClassificationScore(false_pos=1),
 93 |     }
 94 |     assert score.total_ref == 4
 95 |     assert score.total_pos == 4
 96 |     assert score.precision == pytest.approx(2 / 4)
 97 |     assert score.recall == pytest.approx(2 / 4)
 98 |     # Note that we have already checked the precision and recall values
 99 |     assert score.f1 == pytest.approx(
100 |         2 * (score.precision * score.recall) / (score.precision + score.recall)
101 |     )
102 | 
103 | 
104 | def test_score_label_sequences_correct() -> None:
105 |     ref_labels = [["O", "B-ORG", "I-ORG", "O"], ["B-PER", "I-PER"]]
106 |     pred_labels = ref_labels[:]
107 |     classification, accuracy = score_label_sequences(
108 |         pred_labels, ref_labels, "BIO", repair=None
109 |     )
110 | 
111 |     assert accuracy.total == 6
112 |     assert accuracy.hits == 6
113 |     assert accuracy.accuracy == 1.0
114 | 
115 |     assert classification.true_pos == 2
116 |     assert classification.false_pos == 0
117 |     assert classification.false_neg == 0
118 |     assert classification.type_scores["ORG"] == ClassificationScore(true_pos=1)
119 |     assert classification.type_scores["PER"] == ClassificationScore(true_pos=1)
120 | 
121 | 
122 | def test_score_label_sequences_invalid_norepair() -> None:
123 |     ref_labels = [["O", "B-ORG", "I-ORG", "O"], ["B-PER", "I-PER"]]
124 |     pred_labels = [["O", "B-ORG", "I-ORG", "O"], ["I-PER", "I-PER"]]
125 |     with pytest.raises(EncodingError):
126 |         score_label_sequences(pred_labels, ref_labels, "BIO", repair=None)
127 | 
128 | 
129 | def test_score_label_sequences_invalid_repair() -> None:
130 |     ref_labels = [["O", "B-ORG", "I-ORG", "O"], ["B-PER", "I-PER"]]
131 |     pred_labels = [["O", "I-ORG", "I-ORG", "O"], ["O", "I-PER"]]
132 |     classification, accuracy = score_label_sequences(
133 |         pred_labels, ref_labels, "BIO", repair="conlleval"
134 |     )
135 | 
136 |     assert accuracy.total == 6
137 |     assert accuracy.hits == 4
138 |     assert accuracy.accuracy == 4 / 6
139 | 
140 |     assert classification.true_pos == 1
141 |     assert classification.false_pos == 1
142 |     assert classification.false_neg == 1
143 |     assert classification.type_scores["ORG"] == ClassificationScore(true_pos=1)
144 |     assert classification.type_scores["PER"] == ClassificationScore(
145 |         false_pos=1, false_neg=1
146 |     )
147 | 
148 | 
149 | def test_score_label_sequences_different_lengths() -> None:
150 |     ref_labels = [["O", "B-ORG", "I-ORG", "O"], ["B-PER", "I-PER"]]
151 |     pred_labels = [["O", "B-ORG", "I-ORG", "O"]]
152 |     with pytest.raises(ValueError):
153 |         score_label_sequences(pred_labels, ref_labels, "BIO", repair=None)
154 | 
155 | 
156 | def test_classification_score_empty() -> None:
157 |     score = ClassificationScore()
158 |     assert score.precision == 0.0
159 |     assert score.recall == 0.0
160 |     assert score.f1 == 0.0
161 | 
162 | 
163 | def test_classification_score_update() -> None:
164 |     score1 = ClassificationScore()
165 |     score1.true_pos += 1
166 |     score1.type_scores["PER"].true_pos += 1
167 |     score1.false_pos += 1
168 |     score1.type_scores["ORG"].false_pos += 1
169 | 
170 |     score2 = ClassificationScore()
171 |     score2.false_pos += 1
172 |     score2.type_scores["ORG"].false_pos += 1
173 |     score2.false_neg += 1
174 |     score2.type_scores["MISC"].false_neg += 1
175 |     score2.true_pos += 4
176 |     score2.type_scores["ORG"].true_pos += 4
177 | 
178 |     score1.update(score2)
179 | 
180 |     assert score1.true_pos == 5
181 |     assert score1.false_pos == 2
182 |     assert score1.false_neg == 1
183 |     assert score1.type_scores == {
184 |         "PER": ClassificationScore(true_pos=1),
185 |         "ORG": ClassificationScore(true_pos=4, false_pos=2),
186 |         "MISC": ClassificationScore(false_neg=1),
187 |     }
188 | 
189 | 
190 | def test_accuracy_score_empty() -> None:
191 |     score = AccuracyScore()
192 |     assert score.accuracy == 0.0
193 | 
194 | 
195 | def test_token_count_error() -> None:
196 |     ref_labels = ["O", "B-ORG", "I-ORG", "O"]
197 |     pred_labels = ["O", "B-ORG", "I-ORG", "O", "O"]
198 |     ref_sequence = LabeledSequence(
199 |         ["a", "b", "c", "d"], ref_labels, provenance=SequenceProvenance(0, "test")
200 |     )
201 |     pred_sequence = LabeledSequence(
202 |         ["a", "b", "c", "d", "e"], pred_labels, provenance=SequenceProvenance(0, "test")
203 |     )
204 |     with pytest.raises(TokenCountError):
205 |         compute_scores([[pred_sequence]], [[ref_sequence]])
206 | 
207 | 
208 | def test_provenance_none_raises_error() -> None:
209 |     labels = ["O", "B-ORG"]
210 |     sequence = LabeledSequence(["a", "b"], labels, provenance=None)
211 |     with pytest.raises(ValueError):
212 |         TokenCountError.from_predicted_sequence(2, sequence)
213 | 
214 | 
215 | def test_differing_num_docs() -> None:
216 |     ref_labels = ["O", "B-ORG"]
217 |     pred_labels = ["O", "B-LOC"]
218 |     ref_sequence = LabeledSequence(
219 |         ["a", "b"], ref_labels, provenance=SequenceProvenance(0, "test")
220 |     )
221 |     pred_sequence = LabeledSequence(
222 |         ["a", "b"], pred_labels, provenance=SequenceProvenance(0, "test")
223 |     )
224 |     with pytest.raises(ValueError):
225 |         compute_scores([[pred_sequence]], [[ref_sequence], [ref_sequence]])
226 | 
227 | 
228 | def test_differing_doc_length() -> None:
229 |     ref_labels = ["O", "B-ORG"]
230 |     pred_labels = ["O", "B-LOC"]
231 |     ref_sequence = LabeledSequence(
232 |         ["a", "b"], ref_labels, provenance=SequenceProvenance(0, "test")
233 |     )
234 |     pred_sequence = LabeledSequence(
235 |         ["a", "b"], pred_labels, provenance=SequenceProvenance(0, "test")
236 |     )
237 |     with pytest.raises(ValueError):
238 |         compute_scores([[pred_sequence]], [[ref_sequence, ref_sequence]])
239 | 
240 | 
241 | def test_differing_pred_and_ref_tokens() -> None:
242 |     ref_labels = ["O", "B-ORG"]
243 |     pred_labels = ["O", "B-LOC"]
244 |     ref_sequence = LabeledSequence(
245 |         ["a", "b"], ref_labels, provenance=SequenceProvenance(0, "test")
246 |     )
247 |     pred_sequence = LabeledSequence(
248 |         ["a", "c"], pred_labels, provenance=SequenceProvenance(0, "test")
249 |     )
250 |     with pytest.raises(ValueError):
251 |         compute_scores([[pred_sequence]], [[ref_sequence]])
252 | 
253 | 
254 | def test_convert_score() -> None:
255 |     # Check basic rounding up/down
256 |     assert convert_score(0.92156, False) == Decimal("92.16")
257 |     assert convert_score(0.92154, False) == Decimal("92.15")
258 | 
259 |     # Check half rounding
260 |     # Note: due to inexact float representation, changing the test values
261 |     # can lead to unexpected failures. If the final 5 is actually represented
262 |     # as 49999 instead, it will cause rounding down.
263 |     # See: https://docs.python.org/3/library/functions.html#round
264 |     assert convert_score(0.03205, False) == Decimal("3.21")
265 |     assert convert_score(0.03225, False) == Decimal("3.23")
266 |     assert convert_score(0.02205, False) == Decimal("2.21")
267 |     assert convert_score(0.02245, False) == Decimal("2.25")
268 | 
269 |     # Check that the number of decimal places is constant
270 |     assert convert_score(1.0, False) == Decimal("100.00")
271 |     assert convert_score(0.5, False) == Decimal("50.00")
272 |     assert convert_score(0.0, False) == Decimal("0.00")
273 | 
274 |     # Check full precision
275 |     assert convert_score(1 / 3, True) == 1 / 3
276 |     assert convert_score(1 / 7, True) == 1 / 7
277 |     assert convert_score(1 / 9, True) == 1 / 9
278 | 


--------------------------------------------------------------------------------
/seqscore/scoring.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter, defaultdict
  2 | from collections.abc import Iterable, Sequence
  3 | from decimal import ROUND_HALF_UP, Decimal
  4 | from typing import DefaultDict, Optional, Union
  5 | 
  6 | from attr import Factory, attrib, attrs
  7 | 
  8 | from seqscore.encoding import Encoding, EncodingError, get_encoding
  9 | from seqscore.model import LabeledSequence, Mention
 10 | from seqscore.util import tuplify_strs, validator_nonempty_str
 11 | from seqscore.validation import validate_labels
 12 | 
 13 | 
 14 | def _defaultdict_classification_score() -> DefaultDict[str, "ClassificationScore"]:
 15 |     return defaultdict(ClassificationScore)
 16 | 
 17 | 
 18 | @attrs(frozen=True, slots=True)
 19 | class TokensWithType:
 20 |     tokens: tuple[str, ...] = attrib(converter=tuplify_strs)
 21 |     type: str = attrib(validator=validator_nonempty_str)
 22 | 
 23 | 
 24 | class TokenCountError(ValueError):
 25 |     def __init__(
 26 |         self,
 27 |         reference_token_count: int,
 28 |         pred_token_count: int,
 29 |         line_num: int,
 30 |         source: Optional[str],
 31 |     ):
 32 |         self.reference_token_count: int = reference_token_count
 33 |         self.other_token_count: int = pred_token_count
 34 |         self.line_num: int = line_num
 35 |         self.source: Optional[str] = source
 36 | 
 37 |         # Insertable string if source is specified
 38 |         src = f" of {source}" if source else ""
 39 |         msg = "\n".join(
 40 |             [
 41 |                 f"Token count mismatch at line {line_num}{src}",
 42 |                 f"Reference sequence contains {reference_token_count} tokens; "
 43 |                 + f"predicted sequence contains {pred_token_count}.",
 44 |                 "Correct the predictions to have the same number of tokens as the reference.",
 45 |             ]
 46 |         )
 47 |         super().__init__(msg)
 48 | 
 49 |     @classmethod
 50 |     def from_predicted_sequence(
 51 |         cls, reference_token_count: int, pred_sequence: LabeledSequence
 52 |     ) -> "TokenCountError":
 53 |         if pred_sequence.provenance is None:
 54 |             raise ValueError(
 55 |                 f"Cannot create {cls.__name__} from sequence without provenance"
 56 |             )
 57 |         return cls(
 58 |             reference_token_count,
 59 |             len(pred_sequence),
 60 |             pred_sequence.provenance.starting_line,
 61 |             pred_sequence.provenance.source,
 62 |         )
 63 | 
 64 | 
 65 | @attrs
 66 | class ClassificationScore:
 67 |     true_pos: int = attrib(default=0, kw_only=True)
 68 |     false_pos: int = attrib(default=0, kw_only=True)
 69 |     false_neg: int = attrib(default=0, kw_only=True)
 70 |     type_scores: DefaultDict[str, "ClassificationScore"] = attrib(
 71 |         default=Factory(_defaultdict_classification_score), kw_only=True
 72 |     )
 73 |     false_pos_examples: Counter[TokensWithType] = attrib(default=Factory(Counter))
 74 |     false_neg_examples: Counter[TokensWithType] = attrib(default=Factory(Counter))
 75 | 
 76 |     def count_false_positive(self, tokens: Iterable[str], type_: str) -> None:
 77 |         self.false_pos_examples[TokensWithType(tuple(tokens), type_)] += 1
 78 | 
 79 |     def count_false_negative(self, tokens: Iterable[str], type_: str) -> None:
 80 |         self.false_neg_examples[TokensWithType(tuple(tokens), type_)] += 1
 81 | 
 82 |     def update(self, score: "ClassificationScore") -> None:
 83 |         self.true_pos += score.true_pos
 84 |         self.false_pos += score.false_pos
 85 |         self.false_neg += score.false_neg
 86 |         for entity_type, entity_score in score.type_scores.items():
 87 |             self.type_scores[entity_type].update(entity_score)
 88 | 
 89 |     @property
 90 |     def total_pos(self) -> int:
 91 |         return self.true_pos + self.false_pos
 92 | 
 93 |     @property
 94 |     def total_ref(self) -> int:
 95 |         return self.true_pos + self.false_neg
 96 | 
 97 |     @property
 98 |     def precision(self) -> float:
 99 |         total = self.total_pos
100 |         if not total:
101 |             return 0.0
102 |         return self.true_pos / total
103 | 
104 |     @property
105 |     def recall(self) -> float:
106 |         total = self.total_ref
107 |         if not total:
108 |             return 0.0
109 |         return self.true_pos / total
110 | 
111 |     @property
112 |     def f1(self) -> float:
113 |         precision = self.precision
114 |         recall = self.recall
115 |         if not precision or not recall:
116 |             return 0.0
117 |         return 2 * (precision * recall) / (precision + recall)
118 | 
119 | 
120 | @attrs
121 | class AccuracyScore:
122 |     hits: int = attrib(default=0, kw_only=True)
123 |     total: int = attrib(default=0, kw_only=True)
124 | 
125 |     @property
126 |     def accuracy(self) -> float:
127 |         if self.total == 0:
128 |             return 0.0
129 |         return self.hits / self.total
130 | 
131 | 
132 | def compute_scores(
133 |     pred_docs: Sequence[Sequence[LabeledSequence]],
134 |     ref_docs: Sequence[Sequence[LabeledSequence]],
135 |     *,
136 |     count_fp_fn: bool = False,
137 | ) -> tuple[ClassificationScore, AccuracyScore]:
138 |     accuracy = AccuracyScore()
139 |     classification = ClassificationScore()
140 | 
141 |     # TODO: Recommend use of ignore_document_boundaries if this error is encountered
142 |     if len(pred_docs) != len(ref_docs):
143 |         raise ValueError(
144 |             f"Prediction has {len(pred_docs)} documents, reference has {len(ref_docs)}"
145 |         )
146 | 
147 |     for pred_doc, ref_doc in zip(pred_docs, ref_docs):
148 |         if len(pred_doc) != len(ref_doc):
149 |             raise ValueError(
150 |                 f"Prediction has {len(pred_doc)} sequences, reference has {len(ref_doc)}"
151 |             )
152 | 
153 |         for pred_sequence, ref_sequence in zip(pred_doc, ref_doc):
154 |             if len(pred_sequence) != len(ref_sequence):
155 |                 raise TokenCountError.from_predicted_sequence(
156 |                     len(ref_sequence), pred_sequence
157 |                 )
158 | 
159 |             # Fail if tokens have been changed
160 |             # TODO: Consider removing this check or providing a flag to disable it
161 |             # TODO: Change to a more verbose error that uses the provenance
162 |             if pred_sequence.tokens != ref_sequence.tokens:
163 |                 raise ValueError(
164 |                     "Tokens do not match between predictions and reference.\n"
165 |                     f"Prediction: {pred_sequence.tokens}\n"
166 |                     f"Reference: {ref_sequence.tokens}"
167 |                 )
168 | 
169 |             score_sequence_label_accuracy(
170 |                 pred_sequence.labels, ref_sequence.labels, accuracy
171 |             )
172 |             score_sequence_mentions(
173 |                 pred_sequence.mentions,
174 |                 ref_sequence.mentions,
175 |                 classification,
176 |                 tokens=ref_sequence.tokens,
177 |                 count_fp_fn=count_fp_fn,
178 |             )
179 | 
180 |     return classification, accuracy
181 | 
182 | 
183 | def score_sequence_label_accuracy(
184 |     pred_labels: Sequence[str],
185 |     ref_labels: Sequence[str],
186 |     score: AccuracyScore,
187 | ) -> None:
188 |     """Update an AccuracyScore for a single sequence's labels."""
189 | 
190 |     if len(pred_labels) != len(ref_labels):
191 |         raise ValueError(
192 |             f"Prediction has {len(pred_labels)} labels, reference has {len(ref_labels)}"
193 |         )
194 | 
195 |     # Compute label accuracy
196 |     for pred_label, ref_label in zip(pred_labels, ref_labels):
197 |         if pred_label == ref_label:
198 |             score.hits += 1
199 |         score.total += 1
200 | 
201 | 
202 | def score_sequence_mentions(
203 |     pred_mentions: Sequence[Mention],
204 |     ref_mentions: Sequence[Mention],
205 |     score: ClassificationScore,
206 |     *,
207 |     tokens: Optional[Sequence[str]] = (),
208 |     count_fp_fn: bool = False,
209 | ) -> None:
210 |     """Update a ClassificationScore for a single sequence's mentions.
211 | 
212 |     Since mentions are defined per-sequence, the behavior is not defined
213 |     if you provide mentions corresponding to multiple sequences.
214 |     """
215 |     # Compute span accuracy
216 |     pred_mentions_set = set(pred_mentions)
217 |     ref_mentions_set = set(ref_mentions)
218 | 
219 |     # Positives
220 |     for pred in pred_mentions_set:
221 |         if pred in ref_mentions_set:
222 |             # True positive
223 |             score.true_pos += 1
224 |             score.type_scores[pred.type].true_pos += 1
225 |         else:
226 |             # False positive
227 |             score.false_pos += 1
228 |             score.type_scores[pred.type].false_pos += 1
229 |             if count_fp_fn:
230 |                 error_tokens = tokens[pred.span.start : pred.span.end]
231 |                 score.count_false_positive(error_tokens, pred.type)
232 | 
233 |     # Negatives
234 |     for ref in ref_mentions_set:
235 |         if ref not in pred_mentions_set:
236 |             score.false_neg += 1
237 |             score.type_scores[ref.type].false_neg += 1
238 |             if count_fp_fn:
239 |                 error_tokens = tokens[ref.span.start : ref.span.end]
240 |                 score.count_false_negative(error_tokens, ref.type)
241 | 
242 | 
243 | def score_label_sequences(
244 |     pred_label_sequences: Sequence[Sequence[str]],
245 |     ref_label_sequences: Sequence[Sequence[str]],
246 |     encoding_name: str,
247 |     *,
248 |     repair: Optional[str],
249 | ) -> tuple[ClassificationScore, AccuracyScore]:
250 |     """Return accuracy and classification scores for predicted and reference label sequences."""
251 |     if len(pred_label_sequences) != len(ref_label_sequences):
252 |         raise ValueError(
253 |             f"Different number of sequences in predicted ({len(pred_label_sequences)}) and "
254 |             + f"reference ({len(ref_label_sequences)})"
255 |         )
256 | 
257 |     encoder = get_encoding(encoding_name)
258 | 
259 |     classification_score = ClassificationScore()
260 |     accuracy_score = AccuracyScore()
261 | 
262 |     for pred_labels, ref_labels in zip(pred_label_sequences, ref_label_sequences):
263 |         # This takes care of checking that the lengths of the labels match
264 |         score_sequence_label_accuracy(pred_labels, ref_labels, accuracy_score)
265 |         pred_mentions = _repair_label_sequence(pred_labels, encoder, repair)
266 |         ref_mentions = _repair_label_sequence(ref_labels, encoder, repair)
267 |         score_sequence_mentions(pred_mentions, ref_mentions, classification_score)
268 | 
269 |     return classification_score, accuracy_score
270 | 
271 | 
272 | def _repair_label_sequence(
273 |     labels: Sequence[str], encoder: Encoding, repair: Optional[str]
274 | ) -> Sequence[Mention]:
275 |     validation = validate_labels(labels, encoder, repair=repair)
276 |     if not validation.is_valid():
277 |         if repair:
278 |             labels = validation.repaired_labels
279 |         else:
280 |             raise EncodingError(
281 |                 "Cannot score sequence due to validation errors.\n"
282 |                 + f"Labels:\n{labels}\n"
283 |                 + "Errors:\n"
284 |                 + "\n".join(err.msg for err in validation.errors)
285 |             )
286 |     return encoder.decode_labels(labels)
287 | 
288 | 
289 | def convert_score(num: float, full_precision: bool) -> Union[Decimal, float]:
290 |     if full_precision:
291 |         # Leave it unchanged
292 |         return num
293 |     else:
294 |         # Convert a 0-1 score to the 0-100 range with two decimal places
295 |         dec = Decimal(num) * 100
296 |         return dec.quantize(Decimal("0.01"), rounding=ROUND_HALF_UP)
297 | 


--------------------------------------------------------------------------------
/tests/test_validation.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from attr import attrs
  3 | 
  4 | from seqscore.encoding import REPAIR_NONE, EncodingError, get_encoding
  5 | from seqscore.validation import validate_labels
  6 | 
  7 | 
  8 | @attrs(auto_attribs=True)
  9 | class RepairTest:
 10 |     original_labels: list[str]
 11 |     n_errors: int
 12 |     repaired_labels: dict[str, list[str]]
 13 | 
 14 | 
 15 | BIO_REPAIRS = [
 16 |     RepairTest(
 17 |         ["I-PER"],
 18 |         1,
 19 |         {"conlleval": ["B-PER"], "discard": ["O"]},
 20 |     ),
 21 |     RepairTest(
 22 |         ["I-PER", "I-PER"],
 23 |         1,
 24 |         {"conlleval": ["B-PER", "I-PER"], "discard": ["O", "O"]},
 25 |     ),
 26 |     RepairTest(
 27 |         ["O", "I-PER", "I-PER"],
 28 |         1,
 29 |         {"conlleval": ["O", "B-PER", "I-PER"], "discard": ["O", "O", "O"]},
 30 |     ),
 31 |     RepairTest(
 32 |         ["B-ORG", "I-PER", "I-PER"],
 33 |         1,
 34 |         {"conlleval": ["B-ORG", "B-PER", "I-PER"], "discard": ["B-ORG", "O", "O"]},
 35 |     ),
 36 |     RepairTest(
 37 |         ["I-ORG", "I-PER", "I-PER"],
 38 |         2,
 39 |         {"conlleval": ["B-ORG", "B-PER", "I-PER"], "discard": ["O", "O", "O"]},
 40 |     ),
 41 |     RepairTest(
 42 |         ["O", "I-ORG", "I-PER", "I-ORG"],
 43 |         3,
 44 |         {"conlleval": ["O", "B-ORG", "B-PER", "B-ORG"], "discard": ["O", "O", "O", "O"]},
 45 |     ),
 46 |     RepairTest(
 47 |         ["O", "B-ORG", "B-PER", "I-PER"],
 48 |         0,
 49 |         {
 50 |             "conlleval": ["O", "B-ORG", "B-PER", "I-PER"],
 51 |             "discard": ["O", "B-ORG", "B-PER", "I-PER"],
 52 |         },
 53 |     ),
 54 | ]
 55 | IOB_REPAIRS = [
 56 |     RepairTest(
 57 |         ["B-PER"],
 58 |         1,
 59 |         {"conlleval": ["I-PER"]},
 60 |     ),
 61 |     RepairTest(
 62 |         ["B-PER", "I-PER"],
 63 |         1,
 64 |         {"conlleval": ["I-PER", "I-PER"]},
 65 |     ),
 66 |     RepairTest(
 67 |         ["O", "B-PER", "I-PER"],
 68 |         1,
 69 |         {"conlleval": ["O", "I-PER", "I-PER"]},
 70 |     ),
 71 |     RepairTest(
 72 |         ["B-ORG", "B-PER", "I-PER"],
 73 |         2,
 74 |         {"conlleval": ["I-ORG", "I-PER", "I-PER"]},
 75 |     ),
 76 |     RepairTest(
 77 |         ["I-ORG", "B-PER", "I-PER"],
 78 |         1,
 79 |         {"conlleval": ["I-ORG", "I-PER", "I-PER"]},
 80 |     ),
 81 |     RepairTest(
 82 |         ["O", "I-ORG", "B-PER", "I-ORG"],
 83 |         1,
 84 |         {"conlleval": ["O", "I-ORG", "I-PER", "I-ORG"]},
 85 |     ),
 86 |     RepairTest(
 87 |         ["O", "B-ORG", "B-PER", "I-PER"],
 88 |         2,
 89 |         {
 90 |             "conlleval": ["O", "I-ORG", "I-PER", "I-PER"],
 91 |         },
 92 |     ),
 93 |     RepairTest(
 94 |         ["O", "B-ORG", "B-ORG", "I-PER"],
 95 |         1,
 96 |         {
 97 |             "conlleval": ["O", "I-ORG", "B-ORG", "I-PER"],
 98 |         },
 99 |     ),
100 | ]
101 | 
102 | REPAIRS = {
103 |     "BIO": BIO_REPAIRS,
104 |     "IOB": IOB_REPAIRS,
105 | }
106 | REPAIR_UNSUPPORTED = {"IO", "BIOES"}
107 | 
108 | 
109 | def test_repair() -> None:
110 |     for encoding_name, repairs in REPAIRS.items():
111 |         encoding = get_encoding(encoding_name)
112 | 
113 |         # Invalid repair method name
114 |         with pytest.raises(ValueError):
115 |             encoding.repair_labels(["O"], "unk")
116 |         assert "unk" not in encoding.supported_repair_methods()
117 | 
118 |         for case in repairs:
119 |             result = validate_labels(case.original_labels, encoding)
120 |             assert len(result) == case.n_errors
121 |             if case.n_errors:
122 |                 assert not result.is_valid()
123 | 
124 |             for method, repaired in case.repaired_labels.items():
125 |                 assert encoding.repair_labels(case.original_labels, method) == repaired
126 | 
127 |                 # Check that using no repair method raises an error
128 |                 with pytest.raises(ValueError):
129 |                     encoding.repair_labels(case.original_labels, REPAIR_NONE)
130 | 
131 |     for encoding_name in REPAIR_UNSUPPORTED:
132 |         encoding = get_encoding(encoding_name)
133 |         assert not encoding.supported_repair_methods()
134 | 
135 | 
136 | def test_validation_invalid_state() -> None:
137 |     encoding = get_encoding("BIO")
138 | 
139 |     result = validate_labels(["O", "S-PER"], encoding)
140 |     assert not result.is_valid()
141 | 
142 |     with pytest.raises(EncodingError):
143 |         validate_labels(["OUTSIDE", "B-PER"], encoding)
144 | 
145 |     with pytest.raises(EncodingError):
146 |         validate_labels(["O", "PER"], encoding)
147 | 
148 | 
149 | def test_validation_messages() -> None:
150 |     encoding = get_encoding("BIO")
151 | 
152 |     tokens = ["Dr.", "Jonas", "Salk"]
153 |     line_nums = [7, 8, 9]
154 |     labels = ["O", "I-PER", "I-PER"]
155 |     result = validate_labels(labels, encoding, tokens=tokens, line_nums=line_nums)
156 |     assert not result.is_valid()
157 |     assert len(result) == 1
158 |     assert (
159 |         result.errors[0].msg
160 |         == "Invalid transition 'O' -> 'I-PER' for token 'Jonas' on line 8"
161 |     )
162 | 
163 |     tokens = ["foo"]
164 |     line_nums = [7]
165 |     labels = ["S-FOO"]
166 |     result = validate_labels(labels, encoding, tokens=tokens, line_nums=line_nums)
167 |     assert not result.is_valid()
168 |     assert len(result) == 3
169 |     assert (
170 |         result.errors[0].msg
171 |         == "Invalid state 'S' in label 'S-FOO' for token 'foo' on line 7"
172 |     )
173 |     assert (
174 |         result.errors[1].msg
175 |         == "Invalid transition 'O' -> 'S-FOO' for token 'foo' on line 7"
176 |     )
177 |     assert (
178 |         result.errors[2].msg
179 |         == "Invalid transition 'S-FOO' -> 'O' after token 'foo' on line 7 at end of sequence"
180 |     )
181 | 
182 | 
183 | def test_validation_bio() -> None:
184 |     encoding = get_encoding("BIO")
185 | 
186 |     result = validate_labels(("B-PER",), encoding)
187 |     assert not result.errors
188 |     result = validate_labels(("B-PER", "I-PER"), encoding)
189 |     assert not result.errors
190 |     result = validate_labels(("B-PER", "B-PER", "I-PER"), encoding)
191 |     assert not result.errors
192 |     result = validate_labels(("B-ORG", "B-PER", "I-PER"), encoding)
193 |     assert not result.errors
194 |     result = validate_labels(("B-ORG", "I-PER"), encoding)
195 |     assert len(result.errors) == 1
196 |     result = validate_labels(("I-PER",), encoding)
197 |     assert len(result.errors) == 1
198 |     result = validate_labels(("I-PER", "I-PER"), encoding)
199 |     assert len(result.errors) == 1
200 |     result = validate_labels(("I-PER", "I-ORG"), encoding)
201 |     assert len(result.errors) == 2  # start to I, I to I
202 | 
203 | 
204 | def test_validation_iob() -> None:
205 |     encoding = get_encoding("IOB")
206 | 
207 |     result = validate_labels(("I-PER",), encoding)
208 |     assert not result.errors
209 |     result = validate_labels(("I-PER", "I-PER"), encoding)
210 |     assert not result.errors
211 |     result = validate_labels(("I-PER", "I-PER", "I-ORG"), encoding)
212 |     assert not result.errors
213 |     result = validate_labels(("I-PER", "I-ORG", "I-ORG"), encoding)
214 |     assert not result.errors
215 |     result = validate_labels(("I-PER", "O", "I-ORG", "O", "I-ORG"), encoding)
216 |     assert not result.errors
217 |     result = validate_labels(("I-PER", "B-PER"), encoding)
218 |     assert not result.errors
219 | 
220 |     # We are strict in only allowing B-X after I-X
221 |     result = validate_labels(("I-PER", "B-ORG"), encoding)
222 |     assert len(result.errors) == 1
223 |     result = validate_labels(("B-PER", "I-PER"), encoding)
224 |     assert len(result.errors) == 1
225 | 
226 | 
227 | def test_validation_bioes_start() -> None:
228 |     encoding = get_encoding("BIOES")
229 | 
230 |     result = validate_labels(("O",), encoding)
231 |     assert not result.errors
232 | 
233 |     # Can't start length one mention with B
234 |     result = validate_labels(("B-PER",), encoding)
235 |     assert len(result.errors) == 1
236 | 
237 |     # Can't start any mention with I or E
238 |     result = validate_labels(("I-PER", "E-PER"), encoding)
239 |     assert len(result.errors) == 1
240 |     result = validate_labels(("E-PER",), encoding)
241 |     assert len(result.errors) == 1
242 | 
243 |     # Can start length one mention with S
244 |     result = validate_labels(("S-PER",), encoding)
245 |     assert not result.errors
246 | 
247 |     # Can start length two or three mention with B
248 |     result = validate_labels(("B-PER", "E-PER"), encoding)
249 |     assert not result.errors
250 |     result = validate_labels(("B-PER", "I-PER", "E-PER"), encoding)
251 |     assert not result.errors
252 | 
253 |     # S after E
254 |     result = validate_labels(("B-ORG", "I-ORG", "E-ORG", "S-MISC"), encoding)
255 |     assert not result.errors
256 | 
257 |     # B after S
258 |     result = validate_labels(("S-PER", "B-ORG", "E-ORG"), encoding)
259 |     assert not result.errors
260 | 
261 | 
262 | def test_validation_bioes_continue() -> None:
263 |     encoding = get_encoding("BIOES")
264 | 
265 |     # Cannot continue from S to I
266 |     result = validate_labels(("S-PER", "I-PER"), encoding)
267 |     assert len(result.errors) == 2  # Two errors: S to I and I to end
268 | 
269 |     # Cannot continue from E to I
270 |     result = validate_labels(("B-PER", "I-PER", "E-PER", "I-PER"), encoding)
271 |     assert len(result.errors) == 2  # Two errors: E to I and I to end
272 | 
273 |     # Cannot change type mid-mention
274 |     result = validate_labels(("B-PER", "E-ORG"), encoding)
275 |     assert len(result.errors) == 1
276 |     result = validate_labels(("B-PER", "I-ORG", "E-ORG"), encoding)
277 |     assert len(result.errors) == 1
278 |     result = validate_labels(("B-PER", "I-PER", "E-ORG"), encoding)
279 |     assert len(result.errors) == 1
280 | 
281 |     # B after B
282 |     result = validate_labels(("B-PER", "B-PER"), encoding)
283 |     assert len(result.errors) == 2  # B to B and B to end
284 |     result = validate_labels(("B-PER", "B-ORG"), encoding)
285 |     assert len(result.errors) == 2  # B to B and B to end
286 | 
287 |     # S after B
288 |     result = validate_labels(("B-PER", "S-PER"), encoding)
289 |     assert len(result.errors) == 1
290 |     result = validate_labels(("B-PER", "S-ORG"), encoding)
291 |     assert len(result.errors) == 1
292 | 
293 |     # S after I
294 |     result = validate_labels(("B-PER", "I-PER", "S-PER"), encoding)
295 |     assert len(result.errors) == 1
296 |     result = validate_labels(("B-PER", "I-PER", "S-ORG"), encoding)
297 |     assert len(result.errors) == 1
298 | 
299 |     # S after E (allowed)
300 |     result = validate_labels(("B-PER", "E-PER", "S-PER"), encoding)
301 |     assert not result.errors
302 |     result = validate_labels(("B-PER", "E-PER", "S-ORG"), encoding)
303 |     assert not result.errors
304 | 
305 | 
306 | def test_validation_bioes_end() -> None:
307 |     encoding = get_encoding("BIOES")
308 | 
309 |     # Can't end with I
310 |     result = validate_labels(("B-PER", "I-PER"), encoding)
311 |     assert len(result.errors) == 1
312 |     result = validate_labels(("B-PER", "I-PER", "I-PER"), encoding)
313 |     assert len(result.errors) == 1
314 | 
315 |     # Can end with E
316 |     result = validate_labels(("B-PER", "E-PER"), encoding)
317 |     assert not result.errors
318 |     result = validate_labels(("B-PER", "I-PER", "E-PER"), encoding)
319 |     assert not result.errors
320 | 
321 | 
322 | def test_validation_bioes_adjacent_mentions() -> None:
323 |     encoding = get_encoding("BIOES")
324 | 
325 |     result = validate_labels(("S-PER", "B-ORG", "E-ORG"), encoding)
326 |     assert not result.errors
327 |     result = validate_labels(("S-PER", "B-PER", "E-PER"), encoding)
328 |     assert not result.errors
329 |     result = validate_labels(("B-PER", "E-PER", "S-ORG"), encoding)
330 |     assert not result.errors
331 |     result = validate_labels(("B-PER", "E-PER", "S-PER"), encoding)
332 |     assert not result.errors
333 | 
334 | 
335 | def test_validation_bad_label() -> None:
336 |     encoding = get_encoding("BIO")
337 | 
338 |     tokens = ["Dr.", "Jonas", "Salk"]
339 |     line_nums = [7, 8, 9]
340 |     labels = ["O", "PER", "PER"]
341 |     with pytest.raises(EncodingError) as err:
342 |         validate_labels(labels, encoding, tokens=tokens, line_nums=line_nums)
343 |     assert (
344 |         str(err.value)
345 |         == "Could not parse label 'PER' on line 8 during validation: Label 'PER' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'."
346 |     )
347 | 


--------------------------------------------------------------------------------
/tests/test_encoding.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from attr import attrs
  3 | 
  4 | from seqscore.encoding import (
  5 |     _ENCODING_NAMES,
  6 |     BIO,
  7 |     BIOES,
  8 |     IO,
  9 |     IOB,
 10 |     SUPPORTED_ENCODINGS,
 11 |     BILOUDialect,
 12 |     BIOESDialect,
 13 |     BMEOWDialect,
 14 |     BMESDialect,
 15 |     EncodingError,
 16 |     get_encoding,
 17 | )
 18 | from seqscore.model import LabeledSequence, Mention, Span
 19 | 
 20 | FULL_SENTENCE_LABELS = {
 21 |     "IO": ["I-PER", "O", "I-ORG", "I-ORG", "I-ORG", "I-ORG", "I-ORG", "I-LOC"],
 22 |     "IOB": ["I-PER", "O", "I-ORG", "I-ORG", "B-ORG", "I-ORG", "I-ORG", "I-LOC"],
 23 |     "BIO": ["B-PER", "O", "B-ORG", "I-ORG", "B-ORG", "I-ORG", "I-ORG", "B-LOC"],
 24 |     "BIOES": ["S-PER", "O", "B-ORG", "E-ORG", "B-ORG", "I-ORG", "E-ORG", "S-LOC"],
 25 |     "BILOU": ["U-PER", "O", "B-ORG", "L-ORG", "B-ORG", "I-ORG", "L-ORG", "U-LOC"],
 26 |     "BMES": ["S-PER", "O", "B-ORG", "E-ORG", "B-ORG", "M-ORG", "E-ORG", "S-LOC"],
 27 |     "BMEOW": ["W-PER", "O", "B-ORG", "E-ORG", "B-ORG", "M-ORG", "E-ORG", "W-LOC"],
 28 | }
 29 | FULL_SENTENCE_MENTS = [
 30 |     Mention(Span(0, 1), "PER"),
 31 |     Mention(Span(2, 4), "ORG"),
 32 |     Mention(Span(4, 7), "ORG"),
 33 |     Mention(Span(7, 8), "LOC"),
 34 | ]
 35 | # IO cannot faithfully encode this sentence, so there is just one org
 36 | FULL_SENTENCE_MENTS_IO = [
 37 |     Mention(Span(0, 1), "PER"),
 38 |     Mention(Span(2, 7), "ORG"),
 39 |     Mention(Span(7, 8), "LOC"),
 40 | ]
 41 | # Map to sets of encodings that allow that state
 42 | VALID_ENCODING_STATES = {
 43 |     "B": {"IOB", "BIO", "BIOES", "BILOU", "BMES", "BMEOW"},
 44 |     "I": {"IOB", "BIO", "BIOES", "BILOU", "IO"},
 45 |     "O": {"IOB", "IO", "BIO", "BIOES", "BILOU", "BMES", "BMEOW"},
 46 |     "E": {"BIOES", "BMES", "BMEOW"},
 47 |     "M": {"BMES", "BMEOW"},
 48 |     "L": {"BILOU"},
 49 |     "W": {"BMEOW"},
 50 |     "Z": {},
 51 | }
 52 | 
 53 | 
 54 | @attrs(auto_attribs=True)
 55 | class EdgeTestSentence:
 56 |     name: str
 57 |     mentions: list[Mention]
 58 |     encoding_labels: list[tuple[list[str], list[str]]]
 59 | 
 60 | 
 61 | EDGE_TEST_SENTENCES = [
 62 |     EdgeTestSentence(
 63 |         "One token, one mention",
 64 |         [Mention(Span(0, 1), "PER")],
 65 |         [
 66 |             (["BIO"], ["B-PER"]),
 67 |             (["BIOES", "BMES"], ["S-PER"]),
 68 |             (["BILOU"], ["U-PER"]),
 69 |             (["BMEOW"], ["W-PER"]),
 70 |             (["IO", "IOB"], ["I-PER"]),
 71 |         ],
 72 |     ),
 73 |     EdgeTestSentence(
 74 |         "Two tokens, one mention covering them all",
 75 |         [Mention(Span(0, 2), "PER")],
 76 |         [
 77 |             (["BIO"], ["B-PER", "I-PER"]),
 78 |             (["BIOES", "BMES", "BMEOW"], ["B-PER", "E-PER"]),
 79 |             (["BILOU"], ["B-PER", "L-PER"]),
 80 |             (["IO", "IOB"], ["I-PER", "I-PER"]),
 81 |         ],
 82 |     ),
 83 |     EdgeTestSentence(
 84 |         "Three tokens, one mention covering them all",
 85 |         [Mention(Span(0, 3), "PER")],
 86 |         [
 87 |             (["BIO"], ["B-PER", "I-PER", "I-PER"]),
 88 |             (["BIOES"], ["B-PER", "I-PER", "E-PER"]),
 89 |             (["BMES", "BMEOW"], ["B-PER", "M-PER", "E-PER"]),
 90 |             (["BILOU"], ["B-PER", "I-PER", "L-PER"]),
 91 |             (["IO", "IOB"], ["I-PER", "I-PER", "I-PER"]),
 92 |         ],
 93 |     ),
 94 |     EdgeTestSentence(
 95 |         "Adjacent same-type one-token mentions",
 96 |         [Mention(Span(0, 1), "PER"), Mention(Span(1, 2), "PER")],
 97 |         [
 98 |             (["BIO"], ["B-PER", "B-PER"]),
 99 |             (["BIOES", "BMES"], ["S-PER", "S-PER"]),
100 |             (["BILOU"], ["U-PER", "U-PER"]),
101 |             (["BMEOW"], ["W-PER", "W-PER"]),
102 |             # IO is not included because it cannot faithfully handle this
103 |             (["IOB"], ["I-PER", "B-PER"]),
104 |         ],
105 |     ),
106 |     EdgeTestSentence(
107 |         "Adjacent different-type one-token mentions",
108 |         [Mention(Span(0, 1), "PER"), Mention(Span(1, 2), "ORG")],
109 |         [
110 |             (["BIO"], ["B-PER", "B-ORG"]),
111 |             (["BIOES", "BMES"], ["S-PER", "S-ORG"]),
112 |             (["BILOU"], ["U-PER", "U-ORG"]),
113 |             (["BMEOW"], ["W-PER", "W-ORG"]),
114 |             (["IO", "IOB"], ["I-PER", "I-ORG"]),
115 |         ],
116 |     ),
117 |     EdgeTestSentence(
118 |         "Adjacent same-type two-token mentions",
119 |         [Mention(Span(0, 2), "PER"), Mention(Span(2, 4), "PER")],
120 |         [
121 |             (["BIO"], ["B-PER", "I-PER", "B-PER", "I-PER"]),
122 |             (["BIOES", "BMES", "BMEOW"], ["B-PER", "E-PER", "B-PER", "E-PER"]),
123 |             (["BILOU"], ["B-PER", "L-PER", "B-PER", "L-PER"]),
124 |             # IO is not included because it cannot faithfully handle this
125 |             (["IOB"], ["I-PER", "I-PER", "B-PER", "I-PER"]),
126 |         ],
127 |     ),
128 |     EdgeTestSentence(
129 |         "Adjacent different-type two-token mentions",
130 |         [Mention(Span(0, 2), "PER"), Mention(Span(2, 4), "ORG")],
131 |         [
132 |             (["BIO"], ["B-PER", "I-PER", "B-ORG", "I-ORG"]),
133 |             (["BIOES", "BMES", "BMEOW"], ["B-PER", "E-PER", "B-ORG", "E-ORG"]),
134 |             (["BILOU"], ["B-PER", "L-PER", "B-ORG", "L-ORG"]),
135 |             (["IO", "IOB"], ["I-PER", "I-PER", "I-ORG", "I-ORG"]),
136 |         ],
137 |     ),
138 | ]
139 | 
140 | 
141 | def test_basic_decoding() -> None:
142 |     for encoding_name in SUPPORTED_ENCODINGS:
143 |         encoding = get_encoding(encoding_name)
144 |         labels = FULL_SENTENCE_LABELS[encoding_name]
145 |         mentions = (
146 |             FULL_SENTENCE_MENTS_IO if encoding_name == "IO" else FULL_SENTENCE_MENTS
147 |         )
148 |         assert encoding.decode_labels(labels) == mentions
149 | 
150 | 
151 | def test_basic_encoding() -> None:
152 |     for encoding_name in SUPPORTED_ENCODINGS:
153 |         encoding = get_encoding(encoding_name)
154 |         labels = FULL_SENTENCE_LABELS[encoding_name]
155 |         mentions = (
156 |             FULL_SENTENCE_MENTS_IO if encoding_name == "IO" else FULL_SENTENCE_MENTS
157 |         )
158 | 
159 |         assert encoding.encode_mentions(mentions, len(labels)) == labels
160 |         # Also test encoding sentence object, intentionally putting no mentions in the
161 |         # sentence labels to make sure encoding using the mentions, not the labels
162 |         sentence = LabeledSequence(["a"] * len(labels), ["O"] * len(labels), mentions)
163 |         assert encoding.encode_sequence(sentence) == labels
164 | 
165 | 
166 | def test_round_trip() -> None:
167 |     for encoding_name in set(SUPPORTED_ENCODINGS) & set(SUPPORTED_ENCODINGS):
168 |         # Skip IO since it can't round-trip
169 |         if encoding_name == "IO":
170 |             continue
171 | 
172 |         encoding = get_encoding(encoding_name)
173 |         labels = FULL_SENTENCE_LABELS[encoding_name]
174 |         mentions = FULL_SENTENCE_MENTS
175 | 
176 |         # Encode, then decode
177 |         out_labels = encoding.encode_mentions(mentions, len(labels))
178 |         assert encoding.decode_labels(out_labels) == mentions
179 | 
180 |         # Decode, then encode
181 |         out_mentions = encoding.decode_labels(labels)
182 |         assert encoding.encode_mentions(out_mentions, len(labels)) == labels
183 | 
184 | 
185 | def test_valid_states() -> None:
186 |     all_encoding_names = set(_ENCODING_NAMES)
187 |     for state, valid_encoding_names in VALID_ENCODING_STATES.items():
188 |         for encoding_name in all_encoding_names:
189 |             encoding = get_encoding(encoding_name)
190 |             if encoding_name in valid_encoding_names:
191 |                 assert encoding.is_valid_state(state)
192 |             else:
193 |                 assert not encoding.is_valid_state(state)
194 | 
195 | 
196 | def test_edge_case_encoding() -> None:
197 |     for case in EDGE_TEST_SENTENCES:
198 |         mentions = case.mentions
199 |         for encoding_names, labels in case.encoding_labels:
200 |             for encoding_name in encoding_names:
201 |                 encoding = get_encoding(encoding_name)
202 |                 assert encoding.encode_mentions(mentions, len(labels)) == labels
203 | 
204 | 
205 | def test_get_encodings() -> None:
206 |     assert isinstance(get_encoding("IO"), IO)
207 |     assert isinstance(get_encoding("IOB"), IOB)
208 |     assert isinstance(get_encoding("BIO"), BIO)
209 | 
210 |     # Test the dialects for BIOES and derivatives
211 |     enc = get_encoding("BIOES")
212 |     assert isinstance(enc, BIOES)
213 |     assert isinstance(enc.dialect, BIOESDialect)
214 | 
215 |     enc = get_encoding("BILOU")
216 |     assert isinstance(enc, BIOES)
217 |     assert isinstance(enc.dialect, BILOUDialect)
218 | 
219 |     enc = get_encoding("BMES")
220 |     assert isinstance(enc, BIOES)
221 |     assert isinstance(enc.dialect, BMESDialect)
222 | 
223 |     enc = get_encoding("BMEOW")
224 |     assert isinstance(enc, BIOES)
225 |     assert isinstance(enc.dialect, BMEOWDialect)
226 | 
227 | 
228 | def test_get_unknown_encoding() -> None:
229 |     with pytest.raises(ValueError):
230 |         get_encoding("FOO")
231 | 
232 | 
233 | def test_split_label() -> None:
234 |     # This logic is shared across all encodings, we just need any instantiable one
235 |     encoding = get_encoding("BIO")
236 | 
237 |     assert encoding.split_label("O") == ("O", None)
238 |     assert encoding.split_label("B-PER") == ("B", "PER")
239 |     # Only splits the first delim
240 |     assert encoding.split_label("I-ORG-CORP") == ("I", "ORG-CORP")
241 | 
242 |     with pytest.raises(EncodingError):
243 |         assert encoding.split_label("B")
244 | 
245 |     with pytest.raises(EncodingError):
246 |         assert encoding.split_label("O-ORG")
247 | 
248 |     with pytest.raises(EncodingError):
249 |         assert encoding.split_label("")
250 | 
251 | 
252 | def test_join_label() -> None:
253 |     # This logic is shared across all encodings, we just need any instantiable one
254 |     encoding = get_encoding("BIO")
255 | 
256 |     assert encoding.join_label("B", "PER") == "B-PER"
257 |     assert encoding.join_label("O", None) == "O"
258 | 
259 |     with pytest.raises(AssertionError):
260 |         encoding.join_label("B", None)
261 | 
262 |     with pytest.raises(AssertionError):
263 |         encoding.join_label("O", "PER")
264 | 
265 | 
266 | def test_labeled_sequence() -> None:
267 |     # Test length mismatch
268 |     with pytest.raises(ValueError):
269 |         LabeledSequence(
270 |             ["a"] * 10,
271 |             ["O"] * 9,
272 |         )
273 | 
274 | 
275 | def test_decode_bio_invalid_continue() -> None:
276 |     decoder = get_encoding("BIO")
277 |     sent1 = LabeledSequence(("a", "b"), ("B-PER", "I-LOC"))
278 |     with pytest.raises(AssertionError):
279 |         assert decoder.decode_sequence(sent1)
280 | 
281 | 
282 | def test_decode_iob_invalid_begin() -> None:
283 |     decoder = get_encoding("IOB")
284 |     sent = LabeledSequence(("a", "b"), ("I-PER", "B-LOC"))
285 |     with pytest.raises(AssertionError):
286 |         assert decoder.decode_sequence(sent)
287 | 
288 | 
289 | def test_decode_bioes_invalid_start() -> None:
290 |     decoder = get_encoding("BIOES")
291 |     sents = [
292 |         LabeledSequence(("a",), ("I-PER",)),
293 |         LabeledSequence(("a",), ("E-PER",)),
294 |     ]
295 |     for sent in sents:
296 |         with pytest.raises(AssertionError):
297 |             assert decoder.decode_sequence(sent)
298 | 
299 | 
300 | def test_decode_bioes_invalid_end() -> None:
301 |     decoder = get_encoding("BIOES")
302 |     sents = [
303 |         # Single-token mentions must start (and end) with S
304 |         LabeledSequence(("a", "b"), ("B-PER", "S-PER")),
305 |         # Multi-token mentions must end in E
306 |         LabeledSequence(("a",), ("B-PER",)),
307 |         LabeledSequence(("a", "b"), ("B-PER", "I-PER")),
308 |         # Ends with wrong type
309 |         LabeledSequence(("a", "b", "c"), ("B-PER", "I-PER", "E-ORG")),
310 |         # Multi-token mentions cannot end in S
311 |         LabeledSequence(("a", "b", "c"), ("B-PER", "I-PER", "S-PER")),
312 |     ]
313 |     for sent in sents:
314 |         with pytest.raises(AssertionError):
315 |             assert decoder.decode_sequence(sent)
316 | 
317 | 
318 | def test_decode_bioes_invalid_continue() -> None:
319 |     decoder = get_encoding("BIOES")
320 |     sents = [
321 |         # B must be followed by I or E of the same type
322 |         LabeledSequence(("a", "b"), ("B-PER", "B-PER")),
323 |         # Cannot change types mid-mention
324 |         LabeledSequence(("a", "b"), ("B-PER", "E-ORG")),
325 |         LabeledSequence(("a", "b", "c"), ("B-PER", "I-PER", "E-ORG")),
326 |     ]
327 |     for sent in sents:
328 |         with pytest.raises(AssertionError):
329 |             assert decoder.decode_sequence(sent)
330 | 


--------------------------------------------------------------------------------
/external/conlleval.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | # conlleval: evaluate result of processing CoNLL-2000 shared task
  3 | # usage:     conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file
  4 | #            README: http://www.clips.uantwerpen.be/conll2000/chunking/output.html
  5 | # options:   l: generate LaTeX output for tables like in
  6 | #               http://cnts.uia.ac.be/conll2003/ner/example.tex
  7 | #            r: accept raw result tags (without B- and I- prefix;
  8 | #                                       assumes one word per chunk)
  9 | #            d: alternative delimiter tag (default is single space)
 10 | #            o: alternative outside tag (default is O)
 11 | # note:      the file should contain lines with items separated
 12 | #            by $delimiter characters (default space). The final
 13 | #            two items should contain the correct tag and the 
 14 | #            guessed tag in that order. Sentences should be
 15 | #            separated from each other by empty lines or lines
 16 | #            with $boundary fields (default -X-).
 17 | # url:       http://www.clips.uantwerpen.be/conll2000/chunking/
 18 | # started:   1998-09-25
 19 | # version:   2004-01-26
 20 | # author:    Erik Tjong Kim Sang <erikt@uia.ua.ac.be>
 21 | 
 22 | use strict;
 23 | 
 24 | my $false = 0;
 25 | my $true = 42;
 26 | 
 27 | my $boundary = "-X-";     # sentence boundary
 28 | my $correct;              # current corpus chunk tag (I,O,B)
 29 | my $correctChunk = 0;     # number of correctly identified chunks
 30 | my $correctTags = 0;      # number of correct chunk tags
 31 | my $correctType;          # type of current corpus chunk tag (NP,VP,etc.)
 32 | my $delimiter = " ";      # field delimiter
 33 | my $FB1 = 0.0;            # FB1 score (Van Rijsbergen 1979)
 34 | my $firstItem;            # first feature (for sentence boundary checks)
 35 | my $foundCorrect = 0;     # number of chunks in corpus
 36 | my $foundGuessed = 0;     # number of identified chunks
 37 | my $guessed;              # current guessed chunk tag
 38 | my $guessedType;          # type of current guessed chunk tag
 39 | my $i;                    # miscellaneous counter
 40 | my $inCorrect = $false;   # currently processed chunk is correct until now
 41 | my $lastCorrect = "O";    # previous chunk tag in corpus
 42 | my $latex = 0;            # generate LaTeX formatted output
 43 | my $lastCorrectType = ""; # type of previously identified chunk tag
 44 | my $lastGuessed = "O";    # previously identified chunk tag
 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus
 46 | my $lastType;             # temporary storage for detecting duplicates
 47 | my $line;                 # line
 48 | my $nbrOfFeatures = -1;   # number of features per line
 49 | my $precision = 0.0;      # precision score
 50 | my $oTag = "O";           # outside tag, default O
 51 | my $raw = 0;              # raw input: add B to every token
 52 | my $recall = 0.0;         # recall score
 53 | my $tokenCounter = 0;     # token counter (ignores sentence breaks)
 54 | 
 55 | my %correctChunk = ();    # number of correctly identified chunks per type
 56 | my %foundCorrect = ();    # number of chunks in corpus per type
 57 | my %foundGuessed = ();    # number of identified chunks per type
 58 | 
 59 | my @features;             # features on line
 60 | my @sortedTypes;          # sorted list of chunk type names
 61 | 
 62 | # sanity check
 63 | while (@ARGV and $ARGV[0] =~ /^-/) {
 64 |    if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); }
 65 |    elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); }
 66 |    elsif ($ARGV[0] eq "-d") { 
 67 |       shift(@ARGV); 
 68 |       if (not defined $ARGV[0]) { 
 69 |          die "conlleval: -d requires delimiter character"; 
 70 |       }
 71 |       $delimiter = shift(@ARGV);
 72 |    } elsif ($ARGV[0] eq "-o") {
 73 |       shift(@ARGV);
 74 |       if (not defined $ARGV[0]) {
 75 |          die "conlleval: -o requires delimiter character";
 76 |       }
 77 |       $oTag = shift(@ARGV);
 78 |    } else { die "conlleval: unknown argument $ARGV[0]\n"; }
 79 | }
 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; }
 81 | # process input
 82 | while (<STDIN>) {
 83 |    chomp($line = $_);
 84 |    @features = split(/$delimiter/,$line);
 85 |    if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; }
 86 |    elsif ($nbrOfFeatures != $#features and @features != 0) {
 87 |       printf STDERR "unexpected number of features: %d (%d)\n",
 88 |          $#features+1,$nbrOfFeatures+1;
 89 |       exit(1);
 90 |    }
 91 |    if (@features == 0 or 
 92 |        $features[0] eq $boundary) { @features = ($boundary,"O","O"); }
 93 |    if (@features < 2) { 
 94 |       die "conlleval: unexpected number of features in line $line\n"; 
 95 |    }
 96 |    if ($raw) {
 97 |       if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 
 98 |       if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 
 99 |       if ($features[$#features] ne "O") { 
100 |          $features[$#features] = "B-$features[$#features]";
101 |       }
102 |       if ($features[$#features-1] ne "O") { 
103 |          $features[$#features-1] = "B-$features[$#features-1]";
104 |       }
105 |    }
106 |    # 20040126 ET code which allows hyphens in the types
107 |    if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
108 |       $guessed = $1;
109 |       $guessedType = $2;
110 |    } else { 
111 |       $guessed = $features[$#features]; 
112 |       $guessedType = ""; 
113 |    }
114 |    pop(@features);
115 |    if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
116 |       $correct = $1;
117 |       $correctType = $2;
118 |    } else { 
119 |       $correct = $features[$#features]; 
120 |       $correctType = ""; 
121 |    }
122 |    pop(@features);
123 | #  ($guessed,$guessedType) = split(/-/,pop(@features));
124 | #  ($correct,$correctType) = split(/-/,pop(@features));
125 |    $guessedType = $guessedType ? $guessedType : "";
126 |    $correctType = $correctType ? $correctType : "";
127 |    $firstItem = shift(@features);
128 | 
129 |    # 1999-06-26 sentence breaks should always be counted as out of chunk
130 |    if ( $firstItem eq $boundary ) { $guessed = "O"; }
131 | 
132 |    if ($inCorrect) {
133 |       if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
134 |            &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
135 |            $lastGuessedType eq $lastCorrectType) {
136 |          $inCorrect=$false;
137 |          $correctChunk++;
138 |          $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
139 |              $correctChunk{$lastCorrectType}+1 : 1;
140 |       } elsif ( 
141 |            &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 
142 |            &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or
143 |            $guessedType ne $correctType ) {
144 |          $inCorrect=$false; 
145 |       }
146 |    }
147 | 
148 |    if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 
149 |         &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
150 |         $guessedType eq $correctType) { $inCorrect = $true; }
151 | 
152 |    if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) {
153 |       $foundCorrect++; 
154 |       $foundCorrect{$correctType} = $foundCorrect{$correctType} ?
155 |           $foundCorrect{$correctType}+1 : 1;
156 |    }
157 |    if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) {
158 |       $foundGuessed++; 
159 |       $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ?
160 |           $foundGuessed{$guessedType}+1 : 1;
161 |    }
162 |    if ( $firstItem ne $boundary ) { 
163 |       if ( $correct eq $guessed and $guessedType eq $correctType ) { 
164 |          $correctTags++; 
165 |       }
166 |       $tokenCounter++; 
167 |    }
168 | 
169 |    $lastGuessed = $guessed;
170 |    $lastCorrect = $correct;
171 |    $lastGuessedType = $guessedType;
172 |    $lastCorrectType = $correctType;
173 | }
174 | if ($inCorrect) { 
175 |    $correctChunk++;
176 |    $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
177 |        $correctChunk{$lastCorrectType}+1 : 1;
178 | }
179 | 
180 | if (not $latex) {
181 |    # compute overall precision, recall and FB1 (default values are 0.0)
182 |    $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
183 |    $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
184 |    $FB1 = 2*$precision*$recall/($precision+$recall)
185 |       if ($precision+$recall > 0);
186 |    
187 |    # print overall performance
188 |    printf "processed $tokenCounter tokens with $foundCorrect phrases; ";
189 |    printf "found: $foundGuessed phrases; correct: $correctChunk.\n";
190 |    if ($tokenCounter>0) {
191 |       printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter;
192 |       printf "precision: %6.2f%%; ",$precision;
193 |       printf "recall: %6.2f%%; ",$recall;
194 |       printf "FB1: %6.2f\n",$FB1;
195 |    }
196 | }
197 | 
198 | # sort chunk type names
199 | undef($lastType);
200 | @sortedTypes = ();
201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) {
202 |    if (not($lastType) or $lastType ne $i) { 
203 |       push(@sortedTypes,($i));
204 |    }
205 |    $lastType = $i;
206 | }
207 | # print performance per chunk type
208 | if (not $latex) {
209 |    for $i (@sortedTypes) {
210 |       $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
211 |       if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; }
212 |       else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
213 |       if (not($foundCorrect{$i})) { $recall = 0.0; }
214 |       else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
215 |       if ($precision+$recall == 0.0) { $FB1 = 0.0; }
216 |       else { $FB1 = 2*$precision*$recall/($precision+$recall); }
217 |       printf "%17s: ",$i;
218 |       printf "precision: %6.2f%%; ",$precision;
219 |       printf "recall: %6.2f%%; ",$recall;
220 |       printf "FB1: %6.2f  %d\n",$FB1,$foundGuessed{$i};
221 |    }
222 | } else {
223 |    print "        & Precision &  Recall  & F\$_{\\beta=1} \\\\\\hline";
224 |    for $i (@sortedTypes) {
225 |       $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
226 |       if (not($foundGuessed{$i})) { $precision = 0.0; }
227 |       else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
228 |       if (not($foundCorrect{$i})) { $recall = 0.0; }
229 |       else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
230 |       if ($precision+$recall == 0.0) { $FB1 = 0.0; }
231 |       else { $FB1 = 2*$precision*$recall/($precision+$recall); }
232 |       printf "\n%-7s &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\",
233 |              $i,$precision,$recall,$FB1;
234 |    }
235 |    print "\\hline\n";
236 |    $precision = 0.0;
237 |    $recall = 0;
238 |    $FB1 = 0.0;
239 |    $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
240 |    $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
241 |    $FB1 = 2*$precision*$recall/($precision+$recall)
242 |       if ($precision+$recall > 0);
243 |    printf "Overall &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n",
244 |           $precision,$recall,$FB1;
245 | }
246 | 
247 | exit 0;
248 | 
249 | # endOfChunk: checks if a chunk ended between the previous and current word
250 | # arguments:  previous and current chunk tags, previous and current types
251 | # note:       this code is capable of handling other chunk representations
252 | #             than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
253 | #             Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
254 | 
255 | sub endOfChunk {
256 |    my $prevTag = shift(@_);
257 |    my $tag = shift(@_);
258 |    my $prevType = shift(@_);
259 |    my $type = shift(@_);
260 |    my $chunkEnd = $false;
261 | 
262 |    if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; }
263 |    if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; }
264 |    if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; }
265 |    if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
266 | 
267 |    if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; }
268 |    if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; }
269 |    if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; }
270 |    if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
271 | 
272 |    if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 
273 |       $chunkEnd = $true; 
274 |    }
275 | 
276 |    # corrected 1998-12-22: these chunks are assumed to have length 1
277 |    if ( $prevTag eq "]" ) { $chunkEnd = $true; }
278 |    if ( $prevTag eq "[" ) { $chunkEnd = $true; }
279 | 
280 |    return($chunkEnd);   
281 | }
282 | 
283 | # startOfChunk: checks if a chunk started between the previous and current word
284 | # arguments:    previous and current chunk tags, previous and current types
285 | # note:         this code is capable of handling other chunk representations
286 | #               than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
287 | #               Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
288 | 
289 | sub startOfChunk {
290 |    my $prevTag = shift(@_);
291 |    my $tag = shift(@_);
292 |    my $prevType = shift(@_);
293 |    my $type = shift(@_);
294 |    my $chunkStart = $false;
295 | 
296 |    if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; }
297 |    if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; }
298 |    if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; }
299 |    if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
300 | 
301 |    if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; }
302 |    if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; }
303 |    if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; }
304 |    if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
305 | 
306 |    if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 
307 |       $chunkStart = $true; 
308 |    }
309 | 
310 |    # corrected 1998-12-22: these chunks are assumed to have length 1
311 |    if ( $tag eq "[" ) { $chunkStart = $true; }
312 |    if ( $tag eq "]" ) { $chunkStart = $true; }
313 | 
314 |    return($chunkStart);   
315 | }
316 | 


--------------------------------------------------------------------------------
/seqscore/scripts/seqscore.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from collections import Counter
  4 | from contextlib import nullcontext
  5 | from typing import Callable, Optional
  6 | 
  7 | import click
  8 | from tabulate import tabulate
  9 | 
 10 | import seqscore
 11 | from seqscore.conll import (
 12 |     FORMAT_DELIM,
 13 |     SUPPORTED_SCORE_FORMATS,
 14 |     ingest_conll_file,
 15 |     repair_conll_file,
 16 |     score_conll_files,
 17 |     validate_conll_file,
 18 |     write_docs_using_encoding,
 19 | )
 20 | from seqscore.encoding import (
 21 |     DEFAULT_OUTSIDE,
 22 |     REPAIR_NONE,
 23 |     SUPPORTED_ENCODINGS,
 24 |     SUPPORTED_REPAIR_METHODS,
 25 | )
 26 | from seqscore.processing import modify_types
 27 | 
 28 | 
 29 | # Set up a click command group
 30 | @click.group(
 31 |     help=f"Provides scoring and analysis tools for NER/chunking files (version {seqscore.__version__})"
 32 | )
 33 | @click.version_option(seqscore.__version__)
 34 | # This is tested by a subprocess call in test_seqscore_main so coverage will miss it
 35 | def cli() -> None:  # pragma: no cover
 36 |     pass
 37 | 
 38 | 
 39 | # Argument helpers for commands
 40 | def _input_file_options() -> list[Callable]:
 41 |     return [
 42 |         click.option("--file-encoding", default="UTF-8", show_default=True),
 43 |         click.option("--parse-comment-lines", is_flag=True),
 44 |         click.option(
 45 |             "--ignore-document-boundaries/--use-document-boundaries", default=False
 46 |         ),
 47 |     ]
 48 | 
 49 | 
 50 | def _single_input_file_arguments(func: Callable) -> Callable:
 51 |     # In the order they can be used on the command line
 52 |     decorators = [
 53 |         click.argument("file", type=click.Path(dir_okay=False)),
 54 |     ] + _input_file_options()
 55 |     # Need to apply these backwards to match decorator application order
 56 |     for decorator in decorators[::-1]:
 57 |         func = decorator(func)
 58 |     return func
 59 | 
 60 | 
 61 | def _multi_input_file_arguments(func: Callable) -> Callable:
 62 |     # In the order they can be used on the command line
 63 |     decorators = [
 64 |         click.argument("file", type=click.Path(dir_okay=False), nargs=-1, required=True),
 65 |     ] + _input_file_options()
 66 |     # Need to apply these backwards to match decorator application order
 67 |     for decorator in decorators[::-1]:
 68 |         func = decorator(func)
 69 |     return func
 70 | 
 71 | 
 72 | def _repair_option() -> Callable:
 73 |     return click.option(
 74 |         "--repair-method",
 75 |         type=click.Choice(SUPPORTED_REPAIR_METHODS),
 76 |         default=REPAIR_NONE,
 77 |         show_default=True,
 78 |     )
 79 | 
 80 | 
 81 | def _repair_required_option() -> Callable:
 82 |     return click.option(
 83 |         "--repair-method",
 84 |         type=click.Choice(SUPPORTED_REPAIR_METHODS),
 85 |     )
 86 | 
 87 | 
 88 | def _labels_option() -> Callable:
 89 |     return click.option("--labels", required=True, type=click.Choice(SUPPORTED_ENCODINGS))
 90 | 
 91 | 
 92 | def _labels_option_default_bio() -> Callable:
 93 |     return click.option(
 94 |         "--labels",
 95 |         default="BIO",
 96 |         show_default=True,
 97 |         type=click.Choice(SUPPORTED_ENCODINGS),
 98 |     )
 99 | 
100 | 
101 | def _quiet_option() -> Callable:
102 |     return click.option(
103 |         "--quiet",
104 |         "-q",
105 |         is_flag=True,
106 |         help="do not log the repairs performed and suppress other non-critical messages",
107 |     )
108 | 
109 | 
110 | @cli.command(help="validate labels")
111 | @_multi_input_file_arguments
112 | @_labels_option()
113 | @_quiet_option()
114 | def validate(
115 |     file: list[str],  # Name is "file" to make sense on the command line, but it's a list
116 |     labels: str,
117 |     file_encoding: str,
118 |     *,
119 |     ignore_document_boundaries: bool,
120 |     parse_comment_lines: bool,
121 |     quiet: bool,
122 | ) -> None:
123 |     error = False
124 |     for each_file in file:
125 |         result = validate_conll_file(
126 |             each_file,
127 |             labels,
128 |             file_encoding,
129 |             ignore_document_boundaries=ignore_document_boundaries,
130 |             parse_comment_lines=parse_comment_lines,
131 |         )
132 |         if result.errors:
133 |             print(
134 |                 f"Encountered {len(result.errors)} errors in {result.n_tokens} tokens, "
135 |                 + f"{result.n_sequences} sequences, and {result.n_docs} document(s) in {each_file}"
136 |             )
137 |             print("\n".join(err.msg for err in result.errors))
138 |             error = True
139 |         elif not quiet:
140 |             print(
141 |                 f"No errors found in {result.n_tokens} tokens, {result.n_sequences} sequences, "
142 |                 + f"and {result.n_docs} document(s) in {each_file}"
143 |             )
144 | 
145 |     if error:
146 |         sys.exit(1)
147 | 
148 | 
149 | @cli.command(help="repair invalid label transitions")
150 | @_single_input_file_arguments
151 | @click.argument("output_file")
152 | @_repair_required_option()
153 | @_labels_option()
154 | @click.option("--output-delim", default=" ", help="[default: space]")
155 | @_quiet_option()
156 | def repair(
157 |     file: str,
158 |     output_file: str,
159 |     labels: str,
160 |     file_encoding: str,
161 |     repair_method: str,
162 |     output_delim: str,
163 |     *,
164 |     ignore_document_boundaries: bool,
165 |     parse_comment_lines: bool,
166 |     quiet: bool,
167 | ) -> None:
168 |     if repair_method == REPAIR_NONE:
169 |         raise ValueError(f"Cannot repair with repair strategy {repr(repair_method)}")
170 | 
171 |     repair_conll_file(
172 |         file,
173 |         output_file,
174 |         labels,
175 |         repair_method,
176 |         file_encoding,
177 |         output_delim,
178 |         ignore_document_boundaries=ignore_document_boundaries,
179 |         parse_comment_lines=parse_comment_lines,
180 |         quiet=quiet,
181 |     )
182 | 
183 | 
184 | @cli.command(help="convert between mention encodings")
185 | @_single_input_file_arguments
186 | @click.argument("output_file")
187 | @click.option("--output-delim", default=" ", help="[default: space]")
188 | @click.option("--input-labels", required=True, type=click.Choice(SUPPORTED_ENCODINGS))
189 | @click.option("--output-labels", required=True, type=click.Choice(SUPPORTED_ENCODINGS))
190 | def convert(
191 |     file: str,
192 |     output_file: str,
193 |     file_encoding: str,
194 |     output_delim: str,
195 |     input_labels: str,
196 |     output_labels: str,
197 |     *,
198 |     ignore_document_boundaries: bool,
199 |     parse_comment_lines: bool,
200 | ) -> None:
201 |     if input_labels == output_labels:
202 |         raise ValueError("Conversion requires different input and output labels")
203 | 
204 |     docs = ingest_conll_file(
205 |         file,
206 |         input_labels,
207 |         file_encoding,
208 |         ignore_document_boundaries=ignore_document_boundaries,
209 |         parse_comment_lines=parse_comment_lines,
210 |     )
211 | 
212 |     write_docs_using_encoding(
213 |         docs, output_labels, file_encoding, output_delim, output_file
214 |     )
215 | 
216 | 
217 | @cli.command(help="transform entity types by keeping/removing/mapping")
218 | @_single_input_file_arguments
219 | @click.argument("output_file")
220 | @_labels_option_default_bio()
221 | @click.option(
222 |     "--keep-types",
223 |     default="",
224 |     help="entity types to keep, comma-separated [example: PER,LOC,ORG]",
225 | )
226 | @click.option(
227 |     "--remove-types",
228 |     default="",
229 |     help="entity types to remove, comma-separated [example: MISC,DATE]",
230 | )
231 | @click.option(
232 |     "--type-map",
233 |     type=click.Path(dir_okay=False),
234 |     help="a JSON file containing types to be modified, in the format of a dict with keys as the target type and values as the source type [example file: {'MISC': ['WorkOfArt', 'Event']}]",
235 | )
236 | @click.option("--output-delim", default=" ", help="[default: space]")
237 | def process(
238 |     file: str,
239 |     output_file: str,
240 |     file_encoding: str,
241 |     output_delim: str,
242 |     labels: str,
243 |     keep_types: str,
244 |     remove_types: str,
245 |     type_map: str,
246 |     *,
247 |     ignore_document_boundaries: bool,
248 |     parse_comment_lines: bool,
249 | ) -> None:
250 |     keep_types_set = _parse_type_list(keep_types)
251 |     remove_types_set = _parse_type_list(remove_types)
252 |     type_map_dict: dict[str, list[str]] = _load_type_map(type_map, file_encoding)
253 | 
254 |     if keep_types_set and remove_types_set:
255 |         raise ValueError("Cannot specify both keep-types and remove-types")
256 | 
257 |     if not keep_types_set and not remove_types_set and not type_map:
258 |         raise ValueError(
259 |             "Must specify at least one of keep-types, remove-types, or type-map"
260 |         )
261 | 
262 |     docs = ingest_conll_file(
263 |         file,
264 |         labels,
265 |         file_encoding,
266 |         ignore_document_boundaries=ignore_document_boundaries,
267 |         parse_comment_lines=parse_comment_lines,
268 |     )
269 | 
270 |     mod_docs = modify_types(docs, keep_types_set, remove_types_set, type_map_dict)
271 | 
272 |     write_docs_using_encoding(mod_docs, labels, file_encoding, output_delim, output_file)
273 | 
274 | 
275 | @cli.command(help="show counts for all the mentions contained in a file")
276 | @_multi_input_file_arguments
277 | @click.option(
278 |     "--output-file",
279 |     default=None,
280 |     help="path to write output to [default: stdout]",
281 | )
282 | @_repair_option()
283 | @_labels_option_default_bio()
284 | @click.option(
285 |     "--output-delim",
286 |     default="\t",
287 |     help="the delimiter to be used for output (has no effect on input) [default: tab]",
288 | )
289 | @_quiet_option()
290 | def count(
291 |     file: list[str],  # Name is "file" to make sense on the command line, but it's a list
292 |     file_encoding: str,
293 |     output_file: Optional[str],
294 |     labels: str,
295 |     *,
296 |     ignore_document_boundaries: bool,
297 |     parse_comment_lines: bool,
298 |     output_delim: str,
299 |     repair_method: str,
300 |     quiet: bool,
301 | ) -> None:
302 |     if repair_method == REPAIR_NONE:
303 |         repair_method = None
304 | 
305 |     output_delim = _normalize_tab(output_delim)
306 |     if output_delim != "\t":
307 |         print(
308 |             "Warning: Using a delimiter other than tab is not recommended as fields are not quoted",
309 |             file=sys.stderr,
310 |         )
311 | 
312 |     counts: Counter[tuple[str, tuple[str, ...]]] = Counter()
313 |     for each_file in file:
314 |         docs = ingest_conll_file(
315 |             each_file,
316 |             labels,
317 |             file_encoding,
318 |             ignore_document_boundaries=ignore_document_boundaries,
319 |             parse_comment_lines=parse_comment_lines,
320 |             repair=repair_method,
321 |             quiet=quiet,
322 |         )
323 | 
324 |         for doc in docs:
325 |             for sequence in doc:
326 |                 for mention in sequence.mentions:
327 |                     key = (mention.type, sequence.mention_tokens(mention))
328 |                     counts[key] += 1
329 | 
330 |     with (
331 |         open(output_file, "w", encoding=file_encoding)
332 |         if output_file
333 |         else nullcontext(sys.stdout) as output
334 |     ):
335 |         for item, item_count in counts.most_common():
336 |             print(
337 |                 output_delim.join((str(item_count), item[0], " ".join(item[1]))),
338 |                 file=output,
339 |             )
340 | 
341 | 
342 | @cli.command(help="show counts of the documents, sentences, and entity types")
343 | @_multi_input_file_arguments
344 | @_repair_option()
345 | @_labels_option_default_bio()
346 | @_quiet_option()
347 | def summarize(
348 |     file: list[str],  # Name is "file" to make sense on the command line, but it's a list
349 |     file_encoding: str,
350 |     labels: str,
351 |     *,
352 |     ignore_document_boundaries: bool,
353 |     parse_comment_lines: bool,
354 |     repair_method: str,
355 |     quiet: bool,
356 | ) -> None:
357 |     if repair_method == REPAIR_NONE:
358 |         repair_method = None
359 | 
360 |     type_counts: Counter[str] = Counter()
361 |     total_documents = 0
362 |     total_sentences = 0
363 |     for each_file in file:
364 |         docs = ingest_conll_file(
365 |             each_file,
366 |             labels,
367 |             file_encoding,
368 |             ignore_document_boundaries=ignore_document_boundaries,
369 |             parse_comment_lines=parse_comment_lines,
370 |             repair=repair_method,
371 |             quiet=quiet,
372 |         )
373 | 
374 |         for doc in docs:
375 |             for sequence in doc:
376 |                 for mention in sequence.mentions:
377 |                     type_counts[mention.type] += 1
378 | 
379 |         if not quiet:
380 |             # Count sentences
381 |             sentence_count = sum(len(doc) for doc in docs)
382 |             print(
383 |                 f"File {repr(each_file)} contains {len(docs)} document(s) and {sentence_count} sentences"
384 |             )
385 |             total_documents += len(docs)
386 |             total_sentences += sentence_count
387 | 
388 |     if not quiet and len(file) > 1:
389 |         print(f"Total {total_documents} document(s) and {total_sentences} sentences")
390 | 
391 |     header = ["Entity Type", "Count"]
392 |     rows = sorted(type_counts.items())
393 |     print(tabulate(rows, header, tablefmt="github", floatfmt="6.2f"))
394 | 
395 | 
396 | @cli.command(help="score a file and report performance or an error count table")
397 | @_multi_input_file_arguments
398 | @click.option("--reference", required=True)
399 | @_labels_option()
400 | @_repair_option()
401 | @click.option(
402 |     "--score-format",
403 |     default="pretty",
404 |     type=click.Choice(SUPPORTED_SCORE_FORMATS),
405 |     show_default=True,
406 | )
407 | @click.option(
408 |     "--delim",
409 |     default="\t",
410 |     help="the delimiter to be used for delimited output (has no effect on input) [default: tab]",
411 | )
412 | @click.option(
413 |     "--error-counts",
414 |     is_flag=True,
415 |     help="whether to output counts of false positives and negatives instead of scores",
416 | )
417 | @click.option(
418 |     "--full-precision",
419 |     is_flag=True,
420 |     help="whether to output floating values at full precision instead of rounding half even at two decimal places",
421 | )
422 | @_quiet_option()
423 | def score(
424 |     file: list[str],  # Name is "file" to make sense on the command line, but it's a list
425 |     file_encoding: str,
426 |     labels: str,
427 |     *,
428 |     ignore_document_boundaries: bool,
429 |     parse_comment_lines: bool,
430 |     reference: str,
431 |     score_format: str,
432 |     delim: str,
433 |     repair_method: str,
434 |     error_counts: bool,
435 |     full_precision: bool,
436 |     quiet: bool,
437 | ) -> None:
438 |     if repair_method == REPAIR_NONE:
439 |         repair_method = None
440 | 
441 |     if full_precision and score_format != FORMAT_DELIM:
442 |         raise ValueError(f"Can only use full-precision with score-format {FORMAT_DELIM}")
443 | 
444 |     if error_counts and len(file) > 1:
445 |         raise ValueError("Cannot use error-counts with multiple files to be scored")
446 | 
447 |     delim = _normalize_tab(delim)
448 | 
449 |     score_conll_files(
450 |         file,
451 |         reference,
452 |         labels,
453 |         repair_method,
454 |         file_encoding,
455 |         ignore_document_boundaries=ignore_document_boundaries,
456 |         parse_comment_lines=parse_comment_lines,
457 |         output_format=score_format,
458 |         delim=delim,
459 |         error_counts=error_counts,
460 |         full_precision=full_precision,
461 |         quiet=quiet,
462 |     )
463 | 
464 | 
465 | @cli.command(help="extract text from a file")
466 | @_multi_input_file_arguments
467 | @_labels_option_default_bio()
468 | @click.argument("output_file")
469 | def extract_text(
470 |     file: list[str],  # Name is "file" to make sense on the command line, but it's a list
471 |     file_encoding: str,
472 |     labels: str,
473 |     output_file: str,
474 |     *,
475 |     ignore_document_boundaries: bool,
476 |     parse_comment_lines: bool,
477 | ) -> None:
478 |     all_docs = []
479 |     for each_file in file:
480 |         docs = ingest_conll_file(
481 |             each_file,
482 |             labels,
483 |             file_encoding,
484 |             ignore_document_boundaries=ignore_document_boundaries,
485 |             parse_comment_lines=parse_comment_lines,
486 |         )
487 |         all_docs.extend(docs)
488 | 
489 |     with open(output_file, "w", encoding="utf8") as output:
490 |         first_doc = True
491 |         for doc in all_docs:
492 |             # Print empty line between documents
493 |             if not first_doc:
494 |                 print(file=output)
495 |             else:
496 |                 first_doc = False
497 |             for sentence in doc:
498 |                 print(" ".join(sentence), file=output)
499 | 
500 | 
501 | def _normalize_tab(s: str) -> str:
502 |     if s == "tab":
503 |         return "\t"
504 |     else:
505 |         # Clean up the string r"\t" if it's been given
506 |         return s.replace(r"\t", "\t")
507 | 
508 | 
509 | def _parse_type_list(types: str) -> set[str]:
510 |     # Remove any whitespace we got in the types somehow
511 |     split_types = [t.strip() for t in types.split(",") if t.strip()]
512 |     # Check for outside type
513 |     for entity_type in split_types:
514 |         if entity_type == DEFAULT_OUTSIDE:
515 |             raise ValueError(
516 |                 f"Cannot specify the outside type {DEFAULT_OUTSIDE} in keep/remove types"
517 |             )
518 |     return set(split_types)
519 | 
520 | 
521 | def _load_type_map(
522 |     type_map_path: Optional[str], file_encoding: str
523 | ) -> dict[str, list[str]]:
524 |     if not type_map_path:
525 |         return {}
526 | 
527 |     try:
528 |         with open(type_map_path, encoding=file_encoding) as file:
529 |             type_map = json.load(file)
530 |     except FileNotFoundError as err:
531 |         raise ValueError(f"Could not open type map file {repr(type_map_path)}") from err
532 |     except json.decoder.JSONDecodeError as err:
533 |         raise ValueError(
534 |             f"Type map provided in file {repr(type_map_path)} is not valid JSON"
535 |         ) from err
536 | 
537 |     # Validate types
538 |     if not isinstance(type_map, dict):
539 |         raise ValueError(
540 |             f"Type map provided in file {repr(type_map_path)} is not a dictionary"
541 |         )
542 | 
543 |     for from_type, to_types in type_map.items():
544 |         if not isinstance(from_type, str) or not from_type:
545 |             raise ValueError(
546 |                 f"Key {repr(from_type)} in type map {repr(type_map_path)} is not a non-empty string"
547 |             )
548 |         if from_type == DEFAULT_OUTSIDE:
549 |             raise ValueError(
550 |                 f"Key {repr(from_type)} in type map {repr(type_map_path)} is the outside type {DEFAULT_OUTSIDE}"
551 |             )
552 | 
553 |         if not isinstance(to_types, list):
554 |             raise ValueError(
555 |                 f"Value {repr(to_types)} in type map {repr(type_map_path)} is not a list"
556 |             )
557 | 
558 |         for to_type in to_types:
559 |             if not isinstance(to_type, str) or not to_type:
560 |                 raise ValueError(
561 |                     f"Value {repr(to_type)} in type map {repr(type_map_path)} is not a non-empty string"
562 |                 )
563 |             if to_type == DEFAULT_OUTSIDE:
564 |                 raise ValueError(
565 |                     f"Value {repr(to_type)} in type map {repr(type_map_path)} is the outside type {DEFAULT_OUTSIDE}"
566 |                 )
567 | 
568 |     return type_map
569 | 
570 | 
571 | # This is tested by a subprocess call in test_seqscore_main so coverage will miss it
572 | if __name__ == "__main__":  # pragma: no cover
573 |     cli()
574 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SeqScore
  2 | 
  3 | ![Build Status](https://github.com/bltlab/seqscore/actions/workflows/main.yml/badge.svg)
  4 | [![image](https://img.shields.io/pypi/v/seqscore.svg)](https://pypi.python.org/pypi/seqscore)
  5 | [![image](https://img.shields.io/pypi/l/seqscore.svg)](https://pypi.python.org/pypi/seqscore)
  6 | [![image](https://img.shields.io/pypi/pyversions/seqscore.svg)](https://pypi.python.org/pypi/seqscore)
  7 | 
  8 | SeqScore provides scoring for named entity recognition and other
  9 | chunking tasks evaluated over sequence labels.
 10 | 
 11 | SeqScore is maintained by the BLT Lab at Brandeis University. Please
 12 | open an issue if you find incorrect behavior or features you would like
 13 | to see added. Due to the risk of introducing regressions or incorrect
 14 | scoring behavior, *we generally do not accept pull requests*. Please do not
 15 | open a pull request unless you are asked to do so by a maintainer in an
 16 | issue.
 17 | 
 18 | ## Installation
 19 | 
 20 | To install the latest official release of SeqScore, run: `pip install seqscore`.
 21 | This will install the package and add the command `seqscore` in your Python
 22 | environment.
 23 | 
 24 | SeqScore requires Python 3.9 or higher. It is tested on Python 3.9,
 25 | 3.10, 3.11, 3.12, and 3.13.
 26 | 
 27 | ## License
 28 | 
 29 | SeqScore is distributed under the MIT License.
 30 | 
 31 | ## Citation
 32 | 
 33 | If you use SeqScore, please cite
 34 | [SeqScore: Addressing Barriers to Reproducible Named Entity Recognition Evaluation](https://aclanthology.org/2021.eval4nlp-1.5/)
 35 | and
 36 | [Improving NER Research Workflows with SeqScore](https://aclanthology.org/2023.nlposs-1.17/).
 37 | 
 38 | BibTeX:
 39 | 
 40 | ```
 41 | @inproceedings{palen-michel-etal-2021-seqscore,
 42 |     title = "{S}eq{S}core: Addressing Barriers to Reproducible Named Entity Recognition Evaluation",
 43 |     author = "Palen-Michel, Chester  and
 44 |       Holley, Nolan  and
 45 |       Lignos, Constantine",
 46 |     booktitle = "Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems",
 47 |     month = nov,
 48 |     year = "2021",
 49 |     address = "Punta Cana, Dominican Republic",
 50 |     publisher = "Association for Computational Linguistics",
 51 |     url = "https://aclanthology.org/2021.eval4nlp-1.5",
 52 |     pages = "40--50"
 53 | }
 54 | 
 55 | @inproceedings{lignos-etal-2023-improving,
 56 |     title = "Improving {NER} Research Workflows with {S}eq{S}core",
 57 |     author = "Lignos, Constantine  and
 58 |       Kruse, Maya  and
 59 |       Rueda, Andrew",
 60 |     editor = "Tan, Liling  and
 61 |       Milajevs, Dmitrijs  and
 62 |       Chauhan, Geeticka  and
 63 |       Gwinnup, Jeremy  and
 64 |       Rippeth, Elijah",
 65 |     booktitle = "Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023)",
 66 |     month = dec,
 67 |     year = "2023",
 68 |     address = "Singapore",
 69 |     publisher = "Association for Computational Linguistics",
 70 |     url = "https://aclanthology.org/2023.nlposs-1.17/",
 71 |     doi = "10.18653/v1/2023.nlposs-1.17",
 72 |     pages = "147--152"
 73 | }
 74 | ```
 75 | 
 76 | Other papers related to SeqScore include:
 77 | * [If You Build Your Own NER Scorer, Non-replicable Results Will Come](https://aclanthology.org/2020.insights-1.15.pdf)
 78 | * [Toward More Meaningful Resources for Lower-resourced Languages](https://aclanthology.org/2022.findings-acl.44/)
 79 | * [CoNLL#: Fine-grained Error Analysis and a Corrected Test Set for CoNLL-03 English](https://aclanthology.org/2024.lrec-main.330/)
 80 | 
 81 | 
 82 | # Usage
 83 | 
 84 | ## Overview
 85 | 
 86 | For a list of commands, run `seqscore --help`:
 87 | 
 88 | ```
 89 | $ seqscore --help
 90 | Usage: seqscore [OPTIONS] COMMAND [ARGS]...
 91 | 
 92 |   Provides scoring and analysis tools for NER/chunking files (version 0.6.0)
 93 | 
 94 | Options:
 95 |   --version  Show the version and exit.
 96 |   --help     Show this message and exit.
 97 | 
 98 | Commands:
 99 |   convert       convert between mention encodings
100 |   count         show counts for all the mentions contained in a file
101 |   extract-text  extract text from a file
102 |   process       transform entity types by keeping/removing/mapping
103 |   repair        repair invalid label transitions
104 |   score         score a file and report performance or an error count table
105 |   summarize     show counts of the documents, sentences, and entity types
106 |   validate      validate labels
107 | ```
108 | 
109 | ## Scoring
110 | 
111 | The most common application of SeqScore is scoring CoNLL-format NER
112 | predictions. Let's assume you have two files, one containing the
113 | correct labels (annotation) and the other containing the predictions
114 | (system output).
115 | 
116 | The correct labels are in the file [samples/reference.bio](samples/reference.bio):
117 | 
118 | ```
119 | This O
120 | is O
121 | a O
122 | sentence O
123 | . O
124 | 
125 | University B-ORG
126 | of I-ORG
127 | Pennsylvania I-ORG
128 | is O
129 | in O
130 | West B-LOC
131 | Philadelphia I-LOC
132 | , O
133 | Pennsylvania B-LOC
134 | . O
135 | 
136 | ```
137 | 
138 | The predictions are in the file [samples/predicted.bio](samples/predicted.bio):
139 | 
140 | ```
141 | This O
142 | is O
143 | a O
144 | sentence O
145 | . O
146 | 
147 | University B-ORG
148 | of I-ORG
149 | Pennsylvania I-ORG
150 | is O
151 | in O
152 | West B-LOC
153 | Philadelphia B-LOC
154 | , O
155 | Pennsylvania B-LOC
156 | . O
157 | 
158 | ```
159 | 
160 | To score the predictions, run:
161 | `seqscore score --labels BIO --reference samples/reference.bio samples/predicted.bio`
162 | 
163 | ```
164 | | Type   |   Precision |   Recall |     F1 |   Reference |   Predicted |   Correct |
165 | |--------|-------------|----------|--------|-------------|-------------|-----------|
166 | | ALL    |       50.00 |    66.67 |  57.14 |           3 |           4 |         2 |
167 | | LOC    |       33.33 |    50.00 |  40.00 |           2 |           3 |         1 |
168 | | ORG    |      100.00 |   100.00 | 100.00 |           1 |           1 |         1 |
169 | ```
170 | 
171 | A few things to note:
172 | 
173 | * The reference file must be specified with the `--reference` flag.
174 | * The chunk encoding (BIO, BIOES, etc.) must be specified using the
175 |   `--labels` flag.
176 | * Both files need to use the same chunk encoding. If you have
177 |   files that use different chunk encodings, use the `convert` command.
178 | * You can get output in different formats using the `--score-format`
179 |   flag. Using `--score-format delim` will produce tab-delimited
180 |   output. In the delimited format, you can specify the `--full-precision`
181 |   flag to output higher numerical precision.
182 | * In the default (pretty) output format, numbers are rounded "half up"
183 |   at two decimal places. In other words, 57.124 will round to 57.12,
184 |   and 57.125 will round to 57.13. This is different than the "half even"
185 |   rounding used by `conlleval` and other libraries that rely on `printf`
186 |   behavior for rounding. Half up rounding is used as it is more likely to
187 |   match the rounding a user would perform if shown three decimal places.
188 |   If you request `conlleval` output format, the same rounding used by
189 |   `conlleval` will be used.
190 | 
191 | The above scoring command will work for files that do not have any
192 | invalid transitions, that is, those that perfectly follow what the
193 | encoding allows. However, consider this BIO-encoded file,
194 | [samples/invalid.bio](samples/invalid.bio):
195 | 
196 | ```
197 | This O
198 | is O
199 | a O
200 | sentence O
201 | . O
202 | 
203 | University I-ORG
204 | of I-ORG
205 | Pennsylvania I-ORG
206 | is O
207 | in O
208 | West B-LOC
209 | Philadelphia I-LOC
210 | , O
211 | Pennsylvania B-LOC
212 | . O
213 | 
214 | ```
215 | 
216 | Note that the token `University` has the label `I-ORG`, but there is
217 | no preceding `B-ORG`. If we score it as before with
218 | `seqscore score --labels BIO --reference samples/reference.bio samples/invalid.bio`,
219 | scoring will fail:
220 | 
221 | ```
222 | seqscore.encoding.EncodingError: Stopping due to validation errors in invalid.bio:
223 | Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7
224 | ```
225 | 
226 | To score output with invalid transitions, we need to specify a repair
227 | method which can correct them. We can tell SeqScore to use the same
228 | approach that conlleval uses (which we refer to as "begin" repair in our
229 | paper):
230 | `seqscore score --labels BIO --repair-method conlleval --reference samples/reference.bio samples/invalid.bio`:
231 | 
232 | ```
233 | Validation errors in sequence at line 7 of invalid.bio:
234 | Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7
235 | Used method conlleval to repair:
236 | Old: ('I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'O')
237 | New: ('B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'O')
238 | | Type   |   Precision |   Recall |     F1 |   Reference |   Predicted |   Correct |
239 | |--------|-------------|----------|--------|-------------|-------------|-----------|
240 | | ALL    |      100.00 |   100.00 | 100.00 |           3 |           3 |         3 |
241 | | LOC    |      100.00 |   100.00 | 100.00 |           2 |           2 |         2 |
242 | | ORG    |      100.00 |   100.00 | 100.00 |           1 |           1 |         1 |
243 | ```
244 | 
245 | You can use the `-q` flag to suppress the logging of all of the repairs
246 | applied. For example, running the command
247 | `seqscore score -q --labels BIO --repair-method conlleval --reference samples/reference.bio samples/invalid.bio`
248 | will hide the repairs:
249 | 
250 | ```
251 | | Type   |   Precision |   Recall |     F1 |   Reference |   Predicted |   Correct |
252 | |--------|-------------|----------|--------|-------------|-------------|-----------|
253 | | ALL    |      100.00 |   100.00 | 100.00 |           3 |           3 |         3 |
254 | | LOC    |      100.00 |   100.00 | 100.00 |           2 |           2 |         2 |
255 | | ORG    |      100.00 |   100.00 | 100.00 |           1 |           1 |         1 |
256 | ```
257 | 
258 | You may want to also explore the `discard` repair, which can
259 | produce higher scores for output from models without a CRF/constrained
260 | decoding as they are more likely to produce invalid transitions.
261 | 
262 | SeqScore can also display all errors (false positives and false negatives)
263 | encountered in scoring using the `--error-counts` flag. For example, running the
264 | command
265 | `seqscore score --labels BIO --error-counts --reference samples/reference.bio samples/predicted.bio`
266 | will produce the following output:
267 | 
268 | ```
269 | |   Count | Error   | Type   | Tokens            |
270 | |---------|---------|--------|-------------------|
271 | |       1 | FP      | LOC    | Philadelphia      |
272 | |       1 | FP      | LOC    | West              |
273 | |       1 | FN      | LOC    | West Philadelphia |
274 | ```
275 | 
276 | The output shows that the system produced two false positives and missed one
277 | mention in the reference (false negative). The most frequent errors appear at
278 | the top. The `--error-counts` flag can be combined with `--score-format delim`
279 | to write a delimited table that can be read as a spreadsheet.
280 | 
281 | ## Validation
282 | 
283 | To check if a file has any invalid transitions, we can run
284 | `seqscore validate --labels BIO samples/reference.bio`:
285 | 
286 | ```
287 | No errors found in 0 tokens, 2 sequences, and 1 documents in reference.bio
288 | ```
289 | 
290 | For the example of the [samples/invalid.bio](samples/invalid.bio), we can run
291 | `seqscore validate --labels BIO samples/invalid.bio`:
292 | 
293 |  ```
294 | Encountered 1 errors in 1 tokens, 2 sequences, and 1 documents in invalid.bio
295 | Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7
296 | ```
297 | 
298 | ## Conversion
299 | 
300 | We can convert a file from one chunk encoding to another. For example,
301 | `seqscore convert --input-labels BIO --output-labels BIOES samples/reference.bio samples/reference.bioes`
302 | will read [samples/reference.bio](samples/reference.bio) in BIO
303 | encoding and write the BIOES-converted file to [samples/reference.bioes](samples/reference.bioes):
304 | 
305 | ```
306 | This O
307 | is O
308 | a O
309 | sentence O
310 | . O
311 | 
312 | University B-ORG
313 | of I-ORG
314 | Pennsylvania E-ORG
315 | is O
316 | in O
317 | West B-LOC
318 | Philadelphia E-LOC
319 | , O
320 | Pennsylvania S-LOC
321 | . O
322 | 
323 | ```
324 | 
325 | We can get a list of available chunk encodings by running `seqscore convert --help`:
326 | 
327 | ```
328 | Usage: seqscore convert [OPTIONS] FILE OUTPUT_FILE
329 | 
330 | Options:
331 |   --file-encoding TEXT            [default: UTF-8]
332 |   --ignore-comment-lines
333 |   --ignore-document-boundaries / --use-document-boundaries
334 |   --output-delim TEXT             [default: space]
335 |   --input-labels [BIO|BIOES|BILOU|BMES|BMEOW|IO|IOB]
336 |                                   [required]
337 |   --output-labels [BIO|BIOES|BILOU|BMES|BMEOW|IO|IOB]
338 |                                   [required]
339 |   --help                          Show this message and exit.
340 | ```
341 | 
342 | ## Repair
343 | 
344 | We can also apply repair methods to a file, creating an output file
345 | with only valid transitions. For example, we can run
346 | `seqscore repair --labels BIO --repair-method conlleval samples/invalid.bio samples/invalid_repair_conlleval.bio`,
347 | which will apply the conlleval repair method to the
348 | [samples/invalid.bio](samples/invalid.bio) and write the repaired
349 | labels to
350 | [samples/invalid_repair_conlleval.bio](samples/invalid_repair_conlleval.bio):
351 | 
352 | ```
353 | This O
354 | is O
355 | a O
356 | sentence O
357 | . O
358 | 
359 | University B-ORG
360 | of I-ORG
361 | Pennsylvania I-ORG
362 | is O
363 | in O
364 | West B-LOC
365 | Philadelphia I-LOC
366 | , O
367 | Pennsylvania B-LOC
368 | . O
369 | 
370 | ```
371 | 
372 | If we want to apply the discard repair method, we can run
373 | `seqscore repair --labels BIO --repair-method discard samples/invalid.bio samples/invalid_repair_discard.bio`
374 | and the output will be written to [samples/invalid_repair_discard.bio](samples/invalid_repair_discard.bio):
375 | 
376 | ```
377 | This O
378 | is O
379 | a O
380 | sentence O
381 | . O
382 | 
383 | University O
384 | of O
385 | Pennsylvania O
386 | is O
387 | in O
388 | West B-LOC
389 | Philadelphia I-LOC
390 | , O
391 | Pennsylvania B-LOC
392 | . O
393 | 
394 | ```
395 | 
396 | Repairing the file before performing other operations is available in the
397 | `count` and `summarize` subcommands.
398 | 
399 | ## Summarize
400 | 
401 | The `summarize` subcommand can produce counts of the types of chunks
402 | in the input file. For example, if we run
403 | `seqscore summarize --labels BIO samples/reference.bio`
404 | we get the following output:
405 | 
406 | ```
407 | File 'samples/reference.bio' contains 1 document(s) with the following mentions:
408 | | Entity Type   |   Count |
409 | |---------------|---------|
410 | | LOC           |       2 |
411 | | ORG           |       1 |
412 | ```
413 | 
414 | If the quiet (`-q`) flag is provided, the first line giving the filename
415 | and document count is not printed.
416 | 
417 | ## Count
418 | 
419 | The `count` subcommand can produce the counts of chunks in the input
420 | file. Unlike `summarize`, it counts chunk-type pairs, not just types.
421 | For example, if we run
422 | `seqscore count --labels BIO samples/reference.bio --output-file counts.csv`,
423 | tab-delimited counts would be written to `counts.csv` as follows:
424 | 
425 | ```
426 | 1	ORG	University of Pennsylvania
427 | 1	LOC	West Philadelphia
428 | 1	LOC	Pennsylvania
429 | ```
430 | 
431 | You can also call `count` without the `--output-file` argument to print counts to
432 | standard output. However, you may encounter Unicode issues if your terminal is not
433 | configured properly.
434 | 
435 | You can use the `--output-delim` argument to change the delimiter used in the counts.
436 | The default delimiter of tab is strongly recommended, as there is no escaping or
437 | quoting of the names in the output.
438 | 
439 | ## Process
440 | 
441 | The `process` subcommand can remove entity types from a file or map them to
442 | other types. Removing types can be performed by specifying one of `--keep-types`
443 | or `--remove-types`.
444 | 
445 | For example, if we wanted to keep only the ORG type, we could run:
446 | `seqscore process --labels BIO --keep-types ORG samples/reference.bio samples/keep_ORG.bio`,
447 | and the following output will be written to [samples/keep_ORG.bio](samples/keep_ORG.bio):
448 | 
449 | ```
450 | This O
451 | is O
452 | a O
453 | sentence O
454 | . O
455 | 
456 | University B-ORG
457 | of I-ORG
458 | Pennsylvania I-ORG
459 | is O
460 | in O
461 | West O
462 | Philadelphia O
463 | , O
464 | Pennsylvania O
465 | . O
466 | ```
467 | 
468 | You can also keep multiple types by specifying a comma-separated list of types:
469 | `--keep-types LOC,ORG`.
470 | 
471 | Instead of specifying which types to keep, we can also specify which types to
472 | remove using `--remove-types`. For example, if we wanted to remove only the
473 | ORG type, we could run:
474 | `seqscore process --labels BIO --remove-types ORG samples/reference.bio samples/remove_ORG.bio`,
475 | and the following output will be written to [samples/remove_ORG.bio](samples/remove_ORG.bio):
476 | 
477 | ```
478 | This O
479 | is O
480 | a O
481 | sentence O
482 | . O
483 | 
484 | University O
485 | of O
486 | Pennsylvania O
487 | is O
488 | in O
489 | West B-LOC
490 | Philadelphia I-LOC
491 | , O
492 | Pennsylvania B-LOC
493 | . O
494 | ```
495 | 
496 | As with keep, you can specify multiple tags to remove, for example
497 | `--remove-types LOC,ORG`.
498 | 
499 | The `--type-map` argument allows you to specify a JSON file that specifies a
500 | mapping between types and other types. Suppose you want to collapse several
501 | types into a more generic NAME type. In that case, the type map would be
502 | specified as follows:
503 | 
504 | ```
505 | {
506 |   "NAME": ["LOC", "ORG"]
507 | }
508 | ```
509 | 
510 | The type map must be a JSON dictionary. The keys are the types to be mapped to,
511 | while the value for each key is a list of types to be mapped from. Note that
512 | the value must always be a list, even if it would only contain one element.
513 | 
514 | We can apply the above type map to a file using the following command:
515 | `seqscore process --labels BIO --type-map samples/type_map_NAME.json samples/reference.bio samples/all_NAME.bio`,
516 | resulting in this output:
517 | 
518 | ```
519 | This O
520 | is O
521 | a O
522 | sentence O
523 | . O
524 | 
525 | University B-NAME
526 | of I-NAME
527 | Pennsylvania I-NAME
528 | is O
529 | in O
530 | West B-NAME
531 | Philadelphia I-NAME
532 | , O
533 | Pennsylvania B-NAME
534 | . O
535 | ```
536 | 
537 | When `--type-map` is specified at the same time as `--keep-types` or
538 | `--remove-types`, the type mapping is applied **before** the keep/remove
539 | filtering is applied.
540 | 
541 | ## Text extraction
542 | 
543 | The `extract-text` subcommand extracts the text from a CoNLL-format file.
544 | 
545 | For example, to extract the text from `samples/reference.bio` and write it to
546 | `reference.txt`, run the following command:
547 | `seqscore extract-text samples/reference.bio reference.txt`
548 | 
549 | This would result in `reference.txt` having the following contents:
550 | 
551 | ```
552 | This is a sentence .
553 | University of Pennsylvania is in West Philadelphia , Pennsylvania .
554 | ```
555 | 
556 | Each sentence is written on one line with space-delimited tokens.
557 | 
558 | 
559 | # FAQ
560 | 
561 | ## Why can't I score output files that are in the format `conlleval` expects?
562 | 
563 | SeqScore intentionally does not support the "merged"
564 | format used by `conlleval` where each line contains a token, correct
565 | tag, and predicted tag:
566 | 
567 | ```
568 | University B-ORG B-ORG
569 | of I-ORG I-ORG
570 | Pennsylvania I-ORG I-ORG
571 | is O O
572 | in O O
573 | West B-LOC B-LOC
574 | Philadelphia I-LOC B-LOC
575 | , O O
576 | Pennsylvania B-LOC B-LOC
577 | . O O
578 | ```
579 | 
580 | We do not support this format because we have found that creating
581 | predictions in this format is a common source of errors in scoring
582 | pipelines.
583 | 
584 | ## When do I need to specify the `--labels` argument?
585 | 
586 | The `--labels` argument must be specified for commands where knowing the label
587 | encoding is essential to getting correct answers. These commands are `validate`,
588 | `repair`, and `score`. For all other commands, `--labels BIO` is assumed by
589 | default but can be overridden.
590 | 
591 | # Development
592 | 
593 | The following instructions are for the project maintainers only.
594 | 
595 | For development, check out the `dev` branch (latest, but less tested
596 | than `main`).
597 | 
598 | To install from a clone of this repository, use:
599 | `pip install -e .`
600 | 
601 | ## Setting up an environment for development
602 | 
603 | 1. Create an environment: `conda create -yn seqscore python=3.9`
604 | 2. Activate the environment: `conda activate seqscore`
605 | 3. Install seqscore: `pip install -e .`
606 | 4. Install development dependencies: `pip install -r requirements.txt`
607 | 
608 | # Contributors
609 | 
610 | SeqScore was developed by the BLT Lab at Brandeis University under the
611 | direction of PI and lead developer Constantine Lignos. Chester Palen-Michel
612 | and Nolan Holley contributed to its development. Gordon Dou, Maya Kruse, and
613 | Andrew Rueda gave feedback on its features and assisted in README writing.
614 | 


--------------------------------------------------------------------------------