├── tests
├── __init__.py
├── test_token.py
├── test_doubly_linked_list.py
├── test_utils.py
├── test_alignment.py
├── test_somajo.py
└── test_tokenizer_internal.py
├── requirements_dev.txt
├── doc
├── source
│ ├── modules.rst
│ ├── somajo.rst
│ ├── index.rst
│ └── conf.py
├── Makefile
├── make.bat
└── build
│ └── markdown
│ └── somajo.md
├── utils
├── run_tests.sh
├── evaluate_on_gum.sh
├── annotate_cmc.sh
├── annotate_web.sh
├── evaluate_on_konvens.sh
├── evaluate_on_test_cmc.sh
├── evaluate_on_test_web.sh
├── evaluate_on_ewt.sh
├── baseline.sh
├── errors_trial.txt
├── errors_train.txt
├── evaluate.py
└── errors_test.txt
├── .gitignore
├── src
└── somajo
│ ├── data
│ ├── non-breaking_hyphenated_words_en.txt
│ ├── single_tokens_en.txt
│ ├── single_token_abbreviations_en.txt
│ ├── single_tokens_de.txt
│ ├── eos_abbreviations.txt
│ ├── non-breaking_suffixes_en.txt
│ ├── units.txt
│ ├── single_token_abbreviations_de.txt
│ ├── non-breaking_prefixes_en.txt
│ ├── tokens_with_plus_or_ampersand.txt
│ ├── abbreviations_en.txt
│ └── abbreviations_de.txt
│ ├── __init__.py
│ ├── doubly_linked_list.py
│ ├── token.py
│ ├── cli.py
│ ├── alignment.py
│ └── sentence_splitter.py
├── .github
└── workflows
│ └── test.yml
├── README.rst
├── pyproject.toml
├── CHANGES.txt
└── README.md
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | build
2 | sphinx
3 | sphinx-markdown-builder
4 | twine
5 |
--------------------------------------------------------------------------------
/doc/source/modules.rst:
--------------------------------------------------------------------------------
1 | somajo
2 | ======
3 |
4 | .. toctree::
5 | :maxdepth: 4
6 |
7 | somajo
8 |
--------------------------------------------------------------------------------
/utils/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
4 | cd $SCRIPTDIR/..
5 |
6 | # Test Discovery
7 | python3 -m unittest discover
8 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /data/
2 | /dist/
3 | /doc/build/markdown/index.md
4 | /doc/build/markdown/modules.md
5 | /doc/build/doctrees/
6 | /src/SoMaJo.egg-info/
7 | /venv/
8 | __pycache__/
9 | *~
10 | *.pyc
11 |
--------------------------------------------------------------------------------
/src/somajo/data/non-breaking_hyphenated_words_en.txt:
--------------------------------------------------------------------------------
1 | # Hyphenated words in the following list are not split into multiple tokens.
2 |
3 | mm-hm
4 | mm-mm
5 | o-kay
6 | uh-huh
7 | uh-oh
8 | x-ray
9 | x-rayed
10 | x-rays
11 |
--------------------------------------------------------------------------------
/src/somajo/data/single_tokens_en.txt:
--------------------------------------------------------------------------------
1 | # A list of tokens that should not be split.
2 | #
3 | # Lines starting with “#” are treated as comments and will be ignored.
4 |
5 | tl;dr
6 |
7 | # mobile telephony
8 | 3G
9 | 4G
10 | 5G
11 |
--------------------------------------------------------------------------------
/src/somajo/data/single_token_abbreviations_en.txt:
--------------------------------------------------------------------------------
1 | # A list of multi-dot abbreviations that represent single tokens and
2 | # should not be split.
3 | #
4 | # Lines starting with “#” are treated as comments and will be ignored.
5 |
6 | e.g.
7 | i.e.
8 | a.m.
9 | p.m.
10 | P.S.
11 | T.V.
12 |
--------------------------------------------------------------------------------
/src/somajo/data/single_tokens_de.txt:
--------------------------------------------------------------------------------
1 | # A list of tokens that should not be split.
2 | #
3 | # Lines starting with “#” are treated as comments and will be ignored.
4 |
5 | .Net
6 | /rant
7 | /s
8 | E/E
9 | tl;dr
10 | zl;ng
11 |
12 | # SAP Versions
13 | S/4
14 | R/3
15 |
16 | # mobile telephony
17 | 3G
18 | 4G
19 | 5G
20 |
--------------------------------------------------------------------------------
/src/somajo/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib.metadata
2 |
3 | from . import (
4 | sentence_splitter,
5 | somajo,
6 | tokenizer
7 | )
8 |
9 | __version__ = importlib.metadata.version(__package__ or __name__)
10 |
11 | Tokenizer = tokenizer.Tokenizer
12 | SentenceSplitter = sentence_splitter.SentenceSplitter
13 | SoMaJo = somajo.SoMaJo
14 |
--------------------------------------------------------------------------------
/doc/source/somajo.rst:
--------------------------------------------------------------------------------
1 | somajo package
2 | ==============
3 |
4 | somajo.somajo module
5 | --------------------
6 |
7 | .. automodule:: somajo.somajo
8 | :members:
9 | :undoc-members:
10 | :show-inheritance:
11 |
12 | somajo.token module
13 | -------------------
14 |
15 | .. automodule:: somajo.token
16 | :members:
17 | :undoc-members:
18 | :show-inheritance:
19 |
--------------------------------------------------------------------------------
/utils/evaluate_on_gum.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
4 | cd $SCRIPTDIR
5 |
6 | mkdir tmp
7 | for f in ../data/GUM/text/*
8 | do
9 | filename=$(basename $f)
10 | somajo-tokenizer -l en $f > tmp/$filename
11 | done
12 | perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_gum.txt tmp ../data/GUM/tokenized
13 | rm -r tmp/
14 |
--------------------------------------------------------------------------------
/src/somajo/data/eos_abbreviations.txt:
--------------------------------------------------------------------------------
1 | # A list of abbreviations that frequently occur at the end of a
2 | # sentence. If such an abbreviation is followed by a potential
3 | # sentence start, e.g. by a capital letter, it will be interpreted as
4 | # the end of a sentence.
5 | #
6 | # Lines starting with “#” are treated as comments and will be ignored.
7 |
8 | usw.
9 | usf.
10 | etc.
11 | uvam.
12 |
--------------------------------------------------------------------------------
/utils/annotate_cmc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
4 | cd $SCRIPTDIR
5 |
6 | for f in ../../data/empirist_test_tok_cmc/raw/*
7 | do
8 | filename=$(basename $f)
9 | ../bin/tokenizer --split_camel_case $f > ../../data/cmc_tok_SoMaJo/$filename
10 | # ../bin/tokenizer $f > ../../data/cmc_tok_SoMaJo/$filename
11 | done
12 | perl ../../data/empirist_test_tok_cmc/tools/validate_tokenization.perl -x ../../data/cmc_tok_SoMaJo/ ../../data/empirist_test_tok_cmc/raw/
13 |
--------------------------------------------------------------------------------
/utils/annotate_web.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
4 | cd $SCRIPTDIR
5 |
6 | for f in ../../data/empirist_test_tok_web/raw/*
7 | do
8 | filename=$(basename $f)
9 | ../bin/tokenizer --split_camel_case $f > ../../data/web_tok_SoMaJo/$filename
10 | # ../bin/tokenizer $f > ../../data/web_tok_SoMaJo/$filename
11 | done
12 | perl ../../data/empirist_test_tok_web/tools/validate_tokenization.perl -x ../../data/web_tok_SoMaJo/ ../../data/empirist_test_tok_web/raw/
13 |
--------------------------------------------------------------------------------
/utils/evaluate_on_konvens.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
4 | cd $SCRIPTDIR
5 |
6 | mkdir tmp
7 | for f in ../data/Ortmann_et_al/txt/*.txt
8 | do
9 | filename=$(basename $f)
10 | somajo-tokenizer --split-sentences $f > tmp/$filename
11 | done
12 | perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_test.txt tmp ../data/Ortmann_et_al/tokens
13 | ./evaluate.py -d --sentences -e errors.txt tmp/ ../data/Ortmann_et_al/tokens/
14 | rm -r tmp/
15 |
--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
1 | .. SoMaJo documentation master file, created by
2 | sphinx-quickstart on Thu Dec 19 08:01:21 2019.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to SoMaJo's documentation!
7 | ==================================
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 | :caption: Contents:
12 |
13 |
14 |
15 | Indices and tables
16 | ==================
17 |
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 |
--------------------------------------------------------------------------------
/utils/evaluate_on_test_cmc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
4 | cd $SCRIPTDIR
5 |
6 | mkdir tmp
7 | for f in ../data/empirist_gold_standard/test_cmc/raw/*
8 | do
9 | filename=$(basename $f)
10 | somajo-tokenizer --split_camel_case $f > tmp/$filename
11 | done
12 | # perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_test.txt tmp ../data/empirist_gold_standard/test_cmc/tokenized
13 | ./evaluate.py -d -e errors.txt --ignore-xml tmp/ ../data/empirist_gold_standard/test_cmc/tokenized/
14 | rm -r tmp/
15 |
--------------------------------------------------------------------------------
/utils/evaluate_on_test_web.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
4 | cd $SCRIPTDIR
5 |
6 | mkdir tmp
7 | for f in ../data/empirist_gold_standard/test_web/raw/*
8 | do
9 | filename=$(basename $f)
10 | somajo-tokenizer --split_camel_case $f > tmp/$filename
11 | done
12 | # perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_test.txt tmp ../data/empirist_gold_standard/test_web/tokenized
13 | ./evaluate.py -d -e errors.txt --ignore-xml tmp/ ../data/empirist_gold_standard/test_web/tokenized/
14 | rm -r tmp/
15 |
--------------------------------------------------------------------------------
/src/somajo/data/non-breaking_suffixes_en.txt:
--------------------------------------------------------------------------------
1 | # Hyphenated suffixes in the following list are not split into multiple tokens.
2 | # Euro-centric and element-wise are both single tokens.
3 |
4 | able
5 | ahol
6 | aholic
7 | ation
8 | centric
9 | cracy
10 | crat
11 | dom
12 | er
13 | ery
14 | esque
15 | ette
16 | fest
17 | fold
18 | ful
19 | gate
20 | gon
21 | hood
22 | ian
23 | ible
24 | ing
25 | isation
26 | ise
27 | ising
28 | ism
29 | ist
30 | itis
31 | ization
32 | ize
33 | izing
34 | less
35 | logist
36 | logy
37 | ly
38 | most
39 | o-torium
40 | rama
41 | wise
42 |
--------------------------------------------------------------------------------
/src/somajo/data/units.txt:
--------------------------------------------------------------------------------
1 | # A list of units preceded by numbers. The list is case-insensitive.
2 | #
3 | # Lines starting with “#” are treated as comments and will be ignored.
4 |
5 | bit
6 | cent
7 | cm
8 | cm2
9 | cm3
10 | cm^2
11 | cm^3
12 | cm²
13 | cm³
14 | dm
15 | dm2
16 | dm3
17 | dm^2
18 | dm^3
19 | dm²
20 | dm³
21 | eur
22 | f
23 | ft
24 | g
25 | gbit/s
26 | ghz
27 | h
28 | hz
29 | kg
30 | km
31 | km/h
32 | km2
33 | km3
34 | km^2
35 | km^3
36 | km²
37 | km³
38 | l
39 | lb
40 | m
41 | m2
42 | m3
43 | m^2
44 | m^3
45 | mbit/s
46 | min
47 | ml
48 | mm
49 | mm2
50 | mm3
51 | mm^2
52 | mm^3
53 | mm²
54 | mm³
55 | m²
56 | m³
57 | qm
58 | s
59 | sek
60 |
--------------------------------------------------------------------------------
/utils/evaluate_on_ewt.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
4 | cd $SCRIPTDIR
5 |
6 | mkdir tmp
7 | for f in ../data/English_Web_Treebank/en-ud-*.txt
8 | do
9 | filename=$(basename $f)
10 | somajo-tokenizer -l en_PTB $f > tmp/$filename
11 | done
12 | echo "GOLD"
13 | # perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_ewt.txt tmp ../data/English_Web_Treebank/gold
14 | ./evaluate.py -d -e errors.txt tmp/ ../data/English_Web_Treebank/gold/
15 | # echo ""
16 | # echo "SEMIGOLD"
17 | # perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_ewt_semi.txt tmp ../data/English_Web_Treebank/semigold
18 | rm -r tmp/
19 |
--------------------------------------------------------------------------------
/tests/test_token.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import unittest
4 |
5 | from somajo.token import Token
6 |
7 |
8 | class TestToken(unittest.TestCase):
9 | def test_token_01(self):
10 | text = "FooBar"
11 | t = Token(text)
12 | self.assertEqual(str(t), text)
13 |
14 | def test_token_02(self):
15 | t = Token("FooBar", space_after=False, original_spelling="Foo Bar")
16 | self.assertEqual(t.extra_info, 'SpaceAfter=No, OriginalSpelling="Foo Bar"')
17 |
18 | def test_token_03(self):
19 | t = Token("
", markup=True, markup_class="start", markup_eos=True)
20 | self.assertEqual(t.markup_class, "start")
21 | self.assertTrue(t.markup_eos)
22 |
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/src/somajo/data/single_token_abbreviations_de.txt:
--------------------------------------------------------------------------------
1 | # A list of multi-dot abbreviations that represent single tokens and
2 | # should not be split.
3 | #
4 | # Lines starting with “#” are treated as comments and will be ignored.
5 |
6 | ak.mas
7 | Art.-Nr.
8 | At.-Gew.
9 | Best.-Nr.
10 | BT-Drs.
11 | Dipl.-Ing.
12 | E.ON
13 | Forsch.frage
14 | GV.NRW.
15 | H.-I.
16 | H.-Qu.
17 | IT.NRW
18 | klass.-lat.
19 | Komm.formen
20 | Krim.-Ob.-Insp.
21 | Kto.-Nr.
22 | L.-Abg.
23 | M.-Schr.
24 | Mat.-Nr.
25 | MBl.NRW.
26 | o.k.
27 | Pan.do/ra
28 | Priv.-Doz.
29 | prov.-fr.
30 | Proz.-Bev.
31 | r.-k.
32 | Reg.-Bez.
33 | Rg.-Präs.
34 | röm.-kath.
35 | Sat.1
36 | SMBl.NRW.
37 | soz.-päd.
38 | SP.ON
39 | T.V.
40 | Uni-Kl.
41 | USt-IdNr.
42 | Zeitschr.titel
43 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on: [push, pull_request, workflow_dispatch]
4 |
5 | jobs:
6 | build:
7 | runs-on: ubuntu-latest
8 | strategy:
9 | matrix:
10 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
11 |
12 | steps:
13 | - name: Checkout sources
14 | uses: actions/checkout@v4
15 |
16 | - name: Set up Python ${{ matrix.python-version }}
17 | uses: actions/setup-python@v4
18 | with:
19 | python-version: ${{ matrix.python-version }}
20 |
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | python -m pip install .
25 |
26 | - name: Test
27 | run: |
28 | python -m unittest discover
29 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | SoMaJo
2 | ======
3 |
4 | SoMaJo is a rule-based tokenizer and sentence splitter that implements
5 | tokenization guidelines for German and English. It has a strong focus
6 | on web and social media texts (it was originally created as the
7 | winning submission to the `EmpiriST 2015 shared task
8 | `_ on automatic
9 | linguistic annotation of computer-mediated communication / social
10 | media) and is particularly well-suited to perform tokenization on all
11 | kinds of written discourse, for example chats, forums, wiki talk
12 | pages, tweets, blog comments, social networks, SMS and WhatsApp
13 | dialogues. Of course it also works on more formal texts.
14 |
15 | More detailed documentation is available `here
16 | `_.
17 |
--------------------------------------------------------------------------------
/utils/baseline.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
4 | cd $SCRIPTDIR
5 |
6 | mkdir tmp
7 | for f in ../../data/all_test/raw/*
8 | # for f in ../../data/empirist_test_pos_cmc/raw/*
9 | # for f in ../../data/empirist_test_pos_web/raw/*
10 | do
11 | filename=$(basename $f)
12 | sed -re "/^<[^>]+>$/! { s/([.!?,;:+*()\"'–])/ \1 /g; s/\s+/\n/g }" $f > tmp/$filename
13 | done
14 | perl ../../data/empirist_test_pos_web/tools/compare_tokenization.perl -x -e errors_baseline_test.txt tmp ../../data/all_test/tokenized
15 | # perl ../../data/empirist_test_pos_web/tools/compare_tokenization.perl -e errors_test.txt tmp ../../data/empirist_test_pos_cmc/tokenized
16 | # perl ../../data/empirist_test_pos_web/tools/compare_tokenization.perl -e errors_test.txt tmp ../../data/empirist_test_pos_web/tokenized
17 | rm -r tmp/
18 |
--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/src/somajo/data/non-breaking_prefixes_en.txt:
--------------------------------------------------------------------------------
1 | # Hyphenated prefixes in the following list are not split into multiple tokens.
2 | # E-mail and re-evaluation are both single tokens.
3 |
4 | a
5 | adeno
6 | agro
7 | ambi
8 | ante
9 | anti
10 | aorto
11 | arch
12 | axio
13 | be
14 | bi
15 | bio
16 | broncho
17 | centi
18 | circum
19 | cis
20 | co
21 | colo
22 | contra
23 | cortico
24 | counter
25 | cran
26 | cross
27 | crypto
28 | cyber
29 | de
30 | deca
31 | demi
32 | dis
33 | e
34 | eco
35 | electro
36 | ennea
37 | ex
38 | extra
39 | ferro
40 | gastro
41 | giga
42 | hemi
43 | hepta
44 | hexa
45 | hypo
46 | ideo
47 | idio
48 | in
49 | infra
50 | inter
51 | intra
52 | iso
53 | judeo
54 | macro
55 | medi
56 | mega
57 | micro
58 | mid
59 | milli
60 | mini
61 | mono
62 | multi
63 | musculo
64 | neo
65 | neuro
66 | nitro
67 | non
68 | novem
69 | octa
70 | octo
71 | ortho
72 | over
73 | paleo
74 | pan
75 | para
76 | pelvi
77 | penta
78 | peri
79 | pheno
80 | phospho
81 | pica
82 | pneumo
83 | poly
84 | post
85 | pre
86 | preter
87 | pro
88 | pseudo
89 | quadri
90 | quasi
91 | quinque
92 | re
93 | recto
94 | salpingo
95 | semi
96 | sept
97 | sero
98 | soci
99 | sub
100 | super
101 | supra
102 | sur
103 | tele
104 | tera
105 | tetra
106 | tri
107 | u
108 | uber
109 | ultra
110 | un
111 | uni
112 | veno
113 | ventriculo
114 | vice
115 | x
116 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | # 1. Build distribution files:
2 | # python3 -m build
3 | #
4 | # 2. Upload to PyPI:
5 | # twine upload dist/*
6 | #
7 | # 3. Check if everything looks all right:
8 | # https://pypi.python.org/pypi/SoMaJo
9 | #
10 | # 4. Go to https://github.com/tsproisl/SoMaJo/releases/new and create
11 | # a new release
12 | [project]
13 | name = "SoMaJo"
14 | version = "2.4.3"
15 | description = "A tokenizer and sentence splitter for German and English web and social media texts."
16 | readme = "README.md"
17 | requires-python = ">=3.8"
18 | license = {file = "LICENSE.txt"}
19 | keywords = ["tokenizer", "sentence-splitter"]
20 | authors = [
21 | {name = "Thomas Proisl, Peter Uhrig", email = "thomas.proisl@fau.de"}
22 | ]
23 | maintainers = [
24 | {name = "Thomas Proisl", email = "thomas.proisl@fau.de"}
25 | ]
26 | classifiers = [
27 | "Development Status :: 5 - Production/Stable",
28 | "Environment :: Console",
29 | "Intended Audience :: Developers",
30 | "Intended Audience :: Science/Research",
31 | "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
32 | "Natural Language :: German",
33 | "Natural Language :: English",
34 | "Operating System :: OS Independent",
35 | "Programming Language :: Python :: 3",
36 | "Topic :: Text Processing :: Linguistic",
37 | ]
38 |
39 | dependencies = [
40 | "regex>=2019.02.18",
41 | ]
42 |
43 | [project.urls]
44 | "Homepage" = "https://github.com/tsproisl/SoMaJo"
45 | "API documentation" = "https://github.com/tsproisl/SoMaJo/blob/master/doc/build/markdown/somajo.md"
46 |
47 | [project.scripts]
48 | somajo-tokenizer = "somajo.cli:main"
49 |
50 | [build-system]
51 | requires = ["setuptools>=61.0"]
52 | build-backend = "setuptools.build_meta"
53 |
54 | [tool.setuptools.packages.find]
55 | where = ["src"]
56 |
57 | [tool.setuptools.package-data]
58 | "somajo.data" = ["*.txt"]
59 |
--------------------------------------------------------------------------------
/utils/errors_trial.txt:
--------------------------------------------------------------------------------
1 | __________________________________________________________________________________________________
2 | tmp/blog_comments.txt ../../data/all_trial/tokenized/blog_comments.txt
3 |
4 | False Positive (linebreak inserted left):
5 | 145: WIE 145: WIE
6 | 146: ICH 146: ICH
7 | 147: * WEI 147: * WEI?
8 | 148: * ? 148: HABT
9 | 149: HABT 149: IHR
10 | 150: IHR 150: BEIDE
11 |
12 | __________________________________________________________________________________________________
13 | tmp/social_chat.txt ../../data/all_trial/tokenized/social_chat.txt
14 |
15 | False Positive (linebreak inserted left):
16 | 157: marc 157: marc
17 | 158: . 158: .
18 | 159: * . 159: * .)))
19 | 160: * ))) 160:
20 | 161: 161:
21 | 162: 162: ups
22 |
23 | __________________________________________________________________________________________________
24 | tmp/wikipedia_talk_pages.txt .../all_trial/tokenized/wikipedia_talk_pages.txt
25 |
26 | False Negative (linebreak inserted right):
27 | 495: meine 495: meine
28 | 496: ich 496: ich
29 | 497: * ;O)) 497: * ;O)
30 | 498: . 498: * )
31 | 499: .
32 |
33 |
34 |
--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
1 | # pip install sphinx sphinx-markdown-builder
2 | # mkdir doc
3 | # cd doc/
4 | # sphinx-quickstart --sep -p SoMaJo -a "Thomas Proisl, Peter Uhrig" -v "2.0.0" --ext-autodoc --extensions sphinx.ext.napoleon
5 | # # edit source/conf.py:
6 | # # import os
7 | # # import sys
8 | # # sys.path.insert(0, os.path.abspath('../..'))
9 | # cd ..
10 | # sphinx-apidoc -f -o doc/source/ somajo
11 | # cd doc
12 | # make markdown
13 |
14 |
15 | # Configuration file for the Sphinx documentation builder.
16 | #
17 | # This file only contains a selection of the most common options. For a full
18 | # list see the documentation:
19 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
20 |
21 | # -- Path setup --------------------------------------------------------------
22 |
23 | # If extensions (or modules to document with autodoc) are in another directory,
24 | # add these directories to sys.path here. If the directory is relative to the
25 | # documentation root, use os.path.abspath to make it absolute, like shown here.
26 | #
27 | import os
28 | import sys
29 | sys.path.insert(0, os.path.abspath('../..'))
30 |
31 |
32 | # -- Project information -----------------------------------------------------
33 |
34 | project = 'SoMaJo'
35 | copyright = '2019, Thomas Proisl, Peter Uhrig'
36 | author = 'Thomas Proisl, Peter Uhrig'
37 |
38 | # The short X.Y version
39 | version = '2.0.0'
40 |
41 | # The full version, including alpha/beta/rc tags
42 | release = '2.0.0'
43 |
44 |
45 | # -- General configuration ---------------------------------------------------
46 |
47 | # Add any Sphinx extension module names here, as strings. They can be
48 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
49 | # ones.
50 | extensions = [
51 | 'sphinx.ext.autodoc',
52 | 'sphinx.ext.napoleon',
53 | 'sphinx_markdown_builder',
54 | ]
55 |
56 | # Add any paths that contain templates here, relative to this directory.
57 | templates_path = ['_templates']
58 |
59 | # List of patterns, relative to source directory, that match files and
60 | # directories to ignore when looking for source files.
61 | # This pattern also affects html_static_path and html_extra_path.
62 | exclude_patterns = []
63 |
64 |
65 | # -- Options for HTML output -------------------------------------------------
66 |
67 | # The theme to use for HTML and HTML Help pages. See the documentation for
68 | # a list of builtin themes.
69 | #
70 | html_theme = 'alabaster'
71 |
72 | # Add any paths that contain custom static files (such as style sheets) here,
73 | # relative to this directory. They are copied after the builtin static files,
74 | # so a file named "default.css" will overwrite the builtin "default.css".
75 | html_static_path = ['_static']
76 |
77 |
78 | # -- Extension configuration -------------------------------------------------
79 |
--------------------------------------------------------------------------------
/src/somajo/doubly_linked_list.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import operator
4 |
5 |
6 | class DLLElement:
7 | def __init__(self, val=None, prv=None, nxt=None, lst=None):
8 | if isinstance(val, DLLElement):
9 | val = val.value
10 | self.prev = prv
11 | self.next = nxt
12 | self.value = val
13 | self.list = lst
14 | if prv is not None:
15 | prv.next = self
16 | if nxt is not None:
17 | nxt.prev = self
18 |
19 |
20 | class DLL:
21 | def __init__(self, iterable=None):
22 | self.first = None
23 | self.last = None
24 | self.size = 0
25 | if iterable is not None:
26 | self.extend(iterable)
27 |
28 | def __iter__(self, start=None):
29 | current = self.first
30 | if start is not None:
31 | current = start
32 | while current is not None:
33 | yield current
34 | current = current.next
35 |
36 | def __reversed__(self, start=None):
37 | current = self.last
38 | if start is not None:
39 | current = start
40 | while current is not None:
41 | yield current
42 | current = current.prev
43 |
44 | def __len__(self):
45 | return self.size
46 |
47 | def __str__(self):
48 | return str(self.to_list())
49 |
50 | def _find_matching_element(self, item, attrgetter, value, ignore_attrgetter=None, ignore_value=None, forward=True):
51 | current = item
52 | direction = operator.attrgetter("next")
53 | if not forward:
54 | direction = operator.attrgetter("prev")
55 | while direction(current) is not None:
56 | current = direction(current)
57 | if ignore_attrgetter is not None:
58 | if ignore_attrgetter(current) == ignore_value:
59 | continue
60 | if attrgetter(current) == value:
61 | return current
62 | return None
63 |
64 | def append(self, item):
65 | element = DLLElement(item, self.last, None, self)
66 | if self.first is None:
67 | self.first = element
68 | self.last = element
69 | self.size += 1
70 |
71 | def append_left(self, item):
72 | element = DLLElement(item, None, self.first, self)
73 | if self.last is None:
74 | self.last = element
75 | self.first = element
76 | self.size += 1
77 |
78 | def extend(self, iterable):
79 | for item in iterable:
80 | self.append(item)
81 |
82 | def insert_left(self, item, ref_element):
83 | element = DLLElement(item, ref_element.prev, ref_element, self)
84 | ref_element.prev = element
85 | if self.first is ref_element:
86 | self.first = element
87 | self.size += 1
88 |
89 | def insert_right(self, item, ref_element):
90 | element = DLLElement(item, ref_element, ref_element.next, self)
91 | ref_element.next = element
92 | if self.last is ref_element:
93 | self.last = element
94 | self.size += 1
95 |
96 | def is_left_of(self, element, ref_element):
97 | current = ref_element
98 | while current is not self.first:
99 | current = current.prev
100 | if current is element:
101 | return True
102 | return False
103 |
104 | def is_right_of(self, element, ref_element):
105 | return self.is_left_of(ref_element, element)
106 |
107 | def next_matching(self, item, attrgetter, value, ignore_attrgetter=None, ignore_value=None):
108 | return self._find_matching_element(item, attrgetter, value, ignore_attrgetter, ignore_value, forward=True)
109 |
110 | def pop(self):
111 | if self.size == 0:
112 | raise IndexError
113 | element = self.last
114 | self.remove(element)
115 | return element.value
116 |
117 | def previous_matching(self, item, attrgetter, value, ignore_attrgetter=None, ignore_value=None):
118 | return self._find_matching_element(item, attrgetter, value, ignore_attrgetter, ignore_value, forward=False)
119 |
120 | def remove(self, element):
121 | if self.first is element:
122 | self.first = element.next
123 | if self.last is element:
124 | self.last = element.prev
125 | if element.prev is not None:
126 | element.prev.next = element.next
127 | if element.next is not None:
128 | element.next.prev = element.prev
129 | self.size -= 1
130 |
131 | def to_list(self):
132 | return [e.value for e in self]
133 |
--------------------------------------------------------------------------------
/src/somajo/token.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 |
4 | class Token:
5 | """Token objects store a piece of text (in the end a single token) with additional information.
6 |
7 | Parameters
8 | ----------
9 | text : str
10 | The text that makes up the token object
11 | markup : bool, (default=False)
12 | Is the token a markup token?
13 | markup_class : {'start', 'end'}, optional (default=None)
14 | If `markup=True`, then `markup_class` must be either "start" or "end".
15 | markup_eos : bool, optional (default=None)
16 | Is the markup token a sentence boundary?
17 | locked : bool, (default=False)
18 | Mark the token as locked.
19 | token_class : {'URL', 'XML_entity', 'XML_tag', 'abbreviation', 'action_word', 'amount', 'date', 'email_address', 'emoticon', 'hashtag', 'measurement', 'mention', 'number', 'ordinal', 'regular', 'semester', 'symbol', 'time'}, optional (default=None)
20 | The class of the token, e.g. "regular", "emoticon", "URL", etc.
21 | space_after : bool, (default=True)
22 | Was there a space after the token in the original data?
23 | original_spelling : str, optional (default=None)
24 | The original spelling of the token, if it is different from the one in `text`.
25 | first_in_sentence : bool, (default=False)
26 | Is it the first token of a sentence?
27 | last_in_sentence : bool, (default=False)
28 | Is it the last token of a sentence?
29 | character_offset : tuple, (default=None)
30 | Character offset of the token in the input as tuple `(start, end)`
31 | such that `input[start:end] == text` (if there are no changes to
32 | the token text during tokenization)
33 |
34 | """
35 |
36 | token_classes = {
37 | "URL",
38 | "XML_entity",
39 | "XML_tag",
40 | "abbreviation",
41 | "action_word",
42 | "amount",
43 | "date",
44 | "email_address",
45 | "emoticon",
46 | "hashtag",
47 | "measurement",
48 | "mention",
49 | "number",
50 | "ordinal",
51 | "regular",
52 | "semester",
53 | "symbol",
54 | "time",
55 | }
56 |
57 | def __init__(
58 | self,
59 | text,
60 | *,
61 | markup=False,
62 | markup_class=None,
63 | markup_eos=None,
64 | locked=False,
65 | token_class=None,
66 | space_after=True,
67 | original_spelling=None,
68 | first_in_sentence=False,
69 | last_in_sentence=False,
70 | character_offset=None
71 | ):
72 | self.text = text
73 | if markup:
74 | assert markup_class is not None, "You need to specify a `markup_class` for markup tokens."
75 | assert markup_eos is not None, "You need to provide a value for `markup_eos` for markup tokens."
76 | if markup_class is not None:
77 | assert markup, "You can only specify a `markup_class` for markup tokens."
78 | assert markup_class == "start" or markup_class == "end", f"'{markup_class}' is not a recognized markup class."
79 | if markup_eos is not None:
80 | assert markup, "You can only use `markup_eos` for markup tokens."
81 | assert isinstance(markup_eos, bool), f"'{markup_eos}' is not a Boolean value."
82 | if token_class is not None:
83 | assert token_class in self.token_classes, f"'{token_class}' is not a recognized token class."
84 | self.markup = markup
85 | self.markup_class = markup_class
86 | self.markup_eos = markup_eos
87 | self._locked = locked
88 | self.token_class = token_class
89 | self.space_after = space_after
90 | self.original_spelling = original_spelling
91 | self.first_in_sentence = first_in_sentence
92 | self.last_in_sentence = last_in_sentence
93 | self.character_offset = character_offset
94 |
95 | def __str__(self):
96 | return self.text
97 |
98 | @property
99 | def extra_info(self):
100 | """String representation of extra information.
101 |
102 | Returns
103 | -------
104 | str
105 | A string representation of the `space_after` and `original_spelling` attributes.
106 |
107 | Examples
108 | --------
109 | >>> tok = Token(":)", token_class="regular", space_after=False, original_spelling=": )")
110 | >>> print(tok.text)
111 | :)
112 | >>> print(tok.extra_info)
113 | SpaceAfter=No, OriginalSpelling=": )"
114 |
115 | """
116 | info = []
117 | if not self.space_after:
118 | info.append("SpaceAfter=No")
119 | if self.original_spelling is not None:
120 | info.append("OriginalSpelling=\"%s\"" % self.original_spelling)
121 | return ", ".join(info)
122 |
--------------------------------------------------------------------------------
/tests/test_doubly_linked_list.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import operator
4 | import unittest
5 |
6 | from somajo.doubly_linked_list import DLL
7 |
8 |
9 | class TestDLL(unittest.TestCase):
10 | def test_dll_01(self):
11 | lst = ["Foo", "", 0, -1, False, True, None]
12 | dll = DLL(lst)
13 | self.assertEqual(dll.to_list(), lst)
14 |
15 | def test_dll_02(self):
16 | lst = ["Foo", "", 0, -1, False, True, None]
17 | dll = DLL(lst)
18 | self.assertEqual(DLL(reversed(dll)).to_list(), list(reversed(lst)))
19 |
20 | def test_dll_03(self):
21 | lst = ["Foo", "", 0, -1, False, True, None]
22 | dll = DLL(lst)
23 | self.assertEqual(len(dll), len(lst))
24 |
25 | def test_dll_04(self):
26 | lst = ["Foo", "", 0, -1, False, True, None]
27 | dll = DLL(["Foo", "", 0, -1, False, True, None])
28 | self.assertEqual(str(dll), str(lst))
29 |
30 | def test_dll_05(self):
31 | dll = DLL([4, 5, 6])
32 | dll.append_left(3)
33 | self.assertEqual(dll.to_list(), [3, 4, 5, 6])
34 |
35 | def test_dll_06(self):
36 | dll = DLL([4, 5, 6])
37 | dll.append(7)
38 | self.assertEqual(dll.to_list(), [4, 5, 6, 7])
39 |
40 | def test_dll_07(self):
41 | dll = DLL([4, 5, 6])
42 | dll.extend([7, 8, 9])
43 | self.assertEqual(dll.to_list(), [4, 5, 6, 7, 8, 9])
44 |
45 | def test_dll_08(self):
46 | dll = DLL([4, 5, 6, 7])
47 | last = dll.pop()
48 | self.assertEqual(last, 7)
49 | self.assertEqual(len(dll), 3)
50 | self.assertEqual(dll.to_list(), [4, 5, 6])
51 |
52 | def test_dll_09(self):
53 | dll = DLL([])
54 | self.assertEqual(len(dll), 0)
55 | self.assertEqual(dll.to_list(), [])
56 |
57 | def test_dll_10(self):
58 | dll = DLL([4])
59 | last = dll.pop()
60 | self.assertEqual(last, 4)
61 | self.assertEqual(len(dll), 0)
62 | self.assertEqual(dll.to_list(), [])
63 |
64 | def test_dll_11(self):
65 | dll = DLL([4])
66 | last = dll.pop()
67 | self.assertEqual(last, 4)
68 | self.assertRaises(IndexError, dll.pop)
69 |
70 | def test_dll_12(self):
71 | dll = DLL([])
72 | dll.append_left(1)
73 | self.assertEqual(dll.to_list(), [1])
74 |
75 | def test_dll_13(self):
76 | dll = DLL([1, 2, 3, 4])
77 | x = dll.next_matching(dll.first, operator.attrgetter("value"), 2)
78 | self.assertEqual(x.value, 2)
79 | self.assertEqual([e.value for e in dll.__iter__(start=x)], [2, 3, 4])
80 |
81 | def test_dll_14(self):
82 | dll = DLL([1, 2, 3, 4])
83 | x = dll.previous_matching(dll.last, operator.attrgetter("value"), 3)
84 | self.assertEqual(x.value, 3)
85 | self.assertEqual([e.value for e in dll.__reversed__(start=x)], [3, 2, 1])
86 |
87 | def test_dll_15(self):
88 | dll = DLL([1, 2, 3, 4])
89 | x = dll.next_matching(dll.first, operator.attrgetter("value"), 4, operator.attrgetter("value"), 3)
90 | self.assertEqual(x.value, 4)
91 |
92 | def test_dll_16(self):
93 | dll = DLL([1, 2, 3, 4])
94 | x = dll.next_matching(dll.first, operator.attrgetter("value"), 7)
95 | self.assertIs(x, None)
96 |
97 | def test_dll_17(self):
98 | dll = DLL([1, 2, 3])
99 | x = dll.next_matching(dll.first, operator.attrgetter("value"), 2)
100 | dll.insert_left(7, x)
101 | self.assertEqual(dll.to_list(), [1, 7, 2, 3])
102 |
103 | def test_dll_18(self):
104 | dll = DLL([1, 2, 3])
105 | x = dll.next_matching(dll.first, operator.attrgetter("value"), 2)
106 | dll.insert_right(7, x)
107 | self.assertEqual(dll.to_list(), [1, 2, 7, 3])
108 |
109 | def test_dll_19(self):
110 | dll = DLL([1, 2, 3])
111 | self.assertTrue(dll.is_left_of(dll.first, dll.last))
112 |
113 | def test_dll_20(self):
114 | dll = DLL([1, 2, 3])
115 | self.assertTrue(dll.is_right_of(dll.last, dll.first))
116 |
117 | def test_dll_21(self):
118 | dll = DLL([1, 2, 3])
119 | x = dll.next_matching(dll.first, operator.attrgetter("value"), 2)
120 | dll.remove(x)
121 | self.assertEqual(dll.to_list(), [1, 3])
122 |
123 | def test_dll_22(self):
124 | dll = DLL([1, 2, 3])
125 | dll.remove(dll.first)
126 | self.assertEqual(dll.to_list(), [2, 3])
127 |
128 | def test_dll_23(self):
129 | dll = DLL([1, 2, 3])
130 | dll.remove(dll.last)
131 | self.assertEqual(dll.to_list(), [1, 2])
132 |
133 | def test_dll_24(self):
134 | dll = DLL([1, 2, 3])
135 | dll.insert_left(0, dll.first)
136 | self.assertEqual(dll.to_list(), [0, 1, 2, 3])
137 |
138 | def test_dll_25(self):
139 | dll = DLL([1, 2, 3])
140 | dll.insert_right(4, dll.last)
141 | self.assertEqual(dll.to_list(), [1, 2, 3, 4])
142 |
143 | def test_dll_26(self):
144 | dll = DLL([1, 2, 3])
145 | self.assertFalse(dll.is_left_of(dll.last, dll.first))
146 |
147 | def test_dll_27(self):
148 | dll = DLL([1])
149 | dll.remove(dll.last)
150 | self.assertEqual(dll.to_list(), [])
151 |
152 | def test_dll_28(self):
153 | dll = DLL([1])
154 | dll.remove(dll.first)
155 | self.assertEqual(dll.to_list(), [])
156 |
--------------------------------------------------------------------------------
/src/somajo/cli.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | import logging
5 | import time
6 |
7 | from . import (
8 | SoMaJo,
9 | __version__
10 | )
11 |
12 | logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO)
13 |
14 |
15 | def arguments():
16 | """"""
17 | parser = argparse.ArgumentParser(description="A tokenizer and sentence splitter for German and English texts. Currently, two tokenization guidelines are implemented: The EmpiriST guidelines for German web and social media texts (de_CMC) and the \"new\" Penn Treebank conventions for English texts (en_PTB).")
18 | parser.add_argument("-l", "--language", choices=SoMaJo.supported_languages, default=SoMaJo._default_language, help="Choose a language. Currently supported are German EmpiriST-style tokenization (de_CMC) and English Penn-Treebank-style tokenization(en_PTB). (Default: de_CMC)")
19 | parser.add_argument("-s", "--paragraph_separator", choices=SoMaJo.paragraph_separators, default=SoMaJo._default_parsep, help="How are paragraphs separated in the input text? Will be ignored if option -x/--xml is used. (Default: empty_lines)")
20 | parser.add_argument("-x", "--xml", action="store_true", help="The input is an XML file. You can specify tags that always constitute a sentence break (e.g. HTML p tags) via the --tag option.")
21 | parser.add_argument("--tag", action="append", help="Start and end tags of this type constitute sentence breaks, i.e. they do not occur in the middle of a sentence. Can be used multiple times to specify multiple tags, e.g. --tag p --tag br. Implies option -x/--xml. (Default: --tag title --tag h1 --tag h2 --tag h3 --tag h4 --tag h5 --tag h6 --tag p --tag br --tag hr --tag div --tag ol --tag ul --tag dl --tag table)")
22 | parser.add_argument("--prune", action="append", help="Tags of this type will be removed from the input before tokenization. Can be used multiple times to specify multiple tags, e.g. --tag script --tag style. Implies option -x/--xml. By default, no tags are pruned.")
23 | parser.add_argument("--strip-tags", action="store_true", help="Suppresses output of XML tags. Implies option -x/--xml.")
24 | parser.add_argument("-c", "--split_camel_case", action="store_true", help="Split items in written in camelCase (excluding established names and terms).")
25 | parser.add_argument("--split_sentences", "--split-sentences", action="store_true", help="Also split the input into sentences.")
26 | parser.add_argument("--sentence_tag", "--sentence-tag", type=str, help="Tag name for sentence boundaries (e.g. --sentence_tag s). If this option is specified, sentences will be delimited by XML tags (e.g. …) instead of empty lines. This option implies --split_sentences")
27 | parser.add_argument("-t", "--token_classes", action="store_true", help="Output the token classes (number, XML tag, abbreviation, etc.) in addition to the tokens.")
28 | parser.add_argument("-e", "--extra_info", action="store_true", help='Output additional information for each token: SpaceAfter=No if the token was not followed by a space and OriginalSpelling="…" if the token contained whitespace.')
29 | parser.add_argument("--character-offsets", action="store_true", help='Output character offsets in the input for each token.')
30 | parser.add_argument("--parallel", type=int, default=1, metavar="N", help="Run N worker processes (up to the number of CPUs) to speed up tokenization.")
31 | parser.add_argument("-v", "--version", action="version", version="SoMaJo %s" % __version__, help="Output version information and exit.")
32 | parser.add_argument("FILE", type=argparse.FileType("r", encoding="utf-8"), help="The input file (UTF-8-encoded) or \"-\" to read from STDIN.")
33 | args = parser.parse_args()
34 | return args
35 |
36 |
37 | def main():
38 | args = arguments()
39 | n_tokens = 0
40 | n_sentences = 0
41 | t0 = time.perf_counter()
42 | is_xml = False
43 | if args.xml or args.strip_tags or (args.tag is not None) or (args.prune is not None):
44 | is_xml = True
45 | if args.sentence_tag:
46 | args.split_sentences = True
47 | tokenizer = SoMaJo(
48 | args.language,
49 | split_camel_case=args.split_camel_case,
50 | split_sentences=args.split_sentences,
51 | xml_sentences=args.sentence_tag,
52 | character_offsets=args.character_offsets
53 | )
54 | if is_xml:
55 | eos_tags = args.tag
56 | if eos_tags is None:
57 | eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
58 | chunks = tokenizer.tokenize_xml_file(args.FILE, eos_tags, strip_tags=args.strip_tags, parallel=args.parallel, prune_tags=args.prune)
59 | else:
60 | chunks = tokenizer.tokenize_text_file(args.FILE, args.paragraph_separator, parallel=args.parallel)
61 | for chunk in chunks:
62 | n_sentences += 1
63 | for token in chunk:
64 | output = token.text
65 | if not token.markup:
66 | n_tokens += 1
67 | if args.token_classes:
68 | output += "\t" + token.token_class
69 | if args.extra_info:
70 | output += "\t" + token.extra_info
71 | if args.character_offsets:
72 | output += f"\t{token.character_offset[0]}, {token.character_offset[1]}"
73 | print(output)
74 | if args.split_sentences and args.sentence_tag is None:
75 | print()
76 | t1 = time.perf_counter()
77 | if args.split_sentences:
78 | logging.info("Tokenized %d tokens (%d sentences) in %d seconds (%d tokens/s)" % (n_tokens, n_sentences, t1 - t0, n_tokens / (t1 - t0)))
79 | else:
80 | logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" % (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
81 |
--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import unittest
4 |
5 | from somajo import utils
6 |
7 |
8 | class TestXmlChunkGenerator(unittest.TestCase):
9 | def _equal(self, raw, chunks, prune_tags=None):
10 | eos_tags = set(["p"])
11 | if prune_tags is not None:
12 | prune_tags = set(prune_tags)
13 | chunk_info = list(utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags, prune_tags=prune_tags))
14 | chunk_lists = (ci[0] for ci in chunk_info)
15 | chunk_lists = [[t.text for t in gc] for gc in chunk_lists]
16 | self.assertEqual(chunk_lists, chunks)
17 |
18 | def _equal_offsets(self, raw, chunks, prune_tags=None):
19 | eos_tags = set(["p"])
20 | if prune_tags is not None:
21 | prune_tags = set(prune_tags)
22 | chunk_info = list(utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags, prune_tags=prune_tags, character_offsets=True))
23 | chunk_lists, raws, positions = zip(*chunk_info)
24 | offsets = [[t.character_offset for t in cl] for cl in chunk_lists]
25 | extracted_chunks = [[raw[s:e] for s, e in o] for o in offsets]
26 | self.assertEqual(extracted_chunks, chunks)
27 |
28 | def test_xml_chunk_generator_01(self):
29 | self._equal("foo bar", [["", "foo bar", ""]])
30 |
31 | def test_xml_chunk_generator_02(self):
32 | self._equal("foo
bar
", [["", "", "foo", "
"], ["", "bar", "
", ""]])
33 |
34 | def test_xml_chunk_generator_03(self):
35 | self._equal("\n\nfoo\n
\n\nbar\n
\n", [["", "\n", "", "\nfoo\n", "
"], ["\n", "", "\nbar\n", "
", "\n", ""]])
36 |
37 | def test_xml_chunk_generator_04(self):
38 | self._equal(
39 | "\n \n foo\n
\n \n bar\n
\n",
40 | [["", "\n ", "", "\n foo\n ", "
"], ["\n ", "", "\n bar\n ", "
", "\n", ""]]
41 | )
42 |
43 | def test_xml_chunk_generator_05(self):
44 | self._equal(
45 | "foo
bazbar
baz",
46 | [["", "", "foo", "
"], ["", "baz", ""], ["", "bar", "
"], ["", "baz", "", ""]]
47 | )
48 |
49 | def test_xml_chunk_generator_06(self):
50 | self._equal(
51 | "foo
bar
",
52 | [["", "", "foo", "
"], ["
", "", "", "bar", "
"], ["
", "", ""]]
53 | )
54 |
55 | def test_xml_chunk_generator_07(self):
56 | self._equal("foobar", [["", "", "bar", "", ""]], prune_tags=["del"])
57 |
58 | def test_xml_chunk_generator_08(self):
59 | self._equal("foobar
", [["", "", "bar", "
", ""]], prune_tags=["del"])
60 |
61 | def test_xml_chunk_generator_09(self):
62 | self._equal("bar\n foo\nbaz", [["", "bar\n \nbaz", ""]], prune_tags=["del"])
63 |
64 | def test_xml_chunk_offsets_01(self):
65 | self._equal_offsets("Test", [["", "Test", ""]])
66 |
67 | def test_xml_chunk_offsets_02(self):
68 | self._equal_offsets("3 < 5", [["", "3 < 5", ""]])
69 |
70 | def test_xml_chunk_offsets_03(self):
71 | self._equal_offsets("Testfall", [["", "Testfall", ""]])
72 |
73 | def test_xml_chunk_offsets_04(self):
74 | self._equal_offsets("Testfall", [["", "Testfall", ""]])
75 |
76 | def test_xml_chunk_offsets_05(self):
77 | """Single combining mark"""
78 | self._equal_offsets("foo xÄx foo", [["", "foo xÄx foo", ""]])
79 |
80 | def test_xml_chunk_offsets_06(self):
81 | """Multiple combining marks"""
82 | self._equal_offsets("foo xṩx foo", [["", "foo xṩx foo", ""]])
83 |
84 | def test_xml_chunk_offsets_07(self):
85 | """Multiple combining marks"""
86 | self._equal_offsets("foo xṩx foo", [["", "foo xṩx foo", ""]])
87 |
88 | def test_xml_chunk_offsets_08(self):
89 | """Multiple combining marks"""
90 | self._equal_offsets("foo xsḍ̇x foo", [["", "foo xsḍ̇x foo", ""]])
91 |
92 | def test_xml_chunk_offsets_09(self):
93 | """Multiple combining marks"""
94 | self._equal_offsets("foo xq̣̇x foo", [["", "foo xq̣̇x foo", ""]])
95 |
96 | def test_xml_chunk_offsets_10(self):
97 | self._equal_offsets("Foo", [["", "Foo", ""]])
98 |
99 | def test_xml_chunk_offsets_11(self):
100 | self._equal_offsets("Foo", [["", "Foo", ""]])
101 |
102 | def test_xml_chunk_offsets_12(self):
103 | self._equal_offsets(" Foo ", [["", " Foo ", ""]])
104 |
105 | def test_xml_chunk_offsets_13(self):
106 | self._equal_offsets("Foo \"Bar\" 'Baz'", [["", "Foo \"Bar\" 'Baz'", ""]])
107 |
108 | def test_xml_chunk_offsets_14(self):
109 | self._equal_offsets('\n Foo\n', [['', "\n Foo\n", ""]])
110 |
111 | def test_xml_chunk_offsets_15(self):
112 | self._equal_offsets("Hallo
Tschüß", [["", "Hallo", "
", "", "Tschüß", ""]])
113 |
114 | def test_xml_chunk_offsets_16(self):
115 | self._equal_offsets("Hallo
Tschüß", [["", "Hallo", "
", "", "Tschüß", ""]])
116 |
117 | def test_xml_chunk_offsets_17(self):
118 | self._equal_offsets("\u0303foo", [["", "\u0303foo", ""]])
119 |
120 | def test_xml_chunk_offsets_18(self):
121 | self._equal_offsets("foobar
", [["", "foo"], ["", "bar", "
", ""]])
122 |
123 | @unittest.expectedFailure
124 | def test_xml_chunk_offsets_19(self):
125 | self._equal_offsets("bar futsch baz", [["", "bar baz", ""]], prune_tags=["del"])
126 |
--------------------------------------------------------------------------------
/utils/errors_train.txt:
--------------------------------------------------------------------------------
1 | __________________________________________________________________________________________________
2 | tmp/cmc_train_twitter_2.txt ...a/all_train/tokenized/cmc_train_twitter_2.txt
3 |
4 | False Negative (linebreak inserted right):
5 | 238: 238:
6 | 239: @aPfeL4321 239: @aPfeL4321
7 | 240: * DasTB 240: * Das
8 | 241: sollte 241: * TB
9 | 242: allerdings 242: sollte
10 | 243: gut 243: allerdings
11 |
12 | False Negative (linebreak inserted right):
13 | 654: Vernachlässigung 655: Vernachlässigung
14 | 655: ? 656: ?
15 | 656: * Wenn2 657: * Wenn
16 | 657: : 658: * 2
17 | 658: warum 659: :
18 | 659: ? 660: warum
19 |
20 | __________________________________________________________________________________________________
21 | tmp/cmc_train_blog_comment.txt ...ll_train/tokenized/cmc_train_blog_comment.txt
22 |
23 | False Positive (linebreak inserted left):
24 | 145: WIE 145: WIE
25 | 146: ICH 146: ICH
26 | 147: * WEI 147: * WEI?
27 | 148: * ? 148: HABT
28 | 149: HABT 149: IHR
29 | 150: IHR 150: BEIDE
30 |
31 | __________________________________________________________________________________________________
32 | tmp/cmc_train_social_chat.txt ...all_train/tokenized/cmc_train_social_chat.txt
33 |
34 | False Positive (linebreak inserted left):
35 | 158: marc 157: marc
36 | 159: . 158: .
37 | 160: * . 159: * .)))
38 | 161: * ))) 160:
39 | 162: 161:
40 | 163: 162: ups
41 |
42 | False Positive (linebreak inserted left):
43 | 652: 650:
44 | 653: 651:
45 | 654: * 8 652: * 8:)
46 | 655: * :) 653:
47 | 656: 654: 1014:
52 | 1018: * 1015: *
53 | 1019: * 51cm 1016: * 51
54 | 1020: * 1017: * cm
55 | 1021: 1018: *
56 | 1022: 1019:
57 |
58 | False Negative (linebreak inserted right):
59 | 1340: 1338:
60 | 1341: 1339:
61 | 1342: * bochum-münster 1340: * bochum
62 | 1343: ohne 1341: * -
63 | 1344: küche 1342: münster
64 | 1345: 3500 1343: ohne
65 |
66 | False Negative (linebreak inserted right):
67 | 1340: 1339:
68 | 1341: 1340: bochum
69 | 1342: * bochum-münster 1341: * -
70 | 1343: ohne 1342: * münster
71 | 1344: küche 1343: ohne
72 | 1345: 3500 1344: küche
73 |
74 | __________________________________________________________________________________________________
75 | tmp/cmc_train_professional_chat.txt ...ain/tokenized/cmc_train_professional_chat.txt
76 |
77 | False Negative (linebreak inserted right):
78 | 898: im 898: im
79 | 899: Pott 899: Pott
80 | 900: * :-)) 900: * :-)
81 | 901: ? 901: * )
82 | 902: 902: ?
83 | 903: 0):
36 | nfc_j += 1
37 | orig_j = orig_i + 1
38 | while (orig_j < len(orig)) and (unicodedata.combining(orig[orig_j]) > 0):
39 | orig_j += 1
40 | assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j]), f"'{nfc[nfc_i:nfc_j]}' != unicodedata.normalize('NFC', '{orig[orig_i:orig_j]}')"
41 | alignment[(nfc_i, nfc_j)] = (orig_i, orig_j)
42 | nfc_i = nfc_j
43 | orig_i = orig_j
44 | assert orig_j == len(orig), f"{orig_j} != {len(orig)}; nfc: '{nfc}', orig: '{orig}'"
45 | return alignment
46 |
47 |
48 | def _determine_offsets(tokens, raw, position):
49 | """Determine start and end positions of tokens in the original raw (NFC) input."""
50 | offsets = []
51 | raw_i = 0
52 | raw = re.sub(r"\s", " ", raw)
53 | for token in tokens:
54 | if token.markup:
55 | start, end = token.character_offset
56 | start -= position
57 | end -= position
58 | else:
59 | text = token.text
60 | if token.original_spelling is not None:
61 | text = token.original_spelling
62 | text = re.sub(r"\s", " ", text)
63 | if raw[raw_i:].startswith(text):
64 | start = raw_i
65 | end = start + len(text)
66 | elif raw[raw_i:].startswith(" " + text):
67 | start = raw_i + 1
68 | end = start + len(text)
69 | else:
70 | raw_start = raw_i
71 | for i, char in enumerate(text):
72 | for j in range(raw_start, len(raw)):
73 | if raw[j] == char:
74 | if i == 0:
75 | start = j
76 | if i == len(text) - 1:
77 | end = j + 1
78 | break
79 | else:
80 | assert raw[j] in _skipable_characters, f"'{raw[j]}' ({hex(ord(raw[j]))}) is not a skipable character; token: '{text}', raw: '{raw[raw_i:]}'"
81 | raw_start = j + 1
82 | offsets.append((start, end))
83 | raw_i = end
84 | return offsets
85 |
86 |
87 | def _resolve_entities(xml):
88 | """Resolve XML entities and provide an alignment from output string to input string."""
89 | named = {"&": "&", "'": "'", ">": ">", "<": "<", """: '"'}
90 | outstring = ""
91 | alignment = []
92 | xml_lower = xml.lower()
93 | i = 0
94 | for m in _xml_entity.finditer(xml_lower):
95 | start, end = m.span()
96 | if xml_lower[start + 2] == "x":
97 | char = chr(int(xml[start + 3:end - 1], base=16))
98 | elif xml_lower[start + 1] == "#":
99 | char = chr(int(xml[start + 2:end - 1]))
100 | else:
101 | char = named[xml_lower[start:end]]
102 | outstring += xml[i:start] + char
103 | for j in range(i, start):
104 | alignment.append((j, j + 1))
105 | alignment.append((start, end))
106 | i = end
107 | outstring += xml[i:len(xml)]
108 | for j in range(i, len(xml)):
109 | alignment.append((j, j + 1))
110 | return outstring, alignment
111 |
112 |
113 | def token_offsets(token_list, raw, position, xml_input, tokens):
114 | """Determine character offsets for tokens."""
115 | if xml_input:
116 | chunk_offsets = [(t.character_offset[0] - position, t.character_offset[1] - position) for t in token_list]
117 | raw, align_to_entities = _resolve_entities(raw)
118 | align_from_entities = {i: char_i for char_i, (start, end) in enumerate(align_to_entities) for i in range(start, end)}
119 | chunks = [raw[align_from_entities[start]:align_from_entities[end - 1] + 1] for start, end in chunk_offsets]
120 | chunks_nfc = [unicodedata.normalize("NFC", c) for c in chunks]
121 | alignments = [_align_nfc(chunk_nfc, chunk) for chunk, chunk_nfc in zip(chunks, chunks_nfc)]
122 | align_to_raw = alignments[0]
123 | for i in range(1, len(alignments)):
124 | o1 = sum(len(c) for c in chunks_nfc[:i])
125 | o2 = sum(len(c) for c in chunks[:i])
126 | align_to_raw.update({(k[0] + o1, k[1] + o1): (v[0] + o2, v[1] + o2) for k, v in alignments[i].items()})
127 | raw_nfc = "".join(chunks_nfc)
128 | else:
129 | raw_nfc = unicodedata.normalize("NFC", raw)
130 | align_to_raw = _align_nfc(raw_nfc, raw)
131 | align_from_raw = {i: k for k, v in align_to_raw.items() for i in range(v[0], v[1])}
132 | align_to_starts = {i: v[0] for k, v in align_to_raw.items() for i in range(k[0], k[1])}
133 | align_to_ends = {i: v[1] for k, v in align_to_raw.items() for i in range(k[0], k[1])}
134 | # adjust character offsets for markup tokens
135 | if xml_input:
136 | for i in range(len(tokens)):
137 | if tokens[i].markup:
138 | s, e = tokens[i].character_offset
139 | tokens[i].character_offset = (
140 | align_from_raw[align_from_entities[s - position]][0] + position,
141 | align_from_raw[align_from_entities[e - position - 1]][1] + position
142 | )
143 | offsets = _determine_offsets(tokens, raw_nfc, position)
144 | assert len(tokens) == len(offsets), f"Not as many tokens as offsets: {len(tokens)} != {len(offsets)}"
145 | offsets = [(align_to_starts[s], align_to_ends[e - 1]) for s, e in offsets]
146 | if xml_input:
147 | offsets = [(align_to_entities[s][0], align_to_entities[e - 1][1]) for s, e in offsets]
148 | offsets = [(s + position, e + position) for s, e in offsets]
149 | return offsets
150 |
151 |
152 | def xml_chunk_offset(token, raw):
153 | """Determine character offset for an XML chunk created by `utils._xml_chunk_generator`."""
154 | raw, align_to_raw = _resolve_entities(raw)
155 | raw = re.sub(r"\s", " ", raw)
156 | text = token.text
157 | text = re.sub(r"\s", " ", text)
158 | if token.markup:
159 | text, align_to_text = _resolve_entities(text)
160 | text = text.replace("'", '"')
161 | if raw.startswith(text):
162 | start = 0
163 | end = len(text)
164 | else:
165 | pattern = "(" + re.escape(text) + ")"
166 | pattern = pattern.replace(r"\ ", r"\s+")
167 | pattern = pattern.replace("=", r"\s*=\s*")
168 | if not text.startswith(""):
169 | pattern = pattern[:-2] + r"\s*/?\s*" + pattern[-2:]
170 | local_raw = raw.replace("'", '"')
171 | m = re.match(pattern, local_raw)
172 | if text.startswith("") and not m:
173 | start, end = 0, 0
174 | else:
175 | assert m, f"'{text}' not found in '{local_raw}'"
176 | start, end = m.span(1)
177 | else:
178 | assert raw.startswith(text), f"'{raw}' does not start with '{text}'"
179 | start = 0
180 | end = len(text)
181 | if start == end:
182 | return (align_to_raw[start][0], align_to_raw[start][0])
183 | else:
184 | return (align_to_raw[start][0], align_to_raw[end - 1][1])
185 |
--------------------------------------------------------------------------------
/utils/evaluate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | import collections
5 | import os
6 |
7 |
8 | Character = collections.namedtuple("Character", ("char", "token_boundary", "sentence_boundary"))
9 |
10 |
11 | def arguments():
12 | """"""
13 | parser = argparse.ArgumentParser(description="Evaluate tokenization and sentence splitting")
14 | group = parser.add_mutually_exclusive_group(required=True)
15 | group.add_argument("-f", "--files", action="store_true", help="SYSTEM and GOLD are files")
16 | group.add_argument("-d", "--directories", action="store_true", help="SYSTEM and GOLD are directories (filenames have to match)")
17 | parser.add_argument("--ignore-xml", action="store_true", help="Ignore XML tags for evaluation")
18 | parser.add_argument("-s", "--sentences", action="store_true", help="Also evaluate sentence boundaries")
19 | parser.add_argument("-e", "--errors", type=os.path.abspath, help="Write errors to file")
20 | parser.add_argument("SYSTEM", type=os.path.abspath, help="System output")
21 | parser.add_argument("GOLD", type=os.path.abspath, help="Gold data")
22 | args = parser.parse_args()
23 | return args
24 |
25 |
26 | def read_characters(f, ignore_xml, sentences):
27 | characters = []
28 | for line in f:
29 | line = line.rstrip()
30 | if sentences and line == "":
31 | characters[-1] = Character(characters[-1].char, True, True)
32 | continue
33 | if ignore_xml and line.startswith("<") and line.endswith(">"):
34 | continue
35 | for char in line:
36 | characters.append(Character(char, False, False))
37 | characters[-1] = Character(characters[-1].char, True, False)
38 | return characters
39 |
40 |
41 | def char_to_str(system, gold, focus=False):
42 | """"""
43 | string = system.char
44 | if focus:
45 | # sentence fp
46 | if system.sentence_boundary and (not gold.sentence_boundary):
47 | string += "■ "
48 | # sentence fn
49 | elif (not system.sentence_boundary) and gold.sentence_boundary:
50 | string += "□ "
51 | # token fp
52 | elif system.token_boundary and (not gold.token_boundary):
53 | string += "● "
54 | # token fn
55 | elif (not system.token_boundary) and gold.token_boundary:
56 | string += "○ "
57 | # any tp
58 | elif (system.sentence_boundary and gold.sentence_boundary) or (system.token_boundary and gold.token_boundary):
59 | string += " "
60 | else:
61 | if system.sentence_boundary or system.token_boundary:
62 | string += " "
63 | return string
64 |
65 |
66 | def precision_recall_f1(tp, fp, fn):
67 | """"""
68 | precision = tp / (tp + fp)
69 | recall = tp / (tp + fn)
70 | f1 = (2 * precision * recall) / (precision + recall)
71 | return precision, recall, f1
72 |
73 |
74 | def evaluate_file(system_path, gold_path, ignore_xml, sentences, error_file):
75 | """"""
76 | print("%s ⇔ %s" % (system_path, gold_path))
77 | if error_file:
78 | with open(error_file, mode="a", encoding="utf-8") as e:
79 | e.write("%s ⇔ %s\n" % (system_path, gold_path))
80 | with open(system_path, encoding="utf-8") as system, open(gold_path, encoding="utf-8") as gold:
81 | sys_chars = read_characters(system, ignore_xml, sentences)
82 | gold_chars = read_characters(gold, ignore_xml, sentences)
83 | window = collections.deque([""] * 20)
84 | for s, g in zip(sys_chars, gold_chars):
85 | window.append(g.char)
86 | window.popleft()
87 | if s.char != g.char:
88 | print("'" + "".join(window) + "'")
89 | print("'%s' != '%s'" % (s.char, g.char))
90 | break
91 | assert len(sys_chars) == len(gold_chars)
92 | assert all((s.char == g.char for s, g in zip(sys_chars, gold_chars)))
93 | token_precision, token_recall, token_f1, sentence_precision, sentence_recall, sentence_f1 = 0, 0, 0, 0, 0, 0
94 | token_tp, token_fp, token_fn, sentence_tp, sentence_fp, sentence_fn = 0, 0, 0, 0, 0, 0
95 | if error_file:
96 | with open(error_file, mode="a", encoding="utf-8") as e:
97 | sys_window = collections.deque([Character("", False, False)] * 41)
98 | gold_window = collections.deque([Character("", False, False)] * 41)
99 | for s, g in zip(sys_chars + [Character("", False, False)] * 20, gold_chars + [Character("", False, False)] * 20):
100 | sys_window.append(s)
101 | sys_window.popleft()
102 | gold_window.append(g)
103 | gold_window.popleft()
104 | if sys_window[20] != gold_window[20]:
105 | e.write("%s%s%s\n" % ("".join(char_to_str(x, y) for x, y in zip(list(sys_window)[:20], list(gold_window)[:20]))[-20:],
106 | char_to_str(sys_window[20], gold_window[20], focus=True),
107 | "".join(char_to_str(x, y) for x, y in zip(list(sys_window)[21:], list(gold_window)[21:]))[:20]))
108 | token_tp = len([s for s, g in zip(sys_chars, gold_chars) if g.token_boundary and s.token_boundary])
109 | token_fp = len([s for s, g in zip(sys_chars, gold_chars) if (not g.token_boundary) and s.token_boundary])
110 | token_fn = len([s for s, g in zip(sys_chars, gold_chars) if g.token_boundary and (not s.token_boundary)])
111 | token_precision, token_recall, token_f1 = precision_recall_f1(token_tp, token_fp, token_fn)
112 | print("Tokenization:")
113 | print("P = %6.2f%% R = %6.2f%% F = %6.2f%%" % (token_precision * 100, token_recall * 100, token_f1 * 100))
114 | print("%d false positives, %d false negatives" % (token_fp, token_fn))
115 | if sentences:
116 | sentence_tp = len([s for s, g in zip(sys_chars, gold_chars) if g.sentence_boundary and s.sentence_boundary])
117 | sentence_fp = len([s for s, g in zip(sys_chars, gold_chars) if (not g.sentence_boundary) and s.sentence_boundary])
118 | sentence_fn = len([s for s, g in zip(sys_chars, gold_chars) if g.sentence_boundary and (not s.sentence_boundary)])
119 | sentence_precision, sentence_recall, sentence_f1 = precision_recall_f1(sentence_tp, sentence_fp, sentence_fn)
120 | print("Sentence splitting:")
121 | print("P = %6.2f%% R = %6.2f%% F = %6.2f%%" % (sentence_precision * 100, sentence_recall * 100, sentence_f1 * 100))
122 | print("%d false positives, %d false negatives" % (sentence_fp, sentence_fn))
123 | print()
124 | return token_tp, token_fp, token_fn, token_precision, token_recall, token_f1, sentence_tp, sentence_fp, sentence_fn, sentence_precision, sentence_recall, sentence_f1
125 |
126 |
127 | def main():
128 | """"""
129 | args = arguments()
130 | if args.errors:
131 | with open(args.errors, mode="w", encoding="utf-8") as e:
132 | pass
133 | if args.files:
134 | evaluate_file(args.SYSTEM, args.GOLD, args.ignore_xml, args.sentences, args.errors)
135 | elif args.directories:
136 | n_tokens, token_precision, token_recall, token_f1, n_sentences, sentence_precision, sentence_recall, sentence_f1 = 0, 0, 0, 0, 0, 0, 0, 0
137 | token_tp, token_fp, token_fn, sentence_tp, sentence_fp, sentence_fn = 0, 0, 0, 0, 0, 0
138 | system_files = sorted(os.listdir(args.SYSTEM))
139 | gold_files = sorted(os.listdir(args.GOLD))
140 | assert len(system_files) == len(gold_files)
141 | assert all((s == g for s, g in zip(system_files, gold_files)))
142 | for system_file, gold_file in zip(system_files, gold_files):
143 | ttp, tfp, tfn, tp, tr, tf, stp, sfp, sfn, sp, sr, sf = evaluate_file(os.path.join(args.SYSTEM, system_file), os.path.join(args.GOLD, gold_file), args.ignore_xml, args.sentences, args.errors)
144 | nt = ttp + tfn
145 | ns = stp + sfp
146 | token_tp += ttp
147 | token_fp += tfp
148 | token_fn += tfn
149 | sentence_tp += stp
150 | sentence_fp += sfp
151 | sentence_fn += sfn
152 | n_tokens += nt
153 | token_precision += nt * tp
154 | token_recall += nt * tr
155 | token_f1 += nt * tf
156 | n_sentences += ns
157 | sentence_precision += ns * sp
158 | sentence_recall += ns * sr
159 | sentence_f1 += ns * sf
160 | print("TOTAL")
161 | print("Tokenization (weighted average on %d tokens):" % n_tokens)
162 | print("P = %6.2f%% R = %6.2f%% F = %6.2f%%" % (token_precision / n_tokens * 100, token_recall / n_tokens * 100, token_f1 / n_tokens * 100))
163 | print("%d false positives, %d false negatives" % (token_fp, token_fn))
164 | if args.sentences:
165 | print("Sentence splitting (weighted average on %d sentences):" % n_sentences)
166 | print("P = %6.2f%% R = %6.2f%% F = %6.2f%%" % (sentence_precision / n_sentences * 100, sentence_recall / n_sentences * 100, sentence_f1 / n_sentences * 100))
167 | print("%d false positives, %d false negatives" % (sentence_fp, sentence_fn))
168 |
169 |
170 | if __name__ == "__main__":
171 | main()
172 |
--------------------------------------------------------------------------------
/tests/test_alignment.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import itertools
4 | import unicodedata
5 | import unittest
6 |
7 | import somajo.alignment
8 | from somajo.doubly_linked_list import DLL
9 | from somajo.token import Token
10 | from somajo.somajo import Tokenizer
11 | from somajo import utils
12 |
13 |
14 | class TestNfcAlignment(unittest.TestCase):
15 | def test_nfc_01(self):
16 | """Singleton: Angstrom sign"""
17 | orig = "xÅx"
18 | nfc = unicodedata.normalize("NFC", orig)
19 | alignment = {(0, 1): (0, 1), (1, 2): (1, 2), (2, 3): (2, 3)}
20 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
21 |
22 | def test_nfc_02(self):
23 | """Single combining mark"""
24 | orig = "xA\u0308x"
25 | nfc = unicodedata.normalize("NFC", orig)
26 | alignment = {(0, 1): (0, 1), (1, 2): (1, 3), (2, 3): (3, 4)}
27 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
28 |
29 | def test_nfc_03(self):
30 | """Multiple combining marks"""
31 | orig = "xs\u0323\u0307x"
32 | nfc = unicodedata.normalize("NFC", orig)
33 | alignment = {(0, 1): (0, 1), (1, 2): (1, 4), (2, 3): (4, 5)}
34 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
35 |
36 | def test_nfc_04(self):
37 | """Multiple combining marks"""
38 | orig = "xs\u0307\u0323x"
39 | nfc = unicodedata.normalize("NFC", orig)
40 | alignment = {(0, 1): (0, 1), (1, 2): (1, 4), (2, 3): (4, 5)}
41 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
42 |
43 | def test_nfc_05(self):
44 | """Multiple combining marks"""
45 | orig = "x\u1e0b\u0323x"
46 | nfc = unicodedata.normalize("NFC", orig)
47 | alignment = {(0, 1): (0, 1), (1, 3): (1, 3), (3, 4): (3, 4)}
48 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
49 |
50 | def test_nfc_06(self):
51 | """Multiple combining marks"""
52 | orig = "q\u0307\u0323x"
53 | nfc = unicodedata.normalize("NFC", orig)
54 | alignment = {(0, 3): (0, 3), (3, 4): (3, 4)}
55 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
56 |
57 | def test_nfc_07(self):
58 | """Empty string"""
59 | orig = ""
60 | nfc = unicodedata.normalize("NFC", orig)
61 | alignment = {}
62 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
63 |
64 |
65 | class TestResolveEntities(unittest.TestCase):
66 | def test_entitites_01(self):
67 | xml = 'foo <bar> baz'
68 | resolved = 'foo baz'
69 | alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6),
70 | (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12),
71 | (12, 13), (13, 14), (14, 15), (15, 21), (21, 22),
72 | (22, 23), (23, 24), (24, 30), (30, 31), (31, 32),
73 | (32, 33), (33, 34), (34, 35), (35, 36), (36, 37),
74 | (37, 38), (38, 39), (39, 40), (40, 44), (44, 45),
75 | (45, 46), (46, 47), (47, 51), (51, 52), (52, 53),
76 | (53, 54), (54, 55), (55, 56), (56, 57), (57, 58),
77 | (58, 59), (59, 60), (60, 61)]
78 | res, al = somajo.alignment._resolve_entities(xml)
79 | self.assertEqual(res, resolved)
80 | self.assertEqual(al, alignment)
81 |
82 | def test_entities_02(self):
83 | xml = "Test"
84 | resolved = "Test"
85 | alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6),
86 | (6, 14), (14, 15), (15, 16), (16, 17), (17, 18),
87 | (18, 19), (19, 20), (20, 21), (21, 22)]
88 | res, al = somajo.alignment._resolve_entities(xml)
89 | self.assertEqual(res, resolved)
90 | self.assertEqual(al, alignment)
91 |
92 |
93 | class TestDetermineOffsets(unittest.TestCase):
94 | def setUp(self):
95 | """Necessary preparations"""
96 | self.tokenizer = Tokenizer(split_camel_case=True, language="de_CMC")
97 |
98 | def _equal(self, raw, tokenized):
99 | raw = unicodedata.normalize("NFC", raw)
100 | if isinstance(tokenized, str):
101 | tokenized = tokenized.split()
102 | dll = DLL([Token(raw, first_in_sentence=True, last_in_sentence=True)])
103 | tokens = self.tokenizer._tokenize(dll)
104 | offsets = somajo.alignment._determine_offsets(tokens, raw, position=0)
105 | self.assertEqual([raw[s:e] for s, e in offsets], tokenized)
106 |
107 | def test_token_alignment_01(self):
108 | self._equal("Ein simpler Test.", "Ein simpler Test .")
109 |
110 | def test_token_alignment_02(self):
111 | self._equal("bla \u1e0d\u0307amit.", "bla \u1e0d\u0307amit .")
112 |
113 | def test_token_alignment_03(self):
114 | self._equal("foo (bar) baz?", "foo ( bar ) baz ?")
115 |
116 | def test_token_alignment_03a(self):
117 | self._equal("foo:\n) bar", ["foo", ":\n)", "bar"])
118 |
119 | def test_token_alignment_04(self):
120 | self._equal(
121 | "foobar foobar foo\ufeffbar foobarbazquxalphabetagamma foobarbaz foobarbaz foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta",
122 | ["foobar", "foobar", "foo\ufeffbar", "foobarbazquxalphabetagamma", "foobarbaz", "foobarbaz", "foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta"]
123 | )
124 |
125 |
126 | class TestTokenOffsets(unittest.TestCase):
127 | def setUp(self):
128 | """Necessary preparations"""
129 | self.tokenizer = Tokenizer(split_camel_case=True, language="de_CMC")
130 |
131 | def _equal_xml(self, raw, tokenized):
132 | raw = unicodedata.normalize("NFC", raw)
133 | if isinstance(tokenized, str):
134 | tokenized = tokenized.split()
135 | eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
136 | eos_tags = set(eos_tags)
137 | chunk_info = utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags, character_offsets=True)
138 | chunk_lists = [ci[0] for ci in chunk_info]
139 | token_dlls = map(DLL, chunk_lists)
140 | chunks = map(self.tokenizer._tokenize, token_dlls)
141 | complete = list(itertools.chain.from_iterable(chunks))
142 | offsets = somajo.alignment.token_offsets(list(itertools.chain.from_iterable(chunk_lists)), raw, 0, True, complete)
143 | self.assertEqual([raw[s:e] for s, e in offsets], tokenized)
144 |
145 | def test_token_alignment_05(self):
146 | self._equal_xml(
147 | "der beste Betreuer? - >ProfSmith! : )",
148 | ["", "der", "beste", "Betreuer", "?", "- >", "Prof", "Smith", "!", ": )", ""]
149 | )
150 |
151 | def test_token_alignment_06(self):
152 | self._equal_xml("das steht auf S.5", " das steht auf S. 5 ")
153 |
154 | def test_token_alignment_07(self):
155 | self._equal_xml("na so was -> bla", " na so was - > bla ")
156 |
157 | def test_token_alignment_08(self):
158 | self._equal_xml("Test", " Test ")
159 |
160 | def test_token_alignment_09(self):
161 | self._equal_xml("3 < 5", " 3 < 5 ")
162 |
163 | def test_token_alignment_10(self):
164 | self._equal_xml("Testfall", " Testfall ")
165 |
166 | def test_token_alignment_11(self):
167 | self._equal_xml("Testfall", " Testfall ")
168 |
169 | def test_token_alignment_12(self):
170 | """Single combining mark"""
171 | self._equal_xml("foo xÄx foo", " foo xÄx foo ")
172 |
173 | def test_token_alignment_13(self):
174 | """Multiple combining marks"""
175 | self._equal_xml("foo xṩx foo", " foo xṩx foo ")
176 |
177 | def test_token_alignment_14(self):
178 | """Multiple combining marks"""
179 | self._equal_xml("foo xṩx foo", " foo xṩx foo ")
180 |
181 | def test_token_alignment_15(self):
182 | """Multiple combining marks"""
183 | self._equal_xml("foo xsḍ̇x foo", " foo xsḍ̇x foo ")
184 |
185 | def test_token_alignment_16(self):
186 | """Multiple combining marks"""
187 | self._equal_xml("foo xq̣̇x foo", " foo xq̣̇x foo ")
188 |
189 | def test_token_alignment_17(self):
190 | self._equal_xml("Foo", ["", "Foo", ""])
191 |
192 | def test_token_alignment_18(self):
193 | self._equal_xml("Foo", ["", "Foo", ""])
194 |
195 | def test_token_alignment_19(self):
196 | self._equal_xml(" Foo ", ["", "Foo", ""])
197 |
198 | def test_token_alignment_20(self):
199 | self._equal_xml("Foo \"Bar\" 'Baz'", ["", "Foo", '"', "Bar", '"', "'", "Baz", "'", ""])
200 |
201 | def test_token_alignment_21(self):
202 | self._equal_xml('\n Foo\n', ['', "Foo", ""])
203 |
204 | def test_token_alignment_22(self):
205 | self._equal_xml("Hallo
Tschüß", ["", "Hallo", "
", "", "Tschüß", ""])
206 |
207 | def test_token_alignment_23(self):
208 | self._equal_xml("Hallo
Tschüß", ["", "Hallo", "
", "", "Tschüß", ""])
209 |
210 | def test_token_alignment_24(self):
211 | self._equal_xml("\u0303foo", ["", "\u0303foo", ""])
212 |
213 | def test_token_alignment_25(self):
214 | self._equal_xml("foobar
", ["", "foo", "", "bar", "
", ""])
215 |
216 | def test_token_alignment_26(self):
217 | self._equal_xml("bar
baz
", ["", "", "bar", "
", "", "baz", "
", ""])
218 |
--------------------------------------------------------------------------------
/tests/test_somajo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import io
4 | import unittest
5 |
6 | from somajo.somajo import SoMaJo
7 |
8 |
9 | class TestSoMaJo(unittest.TestCase):
10 | def setUp(self):
11 | """Necessary preparations"""
12 | self.tokenizer = SoMaJo("de_CMC")
13 |
14 | def _equal_text(self, paragraphs, tokenized_sentences, parallel=1):
15 | sentences = self.tokenizer.tokenize_text(paragraphs, parallel=parallel)
16 | sentences = [[t.text for t in s] for s in sentences]
17 | self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
18 |
19 | def _equal_text_file_single_newlines(self, paragraphs, tokenized_sentences, parallel=1):
20 | pseudofile = io.StringIO("\n".join(paragraphs))
21 | sentences = self.tokenizer.tokenize_text_file(pseudofile, paragraph_separator="single_newlines", parallel=parallel)
22 | sentences = [[t.text for t in s] for s in sentences]
23 | self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
24 |
25 | def _equal_text_file_empty_lines(self, paragraphs, tokenized_sentences, parallel=1):
26 | pseudofile = io.StringIO("\n\n".join(paragraphs))
27 | sentences = self.tokenizer.tokenize_text_file(pseudofile, paragraph_separator="empty_lines", parallel=parallel)
28 | sentences = [[t.text for t in s] for s in sentences]
29 | self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
30 |
31 | def _equal_xml(self, xml, tokenized_sentences, strip_tags=False, parallel=1, prune_tags=None):
32 | eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
33 | sentences = self.tokenizer.tokenize_xml(xml, eos_tags, strip_tags=strip_tags, parallel=parallel, prune_tags=prune_tags)
34 | sentences = [[t.text for t in s] for s in sentences]
35 | self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
36 |
37 | def _equal_xml_file(self, xml, tokenized_sentences, strip_tags=False, parallel=1, prune_tags=None):
38 | eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
39 | pseudofile = io.StringIO(xml)
40 | sentences = self.tokenizer.tokenize_xml_file(pseudofile, eos_tags, strip_tags=strip_tags, parallel=parallel, prune_tags=prune_tags)
41 | sentences = [[t.text for t in s] for s in sentences]
42 | self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
43 |
44 |
45 | class TestSoMaJoNoSent(TestSoMaJo):
46 | def setUp(self):
47 | """Necessary preparations"""
48 | self.tokenizer = SoMaJo("de_CMC", split_sentences=False)
49 |
50 |
51 | class TestText(TestSoMaJo):
52 | def test_text_01(self):
53 | self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"])
54 |
55 | def test_text_02(self):
56 | self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"])
57 |
58 | def test_text_03(self):
59 | self._equal_text_file_single_newlines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"])
60 |
61 | def test_text_04(self):
62 | self.assertRaises(TypeError, self.tokenizer.tokenize_text, "Foo bar. Baz qux")
63 |
64 |
65 | class TestTextXMLSent(TestSoMaJo):
66 | def setUp(self):
67 | """Necessary preparations"""
68 | self.tokenizer = SoMaJo("de_CMC", xml_sentences="s")
69 |
70 | def test_text_01(self):
71 | self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], [" Foo bar . ", " Baz qux ", " alpha . ", " Beta gamma "])
72 |
73 | def test_text_02(self):
74 | self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], [" Foo bar . ", " Baz qux ", " alpha . ", " Beta gamma "])
75 |
76 |
77 | class TestTextParallel(TestSoMaJo):
78 | def test_text_01(self):
79 | self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], parallel=2)
80 |
81 | def test_text_02(self):
82 | self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], parallel=2)
83 |
84 | def test_text_03(self):
85 | self._equal_text_file_single_newlines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], parallel=2)
86 |
87 |
88 | class TestTextNoSent(TestSoMaJoNoSent):
89 | def test_text_01(self):
90 | self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"])
91 |
92 | def test_text_02(self):
93 | self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"])
94 |
95 | def test_text_03(self):
96 | self._equal_text_file_single_newlines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"])
97 |
98 |
99 | class TestTextNoSentParallel(TestSoMaJoNoSent):
100 | def test_text_01(self):
101 | self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"], parallel=2)
102 |
103 | def test_text_02(self):
104 | self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"], parallel=2)
105 |
106 | def test_text_03(self):
107 | self._equal_text_file_single_newlines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"], parallel=2)
108 |
109 |
110 | class TestXML(TestSoMaJo):
111 | def test_xml_01(self):
112 | self._equal_xml("\n \n Foo bar. Baz qux
\n alpha. Beta gamma
\n \n", [" Foo bar .", "Baz qux
", " alpha .", "Beta gamma
"])
113 |
114 | def test_xml_02(self):
115 | self._equal_xml_file("\n \n Foo bar. Baz qux
\n alpha. Beta gamma
\n \n", [" Foo bar .", "Baz qux
", " alpha .", "Beta gamma
"])
116 |
117 |
118 | class TestXMLParallel(TestSoMaJo):
119 | def test_xml_01(self):
120 | self._equal_xml("\n \n Foo bar. Baz qux
\n alpha. Beta gamma
\n \n", [" Foo bar .", "Baz qux
", " alpha .", "Beta gamma
"], parallel=2)
121 |
122 | def test_xml_02(self):
123 | self._equal_xml_file("\n \n Foo bar. Baz qux
\n alpha. Beta gamma
\n \n", [" Foo bar .", "Baz qux
", " alpha .", "Beta gamma
"], parallel=2)
124 |
125 |
126 | class TestXMLNoSent(TestSoMaJoNoSent):
127 | def test_xml_01(self):
128 | self._equal_xml("\n \n Foo bar. Baz qux
\n alpha. Beta gamma
\n \n", [" Foo bar . Baz qux
", " alpha . Beta gamma
"])
129 |
130 | def test_xml_02(self):
131 | self._equal_xml_file("\n \n Foo bar. Baz qux
\n alpha. Beta gamma
\n \n", [" Foo bar . Baz qux
", " alpha . Beta gamma
"])
132 |
133 |
134 | class TestXMLNoSentParallel(TestSoMaJoNoSent):
135 | def test_xml_01(self):
136 | self._equal_xml("\n \n Foo bar. Baz qux
\n alpha. Beta gamma
\n \n", [" Foo bar . Baz qux
", " alpha . Beta gamma
"], parallel=2)
137 |
138 | def test_xml_02(self):
139 | self._equal_xml_file("\n \n Foo bar. Baz qux
\n alpha. Beta gamma
\n \n", [" Foo bar . Baz qux
", " alpha . Beta gamma
"], parallel=2)
140 |
141 |
142 | class TestXMLStripTags(TestSoMaJo):
143 | def test_xml_01(self):
144 | self._equal_xml("\n \n Foo bar. Baz qux
\n alpha. Beta gamma
\n \n", ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], strip_tags=True)
145 |
146 | def test_xml_02(self):
147 | self._equal_xml_file("\n \n Foo bar. Baz qux
\n alpha. Beta gamma
\n \n", ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], strip_tags=True)
148 |
149 |
150 | class TestXMLPruneTags(TestSoMaJo):
151 | def test_xml_01(self):
152 | self._equal_xml("\n \n Spam\n \n \n Foo bar. Baz qux
\n alpha. Beta gamma
\n \n", [" Foo bar .", "Baz qux
", " alpha .", "Beta gamma
"], prune_tags=["head"])
153 |
154 | def test_xml_02(self):
155 | self._equal_xml_file("\n \n Spam\n \n \n Foo bar. Baz qux
\n alpha. Beta gamma
\n \n", [" Foo bar .", "Baz qux
", " alpha .", "Beta gamma
"], prune_tags=["head"])
156 |
157 |
158 | class TestCharacterOffsets(TestSoMaJo):
159 | def setUp(self):
160 | """Necessary preparations"""
161 | self.tokenizer = SoMaJo("de_CMC", character_offsets=True)
162 |
163 | def _equal_offsets_text_file(self, paragraphs, tokenized_sentences, parallel=1):
164 | raw = "\n\n".join(paragraphs)
165 | pseudofile = io.StringIO(raw)
166 | sentences = self.tokenizer.tokenize_text_file(pseudofile, paragraph_separator="empty_lines", parallel=parallel)
167 | sentences = list(sentences)
168 | tokens = [[t.text for t in s] for s in sentences]
169 | self.assertEqual(tokens, [ts.split() for ts in tokenized_sentences])
170 | offsets = [[t.character_offset for t in s] for s in sentences]
171 | extracted = [[raw[s:e] for s, e in sent] for sent in offsets]
172 | self.assertEqual(tokens, extracted)
173 |
174 | def _equal_offsets_xml(self, xml, tokenized_sentences, strip_tags=False, parallel=1, prune_tags=None):
175 | eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
176 | sentences = self.tokenizer.tokenize_xml(xml, eos_tags, strip_tags=strip_tags, parallel=parallel, prune_tags=prune_tags)
177 | sentences = list(sentences)
178 | tokens = [[t.text for t in s] for s in sentences]
179 | self.assertEqual(tokens, [ts.split() for ts in tokenized_sentences])
180 | offsets = [[t.character_offset for t in s] for s in sentences]
181 | extracted = [[xml[s:e] for s, e in sent] for sent in offsets]
182 | self.assertEqual(tokens, extracted)
183 |
184 | def test_text_offsets_01(self):
185 | self._equal_offsets_text_file(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"])
186 |
187 | def test_xml_offsets_01(self):
188 | self._equal_offsets_xml("bar
baz
", [" bar
", " baz
"])
189 |
190 | def test_xml_offsets_02(self):
191 | self._equal_offsets_xml("\n\nbar\n
\n\nbaz\n
\n", [" bar
", " baz
"])
192 |
--------------------------------------------------------------------------------
/utils/errors_test.txt:
--------------------------------------------------------------------------------
1 | __________________________________________________________________________________________________
2 | tmp/web_test_009.txt ..._standard/test_web/tokenized/web_test_009.txt
3 |
4 | False Positive (linebreak inserted left):
5 | 169: Pops 169: Pops
6 | 170: 170:
7 | 171: * 1. 171: * 1.1.
8 | 172: * 1. 172: Kuchen
9 | 173: Kuchen 173: für
10 | 174: für 174: Cake
11 |
12 | False Positive (linebreak inserted left):
13 | 311: . 310: .
14 | 312: 311:
15 | 313: * 1. 312: * 1.2.
16 | 314: * 2. 313: Kuchen
17 | 315: Kuchen 314: für
18 | 316: für 315: Cake
19 |
20 | False Positive (linebreak inserted left):
21 | 448: . 446: .
22 | 449: 447:
23 | 450: * 1. 448: * 1.3.
24 | 451: * 3. 449: Kekse
25 | 452: Kekse 450: für
26 | 453: für 451: Cake
27 |
28 | __________________________________________________________________________________________________
29 | tmp/web_test_011.txt ..._standard/test_web/tokenized/web_test_011.txt
30 |
31 | False Negative (linebreak inserted right):
32 | 238: man 238: man
33 | 239: sie 239: sie
34 | 240: * 1829/30 240: * 1829
35 | 241: in 241: * /
36 | 242: das 242: 30
37 | 243: Herrschaftliche 243: in
38 |
39 | False Negative (linebreak inserted right):
40 | 238: man 239: sie
41 | 239: sie 240: 1829
42 | 240: * 1829/30 241: * /
43 | 241: in 242: * 30
44 | 242: das 243: in
45 | 243: Herrschaftliche 244: das
46 |
47 | False Negative (linebreak inserted right):
48 | 411: Garten 413: Garten
49 | 412: musste 414: musste
50 | 413: * 1864/65 415: * 1864
51 | 414: dem 416: * /
52 | 415: Bau 417: 65
53 | 416: des 418: dem
54 |
55 | False Negative (linebreak inserted right):
56 | 411: Garten 414: musste
57 | 412: musste 415: 1864
58 | 413: * 1864/65 416: * /
59 | 414: dem 417: * 65
60 | 415: Bau 418: dem
61 | 416: des 419: Bau
62 |
63 | __________________________________________________________________________________________________
64 | tmp/web_test_002.txt ..._standard/test_web/tokenized/web_test_002.txt
65 |
66 | False Negative (linebreak inserted right):
67 | 99: der 99: der
68 | 100: Saison 100: Saison
69 | 101: * 2009/2010 101: * 2009
70 | 102: sind 102: * /
71 | 103: laut 103: 2010
72 | 104: einer 104: sind
73 |
74 | False Negative (linebreak inserted right):
75 | 99: der 100: Saison
76 | 100: Saison 101: 2009
77 | 101: * 2009/2010 102: * /
78 | 102: sind 103: * 2010
79 | 103: laut 104: sind
80 | 104: einer 105: laut
81 |
82 | __________________________________________________________________________________________________
83 | tmp/web_test_012.txt ..._standard/test_web/tokenized/web_test_012.txt
84 |
85 | False Positive (linebreak inserted left):
86 | 660: Backlinks 660: Backlinks
87 | 661: : 661: :
88 | 662: * [ 662: * [[
89 | 663: * [ 663: security
90 | 664: security 664: :
91 | 665: : 665: verschlüsselung
92 |
93 | False Positive (linebreak inserted left):
94 | 665: : 664: :
95 | 666: verschlüsselung 665: verschlüsselung
96 | 667: * ] 666: * ]]
97 | 668: * ] 667:
98 | 669: 668: Navigation
99 | 670: Navigation 669: Passwort-
100 |
101 | __________________________________________________________________________________________________
102 | tmp/web_test_004.txt ..._standard/test_web/tokenized/web_test_004.txt
103 |
104 | False Positive (linebreak inserted left):
105 | 141: Telekommunikationsgeheimnis 141: Telekommunikationsgeheimnis
106 | 142: ( 142: (
107 | 143: * Art 143: * Art.
108 | 144: * . 144: 10
109 | 145: 10 145: GG
110 | 146: GG 146: ,
111 |
112 | False Positive (linebreak inserted left):
113 | 146: GG 145: GG
114 | 147: , 146: ,
115 | 148: * Art 147: * Art.
116 | 149: * . 148: 8
117 | 150: 8 149: Abs.
118 | 151: Abs. 150: 1
119 |
120 | False Positive (linebreak inserted left):
121 | 153: EMRK 151: EMRK
122 | 154: , 152: ,
123 | 155: * Art 153: * Art.
124 | 156: * . 154: 7
125 | 157: 7 155: EU-GrCh
126 | 158: EU-GrCh 156: )
127 |
128 | False Positive (linebreak inserted left):
129 | 173: gewährleistet 170: gewährleistet
130 | 174: . 171: .
131 | 175: * Art 172: * Art.
132 | 176: * . 173: 10
133 | 177: 10 174: GG
134 | 178: GG 175: sagt
135 |
136 | False Positive (linebreak inserted left):
137 | 270: Fernmeldegeheimnisses 266: Fernmeldegeheimnisses
138 | 271: in 267: in
139 | 272: * Art 268: * Art.
140 | 273: * . 269: 8
141 | 274: 8 270: EMRK
142 | 275: EMRK 271: und
143 |
144 | False Positive (linebreak inserted left):
145 | 275: EMRK 270: EMRK
146 | 276: und 271: und
147 | 277: * Art 272: * Art.
148 | 278: * . 273: 7
149 | 279: 7 274: EU-GrCh
150 | 280: EU-GrCh 275: :
151 |
152 | False Positive (linebreak inserted left):
153 | 448: Widerstandsrechts 442: Widerstandsrechts
154 | 449: ( 443: (
155 | 450: * Art 444: * Art.
156 | 451: * . 445: 20
157 | 452: 20 446: Abs.
158 | 453: Abs. 447: 4
159 |
160 |
--------------------------------------------------------------------------------
/CHANGES.txt:
--------------------------------------------------------------------------------
1 | # CHANGELOG #
2 |
3 | ## Version 2.4.3, 2024-08-05 ##
4 |
5 | - Move non-abbreviation tokens that should not be split from
6 | `single_token_abbreviations_.txt` to
7 | `single_tokens_.txt` and add cellular networks generations
8 | (issue #32).
9 |
10 | ## Version 2.4.2, 2024-02-10 ##
11 |
12 | - Fix issues #28 and #29 (markdown links with trailing symbols after
13 | URL part).
14 |
15 | ## Version 2.4.1, 2024-02-09 ##
16 |
17 | - Fix issue #27 (URLs in angle brackets).
18 |
19 | ## Version 2.4.0, 2023-12-23 ##
20 |
21 | - New feature: SoMaJo can output character offsets for tokens,
22 | allowing for stand-off tokenization. Pass `character_offsets=True`
23 | to the constructor or use the option `--character-offsets` on the
24 | command line to enable the feature. The character offsets are
25 | determined by aligning the tokenized output with the input,
26 | therefore activating the feature incurs a noticeable increase in
27 | processing time.
28 |
29 | ## Version 2.3.1, 2023-09-23 ##
30 |
31 | - Fix issue #26 (markdown links that contain a URL in the link text).
32 |
33 | ## Version 2.3.0, 2023-08-14 ##
34 |
35 | - **Potentially breaking change:** The somajo-tokenizer script is
36 | automatically created upon installation and bin/somajo-tokenizer is
37 | removed. For most users, this does not make a difference. If you
38 | used to run your own modified version of SoMaJo directly via
39 | bin/somajo-tokenizer, consider installing the project in editable
40 | mode (see Development section in README.md).
41 | - Switch from setup.py to pyconfig.toml and restructure the project
42 | (source in src, tests in tests).
43 | - When creating a Token object, only known token classes can be
44 | passed.
45 | - Fix issue #25 (dates at the end of sentences)
46 |
47 | ## Version 2.2.4, 2023-06-23 ##
48 |
49 | - Improvements to tokenization of words containing numbers (e.g.
50 | COVID-19-Pandemie, FFP2-Maske).
51 |
52 | ## Version 2.2.3, 2023-02-02 ##
53 |
54 | - Improvements to tokenization: Roman ordinals, abbreviation “Art.”
55 | preceding a number, certain units of measurement at the end of a
56 | sentence (e.g. km/h).
57 |
58 | ## Version 2.2.2, 2022-09-12 ##
59 |
60 | - Bugfix: Command-line option --sentence_tag implies option --split_sentences.
61 |
62 | ## Version 2.2.1, 2022-03-08 ##
63 |
64 | - Bugfix: Command-line option --strip-tags implies option --xml.
65 |
66 | ## Version 2.2.0, 2022-01-18 ##
67 |
68 | - New feature: Prune XML tags and their contents from the input before
69 | tokenization (via the command line option --prune TAGNAME1 --prune
70 | TAGNAME2 … or by passing prune_tags=["TAGNAME1", "TAGNAME2", …] to
71 | tokenize_xml or tokenize_xml_file). This can be useful when
72 | processing HTML files, e.g. for removing any