├── tests ├── __init__.py ├── test_token.py ├── test_doubly_linked_list.py ├── test_utils.py ├── test_alignment.py ├── test_somajo.py └── test_tokenizer_internal.py ├── requirements_dev.txt ├── doc ├── source │ ├── modules.rst │ ├── somajo.rst │ ├── index.rst │ └── conf.py ├── Makefile ├── make.bat └── build │ └── markdown │ └── somajo.md ├── utils ├── run_tests.sh ├── evaluate_on_gum.sh ├── annotate_cmc.sh ├── annotate_web.sh ├── evaluate_on_konvens.sh ├── evaluate_on_test_cmc.sh ├── evaluate_on_test_web.sh ├── evaluate_on_ewt.sh ├── baseline.sh ├── errors_trial.txt ├── errors_train.txt ├── evaluate.py └── errors_test.txt ├── .gitignore ├── src └── somajo │ ├── data │ ├── non-breaking_hyphenated_words_en.txt │ ├── single_tokens_en.txt │ ├── single_token_abbreviations_en.txt │ ├── single_tokens_de.txt │ ├── eos_abbreviations.txt │ ├── non-breaking_suffixes_en.txt │ ├── units.txt │ ├── single_token_abbreviations_de.txt │ ├── non-breaking_prefixes_en.txt │ ├── tokens_with_plus_or_ampersand.txt │ ├── abbreviations_en.txt │ └── abbreviations_de.txt │ ├── __init__.py │ ├── doubly_linked_list.py │ ├── token.py │ ├── cli.py │ ├── alignment.py │ └── sentence_splitter.py ├── .github └── workflows │ └── test.yml ├── README.rst ├── pyproject.toml ├── CHANGES.txt └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | build 2 | sphinx 3 | sphinx-markdown-builder 4 | twine 5 | -------------------------------------------------------------------------------- /doc/source/modules.rst: -------------------------------------------------------------------------------- 1 | somajo 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | somajo 8 | -------------------------------------------------------------------------------- /utils/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTDIR=$(dirname $BASH_SOURCE) 4 | cd $SCRIPTDIR/.. 5 | 6 | # Test Discovery 7 | python3 -m unittest discover 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /data/ 2 | /dist/ 3 | /doc/build/markdown/index.md 4 | /doc/build/markdown/modules.md 5 | /doc/build/doctrees/ 6 | /src/SoMaJo.egg-info/ 7 | /venv/ 8 | __pycache__/ 9 | *~ 10 | *.pyc 11 | -------------------------------------------------------------------------------- /src/somajo/data/non-breaking_hyphenated_words_en.txt: -------------------------------------------------------------------------------- 1 | # Hyphenated words in the following list are not split into multiple tokens. 2 | 3 | mm-hm 4 | mm-mm 5 | o-kay 6 | uh-huh 7 | uh-oh 8 | x-ray 9 | x-rayed 10 | x-rays 11 | -------------------------------------------------------------------------------- /src/somajo/data/single_tokens_en.txt: -------------------------------------------------------------------------------- 1 | # A list of tokens that should not be split. 2 | # 3 | # Lines starting with “#” are treated as comments and will be ignored. 4 | 5 | tl;dr 6 | 7 | # mobile telephony 8 | 3G 9 | 4G 10 | 5G 11 | -------------------------------------------------------------------------------- /src/somajo/data/single_token_abbreviations_en.txt: -------------------------------------------------------------------------------- 1 | # A list of multi-dot abbreviations that represent single tokens and 2 | # should not be split. 3 | # 4 | # Lines starting with “#” are treated as comments and will be ignored. 5 | 6 | e.g. 7 | i.e. 8 | a.m. 9 | p.m. 10 | P.S. 11 | T.V. 12 | -------------------------------------------------------------------------------- /src/somajo/data/single_tokens_de.txt: -------------------------------------------------------------------------------- 1 | # A list of tokens that should not be split. 2 | # 3 | # Lines starting with “#” are treated as comments and will be ignored. 4 | 5 | .Net 6 | /rant 7 | /s 8 | E/E 9 | tl;dr 10 | zl;ng 11 | 12 | # SAP Versions 13 | S/4 14 | R/3 15 | 16 | # mobile telephony 17 | 3G 18 | 4G 19 | 5G 20 | -------------------------------------------------------------------------------- /src/somajo/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata 2 | 3 | from . import ( 4 | sentence_splitter, 5 | somajo, 6 | tokenizer 7 | ) 8 | 9 | __version__ = importlib.metadata.version(__package__ or __name__) 10 | 11 | Tokenizer = tokenizer.Tokenizer 12 | SentenceSplitter = sentence_splitter.SentenceSplitter 13 | SoMaJo = somajo.SoMaJo 14 | -------------------------------------------------------------------------------- /doc/source/somajo.rst: -------------------------------------------------------------------------------- 1 | somajo package 2 | ============== 3 | 4 | somajo.somajo module 5 | -------------------- 6 | 7 | .. automodule:: somajo.somajo 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | somajo.token module 13 | ------------------- 14 | 15 | .. automodule:: somajo.token 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /utils/evaluate_on_gum.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTDIR=$(dirname $BASH_SOURCE) 4 | cd $SCRIPTDIR 5 | 6 | mkdir tmp 7 | for f in ../data/GUM/text/* 8 | do 9 | filename=$(basename $f) 10 | somajo-tokenizer -l en $f > tmp/$filename 11 | done 12 | perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_gum.txt tmp ../data/GUM/tokenized 13 | rm -r tmp/ 14 | -------------------------------------------------------------------------------- /src/somajo/data/eos_abbreviations.txt: -------------------------------------------------------------------------------- 1 | # A list of abbreviations that frequently occur at the end of a 2 | # sentence. If such an abbreviation is followed by a potential 3 | # sentence start, e.g. by a capital letter, it will be interpreted as 4 | # the end of a sentence. 5 | # 6 | # Lines starting with “#” are treated as comments and will be ignored. 7 | 8 | usw. 9 | usf. 10 | etc. 11 | uvam. 12 | -------------------------------------------------------------------------------- /utils/annotate_cmc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTDIR=$(dirname $BASH_SOURCE) 4 | cd $SCRIPTDIR 5 | 6 | for f in ../../data/empirist_test_tok_cmc/raw/* 7 | do 8 | filename=$(basename $f) 9 | ../bin/tokenizer --split_camel_case $f > ../../data/cmc_tok_SoMaJo/$filename 10 | # ../bin/tokenizer $f > ../../data/cmc_tok_SoMaJo/$filename 11 | done 12 | perl ../../data/empirist_test_tok_cmc/tools/validate_tokenization.perl -x ../../data/cmc_tok_SoMaJo/ ../../data/empirist_test_tok_cmc/raw/ 13 | -------------------------------------------------------------------------------- /utils/annotate_web.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTDIR=$(dirname $BASH_SOURCE) 4 | cd $SCRIPTDIR 5 | 6 | for f in ../../data/empirist_test_tok_web/raw/* 7 | do 8 | filename=$(basename $f) 9 | ../bin/tokenizer --split_camel_case $f > ../../data/web_tok_SoMaJo/$filename 10 | # ../bin/tokenizer $f > ../../data/web_tok_SoMaJo/$filename 11 | done 12 | perl ../../data/empirist_test_tok_web/tools/validate_tokenization.perl -x ../../data/web_tok_SoMaJo/ ../../data/empirist_test_tok_web/raw/ 13 | -------------------------------------------------------------------------------- /utils/evaluate_on_konvens.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTDIR=$(dirname $BASH_SOURCE) 4 | cd $SCRIPTDIR 5 | 6 | mkdir tmp 7 | for f in ../data/Ortmann_et_al/txt/*.txt 8 | do 9 | filename=$(basename $f) 10 | somajo-tokenizer --split-sentences $f > tmp/$filename 11 | done 12 | perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_test.txt tmp ../data/Ortmann_et_al/tokens 13 | ./evaluate.py -d --sentences -e errors.txt tmp/ ../data/Ortmann_et_al/tokens/ 14 | rm -r tmp/ 15 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. SoMaJo documentation master file, created by 2 | sphinx-quickstart on Thu Dec 19 08:01:21 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to SoMaJo's documentation! 7 | ================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /utils/evaluate_on_test_cmc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTDIR=$(dirname $BASH_SOURCE) 4 | cd $SCRIPTDIR 5 | 6 | mkdir tmp 7 | for f in ../data/empirist_gold_standard/test_cmc/raw/* 8 | do 9 | filename=$(basename $f) 10 | somajo-tokenizer --split_camel_case $f > tmp/$filename 11 | done 12 | # perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_test.txt tmp ../data/empirist_gold_standard/test_cmc/tokenized 13 | ./evaluate.py -d -e errors.txt --ignore-xml tmp/ ../data/empirist_gold_standard/test_cmc/tokenized/ 14 | rm -r tmp/ 15 | -------------------------------------------------------------------------------- /utils/evaluate_on_test_web.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTDIR=$(dirname $BASH_SOURCE) 4 | cd $SCRIPTDIR 5 | 6 | mkdir tmp 7 | for f in ../data/empirist_gold_standard/test_web/raw/* 8 | do 9 | filename=$(basename $f) 10 | somajo-tokenizer --split_camel_case $f > tmp/$filename 11 | done 12 | # perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_test.txt tmp ../data/empirist_gold_standard/test_web/tokenized 13 | ./evaluate.py -d -e errors.txt --ignore-xml tmp/ ../data/empirist_gold_standard/test_web/tokenized/ 14 | rm -r tmp/ 15 | -------------------------------------------------------------------------------- /src/somajo/data/non-breaking_suffixes_en.txt: -------------------------------------------------------------------------------- 1 | # Hyphenated suffixes in the following list are not split into multiple tokens. 2 | # Euro-centric and element-wise are both single tokens. 3 | 4 | able 5 | ahol 6 | aholic 7 | ation 8 | centric 9 | cracy 10 | crat 11 | dom 12 | er 13 | ery 14 | esque 15 | ette 16 | fest 17 | fold 18 | ful 19 | gate 20 | gon 21 | hood 22 | ian 23 | ible 24 | ing 25 | isation 26 | ise 27 | ising 28 | ism 29 | ist 30 | itis 31 | ization 32 | ize 33 | izing 34 | less 35 | logist 36 | logy 37 | ly 38 | most 39 | o-torium 40 | rama 41 | wise 42 | -------------------------------------------------------------------------------- /src/somajo/data/units.txt: -------------------------------------------------------------------------------- 1 | # A list of units preceded by numbers. The list is case-insensitive. 2 | # 3 | # Lines starting with “#” are treated as comments and will be ignored. 4 | 5 | bit 6 | cent 7 | cm 8 | cm2 9 | cm3 10 | cm^2 11 | cm^3 12 | cm² 13 | cm³ 14 | dm 15 | dm2 16 | dm3 17 | dm^2 18 | dm^3 19 | dm² 20 | dm³ 21 | eur 22 | f 23 | ft 24 | g 25 | gbit/s 26 | ghz 27 | h 28 | hz 29 | kg 30 | km 31 | km/h 32 | km2 33 | km3 34 | km^2 35 | km^3 36 | km² 37 | km³ 38 | l 39 | lb 40 | m 41 | m2 42 | m3 43 | m^2 44 | m^3 45 | mbit/s 46 | min 47 | ml 48 | mm 49 | mm2 50 | mm3 51 | mm^2 52 | mm^3 53 | mm² 54 | mm³ 55 | m² 56 | m³ 57 | qm 58 | s 59 | sek 60 | -------------------------------------------------------------------------------- /utils/evaluate_on_ewt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTDIR=$(dirname $BASH_SOURCE) 4 | cd $SCRIPTDIR 5 | 6 | mkdir tmp 7 | for f in ../data/English_Web_Treebank/en-ud-*.txt 8 | do 9 | filename=$(basename $f) 10 | somajo-tokenizer -l en_PTB $f > tmp/$filename 11 | done 12 | echo "GOLD" 13 | # perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_ewt.txt tmp ../data/English_Web_Treebank/gold 14 | ./evaluate.py -d -e errors.txt tmp/ ../data/English_Web_Treebank/gold/ 15 | # echo "" 16 | # echo "SEMIGOLD" 17 | # perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_ewt_semi.txt tmp ../data/English_Web_Treebank/semigold 18 | rm -r tmp/ 19 | -------------------------------------------------------------------------------- /tests/test_token.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import unittest 4 | 5 | from somajo.token import Token 6 | 7 | 8 | class TestToken(unittest.TestCase): 9 | def test_token_01(self): 10 | text = "FooBar" 11 | t = Token(text) 12 | self.assertEqual(str(t), text) 13 | 14 | def test_token_02(self): 15 | t = Token("FooBar", space_after=False, original_spelling="Foo Bar") 16 | self.assertEqual(t.extra_info, 'SpaceAfter=No, OriginalSpelling="Foo Bar"') 17 | 18 | def test_token_03(self): 19 | t = Token("

", markup=True, markup_class="start", markup_eos=True) 20 | self.assertEqual(t.markup_class, "start") 21 | self.assertTrue(t.markup_eos) 22 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /src/somajo/data/single_token_abbreviations_de.txt: -------------------------------------------------------------------------------- 1 | # A list of multi-dot abbreviations that represent single tokens and 2 | # should not be split. 3 | # 4 | # Lines starting with “#” are treated as comments and will be ignored. 5 | 6 | ak.mas 7 | Art.-Nr. 8 | At.-Gew. 9 | Best.-Nr. 10 | BT-Drs. 11 | Dipl.-Ing. 12 | E.ON 13 | Forsch.frage 14 | GV.NRW. 15 | H.-I. 16 | H.-Qu. 17 | IT.NRW 18 | klass.-lat. 19 | Komm.formen 20 | Krim.-Ob.-Insp. 21 | Kto.-Nr. 22 | L.-Abg. 23 | M.-Schr. 24 | Mat.-Nr. 25 | MBl.NRW. 26 | o.k. 27 | Pan.do/ra 28 | Priv.-Doz. 29 | prov.-fr. 30 | Proz.-Bev. 31 | r.-k. 32 | Reg.-Bez. 33 | Rg.-Präs. 34 | röm.-kath. 35 | Sat.1 36 | SMBl.NRW. 37 | soz.-päd. 38 | SP.ON 39 | T.V. 40 | Uni-Kl. 41 | USt-IdNr. 42 | Zeitschr.titel 43 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: [push, pull_request, workflow_dispatch] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 11 | 12 | steps: 13 | - name: Checkout sources 14 | uses: actions/checkout@v4 15 | 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | python -m pip install . 25 | 26 | - name: Test 27 | run: | 28 | python -m unittest discover 29 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | SoMaJo 2 | ====== 3 | 4 | SoMaJo is a rule-based tokenizer and sentence splitter that implements 5 | tokenization guidelines for German and English. It has a strong focus 6 | on web and social media texts (it was originally created as the 7 | winning submission to the `EmpiriST 2015 shared task 8 | `_ on automatic 9 | linguistic annotation of computer-mediated communication / social 10 | media) and is particularly well-suited to perform tokenization on all 11 | kinds of written discourse, for example chats, forums, wiki talk 12 | pages, tweets, blog comments, social networks, SMS and WhatsApp 13 | dialogues. Of course it also works on more formal texts. 14 | 15 | More detailed documentation is available `here 16 | `_. 17 | -------------------------------------------------------------------------------- /utils/baseline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTDIR=$(dirname $BASH_SOURCE) 4 | cd $SCRIPTDIR 5 | 6 | mkdir tmp 7 | for f in ../../data/all_test/raw/* 8 | # for f in ../../data/empirist_test_pos_cmc/raw/* 9 | # for f in ../../data/empirist_test_pos_web/raw/* 10 | do 11 | filename=$(basename $f) 12 | sed -re "/^<[^>]+>$/! { s/([.!?,;:+*()\"'–])/ \1 /g; s/\s+/\n/g }" $f > tmp/$filename 13 | done 14 | perl ../../data/empirist_test_pos_web/tools/compare_tokenization.perl -x -e errors_baseline_test.txt tmp ../../data/all_test/tokenized 15 | # perl ../../data/empirist_test_pos_web/tools/compare_tokenization.perl -e errors_test.txt tmp ../../data/empirist_test_pos_cmc/tokenized 16 | # perl ../../data/empirist_test_pos_web/tools/compare_tokenization.perl -e errors_test.txt tmp ../../data/empirist_test_pos_web/tokenized 17 | rm -r tmp/ 18 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /src/somajo/data/non-breaking_prefixes_en.txt: -------------------------------------------------------------------------------- 1 | # Hyphenated prefixes in the following list are not split into multiple tokens. 2 | # E-mail and re-evaluation are both single tokens. 3 | 4 | a 5 | adeno 6 | agro 7 | ambi 8 | ante 9 | anti 10 | aorto 11 | arch 12 | axio 13 | be 14 | bi 15 | bio 16 | broncho 17 | centi 18 | circum 19 | cis 20 | co 21 | colo 22 | contra 23 | cortico 24 | counter 25 | cran 26 | cross 27 | crypto 28 | cyber 29 | de 30 | deca 31 | demi 32 | dis 33 | e 34 | eco 35 | electro 36 | ennea 37 | ex 38 | extra 39 | ferro 40 | gastro 41 | giga 42 | hemi 43 | hepta 44 | hexa 45 | hypo 46 | ideo 47 | idio 48 | in 49 | infra 50 | inter 51 | intra 52 | iso 53 | judeo 54 | macro 55 | medi 56 | mega 57 | micro 58 | mid 59 | milli 60 | mini 61 | mono 62 | multi 63 | musculo 64 | neo 65 | neuro 66 | nitro 67 | non 68 | novem 69 | octa 70 | octo 71 | ortho 72 | over 73 | paleo 74 | pan 75 | para 76 | pelvi 77 | penta 78 | peri 79 | pheno 80 | phospho 81 | pica 82 | pneumo 83 | poly 84 | post 85 | pre 86 | preter 87 | pro 88 | pseudo 89 | quadri 90 | quasi 91 | quinque 92 | re 93 | recto 94 | salpingo 95 | semi 96 | sept 97 | sero 98 | soci 99 | sub 100 | super 101 | supra 102 | sur 103 | tele 104 | tera 105 | tetra 106 | tri 107 | u 108 | uber 109 | ultra 110 | un 111 | uni 112 | veno 113 | ventriculo 114 | vice 115 | x 116 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # 1. Build distribution files: 2 | # python3 -m build 3 | # 4 | # 2. Upload to PyPI: 5 | # twine upload dist/* 6 | # 7 | # 3. Check if everything looks all right: 8 | # https://pypi.python.org/pypi/SoMaJo 9 | # 10 | # 4. Go to https://github.com/tsproisl/SoMaJo/releases/new and create 11 | # a new release 12 | [project] 13 | name = "SoMaJo" 14 | version = "2.4.3" 15 | description = "A tokenizer and sentence splitter for German and English web and social media texts." 16 | readme = "README.md" 17 | requires-python = ">=3.8" 18 | license = {file = "LICENSE.txt"} 19 | keywords = ["tokenizer", "sentence-splitter"] 20 | authors = [ 21 | {name = "Thomas Proisl, Peter Uhrig", email = "thomas.proisl@fau.de"} 22 | ] 23 | maintainers = [ 24 | {name = "Thomas Proisl", email = "thomas.proisl@fau.de"} 25 | ] 26 | classifiers = [ 27 | "Development Status :: 5 - Production/Stable", 28 | "Environment :: Console", 29 | "Intended Audience :: Developers", 30 | "Intended Audience :: Science/Research", 31 | "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", 32 | "Natural Language :: German", 33 | "Natural Language :: English", 34 | "Operating System :: OS Independent", 35 | "Programming Language :: Python :: 3", 36 | "Topic :: Text Processing :: Linguistic", 37 | ] 38 | 39 | dependencies = [ 40 | "regex>=2019.02.18", 41 | ] 42 | 43 | [project.urls] 44 | "Homepage" = "https://github.com/tsproisl/SoMaJo" 45 | "API documentation" = "https://github.com/tsproisl/SoMaJo/blob/master/doc/build/markdown/somajo.md" 46 | 47 | [project.scripts] 48 | somajo-tokenizer = "somajo.cli:main" 49 | 50 | [build-system] 51 | requires = ["setuptools>=61.0"] 52 | build-backend = "setuptools.build_meta" 53 | 54 | [tool.setuptools.packages.find] 55 | where = ["src"] 56 | 57 | [tool.setuptools.package-data] 58 | "somajo.data" = ["*.txt"] 59 | -------------------------------------------------------------------------------- /utils/errors_trial.txt: -------------------------------------------------------------------------------- 1 | __________________________________________________________________________________________________ 2 | tmp/blog_comments.txt ../../data/all_trial/tokenized/blog_comments.txt 3 | 4 | False Positive (linebreak inserted left): 5 | 145: WIE 145: WIE 6 | 146: ICH 146: ICH 7 | 147: * WEI 147: * WEI? 8 | 148: * ? 148: HABT 9 | 149: HABT 149: IHR 10 | 150: IHR 150: BEIDE 11 | 12 | __________________________________________________________________________________________________ 13 | tmp/social_chat.txt ../../data/all_trial/tokenized/social_chat.txt 14 | 15 | False Positive (linebreak inserted left): 16 | 157: marc 157: marc 17 | 158: . 158: . 18 | 159: * . 159: * .))) 19 | 160: * ))) 160: 20 | 161: 161: 21 | 162: 162: ups 22 | 23 | __________________________________________________________________________________________________ 24 | tmp/wikipedia_talk_pages.txt .../all_trial/tokenized/wikipedia_talk_pages.txt 25 | 26 | False Negative (linebreak inserted right): 27 | 495: meine 495: meine 28 | 496: ich 496: ich 29 | 497: * ;O)) 497: * ;O) 30 | 498: . 498: * ) 31 | 499: . 32 | 33 | 34 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | # pip install sphinx sphinx-markdown-builder 2 | # mkdir doc 3 | # cd doc/ 4 | # sphinx-quickstart --sep -p SoMaJo -a "Thomas Proisl, Peter Uhrig" -v "2.0.0" --ext-autodoc --extensions sphinx.ext.napoleon 5 | # # edit source/conf.py: 6 | # # import os 7 | # # import sys 8 | # # sys.path.insert(0, os.path.abspath('../..')) 9 | # cd .. 10 | # sphinx-apidoc -f -o doc/source/ somajo 11 | # cd doc 12 | # make markdown 13 | 14 | 15 | # Configuration file for the Sphinx documentation builder. 16 | # 17 | # This file only contains a selection of the most common options. For a full 18 | # list see the documentation: 19 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 20 | 21 | # -- Path setup -------------------------------------------------------------- 22 | 23 | # If extensions (or modules to document with autodoc) are in another directory, 24 | # add these directories to sys.path here. If the directory is relative to the 25 | # documentation root, use os.path.abspath to make it absolute, like shown here. 26 | # 27 | import os 28 | import sys 29 | sys.path.insert(0, os.path.abspath('../..')) 30 | 31 | 32 | # -- Project information ----------------------------------------------------- 33 | 34 | project = 'SoMaJo' 35 | copyright = '2019, Thomas Proisl, Peter Uhrig' 36 | author = 'Thomas Proisl, Peter Uhrig' 37 | 38 | # The short X.Y version 39 | version = '2.0.0' 40 | 41 | # The full version, including alpha/beta/rc tags 42 | release = '2.0.0' 43 | 44 | 45 | # -- General configuration --------------------------------------------------- 46 | 47 | # Add any Sphinx extension module names here, as strings. They can be 48 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 49 | # ones. 50 | extensions = [ 51 | 'sphinx.ext.autodoc', 52 | 'sphinx.ext.napoleon', 53 | 'sphinx_markdown_builder', 54 | ] 55 | 56 | # Add any paths that contain templates here, relative to this directory. 57 | templates_path = ['_templates'] 58 | 59 | # List of patterns, relative to source directory, that match files and 60 | # directories to ignore when looking for source files. 61 | # This pattern also affects html_static_path and html_extra_path. 62 | exclude_patterns = [] 63 | 64 | 65 | # -- Options for HTML output ------------------------------------------------- 66 | 67 | # The theme to use for HTML and HTML Help pages. See the documentation for 68 | # a list of builtin themes. 69 | # 70 | html_theme = 'alabaster' 71 | 72 | # Add any paths that contain custom static files (such as style sheets) here, 73 | # relative to this directory. They are copied after the builtin static files, 74 | # so a file named "default.css" will overwrite the builtin "default.css". 75 | html_static_path = ['_static'] 76 | 77 | 78 | # -- Extension configuration ------------------------------------------------- 79 | -------------------------------------------------------------------------------- /src/somajo/doubly_linked_list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import operator 4 | 5 | 6 | class DLLElement: 7 | def __init__(self, val=None, prv=None, nxt=None, lst=None): 8 | if isinstance(val, DLLElement): 9 | val = val.value 10 | self.prev = prv 11 | self.next = nxt 12 | self.value = val 13 | self.list = lst 14 | if prv is not None: 15 | prv.next = self 16 | if nxt is not None: 17 | nxt.prev = self 18 | 19 | 20 | class DLL: 21 | def __init__(self, iterable=None): 22 | self.first = None 23 | self.last = None 24 | self.size = 0 25 | if iterable is not None: 26 | self.extend(iterable) 27 | 28 | def __iter__(self, start=None): 29 | current = self.first 30 | if start is not None: 31 | current = start 32 | while current is not None: 33 | yield current 34 | current = current.next 35 | 36 | def __reversed__(self, start=None): 37 | current = self.last 38 | if start is not None: 39 | current = start 40 | while current is not None: 41 | yield current 42 | current = current.prev 43 | 44 | def __len__(self): 45 | return self.size 46 | 47 | def __str__(self): 48 | return str(self.to_list()) 49 | 50 | def _find_matching_element(self, item, attrgetter, value, ignore_attrgetter=None, ignore_value=None, forward=True): 51 | current = item 52 | direction = operator.attrgetter("next") 53 | if not forward: 54 | direction = operator.attrgetter("prev") 55 | while direction(current) is not None: 56 | current = direction(current) 57 | if ignore_attrgetter is not None: 58 | if ignore_attrgetter(current) == ignore_value: 59 | continue 60 | if attrgetter(current) == value: 61 | return current 62 | return None 63 | 64 | def append(self, item): 65 | element = DLLElement(item, self.last, None, self) 66 | if self.first is None: 67 | self.first = element 68 | self.last = element 69 | self.size += 1 70 | 71 | def append_left(self, item): 72 | element = DLLElement(item, None, self.first, self) 73 | if self.last is None: 74 | self.last = element 75 | self.first = element 76 | self.size += 1 77 | 78 | def extend(self, iterable): 79 | for item in iterable: 80 | self.append(item) 81 | 82 | def insert_left(self, item, ref_element): 83 | element = DLLElement(item, ref_element.prev, ref_element, self) 84 | ref_element.prev = element 85 | if self.first is ref_element: 86 | self.first = element 87 | self.size += 1 88 | 89 | def insert_right(self, item, ref_element): 90 | element = DLLElement(item, ref_element, ref_element.next, self) 91 | ref_element.next = element 92 | if self.last is ref_element: 93 | self.last = element 94 | self.size += 1 95 | 96 | def is_left_of(self, element, ref_element): 97 | current = ref_element 98 | while current is not self.first: 99 | current = current.prev 100 | if current is element: 101 | return True 102 | return False 103 | 104 | def is_right_of(self, element, ref_element): 105 | return self.is_left_of(ref_element, element) 106 | 107 | def next_matching(self, item, attrgetter, value, ignore_attrgetter=None, ignore_value=None): 108 | return self._find_matching_element(item, attrgetter, value, ignore_attrgetter, ignore_value, forward=True) 109 | 110 | def pop(self): 111 | if self.size == 0: 112 | raise IndexError 113 | element = self.last 114 | self.remove(element) 115 | return element.value 116 | 117 | def previous_matching(self, item, attrgetter, value, ignore_attrgetter=None, ignore_value=None): 118 | return self._find_matching_element(item, attrgetter, value, ignore_attrgetter, ignore_value, forward=False) 119 | 120 | def remove(self, element): 121 | if self.first is element: 122 | self.first = element.next 123 | if self.last is element: 124 | self.last = element.prev 125 | if element.prev is not None: 126 | element.prev.next = element.next 127 | if element.next is not None: 128 | element.next.prev = element.prev 129 | self.size -= 1 130 | 131 | def to_list(self): 132 | return [e.value for e in self] 133 | -------------------------------------------------------------------------------- /src/somajo/token.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | class Token: 5 | """Token objects store a piece of text (in the end a single token) with additional information. 6 | 7 | Parameters 8 | ---------- 9 | text : str 10 | The text that makes up the token object 11 | markup : bool, (default=False) 12 | Is the token a markup token? 13 | markup_class : {'start', 'end'}, optional (default=None) 14 | If `markup=True`, then `markup_class` must be either "start" or "end". 15 | markup_eos : bool, optional (default=None) 16 | Is the markup token a sentence boundary? 17 | locked : bool, (default=False) 18 | Mark the token as locked. 19 | token_class : {'URL', 'XML_entity', 'XML_tag', 'abbreviation', 'action_word', 'amount', 'date', 'email_address', 'emoticon', 'hashtag', 'measurement', 'mention', 'number', 'ordinal', 'regular', 'semester', 'symbol', 'time'}, optional (default=None) 20 | The class of the token, e.g. "regular", "emoticon", "URL", etc. 21 | space_after : bool, (default=True) 22 | Was there a space after the token in the original data? 23 | original_spelling : str, optional (default=None) 24 | The original spelling of the token, if it is different from the one in `text`. 25 | first_in_sentence : bool, (default=False) 26 | Is it the first token of a sentence? 27 | last_in_sentence : bool, (default=False) 28 | Is it the last token of a sentence? 29 | character_offset : tuple, (default=None) 30 | Character offset of the token in the input as tuple `(start, end)` 31 | such that `input[start:end] == text` (if there are no changes to 32 | the token text during tokenization) 33 | 34 | """ 35 | 36 | token_classes = { 37 | "URL", 38 | "XML_entity", 39 | "XML_tag", 40 | "abbreviation", 41 | "action_word", 42 | "amount", 43 | "date", 44 | "email_address", 45 | "emoticon", 46 | "hashtag", 47 | "measurement", 48 | "mention", 49 | "number", 50 | "ordinal", 51 | "regular", 52 | "semester", 53 | "symbol", 54 | "time", 55 | } 56 | 57 | def __init__( 58 | self, 59 | text, 60 | *, 61 | markup=False, 62 | markup_class=None, 63 | markup_eos=None, 64 | locked=False, 65 | token_class=None, 66 | space_after=True, 67 | original_spelling=None, 68 | first_in_sentence=False, 69 | last_in_sentence=False, 70 | character_offset=None 71 | ): 72 | self.text = text 73 | if markup: 74 | assert markup_class is not None, "You need to specify a `markup_class` for markup tokens." 75 | assert markup_eos is not None, "You need to provide a value for `markup_eos` for markup tokens." 76 | if markup_class is not None: 77 | assert markup, "You can only specify a `markup_class` for markup tokens." 78 | assert markup_class == "start" or markup_class == "end", f"'{markup_class}' is not a recognized markup class." 79 | if markup_eos is not None: 80 | assert markup, "You can only use `markup_eos` for markup tokens." 81 | assert isinstance(markup_eos, bool), f"'{markup_eos}' is not a Boolean value." 82 | if token_class is not None: 83 | assert token_class in self.token_classes, f"'{token_class}' is not a recognized token class." 84 | self.markup = markup 85 | self.markup_class = markup_class 86 | self.markup_eos = markup_eos 87 | self._locked = locked 88 | self.token_class = token_class 89 | self.space_after = space_after 90 | self.original_spelling = original_spelling 91 | self.first_in_sentence = first_in_sentence 92 | self.last_in_sentence = last_in_sentence 93 | self.character_offset = character_offset 94 | 95 | def __str__(self): 96 | return self.text 97 | 98 | @property 99 | def extra_info(self): 100 | """String representation of extra information. 101 | 102 | Returns 103 | ------- 104 | str 105 | A string representation of the `space_after` and `original_spelling` attributes. 106 | 107 | Examples 108 | -------- 109 | >>> tok = Token(":)", token_class="regular", space_after=False, original_spelling=": )") 110 | >>> print(tok.text) 111 | :) 112 | >>> print(tok.extra_info) 113 | SpaceAfter=No, OriginalSpelling=": )" 114 | 115 | """ 116 | info = [] 117 | if not self.space_after: 118 | info.append("SpaceAfter=No") 119 | if self.original_spelling is not None: 120 | info.append("OriginalSpelling=\"%s\"" % self.original_spelling) 121 | return ", ".join(info) 122 | -------------------------------------------------------------------------------- /tests/test_doubly_linked_list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import operator 4 | import unittest 5 | 6 | from somajo.doubly_linked_list import DLL 7 | 8 | 9 | class TestDLL(unittest.TestCase): 10 | def test_dll_01(self): 11 | lst = ["Foo", "", 0, -1, False, True, None] 12 | dll = DLL(lst) 13 | self.assertEqual(dll.to_list(), lst) 14 | 15 | def test_dll_02(self): 16 | lst = ["Foo", "", 0, -1, False, True, None] 17 | dll = DLL(lst) 18 | self.assertEqual(DLL(reversed(dll)).to_list(), list(reversed(lst))) 19 | 20 | def test_dll_03(self): 21 | lst = ["Foo", "", 0, -1, False, True, None] 22 | dll = DLL(lst) 23 | self.assertEqual(len(dll), len(lst)) 24 | 25 | def test_dll_04(self): 26 | lst = ["Foo", "", 0, -1, False, True, None] 27 | dll = DLL(["Foo", "", 0, -1, False, True, None]) 28 | self.assertEqual(str(dll), str(lst)) 29 | 30 | def test_dll_05(self): 31 | dll = DLL([4, 5, 6]) 32 | dll.append_left(3) 33 | self.assertEqual(dll.to_list(), [3, 4, 5, 6]) 34 | 35 | def test_dll_06(self): 36 | dll = DLL([4, 5, 6]) 37 | dll.append(7) 38 | self.assertEqual(dll.to_list(), [4, 5, 6, 7]) 39 | 40 | def test_dll_07(self): 41 | dll = DLL([4, 5, 6]) 42 | dll.extend([7, 8, 9]) 43 | self.assertEqual(dll.to_list(), [4, 5, 6, 7, 8, 9]) 44 | 45 | def test_dll_08(self): 46 | dll = DLL([4, 5, 6, 7]) 47 | last = dll.pop() 48 | self.assertEqual(last, 7) 49 | self.assertEqual(len(dll), 3) 50 | self.assertEqual(dll.to_list(), [4, 5, 6]) 51 | 52 | def test_dll_09(self): 53 | dll = DLL([]) 54 | self.assertEqual(len(dll), 0) 55 | self.assertEqual(dll.to_list(), []) 56 | 57 | def test_dll_10(self): 58 | dll = DLL([4]) 59 | last = dll.pop() 60 | self.assertEqual(last, 4) 61 | self.assertEqual(len(dll), 0) 62 | self.assertEqual(dll.to_list(), []) 63 | 64 | def test_dll_11(self): 65 | dll = DLL([4]) 66 | last = dll.pop() 67 | self.assertEqual(last, 4) 68 | self.assertRaises(IndexError, dll.pop) 69 | 70 | def test_dll_12(self): 71 | dll = DLL([]) 72 | dll.append_left(1) 73 | self.assertEqual(dll.to_list(), [1]) 74 | 75 | def test_dll_13(self): 76 | dll = DLL([1, 2, 3, 4]) 77 | x = dll.next_matching(dll.first, operator.attrgetter("value"), 2) 78 | self.assertEqual(x.value, 2) 79 | self.assertEqual([e.value for e in dll.__iter__(start=x)], [2, 3, 4]) 80 | 81 | def test_dll_14(self): 82 | dll = DLL([1, 2, 3, 4]) 83 | x = dll.previous_matching(dll.last, operator.attrgetter("value"), 3) 84 | self.assertEqual(x.value, 3) 85 | self.assertEqual([e.value for e in dll.__reversed__(start=x)], [3, 2, 1]) 86 | 87 | def test_dll_15(self): 88 | dll = DLL([1, 2, 3, 4]) 89 | x = dll.next_matching(dll.first, operator.attrgetter("value"), 4, operator.attrgetter("value"), 3) 90 | self.assertEqual(x.value, 4) 91 | 92 | def test_dll_16(self): 93 | dll = DLL([1, 2, 3, 4]) 94 | x = dll.next_matching(dll.first, operator.attrgetter("value"), 7) 95 | self.assertIs(x, None) 96 | 97 | def test_dll_17(self): 98 | dll = DLL([1, 2, 3]) 99 | x = dll.next_matching(dll.first, operator.attrgetter("value"), 2) 100 | dll.insert_left(7, x) 101 | self.assertEqual(dll.to_list(), [1, 7, 2, 3]) 102 | 103 | def test_dll_18(self): 104 | dll = DLL([1, 2, 3]) 105 | x = dll.next_matching(dll.first, operator.attrgetter("value"), 2) 106 | dll.insert_right(7, x) 107 | self.assertEqual(dll.to_list(), [1, 2, 7, 3]) 108 | 109 | def test_dll_19(self): 110 | dll = DLL([1, 2, 3]) 111 | self.assertTrue(dll.is_left_of(dll.first, dll.last)) 112 | 113 | def test_dll_20(self): 114 | dll = DLL([1, 2, 3]) 115 | self.assertTrue(dll.is_right_of(dll.last, dll.first)) 116 | 117 | def test_dll_21(self): 118 | dll = DLL([1, 2, 3]) 119 | x = dll.next_matching(dll.first, operator.attrgetter("value"), 2) 120 | dll.remove(x) 121 | self.assertEqual(dll.to_list(), [1, 3]) 122 | 123 | def test_dll_22(self): 124 | dll = DLL([1, 2, 3]) 125 | dll.remove(dll.first) 126 | self.assertEqual(dll.to_list(), [2, 3]) 127 | 128 | def test_dll_23(self): 129 | dll = DLL([1, 2, 3]) 130 | dll.remove(dll.last) 131 | self.assertEqual(dll.to_list(), [1, 2]) 132 | 133 | def test_dll_24(self): 134 | dll = DLL([1, 2, 3]) 135 | dll.insert_left(0, dll.first) 136 | self.assertEqual(dll.to_list(), [0, 1, 2, 3]) 137 | 138 | def test_dll_25(self): 139 | dll = DLL([1, 2, 3]) 140 | dll.insert_right(4, dll.last) 141 | self.assertEqual(dll.to_list(), [1, 2, 3, 4]) 142 | 143 | def test_dll_26(self): 144 | dll = DLL([1, 2, 3]) 145 | self.assertFalse(dll.is_left_of(dll.last, dll.first)) 146 | 147 | def test_dll_27(self): 148 | dll = DLL([1]) 149 | dll.remove(dll.last) 150 | self.assertEqual(dll.to_list(), []) 151 | 152 | def test_dll_28(self): 153 | dll = DLL([1]) 154 | dll.remove(dll.first) 155 | self.assertEqual(dll.to_list(), []) 156 | -------------------------------------------------------------------------------- /src/somajo/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import logging 5 | import time 6 | 7 | from . import ( 8 | SoMaJo, 9 | __version__ 10 | ) 11 | 12 | logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO) 13 | 14 | 15 | def arguments(): 16 | """""" 17 | parser = argparse.ArgumentParser(description="A tokenizer and sentence splitter for German and English texts. Currently, two tokenization guidelines are implemented: The EmpiriST guidelines for German web and social media texts (de_CMC) and the \"new\" Penn Treebank conventions for English texts (en_PTB).") 18 | parser.add_argument("-l", "--language", choices=SoMaJo.supported_languages, default=SoMaJo._default_language, help="Choose a language. Currently supported are German EmpiriST-style tokenization (de_CMC) and English Penn-Treebank-style tokenization(en_PTB). (Default: de_CMC)") 19 | parser.add_argument("-s", "--paragraph_separator", choices=SoMaJo.paragraph_separators, default=SoMaJo._default_parsep, help="How are paragraphs separated in the input text? Will be ignored if option -x/--xml is used. (Default: empty_lines)") 20 | parser.add_argument("-x", "--xml", action="store_true", help="The input is an XML file. You can specify tags that always constitute a sentence break (e.g. HTML p tags) via the --tag option.") 21 | parser.add_argument("--tag", action="append", help="Start and end tags of this type constitute sentence breaks, i.e. they do not occur in the middle of a sentence. Can be used multiple times to specify multiple tags, e.g. --tag p --tag br. Implies option -x/--xml. (Default: --tag title --tag h1 --tag h2 --tag h3 --tag h4 --tag h5 --tag h6 --tag p --tag br --tag hr --tag div --tag ol --tag ul --tag dl --tag table)") 22 | parser.add_argument("--prune", action="append", help="Tags of this type will be removed from the input before tokenization. Can be used multiple times to specify multiple tags, e.g. --tag script --tag style. Implies option -x/--xml. By default, no tags are pruned.") 23 | parser.add_argument("--strip-tags", action="store_true", help="Suppresses output of XML tags. Implies option -x/--xml.") 24 | parser.add_argument("-c", "--split_camel_case", action="store_true", help="Split items in written in camelCase (excluding established names and terms).") 25 | parser.add_argument("--split_sentences", "--split-sentences", action="store_true", help="Also split the input into sentences.") 26 | parser.add_argument("--sentence_tag", "--sentence-tag", type=str, help="Tag name for sentence boundaries (e.g. --sentence_tag s). If this option is specified, sentences will be delimited by XML tags (e.g. ) instead of empty lines. This option implies --split_sentences") 27 | parser.add_argument("-t", "--token_classes", action="store_true", help="Output the token classes (number, XML tag, abbreviation, etc.) in addition to the tokens.") 28 | parser.add_argument("-e", "--extra_info", action="store_true", help='Output additional information for each token: SpaceAfter=No if the token was not followed by a space and OriginalSpelling="…" if the token contained whitespace.') 29 | parser.add_argument("--character-offsets", action="store_true", help='Output character offsets in the input for each token.') 30 | parser.add_argument("--parallel", type=int, default=1, metavar="N", help="Run N worker processes (up to the number of CPUs) to speed up tokenization.") 31 | parser.add_argument("-v", "--version", action="version", version="SoMaJo %s" % __version__, help="Output version information and exit.") 32 | parser.add_argument("FILE", type=argparse.FileType("r", encoding="utf-8"), help="The input file (UTF-8-encoded) or \"-\" to read from STDIN.") 33 | args = parser.parse_args() 34 | return args 35 | 36 | 37 | def main(): 38 | args = arguments() 39 | n_tokens = 0 40 | n_sentences = 0 41 | t0 = time.perf_counter() 42 | is_xml = False 43 | if args.xml or args.strip_tags or (args.tag is not None) or (args.prune is not None): 44 | is_xml = True 45 | if args.sentence_tag: 46 | args.split_sentences = True 47 | tokenizer = SoMaJo( 48 | args.language, 49 | split_camel_case=args.split_camel_case, 50 | split_sentences=args.split_sentences, 51 | xml_sentences=args.sentence_tag, 52 | character_offsets=args.character_offsets 53 | ) 54 | if is_xml: 55 | eos_tags = args.tag 56 | if eos_tags is None: 57 | eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() 58 | chunks = tokenizer.tokenize_xml_file(args.FILE, eos_tags, strip_tags=args.strip_tags, parallel=args.parallel, prune_tags=args.prune) 59 | else: 60 | chunks = tokenizer.tokenize_text_file(args.FILE, args.paragraph_separator, parallel=args.parallel) 61 | for chunk in chunks: 62 | n_sentences += 1 63 | for token in chunk: 64 | output = token.text 65 | if not token.markup: 66 | n_tokens += 1 67 | if args.token_classes: 68 | output += "\t" + token.token_class 69 | if args.extra_info: 70 | output += "\t" + token.extra_info 71 | if args.character_offsets: 72 | output += f"\t{token.character_offset[0]}, {token.character_offset[1]}" 73 | print(output) 74 | if args.split_sentences and args.sentence_tag is None: 75 | print() 76 | t1 = time.perf_counter() 77 | if args.split_sentences: 78 | logging.info("Tokenized %d tokens (%d sentences) in %d seconds (%d tokens/s)" % (n_tokens, n_sentences, t1 - t0, n_tokens / (t1 - t0))) 79 | else: 80 | logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" % (n_tokens, t1 - t0, n_tokens / (t1 - t0))) 81 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import unittest 4 | 5 | from somajo import utils 6 | 7 | 8 | class TestXmlChunkGenerator(unittest.TestCase): 9 | def _equal(self, raw, chunks, prune_tags=None): 10 | eos_tags = set(["p"]) 11 | if prune_tags is not None: 12 | prune_tags = set(prune_tags) 13 | chunk_info = list(utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags, prune_tags=prune_tags)) 14 | chunk_lists = (ci[0] for ci in chunk_info) 15 | chunk_lists = [[t.text for t in gc] for gc in chunk_lists] 16 | self.assertEqual(chunk_lists, chunks) 17 | 18 | def _equal_offsets(self, raw, chunks, prune_tags=None): 19 | eos_tags = set(["p"]) 20 | if prune_tags is not None: 21 | prune_tags = set(prune_tags) 22 | chunk_info = list(utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags, prune_tags=prune_tags, character_offsets=True)) 23 | chunk_lists, raws, positions = zip(*chunk_info) 24 | offsets = [[t.character_offset for t in cl] for cl in chunk_lists] 25 | extracted_chunks = [[raw[s:e] for s, e in o] for o in offsets] 26 | self.assertEqual(extracted_chunks, chunks) 27 | 28 | def test_xml_chunk_generator_01(self): 29 | self._equal("foo bar", [["", "foo bar", ""]]) 30 | 31 | def test_xml_chunk_generator_02(self): 32 | self._equal("

foo

bar

", [["", "

", "foo", "

"], ["

", "bar", "

", "
"]]) 33 | 34 | def test_xml_chunk_generator_03(self): 35 | self._equal("\n

\nfoo\n

\n

\nbar\n

\n
", [["", "\n", "

", "\nfoo\n", "

"], ["\n", "

", "\nbar\n", "

", "\n", "
"]]) 36 | 37 | def test_xml_chunk_generator_04(self): 38 | self._equal( 39 | "\n

\n foo\n

\n

\n bar\n

\n
", 40 | [["", "\n ", "

", "\n foo\n ", "

"], ["\n ", "

", "\n bar\n ", "

", "\n", "
"]] 41 | ) 42 | 43 | def test_xml_chunk_generator_05(self): 44 | self._equal( 45 | "

foo

baz

bar

baz
", 46 | [["", "

", "foo", "

"], ["", "baz", ""], ["

", "bar", "

"], ["", "baz", "", "
"]] 47 | ) 48 | 49 | def test_xml_chunk_generator_06(self): 50 | self._equal( 51 | "

foo


bar


", 52 | [["", "

", "foo", "

"], ["
", "
", "

", "bar", "

"], ["
", "
", "
"]] 53 | ) 54 | 55 | def test_xml_chunk_generator_07(self): 56 | self._equal("foobar", [["", "", "bar", "", ""]], prune_tags=["del"]) 57 | 58 | def test_xml_chunk_generator_08(self): 59 | self._equal("foo

bar

", [["", "

", "bar", "

", "
"]], prune_tags=["del"]) 60 | 61 | def test_xml_chunk_generator_09(self): 62 | self._equal("bar\n foo\nbaz", [["", "bar\n \nbaz", ""]], prune_tags=["del"]) 63 | 64 | def test_xml_chunk_offsets_01(self): 65 | self._equal_offsets("Test", [["", "Test", ""]]) 66 | 67 | def test_xml_chunk_offsets_02(self): 68 | self._equal_offsets("3 < 5", [["", "3 < 5", ""]]) 69 | 70 | def test_xml_chunk_offsets_03(self): 71 | self._equal_offsets("Test­fall", [["", "Test­fall", ""]]) 72 | 73 | def test_xml_chunk_offsets_04(self): 74 | self._equal_offsets("Test­fall", [["", "Test­fall", ""]]) 75 | 76 | def test_xml_chunk_offsets_05(self): 77 | """Single combining mark""" 78 | self._equal_offsets("foo xÄx foo", [["", "foo xÄx foo", ""]]) 79 | 80 | def test_xml_chunk_offsets_06(self): 81 | """Multiple combining marks""" 82 | self._equal_offsets("foo xṩx foo", [["", "foo xṩx foo", ""]]) 83 | 84 | def test_xml_chunk_offsets_07(self): 85 | """Multiple combining marks""" 86 | self._equal_offsets("foo xṩx foo", [["", "foo xṩx foo", ""]]) 87 | 88 | def test_xml_chunk_offsets_08(self): 89 | """Multiple combining marks""" 90 | self._equal_offsets("foo xsḍ̇x foo", [["", "foo xsḍ̇x foo", ""]]) 91 | 92 | def test_xml_chunk_offsets_09(self): 93 | """Multiple combining marks""" 94 | self._equal_offsets("foo xq̣̇x foo", [["", "foo xq̣̇x foo", ""]]) 95 | 96 | def test_xml_chunk_offsets_10(self): 97 | self._equal_offsets("Foo", [["", "Foo", ""]]) 98 | 99 | def test_xml_chunk_offsets_11(self): 100 | self._equal_offsets("Foo", [["", "Foo", ""]]) 101 | 102 | def test_xml_chunk_offsets_12(self): 103 | self._equal_offsets(" Foo ", [["", " Foo ", ""]]) 104 | 105 | def test_xml_chunk_offsets_13(self): 106 | self._equal_offsets("Foo \"Bar\" 'Baz'", [["", "Foo \"Bar\" 'Baz'", ""]]) 107 | 108 | def test_xml_chunk_offsets_14(self): 109 | self._equal_offsets('\n Foo\n', [['', "\n Foo\n", ""]]) 110 | 111 | def test_xml_chunk_offsets_15(self): 112 | self._equal_offsets("Hallo
Tschüß
", [["", "Hallo", "
", "", "Tschüß", "
"]]) 113 | 114 | def test_xml_chunk_offsets_16(self): 115 | self._equal_offsets("Hallo
Tschüß
", [["", "Hallo", "
", "", "Tschüß", "
"]]) 116 | 117 | def test_xml_chunk_offsets_17(self): 118 | self._equal_offsets("\u0303foo", [["", "\u0303foo", ""]]) 119 | 120 | def test_xml_chunk_offsets_18(self): 121 | self._equal_offsets("foo

bar

", [["", "foo"], ["

", "bar", "

", "
"]]) 122 | 123 | @unittest.expectedFailure 124 | def test_xml_chunk_offsets_19(self): 125 | self._equal_offsets("bar futsch baz", [["", "bar baz", ""]], prune_tags=["del"]) 126 | -------------------------------------------------------------------------------- /utils/errors_train.txt: -------------------------------------------------------------------------------- 1 | __________________________________________________________________________________________________ 2 | tmp/cmc_train_twitter_2.txt ...a/all_train/tokenized/cmc_train_twitter_2.txt 3 | 4 | False Negative (linebreak inserted right): 5 | 238: 238: 6 | 239: @aPfeL4321 239: @aPfeL4321 7 | 240: * DasTB 240: * Das 8 | 241: sollte 241: * TB 9 | 242: allerdings 242: sollte 10 | 243: gut 243: allerdings 11 | 12 | False Negative (linebreak inserted right): 13 | 654: Vernachlässigung 655: Vernachlässigung 14 | 655: ? 656: ? 15 | 656: * Wenn2 657: * Wenn 16 | 657: : 658: * 2 17 | 658: warum 659: : 18 | 659: ? 660: warum 19 | 20 | __________________________________________________________________________________________________ 21 | tmp/cmc_train_blog_comment.txt ...ll_train/tokenized/cmc_train_blog_comment.txt 22 | 23 | False Positive (linebreak inserted left): 24 | 145: WIE 145: WIE 25 | 146: ICH 146: ICH 26 | 147: * WEI 147: * WEI? 27 | 148: * ? 148: HABT 28 | 149: HABT 149: IHR 29 | 150: IHR 150: BEIDE 30 | 31 | __________________________________________________________________________________________________ 32 | tmp/cmc_train_social_chat.txt ...all_train/tokenized/cmc_train_social_chat.txt 33 | 34 | False Positive (linebreak inserted left): 35 | 158: marc 157: marc 36 | 159: . 158: . 37 | 160: * . 159: * .))) 38 | 161: * ))) 160: 39 | 162: 161: 40 | 163: 162: ups 41 | 42 | False Positive (linebreak inserted left): 43 | 652: 650: 44 | 653: 651: 45 | 654: * 8 652: * 8:) 46 | 655: * :) 653: 47 | 656: 654: 1014: 52 | 1018: * 1015: * 53 | 1019: * 51cm 1016: * 51 54 | 1020: * 1017: * cm 55 | 1021: 1018: * 56 | 1022: 1019: 57 | 58 | False Negative (linebreak inserted right): 59 | 1340: 1338: 60 | 1341: 1339: 61 | 1342: * bochum-münster 1340: * bochum 62 | 1343: ohne 1341: * - 63 | 1344: küche 1342: münster 64 | 1345: 3500 1343: ohne 65 | 66 | False Negative (linebreak inserted right): 67 | 1340: 1339: 68 | 1341: 1340: bochum 69 | 1342: * bochum-münster 1341: * - 70 | 1343: ohne 1342: * münster 71 | 1344: küche 1343: ohne 72 | 1345: 3500 1344: küche 73 | 74 | __________________________________________________________________________________________________ 75 | tmp/cmc_train_professional_chat.txt ...ain/tokenized/cmc_train_professional_chat.txt 76 | 77 | False Negative (linebreak inserted right): 78 | 898: im 898: im 79 | 899: Pott 899: Pott 80 | 900: * :-)) 900: * :-) 81 | 901: ? 901: * ) 82 | 902: 902: ? 83 | 903: 0): 36 | nfc_j += 1 37 | orig_j = orig_i + 1 38 | while (orig_j < len(orig)) and (unicodedata.combining(orig[orig_j]) > 0): 39 | orig_j += 1 40 | assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j]), f"'{nfc[nfc_i:nfc_j]}' != unicodedata.normalize('NFC', '{orig[orig_i:orig_j]}')" 41 | alignment[(nfc_i, nfc_j)] = (orig_i, orig_j) 42 | nfc_i = nfc_j 43 | orig_i = orig_j 44 | assert orig_j == len(orig), f"{orig_j} != {len(orig)}; nfc: '{nfc}', orig: '{orig}'" 45 | return alignment 46 | 47 | 48 | def _determine_offsets(tokens, raw, position): 49 | """Determine start and end positions of tokens in the original raw (NFC) input.""" 50 | offsets = [] 51 | raw_i = 0 52 | raw = re.sub(r"\s", " ", raw) 53 | for token in tokens: 54 | if token.markup: 55 | start, end = token.character_offset 56 | start -= position 57 | end -= position 58 | else: 59 | text = token.text 60 | if token.original_spelling is not None: 61 | text = token.original_spelling 62 | text = re.sub(r"\s", " ", text) 63 | if raw[raw_i:].startswith(text): 64 | start = raw_i 65 | end = start + len(text) 66 | elif raw[raw_i:].startswith(" " + text): 67 | start = raw_i + 1 68 | end = start + len(text) 69 | else: 70 | raw_start = raw_i 71 | for i, char in enumerate(text): 72 | for j in range(raw_start, len(raw)): 73 | if raw[j] == char: 74 | if i == 0: 75 | start = j 76 | if i == len(text) - 1: 77 | end = j + 1 78 | break 79 | else: 80 | assert raw[j] in _skipable_characters, f"'{raw[j]}' ({hex(ord(raw[j]))}) is not a skipable character; token: '{text}', raw: '{raw[raw_i:]}'" 81 | raw_start = j + 1 82 | offsets.append((start, end)) 83 | raw_i = end 84 | return offsets 85 | 86 | 87 | def _resolve_entities(xml): 88 | """Resolve XML entities and provide an alignment from output string to input string.""" 89 | named = {"&": "&", "'": "'", ">": ">", "<": "<", """: '"'} 90 | outstring = "" 91 | alignment = [] 92 | xml_lower = xml.lower() 93 | i = 0 94 | for m in _xml_entity.finditer(xml_lower): 95 | start, end = m.span() 96 | if xml_lower[start + 2] == "x": 97 | char = chr(int(xml[start + 3:end - 1], base=16)) 98 | elif xml_lower[start + 1] == "#": 99 | char = chr(int(xml[start + 2:end - 1])) 100 | else: 101 | char = named[xml_lower[start:end]] 102 | outstring += xml[i:start] + char 103 | for j in range(i, start): 104 | alignment.append((j, j + 1)) 105 | alignment.append((start, end)) 106 | i = end 107 | outstring += xml[i:len(xml)] 108 | for j in range(i, len(xml)): 109 | alignment.append((j, j + 1)) 110 | return outstring, alignment 111 | 112 | 113 | def token_offsets(token_list, raw, position, xml_input, tokens): 114 | """Determine character offsets for tokens.""" 115 | if xml_input: 116 | chunk_offsets = [(t.character_offset[0] - position, t.character_offset[1] - position) for t in token_list] 117 | raw, align_to_entities = _resolve_entities(raw) 118 | align_from_entities = {i: char_i for char_i, (start, end) in enumerate(align_to_entities) for i in range(start, end)} 119 | chunks = [raw[align_from_entities[start]:align_from_entities[end - 1] + 1] for start, end in chunk_offsets] 120 | chunks_nfc = [unicodedata.normalize("NFC", c) for c in chunks] 121 | alignments = [_align_nfc(chunk_nfc, chunk) for chunk, chunk_nfc in zip(chunks, chunks_nfc)] 122 | align_to_raw = alignments[0] 123 | for i in range(1, len(alignments)): 124 | o1 = sum(len(c) for c in chunks_nfc[:i]) 125 | o2 = sum(len(c) for c in chunks[:i]) 126 | align_to_raw.update({(k[0] + o1, k[1] + o1): (v[0] + o2, v[1] + o2) for k, v in alignments[i].items()}) 127 | raw_nfc = "".join(chunks_nfc) 128 | else: 129 | raw_nfc = unicodedata.normalize("NFC", raw) 130 | align_to_raw = _align_nfc(raw_nfc, raw) 131 | align_from_raw = {i: k for k, v in align_to_raw.items() for i in range(v[0], v[1])} 132 | align_to_starts = {i: v[0] for k, v in align_to_raw.items() for i in range(k[0], k[1])} 133 | align_to_ends = {i: v[1] for k, v in align_to_raw.items() for i in range(k[0], k[1])} 134 | # adjust character offsets for markup tokens 135 | if xml_input: 136 | for i in range(len(tokens)): 137 | if tokens[i].markup: 138 | s, e = tokens[i].character_offset 139 | tokens[i].character_offset = ( 140 | align_from_raw[align_from_entities[s - position]][0] + position, 141 | align_from_raw[align_from_entities[e - position - 1]][1] + position 142 | ) 143 | offsets = _determine_offsets(tokens, raw_nfc, position) 144 | assert len(tokens) == len(offsets), f"Not as many tokens as offsets: {len(tokens)} != {len(offsets)}" 145 | offsets = [(align_to_starts[s], align_to_ends[e - 1]) for s, e in offsets] 146 | if xml_input: 147 | offsets = [(align_to_entities[s][0], align_to_entities[e - 1][1]) for s, e in offsets] 148 | offsets = [(s + position, e + position) for s, e in offsets] 149 | return offsets 150 | 151 | 152 | def xml_chunk_offset(token, raw): 153 | """Determine character offset for an XML chunk created by `utils._xml_chunk_generator`.""" 154 | raw, align_to_raw = _resolve_entities(raw) 155 | raw = re.sub(r"\s", " ", raw) 156 | text = token.text 157 | text = re.sub(r"\s", " ", text) 158 | if token.markup: 159 | text, align_to_text = _resolve_entities(text) 160 | text = text.replace("'", '"') 161 | if raw.startswith(text): 162 | start = 0 163 | end = len(text) 164 | else: 165 | pattern = "(" + re.escape(text) + ")" 166 | pattern = pattern.replace(r"\ ", r"\s+") 167 | pattern = pattern.replace("=", r"\s*=\s*") 168 | if not text.startswith(""): 34 | continue 35 | for char in line: 36 | characters.append(Character(char, False, False)) 37 | characters[-1] = Character(characters[-1].char, True, False) 38 | return characters 39 | 40 | 41 | def char_to_str(system, gold, focus=False): 42 | """""" 43 | string = system.char 44 | if focus: 45 | # sentence fp 46 | if system.sentence_boundary and (not gold.sentence_boundary): 47 | string += "■ " 48 | # sentence fn 49 | elif (not system.sentence_boundary) and gold.sentence_boundary: 50 | string += "□ " 51 | # token fp 52 | elif system.token_boundary and (not gold.token_boundary): 53 | string += "● " 54 | # token fn 55 | elif (not system.token_boundary) and gold.token_boundary: 56 | string += "○ " 57 | # any tp 58 | elif (system.sentence_boundary and gold.sentence_boundary) or (system.token_boundary and gold.token_boundary): 59 | string += " " 60 | else: 61 | if system.sentence_boundary or system.token_boundary: 62 | string += " " 63 | return string 64 | 65 | 66 | def precision_recall_f1(tp, fp, fn): 67 | """""" 68 | precision = tp / (tp + fp) 69 | recall = tp / (tp + fn) 70 | f1 = (2 * precision * recall) / (precision + recall) 71 | return precision, recall, f1 72 | 73 | 74 | def evaluate_file(system_path, gold_path, ignore_xml, sentences, error_file): 75 | """""" 76 | print("%s ⇔ %s" % (system_path, gold_path)) 77 | if error_file: 78 | with open(error_file, mode="a", encoding="utf-8") as e: 79 | e.write("%s ⇔ %s\n" % (system_path, gold_path)) 80 | with open(system_path, encoding="utf-8") as system, open(gold_path, encoding="utf-8") as gold: 81 | sys_chars = read_characters(system, ignore_xml, sentences) 82 | gold_chars = read_characters(gold, ignore_xml, sentences) 83 | window = collections.deque([""] * 20) 84 | for s, g in zip(sys_chars, gold_chars): 85 | window.append(g.char) 86 | window.popleft() 87 | if s.char != g.char: 88 | print("'" + "".join(window) + "'") 89 | print("'%s' != '%s'" % (s.char, g.char)) 90 | break 91 | assert len(sys_chars) == len(gold_chars) 92 | assert all((s.char == g.char for s, g in zip(sys_chars, gold_chars))) 93 | token_precision, token_recall, token_f1, sentence_precision, sentence_recall, sentence_f1 = 0, 0, 0, 0, 0, 0 94 | token_tp, token_fp, token_fn, sentence_tp, sentence_fp, sentence_fn = 0, 0, 0, 0, 0, 0 95 | if error_file: 96 | with open(error_file, mode="a", encoding="utf-8") as e: 97 | sys_window = collections.deque([Character("", False, False)] * 41) 98 | gold_window = collections.deque([Character("", False, False)] * 41) 99 | for s, g in zip(sys_chars + [Character("", False, False)] * 20, gold_chars + [Character("", False, False)] * 20): 100 | sys_window.append(s) 101 | sys_window.popleft() 102 | gold_window.append(g) 103 | gold_window.popleft() 104 | if sys_window[20] != gold_window[20]: 105 | e.write("%s%s%s\n" % ("".join(char_to_str(x, y) for x, y in zip(list(sys_window)[:20], list(gold_window)[:20]))[-20:], 106 | char_to_str(sys_window[20], gold_window[20], focus=True), 107 | "".join(char_to_str(x, y) for x, y in zip(list(sys_window)[21:], list(gold_window)[21:]))[:20])) 108 | token_tp = len([s for s, g in zip(sys_chars, gold_chars) if g.token_boundary and s.token_boundary]) 109 | token_fp = len([s for s, g in zip(sys_chars, gold_chars) if (not g.token_boundary) and s.token_boundary]) 110 | token_fn = len([s for s, g in zip(sys_chars, gold_chars) if g.token_boundary and (not s.token_boundary)]) 111 | token_precision, token_recall, token_f1 = precision_recall_f1(token_tp, token_fp, token_fn) 112 | print("Tokenization:") 113 | print("P = %6.2f%% R = %6.2f%% F = %6.2f%%" % (token_precision * 100, token_recall * 100, token_f1 * 100)) 114 | print("%d false positives, %d false negatives" % (token_fp, token_fn)) 115 | if sentences: 116 | sentence_tp = len([s for s, g in zip(sys_chars, gold_chars) if g.sentence_boundary and s.sentence_boundary]) 117 | sentence_fp = len([s for s, g in zip(sys_chars, gold_chars) if (not g.sentence_boundary) and s.sentence_boundary]) 118 | sentence_fn = len([s for s, g in zip(sys_chars, gold_chars) if g.sentence_boundary and (not s.sentence_boundary)]) 119 | sentence_precision, sentence_recall, sentence_f1 = precision_recall_f1(sentence_tp, sentence_fp, sentence_fn) 120 | print("Sentence splitting:") 121 | print("P = %6.2f%% R = %6.2f%% F = %6.2f%%" % (sentence_precision * 100, sentence_recall * 100, sentence_f1 * 100)) 122 | print("%d false positives, %d false negatives" % (sentence_fp, sentence_fn)) 123 | print() 124 | return token_tp, token_fp, token_fn, token_precision, token_recall, token_f1, sentence_tp, sentence_fp, sentence_fn, sentence_precision, sentence_recall, sentence_f1 125 | 126 | 127 | def main(): 128 | """""" 129 | args = arguments() 130 | if args.errors: 131 | with open(args.errors, mode="w", encoding="utf-8") as e: 132 | pass 133 | if args.files: 134 | evaluate_file(args.SYSTEM, args.GOLD, args.ignore_xml, args.sentences, args.errors) 135 | elif args.directories: 136 | n_tokens, token_precision, token_recall, token_f1, n_sentences, sentence_precision, sentence_recall, sentence_f1 = 0, 0, 0, 0, 0, 0, 0, 0 137 | token_tp, token_fp, token_fn, sentence_tp, sentence_fp, sentence_fn = 0, 0, 0, 0, 0, 0 138 | system_files = sorted(os.listdir(args.SYSTEM)) 139 | gold_files = sorted(os.listdir(args.GOLD)) 140 | assert len(system_files) == len(gold_files) 141 | assert all((s == g for s, g in zip(system_files, gold_files))) 142 | for system_file, gold_file in zip(system_files, gold_files): 143 | ttp, tfp, tfn, tp, tr, tf, stp, sfp, sfn, sp, sr, sf = evaluate_file(os.path.join(args.SYSTEM, system_file), os.path.join(args.GOLD, gold_file), args.ignore_xml, args.sentences, args.errors) 144 | nt = ttp + tfn 145 | ns = stp + sfp 146 | token_tp += ttp 147 | token_fp += tfp 148 | token_fn += tfn 149 | sentence_tp += stp 150 | sentence_fp += sfp 151 | sentence_fn += sfn 152 | n_tokens += nt 153 | token_precision += nt * tp 154 | token_recall += nt * tr 155 | token_f1 += nt * tf 156 | n_sentences += ns 157 | sentence_precision += ns * sp 158 | sentence_recall += ns * sr 159 | sentence_f1 += ns * sf 160 | print("TOTAL") 161 | print("Tokenization (weighted average on %d tokens):" % n_tokens) 162 | print("P = %6.2f%% R = %6.2f%% F = %6.2f%%" % (token_precision / n_tokens * 100, token_recall / n_tokens * 100, token_f1 / n_tokens * 100)) 163 | print("%d false positives, %d false negatives" % (token_fp, token_fn)) 164 | if args.sentences: 165 | print("Sentence splitting (weighted average on %d sentences):" % n_sentences) 166 | print("P = %6.2f%% R = %6.2f%% F = %6.2f%%" % (sentence_precision / n_sentences * 100, sentence_recall / n_sentences * 100, sentence_f1 / n_sentences * 100)) 167 | print("%d false positives, %d false negatives" % (sentence_fp, sentence_fn)) 168 | 169 | 170 | if __name__ == "__main__": 171 | main() 172 | -------------------------------------------------------------------------------- /tests/test_alignment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import itertools 4 | import unicodedata 5 | import unittest 6 | 7 | import somajo.alignment 8 | from somajo.doubly_linked_list import DLL 9 | from somajo.token import Token 10 | from somajo.somajo import Tokenizer 11 | from somajo import utils 12 | 13 | 14 | class TestNfcAlignment(unittest.TestCase): 15 | def test_nfc_01(self): 16 | """Singleton: Angstrom sign""" 17 | orig = "xÅx" 18 | nfc = unicodedata.normalize("NFC", orig) 19 | alignment = {(0, 1): (0, 1), (1, 2): (1, 2), (2, 3): (2, 3)} 20 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment) 21 | 22 | def test_nfc_02(self): 23 | """Single combining mark""" 24 | orig = "xA\u0308x" 25 | nfc = unicodedata.normalize("NFC", orig) 26 | alignment = {(0, 1): (0, 1), (1, 2): (1, 3), (2, 3): (3, 4)} 27 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment) 28 | 29 | def test_nfc_03(self): 30 | """Multiple combining marks""" 31 | orig = "xs\u0323\u0307x" 32 | nfc = unicodedata.normalize("NFC", orig) 33 | alignment = {(0, 1): (0, 1), (1, 2): (1, 4), (2, 3): (4, 5)} 34 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment) 35 | 36 | def test_nfc_04(self): 37 | """Multiple combining marks""" 38 | orig = "xs\u0307\u0323x" 39 | nfc = unicodedata.normalize("NFC", orig) 40 | alignment = {(0, 1): (0, 1), (1, 2): (1, 4), (2, 3): (4, 5)} 41 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment) 42 | 43 | def test_nfc_05(self): 44 | """Multiple combining marks""" 45 | orig = "x\u1e0b\u0323x" 46 | nfc = unicodedata.normalize("NFC", orig) 47 | alignment = {(0, 1): (0, 1), (1, 3): (1, 3), (3, 4): (3, 4)} 48 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment) 49 | 50 | def test_nfc_06(self): 51 | """Multiple combining marks""" 52 | orig = "q\u0307\u0323x" 53 | nfc = unicodedata.normalize("NFC", orig) 54 | alignment = {(0, 3): (0, 3), (3, 4): (3, 4)} 55 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment) 56 | 57 | def test_nfc_07(self): 58 | """Empty string""" 59 | orig = "" 60 | nfc = unicodedata.normalize("NFC", orig) 61 | alignment = {} 62 | self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment) 63 | 64 | 65 | class TestResolveEntities(unittest.TestCase): 66 | def test_entitites_01(self): 67 | xml = 'foo <bar> baz' 68 | resolved = 'foo baz' 69 | alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), 70 | (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), 71 | (12, 13), (13, 14), (14, 15), (15, 21), (21, 22), 72 | (22, 23), (23, 24), (24, 30), (30, 31), (31, 32), 73 | (32, 33), (33, 34), (34, 35), (35, 36), (36, 37), 74 | (37, 38), (38, 39), (39, 40), (40, 44), (44, 45), 75 | (45, 46), (46, 47), (47, 51), (51, 52), (52, 53), 76 | (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), 77 | (58, 59), (59, 60), (60, 61)] 78 | res, al = somajo.alignment._resolve_entities(xml) 79 | self.assertEqual(res, resolved) 80 | self.assertEqual(al, alignment) 81 | 82 | def test_entities_02(self): 83 | xml = "Test" 84 | resolved = "Test" 85 | alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), 86 | (6, 14), (14, 15), (15, 16), (16, 17), (17, 18), 87 | (18, 19), (19, 20), (20, 21), (21, 22)] 88 | res, al = somajo.alignment._resolve_entities(xml) 89 | self.assertEqual(res, resolved) 90 | self.assertEqual(al, alignment) 91 | 92 | 93 | class TestDetermineOffsets(unittest.TestCase): 94 | def setUp(self): 95 | """Necessary preparations""" 96 | self.tokenizer = Tokenizer(split_camel_case=True, language="de_CMC") 97 | 98 | def _equal(self, raw, tokenized): 99 | raw = unicodedata.normalize("NFC", raw) 100 | if isinstance(tokenized, str): 101 | tokenized = tokenized.split() 102 | dll = DLL([Token(raw, first_in_sentence=True, last_in_sentence=True)]) 103 | tokens = self.tokenizer._tokenize(dll) 104 | offsets = somajo.alignment._determine_offsets(tokens, raw, position=0) 105 | self.assertEqual([raw[s:e] for s, e in offsets], tokenized) 106 | 107 | def test_token_alignment_01(self): 108 | self._equal("Ein simpler Test.", "Ein simpler Test .") 109 | 110 | def test_token_alignment_02(self): 111 | self._equal("bla \u1e0d\u0307amit.", "bla \u1e0d\u0307amit .") 112 | 113 | def test_token_alignment_03(self): 114 | self._equal("foo (bar) baz?", "foo ( bar ) baz ?") 115 | 116 | def test_token_alignment_03a(self): 117 | self._equal("foo:\n) bar", ["foo", ":\n)", "bar"]) 118 | 119 | def test_token_alignment_04(self): 120 | self._equal( 121 | "foo​bar foo­bar foo\ufeffbar foobarbazquxalphabetagamma foo‌bar‍baz foo‏bar‎baz foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta", 122 | ["foo​bar", "foo­bar", "foo\ufeffbar", "foobarbazquxalphabetagamma", "foo‌bar‍baz", "foo‏bar‎baz", "foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta"] 123 | ) 124 | 125 | 126 | class TestTokenOffsets(unittest.TestCase): 127 | def setUp(self): 128 | """Necessary preparations""" 129 | self.tokenizer = Tokenizer(split_camel_case=True, language="de_CMC") 130 | 131 | def _equal_xml(self, raw, tokenized): 132 | raw = unicodedata.normalize("NFC", raw) 133 | if isinstance(tokenized, str): 134 | tokenized = tokenized.split() 135 | eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() 136 | eos_tags = set(eos_tags) 137 | chunk_info = utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags, character_offsets=True) 138 | chunk_lists = [ci[0] for ci in chunk_info] 139 | token_dlls = map(DLL, chunk_lists) 140 | chunks = map(self.tokenizer._tokenize, token_dlls) 141 | complete = list(itertools.chain.from_iterable(chunks)) 142 | offsets = somajo.alignment.token_offsets(list(itertools.chain.from_iterable(chunk_lists)), raw, 0, True, complete) 143 | self.assertEqual([raw[s:e] for s, e in offsets], tokenized) 144 | 145 | def test_token_alignment_05(self): 146 | self._equal_xml( 147 | "der beste Betreuer? - >ProfSmith! : )", 148 | ["", "der", "beste", "Betreuer", "?", "- >", "Prof", "Smith", "!", ": )", ""] 149 | ) 150 | 151 | def test_token_alignment_06(self): 152 | self._equal_xml("das steht auf S.­5", " das steht auf S. 5 ") 153 | 154 | def test_token_alignment_07(self): 155 | self._equal_xml("na so was -​> bla", " na so was - > bla ") 156 | 157 | def test_token_alignment_08(self): 158 | self._equal_xml("Test", " Test ") 159 | 160 | def test_token_alignment_09(self): 161 | self._equal_xml("3 < 5", " 3 < 5 ") 162 | 163 | def test_token_alignment_10(self): 164 | self._equal_xml("Test­fall", " Test­fall ") 165 | 166 | def test_token_alignment_11(self): 167 | self._equal_xml("Test­fall", " Test­fall ") 168 | 169 | def test_token_alignment_12(self): 170 | """Single combining mark""" 171 | self._equal_xml("foo xÄx foo", " foo xÄx foo ") 172 | 173 | def test_token_alignment_13(self): 174 | """Multiple combining marks""" 175 | self._equal_xml("foo xṩx foo", " foo xṩx foo ") 176 | 177 | def test_token_alignment_14(self): 178 | """Multiple combining marks""" 179 | self._equal_xml("foo xṩx foo", " foo xṩx foo ") 180 | 181 | def test_token_alignment_15(self): 182 | """Multiple combining marks""" 183 | self._equal_xml("foo xsḍ̇x foo", " foo xsḍ̇x foo ") 184 | 185 | def test_token_alignment_16(self): 186 | """Multiple combining marks""" 187 | self._equal_xml("foo xq̣̇x foo", " foo xq̣̇x foo ") 188 | 189 | def test_token_alignment_17(self): 190 | self._equal_xml("Foo", ["", "Foo", ""]) 191 | 192 | def test_token_alignment_18(self): 193 | self._equal_xml("Foo", ["", "Foo", ""]) 194 | 195 | def test_token_alignment_19(self): 196 | self._equal_xml(" Foo ", ["", "Foo", ""]) 197 | 198 | def test_token_alignment_20(self): 199 | self._equal_xml("Foo \"Bar\" 'Baz'", ["", "Foo", '"', "Bar", '"', "'", "Baz", "'", ""]) 200 | 201 | def test_token_alignment_21(self): 202 | self._equal_xml('\n Foo\n', ['', "Foo", ""]) 203 | 204 | def test_token_alignment_22(self): 205 | self._equal_xml("Hallo
Tschüß
", ["", "Hallo", "
", "", "Tschüß", "
"]) 206 | 207 | def test_token_alignment_23(self): 208 | self._equal_xml("Hallo
Tschüß
", ["", "Hallo", "
", "", "Tschüß", "
"]) 209 | 210 | def test_token_alignment_24(self): 211 | self._equal_xml("\u0303foo", ["", "\u0303foo", ""]) 212 | 213 | def test_token_alignment_25(self): 214 | self._equal_xml("foo

bar

", ["", "foo", "

", "bar", "

", "
"]) 215 | 216 | def test_token_alignment_26(self): 217 | self._equal_xml("

bar

baz

", ["", "

", "bar", "

", "

", "baz", "

", "
"]) 218 | -------------------------------------------------------------------------------- /tests/test_somajo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import io 4 | import unittest 5 | 6 | from somajo.somajo import SoMaJo 7 | 8 | 9 | class TestSoMaJo(unittest.TestCase): 10 | def setUp(self): 11 | """Necessary preparations""" 12 | self.tokenizer = SoMaJo("de_CMC") 13 | 14 | def _equal_text(self, paragraphs, tokenized_sentences, parallel=1): 15 | sentences = self.tokenizer.tokenize_text(paragraphs, parallel=parallel) 16 | sentences = [[t.text for t in s] for s in sentences] 17 | self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences]) 18 | 19 | def _equal_text_file_single_newlines(self, paragraphs, tokenized_sentences, parallel=1): 20 | pseudofile = io.StringIO("\n".join(paragraphs)) 21 | sentences = self.tokenizer.tokenize_text_file(pseudofile, paragraph_separator="single_newlines", parallel=parallel) 22 | sentences = [[t.text for t in s] for s in sentences] 23 | self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences]) 24 | 25 | def _equal_text_file_empty_lines(self, paragraphs, tokenized_sentences, parallel=1): 26 | pseudofile = io.StringIO("\n\n".join(paragraphs)) 27 | sentences = self.tokenizer.tokenize_text_file(pseudofile, paragraph_separator="empty_lines", parallel=parallel) 28 | sentences = [[t.text for t in s] for s in sentences] 29 | self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences]) 30 | 31 | def _equal_xml(self, xml, tokenized_sentences, strip_tags=False, parallel=1, prune_tags=None): 32 | eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() 33 | sentences = self.tokenizer.tokenize_xml(xml, eos_tags, strip_tags=strip_tags, parallel=parallel, prune_tags=prune_tags) 34 | sentences = [[t.text for t in s] for s in sentences] 35 | self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences]) 36 | 37 | def _equal_xml_file(self, xml, tokenized_sentences, strip_tags=False, parallel=1, prune_tags=None): 38 | eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() 39 | pseudofile = io.StringIO(xml) 40 | sentences = self.tokenizer.tokenize_xml_file(pseudofile, eos_tags, strip_tags=strip_tags, parallel=parallel, prune_tags=prune_tags) 41 | sentences = [[t.text for t in s] for s in sentences] 42 | self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences]) 43 | 44 | 45 | class TestSoMaJoNoSent(TestSoMaJo): 46 | def setUp(self): 47 | """Necessary preparations""" 48 | self.tokenizer = SoMaJo("de_CMC", split_sentences=False) 49 | 50 | 51 | class TestText(TestSoMaJo): 52 | def test_text_01(self): 53 | self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"]) 54 | 55 | def test_text_02(self): 56 | self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"]) 57 | 58 | def test_text_03(self): 59 | self._equal_text_file_single_newlines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"]) 60 | 61 | def test_text_04(self): 62 | self.assertRaises(TypeError, self.tokenizer.tokenize_text, "Foo bar. Baz qux") 63 | 64 | 65 | class TestTextXMLSent(TestSoMaJo): 66 | def setUp(self): 67 | """Necessary preparations""" 68 | self.tokenizer = SoMaJo("de_CMC", xml_sentences="s") 69 | 70 | def test_text_01(self): 71 | self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], [" Foo bar . ", " Baz qux ", " alpha . ", " Beta gamma "]) 72 | 73 | def test_text_02(self): 74 | self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], [" Foo bar . ", " Baz qux ", " alpha . ", " Beta gamma "]) 75 | 76 | 77 | class TestTextParallel(TestSoMaJo): 78 | def test_text_01(self): 79 | self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], parallel=2) 80 | 81 | def test_text_02(self): 82 | self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], parallel=2) 83 | 84 | def test_text_03(self): 85 | self._equal_text_file_single_newlines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], parallel=2) 86 | 87 | 88 | class TestTextNoSent(TestSoMaJoNoSent): 89 | def test_text_01(self): 90 | self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"]) 91 | 92 | def test_text_02(self): 93 | self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"]) 94 | 95 | def test_text_03(self): 96 | self._equal_text_file_single_newlines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"]) 97 | 98 | 99 | class TestTextNoSentParallel(TestSoMaJoNoSent): 100 | def test_text_01(self): 101 | self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"], parallel=2) 102 | 103 | def test_text_02(self): 104 | self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"], parallel=2) 105 | 106 | def test_text_03(self): 107 | self._equal_text_file_single_newlines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"], parallel=2) 108 | 109 | 110 | class TestXML(TestSoMaJo): 111 | def test_xml_01(self): 112 | self._equal_xml("\n \n

Foo bar. Baz qux

\n

alpha. Beta gamma

\n \n", ["

Foo bar .", "Baz qux

", "

alpha .", "Beta gamma

"]) 113 | 114 | def test_xml_02(self): 115 | self._equal_xml_file("\n \n

Foo bar. Baz qux

\n

alpha. Beta gamma

\n \n", ["

Foo bar .", "Baz qux

", "

alpha .", "Beta gamma

"]) 116 | 117 | 118 | class TestXMLParallel(TestSoMaJo): 119 | def test_xml_01(self): 120 | self._equal_xml("\n \n

Foo bar. Baz qux

\n

alpha. Beta gamma

\n \n", ["

Foo bar .", "Baz qux

", "

alpha .", "Beta gamma

"], parallel=2) 121 | 122 | def test_xml_02(self): 123 | self._equal_xml_file("\n \n

Foo bar. Baz qux

\n

alpha. Beta gamma

\n \n", ["

Foo bar .", "Baz qux

", "

alpha .", "Beta gamma

"], parallel=2) 124 | 125 | 126 | class TestXMLNoSent(TestSoMaJoNoSent): 127 | def test_xml_01(self): 128 | self._equal_xml("\n \n

Foo bar. Baz qux

\n

alpha. Beta gamma

\n \n", ["

Foo bar . Baz qux

", "

alpha . Beta gamma

"]) 129 | 130 | def test_xml_02(self): 131 | self._equal_xml_file("\n \n

Foo bar. Baz qux

\n

alpha. Beta gamma

\n \n", ["

Foo bar . Baz qux

", "

alpha . Beta gamma

"]) 132 | 133 | 134 | class TestXMLNoSentParallel(TestSoMaJoNoSent): 135 | def test_xml_01(self): 136 | self._equal_xml("\n \n

Foo bar. Baz qux

\n

alpha. Beta gamma

\n \n", ["

Foo bar . Baz qux

", "

alpha . Beta gamma

"], parallel=2) 137 | 138 | def test_xml_02(self): 139 | self._equal_xml_file("\n \n

Foo bar. Baz qux

\n

alpha. Beta gamma

\n \n", ["

Foo bar . Baz qux

", "

alpha . Beta gamma

"], parallel=2) 140 | 141 | 142 | class TestXMLStripTags(TestSoMaJo): 143 | def test_xml_01(self): 144 | self._equal_xml("\n \n

Foo bar. Baz qux

\n

alpha. Beta gamma

\n \n", ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], strip_tags=True) 145 | 146 | def test_xml_02(self): 147 | self._equal_xml_file("\n \n

Foo bar. Baz qux

\n

alpha. Beta gamma

\n \n", ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], strip_tags=True) 148 | 149 | 150 | class TestXMLPruneTags(TestSoMaJo): 151 | def test_xml_01(self): 152 | self._equal_xml("\n \n Spam\n \n \n

Foo bar. Baz qux

\n

alpha. Beta gamma

\n \n", ["

Foo bar .", "Baz qux

", "

alpha .", "Beta gamma

"], prune_tags=["head"]) 153 | 154 | def test_xml_02(self): 155 | self._equal_xml_file("\n \n Spam\n \n \n

Foo bar. Baz qux

\n

alpha. Beta gamma

\n \n", ["

Foo bar .", "Baz qux

", "

alpha .", "Beta gamma

"], prune_tags=["head"]) 156 | 157 | 158 | class TestCharacterOffsets(TestSoMaJo): 159 | def setUp(self): 160 | """Necessary preparations""" 161 | self.tokenizer = SoMaJo("de_CMC", character_offsets=True) 162 | 163 | def _equal_offsets_text_file(self, paragraphs, tokenized_sentences, parallel=1): 164 | raw = "\n\n".join(paragraphs) 165 | pseudofile = io.StringIO(raw) 166 | sentences = self.tokenizer.tokenize_text_file(pseudofile, paragraph_separator="empty_lines", parallel=parallel) 167 | sentences = list(sentences) 168 | tokens = [[t.text for t in s] for s in sentences] 169 | self.assertEqual(tokens, [ts.split() for ts in tokenized_sentences]) 170 | offsets = [[t.character_offset for t in s] for s in sentences] 171 | extracted = [[raw[s:e] for s, e in sent] for sent in offsets] 172 | self.assertEqual(tokens, extracted) 173 | 174 | def _equal_offsets_xml(self, xml, tokenized_sentences, strip_tags=False, parallel=1, prune_tags=None): 175 | eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() 176 | sentences = self.tokenizer.tokenize_xml(xml, eos_tags, strip_tags=strip_tags, parallel=parallel, prune_tags=prune_tags) 177 | sentences = list(sentences) 178 | tokens = [[t.text for t in s] for s in sentences] 179 | self.assertEqual(tokens, [ts.split() for ts in tokenized_sentences]) 180 | offsets = [[t.character_offset for t in s] for s in sentences] 181 | extracted = [[xml[s:e] for s, e in sent] for sent in offsets] 182 | self.assertEqual(tokens, extracted) 183 | 184 | def test_text_offsets_01(self): 185 | self._equal_offsets_text_file(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"]) 186 | 187 | def test_xml_offsets_01(self): 188 | self._equal_offsets_xml("

bar

baz

", ["

bar

", "

baz

"]) 189 | 190 | def test_xml_offsets_02(self): 191 | self._equal_offsets_xml("\n

\nbar\n

\n

\nbaz\n

\n
", ["

bar

", "

baz

"]) 192 | -------------------------------------------------------------------------------- /utils/errors_test.txt: -------------------------------------------------------------------------------- 1 | __________________________________________________________________________________________________ 2 | tmp/web_test_009.txt ..._standard/test_web/tokenized/web_test_009.txt 3 | 4 | False Positive (linebreak inserted left): 5 | 169: Pops 169: Pops 6 | 170: 170: 7 | 171: * 1. 171: * 1.1. 8 | 172: * 1. 172: Kuchen 9 | 173: Kuchen 173: für 10 | 174: für 174: Cake 11 | 12 | False Positive (linebreak inserted left): 13 | 311: . 310: . 14 | 312: 311: 15 | 313: * 1. 312: * 1.2. 16 | 314: * 2. 313: Kuchen 17 | 315: Kuchen 314: für 18 | 316: für 315: Cake 19 | 20 | False Positive (linebreak inserted left): 21 | 448: . 446: . 22 | 449: 447: 23 | 450: * 1. 448: * 1.3. 24 | 451: * 3. 449: Kekse 25 | 452: Kekse 450: für 26 | 453: für 451: Cake 27 | 28 | __________________________________________________________________________________________________ 29 | tmp/web_test_011.txt ..._standard/test_web/tokenized/web_test_011.txt 30 | 31 | False Negative (linebreak inserted right): 32 | 238: man 238: man 33 | 239: sie 239: sie 34 | 240: * 1829/30 240: * 1829 35 | 241: in 241: * / 36 | 242: das 242: 30 37 | 243: Herrschaftliche 243: in 38 | 39 | False Negative (linebreak inserted right): 40 | 238: man 239: sie 41 | 239: sie 240: 1829 42 | 240: * 1829/30 241: * / 43 | 241: in 242: * 30 44 | 242: das 243: in 45 | 243: Herrschaftliche 244: das 46 | 47 | False Negative (linebreak inserted right): 48 | 411: Garten 413: Garten 49 | 412: musste 414: musste 50 | 413: * 1864/65 415: * 1864 51 | 414: dem 416: * / 52 | 415: Bau 417: 65 53 | 416: des 418: dem 54 | 55 | False Negative (linebreak inserted right): 56 | 411: Garten 414: musste 57 | 412: musste 415: 1864 58 | 413: * 1864/65 416: * / 59 | 414: dem 417: * 65 60 | 415: Bau 418: dem 61 | 416: des 419: Bau 62 | 63 | __________________________________________________________________________________________________ 64 | tmp/web_test_002.txt ..._standard/test_web/tokenized/web_test_002.txt 65 | 66 | False Negative (linebreak inserted right): 67 | 99: der 99: der 68 | 100: Saison 100: Saison 69 | 101: * 2009/2010 101: * 2009 70 | 102: sind 102: * / 71 | 103: laut 103: 2010 72 | 104: einer 104: sind 73 | 74 | False Negative (linebreak inserted right): 75 | 99: der 100: Saison 76 | 100: Saison 101: 2009 77 | 101: * 2009/2010 102: * / 78 | 102: sind 103: * 2010 79 | 103: laut 104: sind 80 | 104: einer 105: laut 81 | 82 | __________________________________________________________________________________________________ 83 | tmp/web_test_012.txt ..._standard/test_web/tokenized/web_test_012.txt 84 | 85 | False Positive (linebreak inserted left): 86 | 660: Backlinks 660: Backlinks 87 | 661: : 661: : 88 | 662: * [ 662: * [[ 89 | 663: * [ 663: security 90 | 664: security 664: : 91 | 665: : 665: verschlüsselung 92 | 93 | False Positive (linebreak inserted left): 94 | 665: : 664: : 95 | 666: verschlüsselung 665: verschlüsselung 96 | 667: * ] 666: * ]] 97 | 668: * ] 667: 98 | 669: 668: Navigation 99 | 670: Navigation 669: Passwort- 100 | 101 | __________________________________________________________________________________________________ 102 | tmp/web_test_004.txt ..._standard/test_web/tokenized/web_test_004.txt 103 | 104 | False Positive (linebreak inserted left): 105 | 141: Telekommunikationsgeheimnis 141: Telekommunikationsgeheimnis 106 | 142: ( 142: ( 107 | 143: * Art 143: * Art. 108 | 144: * . 144: 10 109 | 145: 10 145: GG 110 | 146: GG 146: , 111 | 112 | False Positive (linebreak inserted left): 113 | 146: GG 145: GG 114 | 147: , 146: , 115 | 148: * Art 147: * Art. 116 | 149: * . 148: 8 117 | 150: 8 149: Abs. 118 | 151: Abs. 150: 1 119 | 120 | False Positive (linebreak inserted left): 121 | 153: EMRK 151: EMRK 122 | 154: , 152: , 123 | 155: * Art 153: * Art. 124 | 156: * . 154: 7 125 | 157: 7 155: EU-GrCh 126 | 158: EU-GrCh 156: ) 127 | 128 | False Positive (linebreak inserted left): 129 | 173: gewährleistet 170: gewährleistet 130 | 174: . 171: . 131 | 175: * Art 172: * Art. 132 | 176: * . 173: 10 133 | 177: 10 174: GG 134 | 178: GG 175: sagt 135 | 136 | False Positive (linebreak inserted left): 137 | 270: Fernmeldegeheimnisses 266: Fernmeldegeheimnisses 138 | 271: in 267: in 139 | 272: * Art 268: * Art. 140 | 273: * . 269: 8 141 | 274: 8 270: EMRK 142 | 275: EMRK 271: und 143 | 144 | False Positive (linebreak inserted left): 145 | 275: EMRK 270: EMRK 146 | 276: und 271: und 147 | 277: * Art 272: * Art. 148 | 278: * . 273: 7 149 | 279: 7 274: EU-GrCh 150 | 280: EU-GrCh 275: : 151 | 152 | False Positive (linebreak inserted left): 153 | 448: Widerstandsrechts 442: Widerstandsrechts 154 | 449: ( 443: ( 155 | 450: * Art 444: * Art. 156 | 451: * . 445: 20 157 | 452: 20 446: Abs. 158 | 453: Abs. 447: 4 159 | 160 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | # CHANGELOG # 2 | 3 | ## Version 2.4.3, 2024-08-05 ## 4 | 5 | - Move non-abbreviation tokens that should not be split from 6 | `single_token_abbreviations_.txt` to 7 | `single_tokens_.txt` and add cellular networks generations 8 | (issue #32). 9 | 10 | ## Version 2.4.2, 2024-02-10 ## 11 | 12 | - Fix issues #28 and #29 (markdown links with trailing symbols after 13 | URL part). 14 | 15 | ## Version 2.4.1, 2024-02-09 ## 16 | 17 | - Fix issue #27 (URLs in angle brackets). 18 | 19 | ## Version 2.4.0, 2023-12-23 ## 20 | 21 | - New feature: SoMaJo can output character offsets for tokens, 22 | allowing for stand-off tokenization. Pass `character_offsets=True` 23 | to the constructor or use the option `--character-offsets` on the 24 | command line to enable the feature. The character offsets are 25 | determined by aligning the tokenized output with the input, 26 | therefore activating the feature incurs a noticeable increase in 27 | processing time. 28 | 29 | ## Version 2.3.1, 2023-09-23 ## 30 | 31 | - Fix issue #26 (markdown links that contain a URL in the link text). 32 | 33 | ## Version 2.3.0, 2023-08-14 ## 34 | 35 | - **Potentially breaking change:** The somajo-tokenizer script is 36 | automatically created upon installation and bin/somajo-tokenizer is 37 | removed. For most users, this does not make a difference. If you 38 | used to run your own modified version of SoMaJo directly via 39 | bin/somajo-tokenizer, consider installing the project in editable 40 | mode (see Development section in README.md). 41 | - Switch from setup.py to pyconfig.toml and restructure the project 42 | (source in src, tests in tests). 43 | - When creating a Token object, only known token classes can be 44 | passed. 45 | - Fix issue #25 (dates at the end of sentences) 46 | 47 | ## Version 2.2.4, 2023-06-23 ## 48 | 49 | - Improvements to tokenization of words containing numbers (e.g. 50 | COVID-19-Pandemie, FFP2-Maske). 51 | 52 | ## Version 2.2.3, 2023-02-02 ## 53 | 54 | - Improvements to tokenization: Roman ordinals, abbreviation “Art.” 55 | preceding a number, certain units of measurement at the end of a 56 | sentence (e.g. km/h). 57 | 58 | ## Version 2.2.2, 2022-09-12 ## 59 | 60 | - Bugfix: Command-line option --sentence_tag implies option --split_sentences. 61 | 62 | ## Version 2.2.1, 2022-03-08 ## 63 | 64 | - Bugfix: Command-line option --strip-tags implies option --xml. 65 | 66 | ## Version 2.2.0, 2022-01-18 ## 67 | 68 | - New feature: Prune XML tags and their contents from the input before 69 | tokenization (via the command line option --prune TAGNAME1 --prune 70 | TAGNAME2 … or by passing prune_tags=["TAGNAME1", "TAGNAME2", …] to 71 | tokenize_xml or tokenize_xml_file). This can be useful when 72 | processing HTML files, e.g. for removing any