├── tests
    ├── __init__.py
    ├── test_token.py
    ├── test_doubly_linked_list.py
    ├── test_utils.py
    ├── test_alignment.py
    ├── test_somajo.py
    └── test_tokenizer_internal.py
├── requirements_dev.txt
├── doc
    ├── source
    │   ├── modules.rst
    │   ├── somajo.rst
    │   ├── index.rst
    │   └── conf.py
    ├── Makefile
    ├── make.bat
    └── build
    │   └── markdown
    │       └── somajo.md
├── utils
    ├── run_tests.sh
    ├── evaluate_on_gum.sh
    ├── annotate_cmc.sh
    ├── annotate_web.sh
    ├── evaluate_on_konvens.sh
    ├── evaluate_on_test_cmc.sh
    ├── evaluate_on_test_web.sh
    ├── evaluate_on_ewt.sh
    ├── baseline.sh
    ├── errors_trial.txt
    ├── errors_train.txt
    ├── evaluate.py
    └── errors_test.txt
├── .gitignore
├── src
    └── somajo
    │   ├── data
    │       ├── non-breaking_hyphenated_words_en.txt
    │       ├── single_tokens_en.txt
    │       ├── single_token_abbreviations_en.txt
    │       ├── single_tokens_de.txt
    │       ├── eos_abbreviations.txt
    │       ├── non-breaking_suffixes_en.txt
    │       ├── units.txt
    │       ├── single_token_abbreviations_de.txt
    │       ├── non-breaking_prefixes_en.txt
    │       ├── tokens_with_plus_or_ampersand.txt
    │       ├── abbreviations_en.txt
    │       └── abbreviations_de.txt
    │   ├── __init__.py
    │   ├── doubly_linked_list.py
    │   ├── token.py
    │   ├── cli.py
    │   ├── alignment.py
    │   └── sentence_splitter.py
├── .github
    └── workflows
    │   └── test.yml
├── README.rst
├── pyproject.toml
├── CHANGES.txt
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | build
2 | sphinx
3 | sphinx-markdown-builder
4 | twine
5 | 


--------------------------------------------------------------------------------
/doc/source/modules.rst:
--------------------------------------------------------------------------------
1 | somajo
2 | ======
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    somajo
8 | 


--------------------------------------------------------------------------------
/utils/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
4 | cd $SCRIPTDIR/..
5 | 
6 | # Test Discovery
7 | python3 -m unittest discover
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /data/
 2 | /dist/
 3 | /doc/build/markdown/index.md
 4 | /doc/build/markdown/modules.md
 5 | /doc/build/doctrees/
 6 | /src/SoMaJo.egg-info/
 7 | /venv/
 8 | __pycache__/
 9 | *~
10 | *.pyc
11 | 


--------------------------------------------------------------------------------
/src/somajo/data/non-breaking_hyphenated_words_en.txt:
--------------------------------------------------------------------------------
 1 | # Hyphenated words in the following list are not split into multiple tokens.
 2 | 
 3 | mm-hm
 4 | mm-mm
 5 | o-kay
 6 | uh-huh
 7 | uh-oh
 8 | x-ray
 9 | x-rayed
10 | x-rays
11 | 


--------------------------------------------------------------------------------
/src/somajo/data/single_tokens_en.txt:
--------------------------------------------------------------------------------
 1 | # A list of tokens that should not be split.
 2 | #
 3 | # Lines starting with “#” are treated as comments and will be ignored.
 4 | 
 5 | tl;dr
 6 | 
 7 | # mobile telephony
 8 | 3G
 9 | 4G
10 | 5G
11 | 


--------------------------------------------------------------------------------
/src/somajo/data/single_token_abbreviations_en.txt:
--------------------------------------------------------------------------------
 1 | # A list of multi-dot abbreviations that represent single tokens and
 2 | # should not be split.
 3 | #
 4 | # Lines starting with “#” are treated as comments and will be ignored.
 5 | 
 6 | e.g.
 7 | i.e.
 8 | a.m.
 9 | p.m.
10 | P.S.
11 | T.V.
12 | 


--------------------------------------------------------------------------------
/src/somajo/data/single_tokens_de.txt:
--------------------------------------------------------------------------------
 1 | # A list of tokens that should not be split.
 2 | #
 3 | # Lines starting with “#” are treated as comments and will be ignored.
 4 | 
 5 | .Net
 6 | /rant
 7 | /s
 8 | E/E
 9 | tl;dr
10 | zl;ng
11 | 
12 | # SAP Versions
13 | S/4
14 | R/3
15 | 
16 | # mobile telephony
17 | 3G
18 | 4G
19 | 5G
20 | 


--------------------------------------------------------------------------------
/src/somajo/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib.metadata
 2 | 
 3 | from . import (
 4 |     sentence_splitter,
 5 |     somajo,
 6 |     tokenizer
 7 | )
 8 | 
 9 | __version__ = importlib.metadata.version(__package__ or __name__)
10 | 
11 | Tokenizer = tokenizer.Tokenizer
12 | SentenceSplitter = sentence_splitter.SentenceSplitter
13 | SoMaJo = somajo.SoMaJo
14 | 


--------------------------------------------------------------------------------
/doc/source/somajo.rst:
--------------------------------------------------------------------------------
 1 | somajo package
 2 | ==============
 3 | 
 4 | somajo.somajo module
 5 | --------------------
 6 | 
 7 | .. automodule:: somajo.somajo
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 | 
12 | somajo.token module
13 | -------------------
14 | 
15 | .. automodule:: somajo.token
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/utils/evaluate_on_gum.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
 4 | cd $SCRIPTDIR
 5 | 
 6 | mkdir tmp
 7 | for f in ../data/GUM/text/*
 8 | do
 9 |     filename=$(basename $f)
10 |     somajo-tokenizer -l en $f > tmp/$filename
11 | done
12 | perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_gum.txt tmp ../data/GUM/tokenized
13 | rm -r tmp/
14 | 


--------------------------------------------------------------------------------
/src/somajo/data/eos_abbreviations.txt:
--------------------------------------------------------------------------------
 1 | # A list of abbreviations that frequently occur at the end of a
 2 | # sentence. If such an abbreviation is followed by a potential
 3 | # sentence start, e.g. by a capital letter, it will be interpreted as
 4 | # the end of a sentence.
 5 | #
 6 | # Lines starting with “#” are treated as comments and will be ignored.
 7 | 
 8 | usw.
 9 | usf.
10 | etc.
11 | uvam.
12 | 


--------------------------------------------------------------------------------
/utils/annotate_cmc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
 4 | cd $SCRIPTDIR
 5 | 
 6 | for f in ../../data/empirist_test_tok_cmc/raw/*
 7 | do
 8 |     filename=$(basename $f)
 9 |     ../bin/tokenizer --split_camel_case $f > ../../data/cmc_tok_SoMaJo/$filename
10 |     # ../bin/tokenizer $f > ../../data/cmc_tok_SoMaJo/$filename
11 | done
12 | perl ../../data/empirist_test_tok_cmc/tools/validate_tokenization.perl -x ../../data/cmc_tok_SoMaJo/ ../../data/empirist_test_tok_cmc/raw/
13 | 


--------------------------------------------------------------------------------
/utils/annotate_web.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
 4 | cd $SCRIPTDIR
 5 | 
 6 | for f in ../../data/empirist_test_tok_web/raw/*
 7 | do
 8 |     filename=$(basename $f)
 9 |     ../bin/tokenizer --split_camel_case $f > ../../data/web_tok_SoMaJo/$filename
10 |     # ../bin/tokenizer $f > ../../data/web_tok_SoMaJo/$filename
11 | done
12 | perl ../../data/empirist_test_tok_web/tools/validate_tokenization.perl -x ../../data/web_tok_SoMaJo/ ../../data/empirist_test_tok_web/raw/
13 | 


--------------------------------------------------------------------------------
/utils/evaluate_on_konvens.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
 4 | cd $SCRIPTDIR
 5 | 
 6 | mkdir tmp
 7 | for f in ../data/Ortmann_et_al/txt/*.txt
 8 | do
 9 |     filename=$(basename $f)
10 |     somajo-tokenizer --split-sentences $f > tmp/$filename
11 | done
12 | perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_test.txt tmp ../data/Ortmann_et_al/tokens
13 | ./evaluate.py -d --sentences -e errors.txt tmp/ ../data/Ortmann_et_al/tokens/
14 | rm -r tmp/
15 | 


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. SoMaJo documentation master file, created by
 2 |    sphinx-quickstart on Thu Dec 19 08:01:21 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to SoMaJo's documentation!
 7 | ==================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 | 
14 | 
15 | Indices and tables
16 | ==================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 | 


--------------------------------------------------------------------------------
/utils/evaluate_on_test_cmc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
 4 | cd $SCRIPTDIR
 5 | 
 6 | mkdir tmp
 7 | for f in ../data/empirist_gold_standard/test_cmc/raw/*
 8 | do
 9 |     filename=$(basename $f)
10 |     somajo-tokenizer --split_camel_case $f > tmp/$filename
11 | done
12 | # perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_test.txt tmp ../data/empirist_gold_standard/test_cmc/tokenized
13 | ./evaluate.py -d -e errors.txt --ignore-xml tmp/ ../data/empirist_gold_standard/test_cmc/tokenized/
14 | rm -r tmp/
15 | 


--------------------------------------------------------------------------------
/utils/evaluate_on_test_web.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
 4 | cd $SCRIPTDIR
 5 | 
 6 | mkdir tmp
 7 | for f in ../data/empirist_gold_standard/test_web/raw/*
 8 | do
 9 |     filename=$(basename $f)
10 |     somajo-tokenizer --split_camel_case $f > tmp/$filename
11 | done
12 | # perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_test.txt tmp ../data/empirist_gold_standard/test_web/tokenized
13 | ./evaluate.py -d -e errors.txt --ignore-xml tmp/ ../data/empirist_gold_standard/test_web/tokenized/
14 | rm -r tmp/
15 | 


--------------------------------------------------------------------------------
/src/somajo/data/non-breaking_suffixes_en.txt:
--------------------------------------------------------------------------------
 1 | # Hyphenated suffixes in the following list are not split into multiple tokens.
 2 | # Euro-centric and element-wise are both single tokens.
 3 | 
 4 | able
 5 | ahol
 6 | aholic
 7 | ation
 8 | centric
 9 | cracy
10 | crat
11 | dom
12 | er
13 | ery
14 | esque
15 | ette
16 | fest
17 | fold
18 | ful
19 | gate
20 | gon
21 | hood
22 | ian
23 | ible
24 | ing
25 | isation
26 | ise
27 | ising
28 | ism
29 | ist
30 | itis
31 | ization
32 | ize
33 | izing
34 | less
35 | logist
36 | logy
37 | ly
38 | most
39 | o-torium
40 | rama
41 | wise
42 | 


--------------------------------------------------------------------------------
/src/somajo/data/units.txt:
--------------------------------------------------------------------------------
 1 | # A list of units preceded by numbers. The list is case-insensitive.
 2 | #
 3 | # Lines starting with “#” are treated as comments and will be ignored.
 4 | 
 5 | bit
 6 | cent
 7 | cm
 8 | cm2
 9 | cm3
10 | cm^2
11 | cm^3
12 | cm²
13 | cm³
14 | dm
15 | dm2
16 | dm3
17 | dm^2
18 | dm^3
19 | dm²
20 | dm³
21 | eur
22 | f
23 | ft
24 | g
25 | gbit/s
26 | ghz
27 | h
28 | hz
29 | kg
30 | km
31 | km/h
32 | km2
33 | km3
34 | km^2
35 | km^3
36 | km²
37 | km³
38 | l
39 | lb
40 | m
41 | m2
42 | m3
43 | m^2
44 | m^3
45 | mbit/s
46 | min
47 | ml
48 | mm
49 | mm2
50 | mm3
51 | mm^2
52 | mm^3
53 | mm²
54 | mm³
55 | m²
56 | m³
57 | qm
58 | s
59 | sek
60 | 


--------------------------------------------------------------------------------
/utils/evaluate_on_ewt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
 4 | cd $SCRIPTDIR
 5 | 
 6 | mkdir tmp
 7 | for f in ../data/English_Web_Treebank/en-ud-*.txt
 8 | do
 9 |     filename=$(basename $f)
10 |     somajo-tokenizer -l en_PTB $f > tmp/$filename
11 | done
12 | echo "GOLD"
13 | # perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_ewt.txt tmp ../data/English_Web_Treebank/gold
14 | ./evaluate.py -d -e errors.txt tmp/ ../data/English_Web_Treebank/gold/
15 | # echo ""
16 | # echo "SEMIGOLD"
17 | # perl ../data/empirist_gold_standard/tools/compare_tokenization.perl -e errors_ewt_semi.txt tmp ../data/English_Web_Treebank/semigold
18 | rm -r tmp/
19 | 


--------------------------------------------------------------------------------
/tests/test_token.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import unittest
 4 | 
 5 | from somajo.token import Token
 6 | 
 7 | 
 8 | class TestToken(unittest.TestCase):
 9 |     def test_token_01(self):
10 |         text = "FooBar"
11 |         t = Token(text)
12 |         self.assertEqual(str(t), text)
13 | 
14 |     def test_token_02(self):
15 |         t = Token("FooBar", space_after=False, original_spelling="Foo Bar")
16 |         self.assertEqual(t.extra_info, 'SpaceAfter=No, OriginalSpelling="Foo Bar"')
17 | 
18 |     def test_token_03(self):
19 |         t = Token("<p foo='bar'>", markup=True, markup_class="start", markup_eos=True)
20 |         self.assertEqual(t.markup_class, "start")
21 |         self.assertTrue(t.markup_eos)
22 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/src/somajo/data/single_token_abbreviations_de.txt:
--------------------------------------------------------------------------------
 1 | # A list of multi-dot abbreviations that represent single tokens and
 2 | # should not be split.
 3 | #
 4 | # Lines starting with “#” are treated as comments and will be ignored.
 5 | 
 6 | ak.mas
 7 | Art.-Nr.
 8 | At.-Gew.
 9 | Best.-Nr.
10 | BT-Drs.
11 | Dipl.-Ing.
12 | E.ON
13 | Forsch.frage
14 | GV.NRW.
15 | H.-I.
16 | H.-Qu.
17 | IT.NRW
18 | klass.-lat.
19 | Komm.formen
20 | Krim.-Ob.-Insp.
21 | Kto.-Nr.
22 | L.-Abg.
23 | M.-Schr.
24 | Mat.-Nr.
25 | MBl.NRW.
26 | o.k.
27 | Pan.do/ra
28 | Priv.-Doz.
29 | prov.-fr.
30 | Proz.-Bev.
31 | r.-k.
32 | Reg.-Bez.
33 | Rg.-Präs.
34 | röm.-kath.
35 | Sat.1
36 | SMBl.NRW.
37 | soz.-päd.
38 | SP.ON
39 | T.V.
40 | Uni-Kl.
41 | USt-IdNr.
42 | Zeitschr.titel
43 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on: [push, pull_request, workflow_dispatch]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       matrix:
10 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
11 | 
12 |     steps:
13 |     - name: Checkout sources
14 |       uses: actions/checkout@v4
15 | 
16 |     - name: Set up Python ${{ matrix.python-version }}
17 |       uses: actions/setup-python@v4
18 |       with:
19 |         python-version: ${{ matrix.python-version }}
20 | 
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         python -m pip install .
25 | 
26 |     - name: Test
27 |       run: |
28 |         python -m unittest discover
29 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | SoMaJo
 2 | ======
 3 | 
 4 | SoMaJo is a rule-based tokenizer and sentence splitter that implements
 5 | tokenization guidelines for German and English. It has a strong focus
 6 | on web and social media texts (it was originally created as the
 7 | winning submission to the `EmpiriST 2015 shared task
 8 | <https://sites.google.com/site/empirist2015/>`_ on automatic
 9 | linguistic annotation of computer-mediated communication / social
10 | media) and is particularly well-suited to perform tokenization on all
11 | kinds of written discourse, for example chats, forums, wiki talk
12 | pages, tweets, blog comments, social networks, SMS and WhatsApp
13 | dialogues. Of course it also works on more formal texts.
14 | 
15 | More detailed documentation is available `here
16 | <https://github.com/tsproisl/SoMaJo>`_.
17 | 


--------------------------------------------------------------------------------
/utils/baseline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTDIR=$(dirname $BASH_SOURCE)
 4 | cd $SCRIPTDIR
 5 | 
 6 | mkdir tmp
 7 | for f in ../../data/all_test/raw/*
 8 | # for f in ../../data/empirist_test_pos_cmc/raw/*
 9 | # for f in ../../data/empirist_test_pos_web/raw/*
10 | do
11 |     filename=$(basename $f)
12 |     sed -re "/^<[^>]+>$/! { s/([.!?,;:+*()\"'–])/ \1 /g; s/\s+/\n/g }" $f > tmp/$filename
13 | done
14 | perl ../../data/empirist_test_pos_web/tools/compare_tokenization.perl -x -e errors_baseline_test.txt tmp ../../data/all_test/tokenized
15 | # perl ../../data/empirist_test_pos_web/tools/compare_tokenization.perl -e errors_test.txt tmp ../../data/empirist_test_pos_cmc/tokenized
16 | # perl ../../data/empirist_test_pos_web/tools/compare_tokenization.perl -e errors_test.txt tmp ../../data/empirist_test_pos_web/tokenized
17 | rm -r tmp/
18 | 


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/src/somajo/data/non-breaking_prefixes_en.txt:
--------------------------------------------------------------------------------
  1 | # Hyphenated prefixes in the following list are not split into multiple tokens.
  2 | # E-mail and re-evaluation are both single tokens.
  3 | 
  4 | a
  5 | adeno
  6 | agro
  7 | ambi
  8 | ante
  9 | anti
 10 | aorto
 11 | arch
 12 | axio
 13 | be
 14 | bi
 15 | bio
 16 | broncho
 17 | centi
 18 | circum
 19 | cis
 20 | co
 21 | colo
 22 | contra
 23 | cortico
 24 | counter
 25 | cran
 26 | cross
 27 | crypto
 28 | cyber
 29 | de
 30 | deca
 31 | demi
 32 | dis
 33 | e
 34 | eco
 35 | electro
 36 | ennea
 37 | ex
 38 | extra
 39 | ferro
 40 | gastro
 41 | giga
 42 | hemi
 43 | hepta
 44 | hexa
 45 | hypo
 46 | ideo
 47 | idio
 48 | in
 49 | infra
 50 | inter
 51 | intra
 52 | iso
 53 | judeo
 54 | macro
 55 | medi
 56 | mega
 57 | micro
 58 | mid
 59 | milli
 60 | mini
 61 | mono
 62 | multi
 63 | musculo
 64 | neo
 65 | neuro
 66 | nitro
 67 | non
 68 | novem
 69 | octa
 70 | octo
 71 | ortho
 72 | over
 73 | paleo
 74 | pan
 75 | para
 76 | pelvi
 77 | penta
 78 | peri
 79 | pheno
 80 | phospho
 81 | pica
 82 | pneumo
 83 | poly
 84 | post
 85 | pre
 86 | preter
 87 | pro
 88 | pseudo
 89 | quadri
 90 | quasi
 91 | quinque
 92 | re
 93 | recto
 94 | salpingo
 95 | semi
 96 | sept
 97 | sero
 98 | soci
 99 | sub
100 | super
101 | supra
102 | sur
103 | tele
104 | tera
105 | tetra
106 | tri
107 | u
108 | uber
109 | ultra
110 | un
111 | uni
112 | veno
113 | ventriculo
114 | vice
115 | x
116 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # 1. Build distribution files:
 2 | #    python3 -m build
 3 | #
 4 | # 2. Upload to PyPI:
 5 | #    twine upload dist/*
 6 | #
 7 | # 3. Check if everything looks all right:
 8 | #    https://pypi.python.org/pypi/SoMaJo
 9 | #
10 | # 4. Go to https://github.com/tsproisl/SoMaJo/releases/new and create
11 | #    a new release
12 | [project]
13 | name = "SoMaJo"
14 | version = "2.4.3"
15 | description = "A tokenizer and sentence splitter for German and English web and social media texts."
16 | readme = "README.md"
17 | requires-python = ">=3.8"
18 | license = {file = "LICENSE.txt"}
19 | keywords = ["tokenizer", "sentence-splitter"]
20 | authors = [
21 |   {name = "Thomas Proisl, Peter Uhrig", email = "thomas.proisl@fau.de"}
22 | ]
23 | maintainers = [
24 |   {name = "Thomas Proisl", email = "thomas.proisl@fau.de"}
25 | ]
26 | classifiers = [
27 |   "Development Status :: 5 - Production/Stable",
28 |   "Environment :: Console",
29 |   "Intended Audience :: Developers",
30 |   "Intended Audience :: Science/Research",
31 |   "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
32 |   "Natural Language :: German",
33 |   "Natural Language :: English",
34 |   "Operating System :: OS Independent",
35 |   "Programming Language :: Python :: 3",
36 |   "Topic :: Text Processing :: Linguistic",
37 | ]
38 | 
39 | dependencies = [
40 |   "regex>=2019.02.18",
41 | ]
42 | 
43 | [project.urls]
44 | "Homepage" = "https://github.com/tsproisl/SoMaJo"
45 | "API documentation" = "https://github.com/tsproisl/SoMaJo/blob/master/doc/build/markdown/somajo.md"
46 | 
47 | [project.scripts]
48 | somajo-tokenizer = "somajo.cli:main"
49 | 
50 | [build-system]
51 | requires = ["setuptools>=61.0"]
52 | build-backend = "setuptools.build_meta"
53 | 
54 | [tool.setuptools.packages.find]
55 | where = ["src"]
56 | 
57 | [tool.setuptools.package-data]
58 | "somajo.data" = ["*.txt"]
59 | 


--------------------------------------------------------------------------------
/utils/errors_trial.txt:
--------------------------------------------------------------------------------
 1 | __________________________________________________________________________________________________
 2 | tmp/blog_comments.txt                             ../../data/all_trial/tokenized/blog_comments.txt
 3 | 
 4 | False Positive (linebreak inserted left):
 5 |    145:   WIE                                        145:   WIE                                   
 6 |    146:   ICH                                        146:   ICH                                   
 7 |    147: * WEI                                        147: * WEI?                                  
 8 |    148: * ?                                          148:   HABT                                  
 9 |    149:   HABT                                       149:   IHR                                   
10 |    150:   IHR                                        150:   BEIDE                                 
11 | 
12 | __________________________________________________________________________________________________
13 | tmp/social_chat.txt                               ../../data/all_trial/tokenized/social_chat.txt  
14 | 
15 | False Positive (linebreak inserted left):
16 |    157:   marc                                       157:   marc                                  
17 |    158:   .                                          158:   .                                     
18 |    159: * .                                          159: * .)))                                  
19 |    160: * )))                                        160:                                         
20 |    161:                                              161:   <posting id="1-39" author="quaki" />  
21 |    162:   <posting id="1-39" author="quaki" />       162:   ups                                   
22 | 
23 | __________________________________________________________________________________________________
24 | tmp/wikipedia_talk_pages.txt                      .../all_trial/tokenized/wikipedia_talk_pages.txt
25 | 
26 | False Negative (linebreak inserted right):
27 |    495:   meine                                      495:   meine                                 
28 |    496:   ich                                        496:   ich                                   
29 |    497: * ;O))                                       497: * ;O)                                   
30 |    498:   .                                          498: * )                                     
31 |                                                      499:   .                                     
32 |                                                                                                   
33 | 
34 | 


--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
 1 | # pip install sphinx sphinx-markdown-builder
 2 | # mkdir doc
 3 | # cd doc/
 4 | # sphinx-quickstart --sep -p SoMaJo -a "Thomas Proisl, Peter Uhrig" -v "2.0.0" --ext-autodoc --extensions sphinx.ext.napoleon
 5 | # # edit source/conf.py:
 6 | # # import os
 7 | # # import sys
 8 | # # sys.path.insert(0, os.path.abspath('../..'))
 9 | # cd ..
10 | # sphinx-apidoc -f -o doc/source/ somajo
11 | # cd doc
12 | # make markdown
13 | 
14 | 
15 | # Configuration file for the Sphinx documentation builder.
16 | #
17 | # This file only contains a selection of the most common options. For a full
18 | # list see the documentation:
19 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
20 | 
21 | # -- Path setup --------------------------------------------------------------
22 | 
23 | # If extensions (or modules to document with autodoc) are in another directory,
24 | # add these directories to sys.path here. If the directory is relative to the
25 | # documentation root, use os.path.abspath to make it absolute, like shown here.
26 | #
27 | import os
28 | import sys
29 | sys.path.insert(0, os.path.abspath('../..'))
30 | 
31 | 
32 | # -- Project information -----------------------------------------------------
33 | 
34 | project = 'SoMaJo'
35 | copyright = '2019, Thomas Proisl, Peter Uhrig'
36 | author = 'Thomas Proisl, Peter Uhrig'
37 | 
38 | # The short X.Y version
39 | version = '2.0.0'
40 | 
41 | # The full version, including alpha/beta/rc tags
42 | release = '2.0.0'
43 | 
44 | 
45 | # -- General configuration ---------------------------------------------------
46 | 
47 | # Add any Sphinx extension module names here, as strings. They can be
48 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
49 | # ones.
50 | extensions = [
51 |     'sphinx.ext.autodoc',
52 |     'sphinx.ext.napoleon',
53 |     'sphinx_markdown_builder',
54 | ]
55 | 
56 | # Add any paths that contain templates here, relative to this directory.
57 | templates_path = ['_templates']
58 | 
59 | # List of patterns, relative to source directory, that match files and
60 | # directories to ignore when looking for source files.
61 | # This pattern also affects html_static_path and html_extra_path.
62 | exclude_patterns = []
63 | 
64 | 
65 | # -- Options for HTML output -------------------------------------------------
66 | 
67 | # The theme to use for HTML and HTML Help pages.  See the documentation for
68 | # a list of builtin themes.
69 | #
70 | html_theme = 'alabaster'
71 | 
72 | # Add any paths that contain custom static files (such as style sheets) here,
73 | # relative to this directory. They are copied after the builtin static files,
74 | # so a file named "default.css" will overwrite the builtin "default.css".
75 | html_static_path = ['_static']
76 | 
77 | 
78 | # -- Extension configuration -------------------------------------------------
79 | 


--------------------------------------------------------------------------------
/src/somajo/doubly_linked_list.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import operator
  4 | 
  5 | 
  6 | class DLLElement:
  7 |     def __init__(self, val=None, prv=None, nxt=None, lst=None):
  8 |         if isinstance(val, DLLElement):
  9 |             val = val.value
 10 |         self.prev = prv
 11 |         self.next = nxt
 12 |         self.value = val
 13 |         self.list = lst
 14 |         if prv is not None:
 15 |             prv.next = self
 16 |         if nxt is not None:
 17 |             nxt.prev = self
 18 | 
 19 | 
 20 | class DLL:
 21 |     def __init__(self, iterable=None):
 22 |         self.first = None
 23 |         self.last = None
 24 |         self.size = 0
 25 |         if iterable is not None:
 26 |             self.extend(iterable)
 27 | 
 28 |     def __iter__(self, start=None):
 29 |         current = self.first
 30 |         if start is not None:
 31 |             current = start
 32 |         while current is not None:
 33 |             yield current
 34 |             current = current.next
 35 | 
 36 |     def __reversed__(self, start=None):
 37 |         current = self.last
 38 |         if start is not None:
 39 |             current = start
 40 |         while current is not None:
 41 |             yield current
 42 |             current = current.prev
 43 | 
 44 |     def __len__(self):
 45 |         return self.size
 46 | 
 47 |     def __str__(self):
 48 |         return str(self.to_list())
 49 | 
 50 |     def _find_matching_element(self, item, attrgetter, value, ignore_attrgetter=None, ignore_value=None, forward=True):
 51 |         current = item
 52 |         direction = operator.attrgetter("next")
 53 |         if not forward:
 54 |             direction = operator.attrgetter("prev")
 55 |         while direction(current) is not None:
 56 |             current = direction(current)
 57 |             if ignore_attrgetter is not None:
 58 |                 if ignore_attrgetter(current) == ignore_value:
 59 |                     continue
 60 |             if attrgetter(current) == value:
 61 |                 return current
 62 |         return None
 63 | 
 64 |     def append(self, item):
 65 |         element = DLLElement(item, self.last, None, self)
 66 |         if self.first is None:
 67 |             self.first = element
 68 |         self.last = element
 69 |         self.size += 1
 70 | 
 71 |     def append_left(self, item):
 72 |         element = DLLElement(item, None, self.first, self)
 73 |         if self.last is None:
 74 |             self.last = element
 75 |         self.first = element
 76 |         self.size += 1
 77 | 
 78 |     def extend(self, iterable):
 79 |         for item in iterable:
 80 |             self.append(item)
 81 | 
 82 |     def insert_left(self, item, ref_element):
 83 |         element = DLLElement(item, ref_element.prev, ref_element, self)
 84 |         ref_element.prev = element
 85 |         if self.first is ref_element:
 86 |             self.first = element
 87 |         self.size += 1
 88 | 
 89 |     def insert_right(self, item, ref_element):
 90 |         element = DLLElement(item, ref_element, ref_element.next, self)
 91 |         ref_element.next = element
 92 |         if self.last is ref_element:
 93 |             self.last = element
 94 |         self.size += 1
 95 | 
 96 |     def is_left_of(self, element, ref_element):
 97 |         current = ref_element
 98 |         while current is not self.first:
 99 |             current = current.prev
100 |             if current is element:
101 |                 return True
102 |         return False
103 | 
104 |     def is_right_of(self, element, ref_element):
105 |         return self.is_left_of(ref_element, element)
106 | 
107 |     def next_matching(self, item, attrgetter, value, ignore_attrgetter=None, ignore_value=None):
108 |         return self._find_matching_element(item, attrgetter, value, ignore_attrgetter, ignore_value, forward=True)
109 | 
110 |     def pop(self):
111 |         if self.size == 0:
112 |             raise IndexError
113 |         element = self.last
114 |         self.remove(element)
115 |         return element.value
116 | 
117 |     def previous_matching(self, item, attrgetter, value, ignore_attrgetter=None, ignore_value=None):
118 |         return self._find_matching_element(item, attrgetter, value, ignore_attrgetter, ignore_value, forward=False)
119 | 
120 |     def remove(self, element):
121 |         if self.first is element:
122 |             self.first = element.next
123 |         if self.last is element:
124 |             self.last = element.prev
125 |         if element.prev is not None:
126 |             element.prev.next = element.next
127 |         if element.next is not None:
128 |             element.next.prev = element.prev
129 |         self.size -= 1
130 | 
131 |     def to_list(self):
132 |         return [e.value for e in self]
133 | 


--------------------------------------------------------------------------------
/src/somajo/token.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | 
  4 | class Token:
  5 |     """Token objects store a piece of text (in the end a single token) with additional information.
  6 | 
  7 |     Parameters
  8 |     ----------
  9 |     text : str
 10 |         The text that makes up the token object
 11 |     markup : bool, (default=False)
 12 |         Is the token a markup token?
 13 |     markup_class : {'start', 'end'}, optional (default=None)
 14 |         If `markup=True`, then `markup_class` must be either "start" or "end".
 15 |     markup_eos : bool, optional (default=None)
 16 |         Is the markup token a sentence boundary?
 17 |     locked : bool, (default=False)
 18 |         Mark the token as locked.
 19 |     token_class : {'URL', 'XML_entity', 'XML_tag', 'abbreviation', 'action_word', 'amount', 'date', 'email_address', 'emoticon', 'hashtag', 'measurement', 'mention', 'number', 'ordinal', 'regular', 'semester', 'symbol', 'time'}, optional (default=None)
 20 |         The class of the token, e.g. "regular", "emoticon", "URL", etc.
 21 |     space_after : bool, (default=True)
 22 |         Was there a space after the token in the original data?
 23 |     original_spelling : str, optional (default=None)
 24 |         The original spelling of the token, if it is different from the one in `text`.
 25 |     first_in_sentence : bool, (default=False)
 26 |         Is it the first token of a sentence?
 27 |     last_in_sentence : bool, (default=False)
 28 |         Is it the last token of a sentence?
 29 |     character_offset : tuple, (default=None)
 30 |         Character offset of the token in the input as tuple `(start, end)`
 31 |         such that `input[start:end] == text` (if there are no changes to
 32 |         the token text during tokenization)
 33 | 
 34 |     """
 35 | 
 36 |     token_classes = {
 37 |         "URL",
 38 |         "XML_entity",
 39 |         "XML_tag",
 40 |         "abbreviation",
 41 |         "action_word",
 42 |         "amount",
 43 |         "date",
 44 |         "email_address",
 45 |         "emoticon",
 46 |         "hashtag",
 47 |         "measurement",
 48 |         "mention",
 49 |         "number",
 50 |         "ordinal",
 51 |         "regular",
 52 |         "semester",
 53 |         "symbol",
 54 |         "time",
 55 |     }
 56 | 
 57 |     def __init__(
 58 |             self,
 59 |             text,
 60 |             *,
 61 |             markup=False,
 62 |             markup_class=None,
 63 |             markup_eos=None,
 64 |             locked=False,
 65 |             token_class=None,
 66 |             space_after=True,
 67 |             original_spelling=None,
 68 |             first_in_sentence=False,
 69 |             last_in_sentence=False,
 70 |             character_offset=None
 71 |     ):
 72 |         self.text = text
 73 |         if markup:
 74 |             assert markup_class is not None, "You need to specify a `markup_class` for markup tokens."
 75 |             assert markup_eos is not None, "You need to provide a value for `markup_eos` for markup tokens."
 76 |         if markup_class is not None:
 77 |             assert markup, "You can only specify a `markup_class` for markup tokens."
 78 |             assert markup_class == "start" or markup_class == "end", f"'{markup_class}' is not a recognized markup class."
 79 |         if markup_eos is not None:
 80 |             assert markup, "You can only use `markup_eos` for markup tokens."
 81 |             assert isinstance(markup_eos, bool), f"'{markup_eos}' is not a Boolean value."
 82 |         if token_class is not None:
 83 |             assert token_class in self.token_classes, f"'{token_class}' is not a recognized token class."
 84 |         self.markup = markup
 85 |         self.markup_class = markup_class
 86 |         self.markup_eos = markup_eos
 87 |         self._locked = locked
 88 |         self.token_class = token_class
 89 |         self.space_after = space_after
 90 |         self.original_spelling = original_spelling
 91 |         self.first_in_sentence = first_in_sentence
 92 |         self.last_in_sentence = last_in_sentence
 93 |         self.character_offset = character_offset
 94 | 
 95 |     def __str__(self):
 96 |         return self.text
 97 | 
 98 |     @property
 99 |     def extra_info(self):
100 |         """String representation of extra information.
101 | 
102 |         Returns
103 |         -------
104 |         str
105 |             A string representation of the `space_after` and `original_spelling` attributes.
106 | 
107 |         Examples
108 |         --------
109 |         >>> tok = Token(":)", token_class="regular", space_after=False, original_spelling=": )")
110 |         >>> print(tok.text)
111 |         :)
112 |         >>> print(tok.extra_info)
113 |         SpaceAfter=No, OriginalSpelling=": )"
114 | 
115 |         """
116 |         info = []
117 |         if not self.space_after:
118 |             info.append("SpaceAfter=No")
119 |         if self.original_spelling is not None:
120 |             info.append("OriginalSpelling=\"%s\"" % self.original_spelling)
121 |         return ", ".join(info)
122 | 


--------------------------------------------------------------------------------
/tests/test_doubly_linked_list.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import operator
  4 | import unittest
  5 | 
  6 | from somajo.doubly_linked_list import DLL
  7 | 
  8 | 
  9 | class TestDLL(unittest.TestCase):
 10 |     def test_dll_01(self):
 11 |         lst = ["Foo", "", 0, -1, False, True, None]
 12 |         dll = DLL(lst)
 13 |         self.assertEqual(dll.to_list(), lst)
 14 | 
 15 |     def test_dll_02(self):
 16 |         lst = ["Foo", "", 0, -1, False, True, None]
 17 |         dll = DLL(lst)
 18 |         self.assertEqual(DLL(reversed(dll)).to_list(), list(reversed(lst)))
 19 | 
 20 |     def test_dll_03(self):
 21 |         lst = ["Foo", "", 0, -1, False, True, None]
 22 |         dll = DLL(lst)
 23 |         self.assertEqual(len(dll), len(lst))
 24 | 
 25 |     def test_dll_04(self):
 26 |         lst = ["Foo", "", 0, -1, False, True, None]
 27 |         dll = DLL(["Foo", "", 0, -1, False, True, None])
 28 |         self.assertEqual(str(dll), str(lst))
 29 | 
 30 |     def test_dll_05(self):
 31 |         dll = DLL([4, 5, 6])
 32 |         dll.append_left(3)
 33 |         self.assertEqual(dll.to_list(), [3, 4, 5, 6])
 34 | 
 35 |     def test_dll_06(self):
 36 |         dll = DLL([4, 5, 6])
 37 |         dll.append(7)
 38 |         self.assertEqual(dll.to_list(), [4, 5, 6, 7])
 39 | 
 40 |     def test_dll_07(self):
 41 |         dll = DLL([4, 5, 6])
 42 |         dll.extend([7, 8, 9])
 43 |         self.assertEqual(dll.to_list(), [4, 5, 6, 7, 8, 9])
 44 | 
 45 |     def test_dll_08(self):
 46 |         dll = DLL([4, 5, 6, 7])
 47 |         last = dll.pop()
 48 |         self.assertEqual(last, 7)
 49 |         self.assertEqual(len(dll), 3)
 50 |         self.assertEqual(dll.to_list(), [4, 5, 6])
 51 | 
 52 |     def test_dll_09(self):
 53 |         dll = DLL([])
 54 |         self.assertEqual(len(dll), 0)
 55 |         self.assertEqual(dll.to_list(), [])
 56 | 
 57 |     def test_dll_10(self):
 58 |         dll = DLL([4])
 59 |         last = dll.pop()
 60 |         self.assertEqual(last, 4)
 61 |         self.assertEqual(len(dll), 0)
 62 |         self.assertEqual(dll.to_list(), [])
 63 | 
 64 |     def test_dll_11(self):
 65 |         dll = DLL([4])
 66 |         last = dll.pop()
 67 |         self.assertEqual(last, 4)
 68 |         self.assertRaises(IndexError, dll.pop)
 69 | 
 70 |     def test_dll_12(self):
 71 |         dll = DLL([])
 72 |         dll.append_left(1)
 73 |         self.assertEqual(dll.to_list(), [1])
 74 | 
 75 |     def test_dll_13(self):
 76 |         dll = DLL([1, 2, 3, 4])
 77 |         x = dll.next_matching(dll.first, operator.attrgetter("value"), 2)
 78 |         self.assertEqual(x.value, 2)
 79 |         self.assertEqual([e.value for e in dll.__iter__(start=x)], [2, 3, 4])
 80 | 
 81 |     def test_dll_14(self):
 82 |         dll = DLL([1, 2, 3, 4])
 83 |         x = dll.previous_matching(dll.last, operator.attrgetter("value"), 3)
 84 |         self.assertEqual(x.value, 3)
 85 |         self.assertEqual([e.value for e in dll.__reversed__(start=x)], [3, 2, 1])
 86 | 
 87 |     def test_dll_15(self):
 88 |         dll = DLL([1, 2, 3, 4])
 89 |         x = dll.next_matching(dll.first, operator.attrgetter("value"), 4, operator.attrgetter("value"), 3)
 90 |         self.assertEqual(x.value, 4)
 91 | 
 92 |     def test_dll_16(self):
 93 |         dll = DLL([1, 2, 3, 4])
 94 |         x = dll.next_matching(dll.first, operator.attrgetter("value"), 7)
 95 |         self.assertIs(x, None)
 96 | 
 97 |     def test_dll_17(self):
 98 |         dll = DLL([1, 2, 3])
 99 |         x = dll.next_matching(dll.first, operator.attrgetter("value"), 2)
100 |         dll.insert_left(7, x)
101 |         self.assertEqual(dll.to_list(), [1, 7, 2, 3])
102 | 
103 |     def test_dll_18(self):
104 |         dll = DLL([1, 2, 3])
105 |         x = dll.next_matching(dll.first, operator.attrgetter("value"), 2)
106 |         dll.insert_right(7, x)
107 |         self.assertEqual(dll.to_list(), [1, 2, 7, 3])
108 | 
109 |     def test_dll_19(self):
110 |         dll = DLL([1, 2, 3])
111 |         self.assertTrue(dll.is_left_of(dll.first, dll.last))
112 | 
113 |     def test_dll_20(self):
114 |         dll = DLL([1, 2, 3])
115 |         self.assertTrue(dll.is_right_of(dll.last, dll.first))
116 | 
117 |     def test_dll_21(self):
118 |         dll = DLL([1, 2, 3])
119 |         x = dll.next_matching(dll.first, operator.attrgetter("value"), 2)
120 |         dll.remove(x)
121 |         self.assertEqual(dll.to_list(), [1, 3])
122 | 
123 |     def test_dll_22(self):
124 |         dll = DLL([1, 2, 3])
125 |         dll.remove(dll.first)
126 |         self.assertEqual(dll.to_list(), [2, 3])
127 | 
128 |     def test_dll_23(self):
129 |         dll = DLL([1, 2, 3])
130 |         dll.remove(dll.last)
131 |         self.assertEqual(dll.to_list(), [1, 2])
132 | 
133 |     def test_dll_24(self):
134 |         dll = DLL([1, 2, 3])
135 |         dll.insert_left(0, dll.first)
136 |         self.assertEqual(dll.to_list(), [0, 1, 2, 3])
137 | 
138 |     def test_dll_25(self):
139 |         dll = DLL([1, 2, 3])
140 |         dll.insert_right(4, dll.last)
141 |         self.assertEqual(dll.to_list(), [1, 2, 3, 4])
142 | 
143 |     def test_dll_26(self):
144 |         dll = DLL([1, 2, 3])
145 |         self.assertFalse(dll.is_left_of(dll.last, dll.first))
146 | 
147 |     def test_dll_27(self):
148 |         dll = DLL([1])
149 |         dll.remove(dll.last)
150 |         self.assertEqual(dll.to_list(), [])
151 | 
152 |     def test_dll_28(self):
153 |         dll = DLL([1])
154 |         dll.remove(dll.first)
155 |         self.assertEqual(dll.to_list(), [])
156 | 


--------------------------------------------------------------------------------
/src/somajo/cli.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import logging
 5 | import time
 6 | 
 7 | from . import (
 8 |     SoMaJo,
 9 |     __version__
10 | )
11 | 
12 | logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO)
13 | 
14 | 
15 | def arguments():
16 |     """"""
17 |     parser = argparse.ArgumentParser(description="A tokenizer and sentence splitter for German and English texts. Currently, two tokenization guidelines are implemented: The EmpiriST guidelines for German web and social media texts (de_CMC) and the \"new\" Penn Treebank conventions for English texts (en_PTB).")
18 |     parser.add_argument("-l", "--language", choices=SoMaJo.supported_languages, default=SoMaJo._default_language, help="Choose a language. Currently supported are German EmpiriST-style tokenization (de_CMC) and English Penn-Treebank-style tokenization(en_PTB). (Default: de_CMC)")
19 |     parser.add_argument("-s", "--paragraph_separator", choices=SoMaJo.paragraph_separators, default=SoMaJo._default_parsep, help="How are paragraphs separated in the input text? Will be ignored if option -x/--xml is used. (Default: empty_lines)")
20 |     parser.add_argument("-x", "--xml", action="store_true", help="The input is an XML file. You can specify tags that always constitute a sentence break (e.g. HTML p tags) via the --tag option.")
21 |     parser.add_argument("--tag", action="append", help="Start and end tags of this type constitute sentence breaks, i.e. they do not occur in the middle of a sentence. Can be used multiple times to specify multiple tags, e.g. --tag p --tag br. Implies option -x/--xml. (Default: --tag title --tag h1 --tag h2 --tag h3 --tag h4 --tag h5 --tag h6 --tag p --tag br --tag hr --tag div --tag ol --tag ul --tag dl --tag table)")
22 |     parser.add_argument("--prune", action="append", help="Tags of this type will be removed from the input before tokenization. Can be used multiple times to specify multiple tags, e.g. --tag script --tag style. Implies option -x/--xml. By default, no tags are pruned.")
23 |     parser.add_argument("--strip-tags", action="store_true", help="Suppresses output of XML tags. Implies option -x/--xml.")
24 |     parser.add_argument("-c", "--split_camel_case", action="store_true", help="Split items in written in camelCase (excluding established names and terms).")
25 |     parser.add_argument("--split_sentences", "--split-sentences", action="store_true", help="Also split the input into sentences.")
26 |     parser.add_argument("--sentence_tag", "--sentence-tag", type=str, help="Tag name for sentence boundaries (e.g. --sentence_tag s). If this option is specified, sentences will be delimited by XML tags (e.g. <s>…</s>) instead of empty lines. This option implies --split_sentences")
27 |     parser.add_argument("-t", "--token_classes", action="store_true", help="Output the token classes (number, XML tag, abbreviation, etc.) in addition to the tokens.")
28 |     parser.add_argument("-e", "--extra_info", action="store_true", help='Output additional information for each token: SpaceAfter=No if the token was not followed by a space and OriginalSpelling="…" if the token contained whitespace.')
29 |     parser.add_argument("--character-offsets", action="store_true", help='Output character offsets in the input for each token.')
30 |     parser.add_argument("--parallel", type=int, default=1, metavar="N", help="Run N worker processes (up to the number of CPUs) to speed up tokenization.")
31 |     parser.add_argument("-v", "--version", action="version", version="SoMaJo %s" % __version__, help="Output version information and exit.")
32 |     parser.add_argument("FILE", type=argparse.FileType("r", encoding="utf-8"), help="The input file (UTF-8-encoded) or \"-\" to read from STDIN.")
33 |     args = parser.parse_args()
34 |     return args
35 | 
36 | 
37 | def main():
38 |     args = arguments()
39 |     n_tokens = 0
40 |     n_sentences = 0
41 |     t0 = time.perf_counter()
42 |     is_xml = False
43 |     if args.xml or args.strip_tags or (args.tag is not None) or (args.prune is not None):
44 |         is_xml = True
45 |     if args.sentence_tag:
46 |         args.split_sentences = True
47 |     tokenizer = SoMaJo(
48 |         args.language,
49 |         split_camel_case=args.split_camel_case,
50 |         split_sentences=args.split_sentences,
51 |         xml_sentences=args.sentence_tag,
52 |         character_offsets=args.character_offsets
53 |     )
54 |     if is_xml:
55 |         eos_tags = args.tag
56 |         if eos_tags is None:
57 |             eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
58 |         chunks = tokenizer.tokenize_xml_file(args.FILE, eos_tags, strip_tags=args.strip_tags, parallel=args.parallel, prune_tags=args.prune)
59 |     else:
60 |         chunks = tokenizer.tokenize_text_file(args.FILE, args.paragraph_separator, parallel=args.parallel)
61 |     for chunk in chunks:
62 |         n_sentences += 1
63 |         for token in chunk:
64 |             output = token.text
65 |             if not token.markup:
66 |                 n_tokens += 1
67 |                 if args.token_classes:
68 |                     output += "\t" + token.token_class
69 |                 if args.extra_info:
70 |                     output += "\t" + token.extra_info
71 |                 if args.character_offsets:
72 |                     output += f"\t{token.character_offset[0]}, {token.character_offset[1]}"
73 |             print(output)
74 |         if args.split_sentences and args.sentence_tag is None:
75 |             print()
76 |     t1 = time.perf_counter()
77 |     if args.split_sentences:
78 |         logging.info("Tokenized %d tokens (%d sentences) in %d seconds (%d tokens/s)" % (n_tokens, n_sentences, t1 - t0, n_tokens / (t1 - t0)))
79 |     else:
80 |         logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" % (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
81 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import unittest
  4 | 
  5 | from somajo import utils
  6 | 
  7 | 
  8 | class TestXmlChunkGenerator(unittest.TestCase):
  9 |     def _equal(self, raw, chunks, prune_tags=None):
 10 |         eos_tags = set(["p"])
 11 |         if prune_tags is not None:
 12 |             prune_tags = set(prune_tags)
 13 |         chunk_info = list(utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags, prune_tags=prune_tags))
 14 |         chunk_lists = (ci[0] for ci in chunk_info)
 15 |         chunk_lists = [[t.text for t in gc] for gc in chunk_lists]
 16 |         self.assertEqual(chunk_lists, chunks)
 17 | 
 18 |     def _equal_offsets(self, raw, chunks, prune_tags=None):
 19 |         eos_tags = set(["p"])
 20 |         if prune_tags is not None:
 21 |             prune_tags = set(prune_tags)
 22 |         chunk_info = list(utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags, prune_tags=prune_tags, character_offsets=True))
 23 |         chunk_lists, raws, positions = zip(*chunk_info)
 24 |         offsets = [[t.character_offset for t in cl] for cl in chunk_lists]
 25 |         extracted_chunks = [[raw[s:e] for s, e in o] for o in offsets]
 26 |         self.assertEqual(extracted_chunks, chunks)
 27 | 
 28 |     def test_xml_chunk_generator_01(self):
 29 |         self._equal("<x>foo bar</x>", [["<x>", "foo bar", "</x>"]])
 30 | 
 31 |     def test_xml_chunk_generator_02(self):
 32 |         self._equal("<x><p>foo</p><p>bar</p></x>", [["<x>", "<p>", "foo", "</p>"], ["<p>", "bar", "</p>", "</x>"]])
 33 | 
 34 |     def test_xml_chunk_generator_03(self):
 35 |         self._equal("<x>\n<p>\nfoo\n</p>\n<p>\nbar\n</p>\n</x>", [["<x>", "\n", "<p>", "\nfoo\n", "</p>"], ["\n", "<p>", "\nbar\n", "</p>", "\n", "</x>"]])
 36 | 
 37 |     def test_xml_chunk_generator_04(self):
 38 |         self._equal(
 39 |             "<x>\n  <p>\n    foo\n  </p>\n  <p>\n    bar\n  </p>\n</x>",
 40 |             [["<x>", "\n  ", "<p>", "\n    foo\n  ", "</p>"], ["\n  ", "<p>", "\n    bar\n  ", "</p>", "\n", "</x>"]]
 41 |         )
 42 | 
 43 |     def test_xml_chunk_generator_05(self):
 44 |         self._equal(
 45 |             "<x><p>foo</p><i>baz</i><p>bar</p><i>baz</i></x>",
 46 |             [["<x>", "<p>", "foo", "</p>"], ["<i>", "baz", "</i>"], ["<p>", "bar", "</p>"], ["<i>", "baz", "</i>", "</x>"]]
 47 |         )
 48 | 
 49 |     def test_xml_chunk_generator_06(self):
 50 |         self._equal(
 51 |             "<x><p>foo</p><br/><p>bar</p><br/></x>",
 52 |             [["<x>", "<p>", "foo", "</p>"], ["<br>", "</br>", "<p>", "bar", "</p>"], ["<br>", "</br>", "</x>"]]
 53 |         )
 54 | 
 55 |     def test_xml_chunk_generator_07(self):
 56 |         self._equal("<x><del>foo</del><i>bar</i></x>", [["<x>", "<i>", "bar", "</i>", "</x>"]], prune_tags=["del"])
 57 | 
 58 |     def test_xml_chunk_generator_08(self):
 59 |         self._equal("<x><del>foo</del><p>bar</p></x>", [["<x>", "<p>", "bar", "</p>", "</x>"]], prune_tags=["del"])
 60 | 
 61 |     def test_xml_chunk_generator_09(self):
 62 |         self._equal("<x>bar\n  <del>foo</del>\nbaz</x>", [["<x>", "bar\n  \nbaz", "</x>"]], prune_tags=["del"])
 63 | 
 64 |     def test_xml_chunk_offsets_01(self):
 65 |         self._equal_offsets("<foo>T&#x0065;st</foo>", [["<foo>", "T&#x0065;st", "</foo>"]])
 66 | 
 67 |     def test_xml_chunk_offsets_02(self):
 68 |         self._equal_offsets("<foo>3 &#x003c; 5</foo>", [["<foo>", "3 &#x003c; 5",  "</foo>"]])
 69 | 
 70 |     def test_xml_chunk_offsets_03(self):
 71 |         self._equal_offsets("<foo>Test&#x00ad;fall</foo>", [["<foo>", "Test&#x00ad;fall", "</foo>"]])
 72 | 
 73 |     def test_xml_chunk_offsets_04(self):
 74 |         self._equal_offsets("<foo>Test­fall</foo>", [["<foo>", "Test­fall", "</foo>"]])
 75 | 
 76 |     def test_xml_chunk_offsets_05(self):
 77 |         """Single combining mark"""
 78 |         self._equal_offsets("<foo>foo xA&#x0308;x foo</foo>", [["<foo>", "foo xA&#x0308;x foo", "</foo>"]])
 79 | 
 80 |     def test_xml_chunk_offsets_06(self):
 81 |         """Multiple combining marks"""
 82 |         self._equal_offsets("<foo>foo xs&#x0323;&#x0307;x foo</foo>", [["<foo>", "foo xs&#x0323;&#x0307;x foo", "</foo>"]])
 83 | 
 84 |     def test_xml_chunk_offsets_07(self):
 85 |         """Multiple combining marks"""
 86 |         self._equal_offsets("<foo>foo xs&#x0307;&#x0323;x foo</foo>", [["<foo>", "foo xs&#x0307;&#x0323;x foo", "</foo>"]])
 87 | 
 88 |     def test_xml_chunk_offsets_08(self):
 89 |         """Multiple combining marks"""
 90 |         self._equal_offsets("<foo>foo xs&#x1e0b;&#x0323;x foo</foo>", [["<foo>", "foo xs&#x1e0b;&#x0323;x foo", "</foo>"]])
 91 | 
 92 |     def test_xml_chunk_offsets_09(self):
 93 |         """Multiple combining marks"""
 94 |         self._equal_offsets("<foo>foo xq&#x0307;&#x0323;x foo</foo>", [["<foo>", "foo xq&#x0307;&#x0323;x foo", "</foo>"]])
 95 | 
 96 |     def test_xml_chunk_offsets_10(self):
 97 |         self._equal_offsets("<foo bar='baz'>Foo</foo>", [["<foo bar='baz'>", "Foo", "</foo>"]])
 98 | 
 99 |     def test_xml_chunk_offsets_11(self):
100 |         self._equal_offsets("<foo bar='ba\"z'>Foo</foo>", [["<foo bar='ba\"z'>", "Foo", "</foo>"]])
101 | 
102 |     def test_xml_chunk_offsets_12(self):
103 |         self._equal_offsets("<foo   bar   =   'baz'>   Foo   </foo>", [["<foo   bar   =   'baz'>", "   Foo   ", "</foo>"]])
104 | 
105 |     def test_xml_chunk_offsets_13(self):
106 |         self._equal_offsets("<foo bar='ba\"z'>Foo \"Bar\" 'Baz'</foo>", [["<foo bar='ba\"z'>", "Foo \"Bar\" 'Baz'", "</foo>"]])
107 | 
108 |     def test_xml_chunk_offsets_14(self):
109 |         self._equal_offsets('<foo bar="baz"\n  spam="eggs">\n    Foo\n</foo>', [['<foo bar="baz"\n  spam="eggs">', "\n    Foo\n", "</foo>"]])
110 | 
111 |     def test_xml_chunk_offsets_15(self):
112 |         self._equal_offsets("<foo>Hallo<br/>Tschüß</foo>", [["<foo>", "Hallo", "<br/>", "", "Tschüß", "</foo>"]])
113 | 
114 |     def test_xml_chunk_offsets_16(self):
115 |         self._equal_offsets("<foo>Hallo<br />Tschüß</foo>", [["<foo>", "Hallo", "<br />", "", "Tschüß", "</foo>"]])
116 | 
117 |     def test_xml_chunk_offsets_17(self):
118 |         self._equal_offsets("<foo>\u0303foo</foo>", [["<foo>", "\u0303foo", "</foo>"]])
119 | 
120 |     def test_xml_chunk_offsets_18(self):
121 |         self._equal_offsets("<foo>foo<p>bar</p></foo>", [["<foo>", "foo"], ["<p>", "bar", "</p>", "</foo>"]])
122 | 
123 |     @unittest.expectedFailure
124 |     def test_xml_chunk_offsets_19(self):
125 |         self._equal_offsets("<foo>bar <del>futsch</del> baz</foo>", [["<foo>", "bar  baz", "</foo>"]], prune_tags=["del"])
126 | 


--------------------------------------------------------------------------------
/utils/errors_train.txt:
--------------------------------------------------------------------------------
 1 | __________________________________________________________________________________________________
 2 | tmp/cmc_train_twitter_2.txt                       ...a/all_train/tokenized/cmc_train_twitter_2.txt
 3 | 
 4 | False Negative (linebreak inserted right):
 5 |    238:   <posting info="AUTOR 17. Apr. 2011" />     238:   <posting info="AUTOR 17. Apr. 2011" />
 6 |    239:   @aPfeL4321                                 239:   @aPfeL4321                            
 7 |    240: * DasTB                                      240: * Das                                   
 8 |    241:   sollte                                     241: * TB                                    
 9 |    242:   allerdings                                 242:   sollte                                
10 |    243:   gut                                        243:   allerdings                            
11 | 
12 | False Negative (linebreak inserted right):
13 |    654:   Vernachlässigung                           655:   Vernachlässigung                      
14 |    655:   ?                                          656:   ?                                     
15 |    656: * Wenn2                                      657: * Wenn                                  
16 |    657:   :                                          658: * 2                                     
17 |    658:   warum                                      659:   :                                     
18 |    659:   ?                                          660:   warum                                 
19 | 
20 | __________________________________________________________________________________________________
21 | tmp/cmc_train_blog_comment.txt                    ...ll_train/tokenized/cmc_train_blog_comment.txt
22 | 
23 | False Positive (linebreak inserted left):
24 |    145:   WIE                                        145:   WIE                                   
25 |    146:   ICH                                        146:   ICH                                   
26 |    147: * WEI                                        147: * WEI?                                  
27 |    148: * ?                                          148:   HABT                                  
28 |    149:   HABT                                       149:   IHR                                   
29 |    150:   IHR                                        150:   BEIDE                                 
30 | 
31 | __________________________________________________________________________________________________
32 | tmp/cmc_train_social_chat.txt                     ...all_train/tokenized/cmc_train_social_chat.txt
33 | 
34 | False Positive (linebreak inserted left):
35 |    158:   marc                                       157:   marc                                  
36 |    159:   .                                          158:   .                                     
37 |    160: * .                                          159: * .)))                                  
38 |    161: * )))                                        160:                                         
39 |    162:                                              161:   <posting id="1-39" author="quaki" />  
40 |    163:   <posting id="1-39" author="quaki" />       162:   ups                                   
41 | 
42 | False Positive (linebreak inserted left):
43 |    652:                                              650:                                         
44 |    653:   <posting id="1-126" author="zora" />       651:   <posting id="1-126" author="zora" />  
45 |    654: * 8                                          652: * 8:)                                   
46 |    655: * :)                                         653:                                         
47 |    656:                                              654:   <posting id="1-127" author="Lantonie" 
48 |    657:   <posting id="1-127" author="Lantonie"      655:   *                                     
49 | 
50 | False Negative (linebreak inserted right):
51 |   1017:   <posting id="1-172" author="quaki" />     1014:   <posting id="1-172" author="quaki" /> 
52 |   1018:   *                                         1015:   *                                     
53 |   1019: * 51cm                                      1016: * 51                                    
54 |   1020:   *                                         1017: * cm                                    
55 |   1021:                                             1018:   *                                     
56 |   1022:   <posting id="1-173" author="stoeps" />    1019:                                         
57 | 
58 | False Negative (linebreak inserted right):
59 |   1340:                                             1338:                                         
60 |   1341:   <posting id="1-223" author="Bochum" />    1339:   <posting id="1-223" author="Bochum" />
61 |   1342: * bochum-münster                            1340: * bochum                                
62 |   1343:   ohne                                      1341: * -                                     
63 |   1344:   küche                                     1342:   münster                               
64 |   1345:   3500                                      1343:   ohne                                  
65 | 
66 | False Negative (linebreak inserted right):
67 |   1340:                                             1339:   <posting id="1-223" author="Bochum" />
68 |   1341:   <posting id="1-223" author="Bochum" />    1340:   bochum                                
69 |   1342: * bochum-münster                            1341: * -                                     
70 |   1343:   ohne                                      1342: * münster                               
71 |   1344:   küche                                     1343:   ohne                                  
72 |   1345:   3500                                      1344:   küche                                 
73 | 
74 | __________________________________________________________________________________________________
75 | tmp/cmc_train_professional_chat.txt               ...ain/tokenized/cmc_train_professional_chat.txt
76 | 
77 | False Negative (linebreak inserted right):
78 |    898:   im                                         898:   im                                    
79 |    899:   Pott                                       899:   Pott                                  
80 |    900: * :-))                                       900: * :-)                                   
81 |    901:   ?                                          901: * )                                     
82 |    902:                                              902:   ?                                     
83 |    903:   <posting id="SEM-33" author="Student5"     903:                                         
84 | 
85 | __________________________________________________________________________________________________
86 | tmp/cmc_train_wiki_discussion_1.txt               ...ain/tokenized/cmc_train_wiki_discussion_1.txt
87 | 
88 | False Negative (linebreak inserted right):
89 |    499:   meine                                      499:   meine                                 
90 |    500:   ich                                        500:   ich                                   
91 |    501: * ;O))                                       501: * ;O)                                   
92 |    502:   .                                          502: * )                                     
93 |    503:                                              503:   .                                     
94 |                                                                                                   
95 | 
96 | 


--------------------------------------------------------------------------------
/src/somajo/data/tokens_with_plus_or_ampersand.txt:
--------------------------------------------------------------------------------
  1 | &amp;
  2 | &apos;
  3 | &gt;
  4 | &K
  5 | &lt;
  6 | &M
  7 | &quot;
  8 | &RQ
  9 | +Ale
 10 | +ALe
 11 | +Anima
 12 | +APD
 13 | +co
 14 | +Co
 15 | +GF+
 16 | +Leif
 17 | +Strang
 18 | +Teamgeist
 19 | A&A
 20 | A&E
 21 | A&F
 22 | A&M
 23 | A&O
 24 | A&P
 25 | A&R
 26 | A&V
 27 | A&W
 28 | A++
 29 | A+++
 30 | A+E
 31 | A+f
 32 | AAC+
 33 | ABC&D
 34 | AC+
 35 | AD&D
 36 | AE&E
 37 | AES+F
 38 | AEW&C
 39 | AFM+E
 40 | AGTL+
 41 | Altenpflege+ProPflege
 42 | Analyse+kritik
 43 | anlagen+verfahren
 44 | ANT+
 45 | Anynet+
 46 | Applus+
 47 | Arch+
 48 | ARCH+
 49 | ART+COM
 50 | AS&P
 51 | ASC+T
 52 | ASEAN+
 53 | Asis&t
 54 | AT&L
 55 | AT&S
 56 | AT&SF
 57 | AT&T
 58 | ATV+
 59 | Auer+Weber
 60 | Auer+Weber+Assoziierte
 61 | Axis&Allies
 62 | B&B
 63 | B&C
 64 | B&F
 65 | B&G
 66 | B&H
 67 | B&I
 68 | B&K
 69 | B&M
 70 | B&MTJR
 71 | B&NES
 72 | B&O
 73 | B&Q
 74 | B&R
 75 | B&T
 76 | B&V
 77 | B&W
 78 | B+B
 79 | B+R
 80 | B+T
 81 | Baby&Co
 82 | Bayern+
 83 | BB&T
 84 | BD+
 85 | Beast+
 86 | BEAST+
 87 | Beck+Schubert
 88 | Belle&Sebastian
 89 | BFE+
 90 | BG+BRG
 91 | BIBEL+ORIENT
 92 | Bild+Funk
 93 | Binder+Co
 94 | Blohm+Voss
 95 | Blood+
 96 | Blut+Eisen
 97 | BM&F
 98 | BM&FBovespa
 99 | Bolles+Wilson
100 | Bottega+Ehrhardt
101 | Brangs+Heinrich
102 | BRF+
103 | Briner+Kern
104 | BUCH&media
105 | Burghardt+Schmidt
106 | bus+bahn
107 | C&A
108 | C&C
109 | C&D
110 | C&L
111 | C&M
112 | C&O
113 | C&P
114 | C&R
115 | C&S
116 | C&T
117 | C&W
118 | C++
119 | C++Builder
120 | C+c
121 | C+C
122 | C+M+B
123 | Ca++
124 | Cafe+co
125 | Cafe+Co
126 | Canal+
127 | Cantata++
128 | CB&I
129 | CC&G
130 | CCC&StL
131 | CD&E
132 | CD&V
133 | CD+DVD
134 | CD+G
135 | CDIA+
136 | Celtic+
137 | Cendres+M
138 | Chage&Aska
139 | Chage&Asuka
140 | Channel+smile
141 | Charm++
142 | Chip&Chap
143 | CI&CEQ
144 | CI+
145 | Click&Buy
146 | Cocl&Seff
147 | Com&Com
148 | COM+
149 | Comicplus+
150 | COR&FJA
151 | CS&S
152 | CT&T
153 | ctc++
154 | Ctrl+Alt+Del
155 | CTRL+ALT+DEL
156 | Cube+
157 | Cyfra+
158 | CYFRA+
159 | D&A
160 | D&AD
161 | D&b
162 | D&B
163 | D&D
164 | D&G
165 | D&O
166 | D&RGW
167 | D&S
168 | D&W
169 | D+Q
170 | DAB+
171 | DACH+HOLZ
172 | DAML+OIL
173 | DBM&T
174 | Dc++
175 | DC++
176 | DDDBM&T
177 | Despe&Siga
178 | DF&S
179 | Digital+
180 | DirectConnect++
181 | Dissing+Weitling
182 | DL+NT
183 | DLSW+
184 | Do&Co
185 | Dok&Deb
186 | Dorma+kaba
187 | DP&L
188 | Drm+
189 | DRM+
190 | DTS++
191 | DU&ICH
192 | DVD+R
193 | Dvd+rw
194 | DVD+RW
195 | E&a
196 | E&N
197 | E&Y
198 | E+
199 | E+e
200 | E+h
201 | E+H
202 | EAAC+
203 | Ebert+Jacobi
204 | ECO+
205 | EG&G
206 | Eigen+Art
207 | Eins+Alles
208 | Electromobility+
209 | En+
210 | Endress+Hauser
211 | Erasmus+
212 | ES&T
213 | ETV+
214 | EV+
215 | Eve&rave
216 | Every+
217 | F&A
218 | F&AM
219 | F&B
220 | F&E
221 | F&F
222 | F&K
223 | F&R
224 | F+F
225 | F+U
226 | Familie&Co
227 | FAT+
228 | Film+
229 | FILM+SCHULE
230 | Fischer+Kr
231 | Fix+Foxi
232 | FLUXUS+
233 | FMHL+
234 | Form+zweck
235 | fuhrpark+management
236 | G&B
237 | G&D
238 | G&IF
239 | G&L
240 | G&V
241 | G++
242 | G+H
243 | G+J
244 | G+tt
245 | GC&CS
246 | GDI+
247 | ge+her
248 | GG&L
249 | Go+
250 | GO+
251 | Google+
252 | Goran+Vujic
253 | GRAF+ZYX
254 | Gruner+Jahr
255 | Gtk+
256 | GTK+
257 | GTL+
258 | GTX+
259 | Guide+
260 | H&BC
261 | H&H
262 | H&K
263 | H&M
264 | H&N
265 | H&R
266 | H&S
267 | H+BEDV
268 | H+H
269 | H+N
270 | H+S
271 | Haase&band
272 | Hahn+Kolb
273 | HAHN+KOLB
274 | Hasta+Coda
275 | Haubitz&Zoche
276 | Haubitz+Zoche
277 | HBCI+
278 | HD+
279 | Health&Care
280 | Heim+Handwerk
281 | Heute+
282 | HFS+
283 | hne+Nagel
284 | HSPA+
285 | HT&L
286 | HTML+TIME
287 | Huber+Suhner
288 | Hunger&Seide
289 | I&A
290 | I&K
291 | I&Q
292 | I&u
293 | I&U
294 | I+D
295 | I+R
296 | Ich+Ich
297 | ID&T
298 | Idee+spiel
299 | Ihp+
300 | II+
301 | IIc+
302 | III+
303 | IK+
304 | In&phone
305 | In&Phone
306 | info+
307 | Interkama+
308 | IT&Production
309 | J&B
310 | J&D
311 | J&J
312 | J&M
313 | J&P
314 | J&S
315 | J&T
316 | J++
317 | J+S
318 | Jazz+Az
319 | Jenna+Ron
320 | Johnson&Johnson
321 | JU+TE
322 | Jugend+Sport
323 | Jugend+Technik
324 | Jump&Run
325 | K&k
326 | K&K
327 | K&L
328 | K&M
329 | K&N
330 | K&R
331 | K&S
332 | K&U
333 | K++
334 | K+A
335 | K+H
336 | K+K
337 | k+Metal
338 | K+R
339 | K+S
340 | K+W
341 | Kai+Sven
342 | Kaiser+Kraft
343 | KAISER+KRAFT
344 | Kino&Co
345 | KINO&CO
346 | Kino+
347 | Kirche+Leben
348 | Klassik&JazzMagazin
349 | Kurz&F
350 | L&B
351 | L&C
352 | L&M
353 | L&N
354 | L&P
355 | L&S
356 | L+R
357 | L+T
358 | Lancia+Voyager
359 | Landis+Gyr
360 | LB&SCR
361 | Leader+
362 | LEADER+
363 | Lederer+Ragnarsd
364 | Leicht&Cross
365 | Lenord+Bauer
366 | Leslie+Lohman
367 | Libsigc++
368 | Life&Style
369 | LIFE+
370 | Light+Building
371 | Lippmann+Rau
372 | LISA+
373 | Lords&Knights
374 | LT&SR
375 | Lussi+Halter
376 | M&A
377 | M&B
378 | M&D
379 | M&G
380 | M&i
381 | M&I
382 | M&M
383 | M&Ms
384 | M&N
385 | M&S
386 | M&T
387 | M+a
388 | M+C
389 | M+M
390 | M+O
391 | M+s
392 | M+S
393 | M+W
394 | Maildir++
395 | Mann+Hummel
396 | Markt+Technik
397 | Means++
398 | Melodie&Rhythmus
399 | Metadata+
400 | Miles&more
401 | Milk+
402 | Mining+geo
403 | Mix&Genest
404 | mmerly+Frey
405 | Monet+
406 | Motion+picture
407 | MPP+
408 | MS&D
409 | MS&L
410 | MStP&SSM
411 | Music&Voice
412 | N&CRR
413 | N&ER
414 | n&gut
415 | N&R
416 | N&W
417 | N+M
418 | Na+
419 | NADHH+
420 | Nah&gut
421 | Natur+kosmos
422 | natur+mensch
423 | Nc+
424 | NI&Co
425 | nig+Neurath
426 | Nike+iPod
427 | Nintendogs+Cats
428 | Notepad++
429 | NYW&B
430 | O&K
431 | O&L
432 | O&M
433 | Ola+
434 | OMNeT++
435 | ORFsport+
436 | Ost+Front
437 | P&A
438 | P&C
439 | P&E
440 | P&G
441 | P&I
442 | P&ID
443 | P&L
444 | P&M
445 | P&O
446 | P&P
447 | P&R
448 | P&T
449 | P&TLuxembourg
450 | P&W
451 | P+M
452 | P+R
453 | P+S
454 | PAL+
455 | Pan&Scan
456 | Papier&Stift
457 | Park&Charge
458 | Park&Rail
459 | Park&Ride
460 | Park&Suites
461 | PB&J
462 | Peek&Cloppenburg
463 | Pen&Paper
464 | Pepperl&Fuchs
465 | Pepperl+Fuchs
466 | Peste&Sida
467 | PG&E
468 | Pirelli&C
469 | Pittel+Brausewetter
470 | Plug&play
471 | Plus+
472 | POB&A
473 | Pol&is
474 | POL&IS
475 | POLO+
476 | Poses++
477 | PP&P
478 | Pratt&Whitney
479 | Princess+
480 | Prius+
481 | Procter&Gamble
482 | Prozac+
483 | PS&P
484 | Pur+
485 | Q&A
486 | Q&Q
487 | Q+Q
488 | Quanta+
489 | R&A
490 | R&B
491 | R&D
492 | R&ER
493 | R&F
494 | R&G
495 | R&I
496 | R&M
497 | R&Q
498 | R&R
499 | R&S
500 | R+C
501 | R+S
502 | R+V
503 | Rail&Fly
504 | REDD+
505 | Reise&Touristik
506 | Relax+ng
507 | RF&P
508 | Richter+Frenzel
509 | Rio+
510 | Rohde&Schwarz
511 | RT+
512 | Run&Dine
513 | S&B
514 | S&D
515 | S&G
516 | S&H
517 | S&K
518 | S&M
519 | S&P
520 | S&T
521 | S&w
522 | S&W
523 | S+D
524 | S+G
525 | S+T
526 | S+U
527 | Sales&Services
528 | Sam&Max
529 | Schedule+
530 | Schiff&Hafen
531 | Schlund+Partner
532 | Schmelzle+Partner
533 | Schmidt+Clemens
534 | science+business
535 | Science+Business
536 | sd&m
537 | Sd&m
538 | Sdr+
539 | Serve&Volley
540 | Severin+K
541 | SiMPLE++
542 | SMS&park
543 | SMW+
544 | Soap&Skin
545 | Solo+
546 | Spar+Kreditbank
547 | Spar+Leihkasse
548 | speed+
549 | Speed+
550 | Spoga+gafa
551 | SPORT+
552 | Sport+Technik
553 | SS+
554 | St&H
555 | St&Z
556 | Standard&Poor
557 | Standard&Poors
558 | Station&Service
559 | Steib+Steib
560 | Stil&Stadt
561 | Strategy&
562 | Strg+Alt+Entf
563 | StrongDC++
564 | Such&Find
565 | Sumol+Compal
566 | SVS&E
567 | SVWZ+
568 | SW&S
569 | Swift+
570 | SXGA+
571 | T&D
572 | T&L
573 | T&N
574 | T&T
575 | T+A
576 | T+T
577 | TACACS+
578 | Tanz&FolkFest
579 | Taylor&Francis
580 | text+kritik
581 | TEXT+KRITIK
582 | textil+mode
583 | Timidity++
584 | TMRM+
585 | Toni&Guy
586 | toon+
587 | Touch&Travel
588 | Track+
589 | Trends+More
590 | TT&C
591 | TT&R
592 | ttir+Oei
593 | TV+Synchron
594 | U&D
595 | U++
596 | U+F
597 | Ultimate++
598 | Urban&Fischer
599 | URW++
600 | USC&GS
601 | UTC+
602 | V&A
603 | V&R
604 | V&S
605 | V&W
606 | Valentien+Valentien
607 | VC++
608 | VF+
609 | Vieweg+Teubner
610 | VISEO+
611 | Vision+Technik
612 | VisualDSP++
613 | VIVA+
614 | VL&D
615 | Vorschau+R
616 | Vorster&Gr
617 | VT&MA
618 | W&B
619 | W&F
620 | W&G
621 | W&H
622 | W&p
623 | W&V
624 | W&W
625 | WB+
626 | Wein+Markt
627 | Wienstroth&Hammans
628 | Winkler+D
629 | Wirtschaft+Markt
630 | WP&YR
631 | WS&P
632 | WSXGA+
633 | WXGA+
634 | X++
635 | X+Y
636 | Xbase++
637 | XHTML+SMIL
638 | Y&R
639 | Y&T
640 | Yin&Yang
641 | Yotsuba&
642 | Young&Queer
643 | Z+W
644 | Zeidler&Wimmel
645 | Zinc&Germanium
646 | 


--------------------------------------------------------------------------------
/src/somajo/alignment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import unicodedata
  4 | 
  5 | import regex as re
  6 | 
  7 | 
  8 | _ranges = [
  9 |     (0x0000, 0x001F),
 10 |     (0x007F, 0x009F),
 11 |     (0x2000, 0x200A),           # whitespace
 12 |     (0x200B, 0x200F),
 13 |     (0x202A, 0x202E),
 14 |     (0x2066, 0x2069)
 15 | ]
 16 | _single_characters = ["\u00AD", "\u061C", "\u2060", "\uFEFF", "\uFE0F"]
 17 | _whitespace = [" ", "\u00A0", "\u1680", "\u2028", "\u2029", "\u202F", "\u205F", "\u3000"]
 18 | _skipable_characters = set(_single_characters + _whitespace + [chr(i) for start, end in _ranges for i in range(start, end + 1)])
 19 | 
 20 | _xml_entity = re.compile(r"&(?:#\d+|#x[0-9a-f]+|amp|apos|gt|lt|quot);", re.I)
 21 | 
 22 | 
 23 | def _align_nfc(nfc, orig):
 24 |     """Character alignment from NFC version to original string."""
 25 |     alignment = {}
 26 |     if nfc == "":
 27 |         assert orig == "", "NFC string is empty - expected original string to be also empty; it is '{orig}' instead"
 28 |         return alignment
 29 |     if nfc == orig:
 30 |         return {(i, i + 1): (i, i + 1) for i in range(len(nfc))}
 31 |     nfc_i, nfc_j = 0, 0
 32 |     orig_i, orig_j = 0, 0
 33 |     while nfc_j < len(nfc):
 34 |         nfc_j = nfc_i + 1
 35 |         while (nfc_j < len(nfc)) and (unicodedata.combining(nfc[nfc_j]) > 0):
 36 |             nfc_j += 1
 37 |         orig_j = orig_i + 1
 38 |         while (orig_j < len(orig)) and (unicodedata.combining(orig[orig_j]) > 0):
 39 |             orig_j += 1
 40 |         assert nfc[nfc_i:nfc_j] == unicodedata.normalize("NFC", orig[orig_i:orig_j]), f"'{nfc[nfc_i:nfc_j]}' != unicodedata.normalize('NFC', '{orig[orig_i:orig_j]}')"
 41 |         alignment[(nfc_i, nfc_j)] = (orig_i, orig_j)
 42 |         nfc_i = nfc_j
 43 |         orig_i = orig_j
 44 |     assert orig_j == len(orig), f"{orig_j} != {len(orig)}; nfc: '{nfc}', orig: '{orig}'"
 45 |     return alignment
 46 | 
 47 | 
 48 | def _determine_offsets(tokens, raw, position):
 49 |     """Determine start and end positions of tokens in the original raw (NFC) input."""
 50 |     offsets = []
 51 |     raw_i = 0
 52 |     raw = re.sub(r"\s", " ", raw)
 53 |     for token in tokens:
 54 |         if token.markup:
 55 |             start, end = token.character_offset
 56 |             start -= position
 57 |             end -= position
 58 |         else:
 59 |             text = token.text
 60 |             if token.original_spelling is not None:
 61 |                 text = token.original_spelling
 62 |             text = re.sub(r"\s", " ", text)
 63 |             if raw[raw_i:].startswith(text):
 64 |                 start = raw_i
 65 |                 end = start + len(text)
 66 |             elif raw[raw_i:].startswith(" " + text):
 67 |                 start = raw_i + 1
 68 |                 end = start + len(text)
 69 |             else:
 70 |                 raw_start = raw_i
 71 |                 for i, char in enumerate(text):
 72 |                     for j in range(raw_start, len(raw)):
 73 |                         if raw[j] == char:
 74 |                             if i == 0:
 75 |                                 start = j
 76 |                             if i == len(text) - 1:
 77 |                                 end = j + 1
 78 |                             break
 79 |                         else:
 80 |                             assert raw[j] in _skipable_characters, f"'{raw[j]}' ({hex(ord(raw[j]))}) is not a skipable character; token: '{text}', raw: '{raw[raw_i:]}'"
 81 |                     raw_start = j + 1
 82 |         offsets.append((start, end))
 83 |         raw_i = end
 84 |     return offsets
 85 | 
 86 | 
 87 | def _resolve_entities(xml):
 88 |     """Resolve XML entities and provide an alignment from output string to input string."""
 89 |     named = {"&amp;": "&", "&apos;": "'", "&gt;": ">", "&lt;": "<", "&quot;": '"'}
 90 |     outstring = ""
 91 |     alignment = []
 92 |     xml_lower = xml.lower()
 93 |     i = 0
 94 |     for m in _xml_entity.finditer(xml_lower):
 95 |         start, end = m.span()
 96 |         if xml_lower[start + 2] == "x":
 97 |             char = chr(int(xml[start + 3:end - 1], base=16))
 98 |         elif xml_lower[start + 1] == "#":
 99 |             char = chr(int(xml[start + 2:end - 1]))
100 |         else:
101 |             char = named[xml_lower[start:end]]
102 |         outstring += xml[i:start] + char
103 |         for j in range(i, start):
104 |             alignment.append((j, j + 1))
105 |         alignment.append((start, end))
106 |         i = end
107 |     outstring += xml[i:len(xml)]
108 |     for j in range(i, len(xml)):
109 |         alignment.append((j, j + 1))
110 |     return outstring, alignment
111 | 
112 | 
113 | def token_offsets(token_list, raw, position, xml_input, tokens):
114 |     """Determine character offsets for tokens."""
115 |     if xml_input:
116 |         chunk_offsets = [(t.character_offset[0] - position, t.character_offset[1] - position) for t in token_list]
117 |         raw, align_to_entities = _resolve_entities(raw)
118 |         align_from_entities = {i: char_i for char_i, (start, end) in enumerate(align_to_entities) for i in range(start, end)}
119 |         chunks = [raw[align_from_entities[start]:align_from_entities[end - 1] + 1] for start, end in chunk_offsets]
120 |         chunks_nfc = [unicodedata.normalize("NFC", c) for c in chunks]
121 |         alignments = [_align_nfc(chunk_nfc, chunk) for chunk, chunk_nfc in zip(chunks, chunks_nfc)]
122 |         align_to_raw = alignments[0]
123 |         for i in range(1, len(alignments)):
124 |             o1 = sum(len(c) for c in chunks_nfc[:i])
125 |             o2 = sum(len(c) for c in chunks[:i])
126 |             align_to_raw.update({(k[0] + o1, k[1] + o1): (v[0] + o2, v[1] + o2) for k, v in alignments[i].items()})
127 |         raw_nfc = "".join(chunks_nfc)
128 |     else:
129 |         raw_nfc = unicodedata.normalize("NFC", raw)
130 |         align_to_raw = _align_nfc(raw_nfc, raw)
131 |     align_from_raw = {i: k for k, v in align_to_raw.items() for i in range(v[0], v[1])}
132 |     align_to_starts = {i: v[0] for k, v in align_to_raw.items() for i in range(k[0], k[1])}
133 |     align_to_ends = {i: v[1] for k, v in align_to_raw.items() for i in range(k[0], k[1])}
134 |     # adjust character offsets for markup tokens
135 |     if xml_input:
136 |         for i in range(len(tokens)):
137 |             if tokens[i].markup:
138 |                 s, e = tokens[i].character_offset
139 |                 tokens[i].character_offset = (
140 |                     align_from_raw[align_from_entities[s - position]][0] + position,
141 |                     align_from_raw[align_from_entities[e - position - 1]][1] + position
142 |                 )
143 |     offsets = _determine_offsets(tokens, raw_nfc, position)
144 |     assert len(tokens) == len(offsets), f"Not as many tokens as offsets: {len(tokens)} != {len(offsets)}"
145 |     offsets = [(align_to_starts[s], align_to_ends[e - 1]) for s, e in offsets]
146 |     if xml_input:
147 |         offsets = [(align_to_entities[s][0], align_to_entities[e - 1][1]) for s, e in offsets]
148 |     offsets = [(s + position, e + position) for s, e in offsets]
149 |     return offsets
150 | 
151 | 
152 | def xml_chunk_offset(token, raw):
153 |     """Determine character offset for an XML chunk created by `utils._xml_chunk_generator`."""
154 |     raw, align_to_raw = _resolve_entities(raw)
155 |     raw = re.sub(r"\s", " ", raw)
156 |     text = token.text
157 |     text = re.sub(r"\s", " ", text)
158 |     if token.markup:
159 |         text, align_to_text = _resolve_entities(text)
160 |         text = text.replace("'", '"')
161 |         if raw.startswith(text):
162 |             start = 0
163 |             end = len(text)
164 |         else:
165 |             pattern = "(" + re.escape(text) + ")"
166 |             pattern = pattern.replace(r"\ ", r"\s+")
167 |             pattern = pattern.replace("=", r"\s*=\s*")
168 |             if not text.startswith("</"):
169 |                 pattern = pattern[:-2] + r"\s*/?\s*" + pattern[-2:]
170 |             local_raw = raw.replace("'", '"')
171 |             m = re.match(pattern, local_raw)
172 |             if text.startswith("</") and not m:
173 |                 start, end = 0, 0
174 |             else:
175 |                 assert m, f"'{text}' not found in '{local_raw}'"
176 |                 start, end = m.span(1)
177 |     else:
178 |         assert raw.startswith(text), f"'{raw}' does not start with '{text}'"
179 |         start = 0
180 |         end = len(text)
181 |     if start == end:
182 |         return (align_to_raw[start][0], align_to_raw[start][0])
183 |     else:
184 |         return (align_to_raw[start][0], align_to_raw[end - 1][1])
185 | 


--------------------------------------------------------------------------------
/utils/evaluate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import collections
  5 | import os
  6 | 
  7 | 
  8 | Character = collections.namedtuple("Character", ("char", "token_boundary", "sentence_boundary"))
  9 | 
 10 | 
 11 | def arguments():
 12 |     """"""
 13 |     parser = argparse.ArgumentParser(description="Evaluate tokenization and sentence splitting")
 14 |     group = parser.add_mutually_exclusive_group(required=True)
 15 |     group.add_argument("-f", "--files", action="store_true", help="SYSTEM and GOLD are files")
 16 |     group.add_argument("-d", "--directories", action="store_true", help="SYSTEM and GOLD are directories (filenames have to match)")
 17 |     parser.add_argument("--ignore-xml", action="store_true", help="Ignore XML tags for evaluation")
 18 |     parser.add_argument("-s", "--sentences", action="store_true", help="Also evaluate sentence boundaries")
 19 |     parser.add_argument("-e", "--errors", type=os.path.abspath, help="Write errors to file")
 20 |     parser.add_argument("SYSTEM", type=os.path.abspath, help="System output")
 21 |     parser.add_argument("GOLD", type=os.path.abspath, help="Gold data")
 22 |     args = parser.parse_args()
 23 |     return args
 24 | 
 25 | 
 26 | def read_characters(f, ignore_xml, sentences):
 27 |     characters = []
 28 |     for line in f:
 29 |         line = line.rstrip()
 30 |         if sentences and line == "":
 31 |             characters[-1] = Character(characters[-1].char, True, True)
 32 |             continue
 33 |         if ignore_xml and line.startswith("<") and line.endswith(">"):
 34 |             continue
 35 |         for char in line:
 36 |             characters.append(Character(char, False, False))
 37 |         characters[-1] = Character(characters[-1].char, True, False)
 38 |     return characters
 39 | 
 40 | 
 41 | def char_to_str(system, gold, focus=False):
 42 |     """"""
 43 |     string = system.char
 44 |     if focus:
 45 |         # sentence fp
 46 |         if system.sentence_boundary and (not gold.sentence_boundary):
 47 |             string += "■ "
 48 |         # sentence fn
 49 |         elif (not system.sentence_boundary) and gold.sentence_boundary:
 50 |             string += "□ "
 51 |         # token fp
 52 |         elif system.token_boundary and (not gold.token_boundary):
 53 |             string += "● "
 54 |         # token fn
 55 |         elif (not system.token_boundary) and gold.token_boundary:
 56 |             string += "○ "
 57 |         # any tp
 58 |         elif (system.sentence_boundary and gold.sentence_boundary) or (system.token_boundary and gold.token_boundary):
 59 |             string += " "
 60 |     else:
 61 |         if system.sentence_boundary or system.token_boundary:
 62 |             string += " "
 63 |     return string
 64 | 
 65 | 
 66 | def precision_recall_f1(tp, fp, fn):
 67 |     """"""
 68 |     precision = tp / (tp + fp)
 69 |     recall = tp / (tp + fn)
 70 |     f1 = (2 * precision * recall) / (precision + recall)
 71 |     return precision, recall, f1
 72 | 
 73 | 
 74 | def evaluate_file(system_path, gold_path, ignore_xml, sentences, error_file):
 75 |     """"""
 76 |     print("%s ⇔ %s" % (system_path, gold_path))
 77 |     if error_file:
 78 |         with open(error_file, mode="a", encoding="utf-8") as e:
 79 |             e.write("%s ⇔ %s\n" % (system_path, gold_path))
 80 |     with open(system_path, encoding="utf-8") as system, open(gold_path, encoding="utf-8") as gold:
 81 |         sys_chars = read_characters(system, ignore_xml, sentences)
 82 |         gold_chars = read_characters(gold, ignore_xml, sentences)
 83 |         window = collections.deque([""] * 20)
 84 |         for s, g in zip(sys_chars, gold_chars):
 85 |             window.append(g.char)
 86 |             window.popleft()
 87 |             if s.char != g.char:
 88 |                 print("'" + "".join(window) + "'")
 89 |                 print("'%s' != '%s'" % (s.char, g.char))
 90 |                 break
 91 |         assert len(sys_chars) == len(gold_chars)
 92 |         assert all((s.char == g.char for s, g in zip(sys_chars, gold_chars)))
 93 |         token_precision, token_recall, token_f1, sentence_precision, sentence_recall, sentence_f1 = 0, 0, 0, 0, 0, 0
 94 |         token_tp, token_fp, token_fn, sentence_tp, sentence_fp, sentence_fn = 0, 0, 0, 0, 0, 0
 95 |         if error_file:
 96 |             with open(error_file, mode="a", encoding="utf-8") as e:
 97 |                 sys_window = collections.deque([Character("", False, False)] * 41)
 98 |                 gold_window = collections.deque([Character("", False, False)] * 41)
 99 |                 for s, g in zip(sys_chars + [Character("", False, False)] * 20, gold_chars + [Character("", False, False)] * 20):
100 |                     sys_window.append(s)
101 |                     sys_window.popleft()
102 |                     gold_window.append(g)
103 |                     gold_window.popleft()
104 |                     if sys_window[20] != gold_window[20]:
105 |                         e.write("%s%s%s\n" % ("".join(char_to_str(x, y) for x, y in zip(list(sys_window)[:20], list(gold_window)[:20]))[-20:],
106 |                                           char_to_str(sys_window[20], gold_window[20], focus=True),
107 |                                           "".join(char_to_str(x, y) for x, y in zip(list(sys_window)[21:], list(gold_window)[21:]))[:20]))
108 |         token_tp = len([s for s, g in zip(sys_chars, gold_chars) if g.token_boundary and s.token_boundary])
109 |         token_fp = len([s for s, g in zip(sys_chars, gold_chars) if (not g.token_boundary) and s.token_boundary])
110 |         token_fn = len([s for s, g in zip(sys_chars, gold_chars) if g.token_boundary and (not s.token_boundary)])
111 |         token_precision, token_recall, token_f1 = precision_recall_f1(token_tp, token_fp, token_fn)
112 |         print("Tokenization:")
113 |         print("P = %6.2f%%   R = %6.2f%%   F = %6.2f%%" % (token_precision * 100, token_recall * 100, token_f1 * 100))
114 |         print("%d false positives, %d false negatives" % (token_fp, token_fn))
115 |         if sentences:
116 |             sentence_tp = len([s for s, g in zip(sys_chars, gold_chars) if g.sentence_boundary and s.sentence_boundary])
117 |             sentence_fp = len([s for s, g in zip(sys_chars, gold_chars) if (not g.sentence_boundary) and s.sentence_boundary])
118 |             sentence_fn = len([s for s, g in zip(sys_chars, gold_chars) if g.sentence_boundary and (not s.sentence_boundary)])
119 |             sentence_precision, sentence_recall, sentence_f1 = precision_recall_f1(sentence_tp, sentence_fp, sentence_fn)
120 |             print("Sentence splitting:")
121 |             print("P = %6.2f%%   R = %6.2f%%   F = %6.2f%%" % (sentence_precision * 100, sentence_recall * 100, sentence_f1 * 100))
122 |             print("%d false positives, %d false negatives" % (sentence_fp, sentence_fn))
123 |         print()
124 |         return token_tp, token_fp, token_fn, token_precision, token_recall, token_f1, sentence_tp, sentence_fp, sentence_fn, sentence_precision, sentence_recall, sentence_f1
125 | 
126 | 
127 | def main():
128 |     """"""
129 |     args = arguments()
130 |     if args.errors:
131 |         with open(args.errors, mode="w", encoding="utf-8") as e:
132 |             pass
133 |     if args.files:
134 |         evaluate_file(args.SYSTEM, args.GOLD, args.ignore_xml, args.sentences, args.errors)
135 |     elif args.directories:
136 |         n_tokens, token_precision, token_recall, token_f1, n_sentences, sentence_precision, sentence_recall, sentence_f1 = 0, 0, 0, 0, 0, 0, 0, 0
137 |         token_tp, token_fp, token_fn, sentence_tp, sentence_fp, sentence_fn = 0, 0, 0, 0, 0, 0
138 |         system_files = sorted(os.listdir(args.SYSTEM))
139 |         gold_files = sorted(os.listdir(args.GOLD))
140 |         assert len(system_files) == len(gold_files)
141 |         assert all((s == g for s, g in zip(system_files, gold_files)))
142 |         for system_file, gold_file in zip(system_files, gold_files):
143 |             ttp, tfp, tfn, tp, tr, tf, stp, sfp, sfn, sp, sr, sf = evaluate_file(os.path.join(args.SYSTEM, system_file), os.path.join(args.GOLD, gold_file), args.ignore_xml, args.sentences, args.errors)
144 |             nt = ttp + tfn
145 |             ns = stp + sfp
146 |             token_tp += ttp
147 |             token_fp += tfp
148 |             token_fn += tfn
149 |             sentence_tp += stp
150 |             sentence_fp += sfp
151 |             sentence_fn += sfn
152 |             n_tokens += nt
153 |             token_precision += nt * tp
154 |             token_recall += nt * tr
155 |             token_f1 += nt * tf
156 |             n_sentences += ns
157 |             sentence_precision += ns * sp
158 |             sentence_recall += ns * sr
159 |             sentence_f1 += ns * sf
160 |         print("TOTAL")
161 |         print("Tokenization (weighted average on %d tokens):" % n_tokens)
162 |         print("P = %6.2f%%   R = %6.2f%%   F = %6.2f%%" % (token_precision / n_tokens * 100, token_recall / n_tokens * 100, token_f1 / n_tokens * 100))
163 |         print("%d false positives, %d false negatives" % (token_fp, token_fn))
164 |         if args.sentences:
165 |             print("Sentence splitting (weighted average on %d sentences):" % n_sentences)
166 |             print("P = %6.2f%%   R = %6.2f%%   F = %6.2f%%" % (sentence_precision / n_sentences * 100, sentence_recall / n_sentences * 100, sentence_f1 / n_sentences * 100))
167 |             print("%d false positives, %d false negatives" % (sentence_fp, sentence_fn))
168 | 
169 | 
170 | if __name__ == "__main__":
171 |     main()
172 | 


--------------------------------------------------------------------------------
/tests/test_alignment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import itertools
  4 | import unicodedata
  5 | import unittest
  6 | 
  7 | import somajo.alignment
  8 | from somajo.doubly_linked_list import DLL
  9 | from somajo.token import Token
 10 | from somajo.somajo import Tokenizer
 11 | from somajo import utils
 12 | 
 13 | 
 14 | class TestNfcAlignment(unittest.TestCase):
 15 |     def test_nfc_01(self):
 16 |         """Singleton: Angstrom sign"""
 17 |         orig = "xÅx"
 18 |         nfc = unicodedata.normalize("NFC", orig)
 19 |         alignment = {(0, 1): (0, 1), (1, 2): (1, 2), (2, 3): (2, 3)}
 20 |         self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
 21 | 
 22 |     def test_nfc_02(self):
 23 |         """Single combining mark"""
 24 |         orig = "xA\u0308x"
 25 |         nfc = unicodedata.normalize("NFC", orig)
 26 |         alignment = {(0, 1): (0, 1), (1, 2): (1, 3), (2, 3): (3, 4)}
 27 |         self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
 28 | 
 29 |     def test_nfc_03(self):
 30 |         """Multiple combining marks"""
 31 |         orig = "xs\u0323\u0307x"
 32 |         nfc = unicodedata.normalize("NFC", orig)
 33 |         alignment = {(0, 1): (0, 1), (1, 2): (1, 4), (2, 3): (4, 5)}
 34 |         self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
 35 | 
 36 |     def test_nfc_04(self):
 37 |         """Multiple combining marks"""
 38 |         orig = "xs\u0307\u0323x"
 39 |         nfc = unicodedata.normalize("NFC", orig)
 40 |         alignment = {(0, 1): (0, 1), (1, 2): (1, 4), (2, 3): (4, 5)}
 41 |         self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
 42 | 
 43 |     def test_nfc_05(self):
 44 |         """Multiple combining marks"""
 45 |         orig = "x\u1e0b\u0323x"
 46 |         nfc = unicodedata.normalize("NFC", orig)
 47 |         alignment = {(0, 1): (0, 1), (1, 3): (1, 3), (3, 4): (3, 4)}
 48 |         self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
 49 | 
 50 |     def test_nfc_06(self):
 51 |         """Multiple combining marks"""
 52 |         orig = "q\u0307\u0323x"
 53 |         nfc = unicodedata.normalize("NFC", orig)
 54 |         alignment = {(0, 3): (0, 3), (3, 4): (3, 4)}
 55 |         self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
 56 | 
 57 |     def test_nfc_07(self):
 58 |         """Empty string"""
 59 |         orig = ""
 60 |         nfc = unicodedata.normalize("NFC", orig)
 61 |         alignment = {}
 62 |         self.assertEqual(somajo.alignment._align_nfc(nfc, orig), alignment)
 63 | 
 64 | 
 65 | class TestResolveEntities(unittest.TestCase):
 66 |     def test_entitites_01(self):
 67 |         xml = '<foo attr="bar &quot;baz&quot; qux">foo &lt;bar&gt; baz</foo>'
 68 |         resolved = '<foo attr="bar "baz" qux">foo <bar> baz</foo>'
 69 |         alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6),
 70 |                      (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12),
 71 |                      (12, 13), (13, 14), (14, 15), (15, 21), (21, 22),
 72 |                      (22, 23), (23, 24), (24, 30), (30, 31), (31, 32),
 73 |                      (32, 33), (33, 34), (34, 35), (35, 36), (36, 37),
 74 |                      (37, 38), (38, 39), (39, 40), (40, 44), (44, 45),
 75 |                      (45, 46), (46, 47), (47, 51), (51, 52), (52, 53),
 76 |                      (53, 54), (54, 55), (55, 56), (56, 57), (57, 58),
 77 |                      (58, 59), (59, 60), (60, 61)]
 78 |         res, al = somajo.alignment._resolve_entities(xml)
 79 |         self.assertEqual(res, resolved)
 80 |         self.assertEqual(al, alignment)
 81 | 
 82 |     def test_entities_02(self):
 83 |         xml = "<foo>T&#x0065;st</foo>"
 84 |         resolved = "<foo>Test</foo>"
 85 |         alignment = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6),
 86 |                      (6, 14), (14, 15), (15, 16), (16, 17), (17, 18),
 87 |                      (18, 19), (19, 20), (20, 21), (21, 22)]
 88 |         res, al = somajo.alignment._resolve_entities(xml)
 89 |         self.assertEqual(res, resolved)
 90 |         self.assertEqual(al, alignment)
 91 | 
 92 | 
 93 | class TestDetermineOffsets(unittest.TestCase):
 94 |     def setUp(self):
 95 |         """Necessary preparations"""
 96 |         self.tokenizer = Tokenizer(split_camel_case=True, language="de_CMC")
 97 | 
 98 |     def _equal(self, raw, tokenized):
 99 |         raw = unicodedata.normalize("NFC", raw)
100 |         if isinstance(tokenized, str):
101 |             tokenized = tokenized.split()
102 |         dll = DLL([Token(raw, first_in_sentence=True, last_in_sentence=True)])
103 |         tokens = self.tokenizer._tokenize(dll)
104 |         offsets = somajo.alignment._determine_offsets(tokens, raw, position=0)
105 |         self.assertEqual([raw[s:e] for s, e in offsets], tokenized)
106 | 
107 |     def test_token_alignment_01(self):
108 |         self._equal("Ein simpler Test.", "Ein simpler Test .")
109 | 
110 |     def test_token_alignment_02(self):
111 |         self._equal("bla \u1e0d\u0307amit.", "bla \u1e0d\u0307amit .")
112 | 
113 |     def test_token_alignment_03(self):
114 |         self._equal("foo (bar) baz?", "foo ( bar ) baz ?")
115 | 
116 |     def test_token_alignment_03a(self):
117 |         self._equal("foo:\n) bar", ["foo", ":\n)", "bar"])
118 | 
119 |     def test_token_alignment_04(self):
120 |         self._equal(
121 |             "foo​bar foo­bar foo\ufeffbar foobarbazquxalphabetagamma foo‌bar‍baz foo‏bar‎baz foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta",
122 |             ["foo​bar", "foo­bar", "foo\ufeffbar", "foobarbazquxalphabetagamma", "foo‌bar‍baz", "foo‏bar‎baz", "foo\u202bbar\u202abaz\u202cqux\u202ealpha\u202dbeta"]
123 |         )
124 | 
125 | 
126 | class TestTokenOffsets(unittest.TestCase):
127 |     def setUp(self):
128 |         """Necessary preparations"""
129 |         self.tokenizer = Tokenizer(split_camel_case=True, language="de_CMC")
130 | 
131 |     def _equal_xml(self, raw, tokenized):
132 |         raw = unicodedata.normalize("NFC", raw)
133 |         if isinstance(tokenized, str):
134 |             tokenized = tokenized.split()
135 |         eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
136 |         eos_tags = set(eos_tags)
137 |         chunk_info = utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags, character_offsets=True)
138 |         chunk_lists = [ci[0] for ci in chunk_info]
139 |         token_dlls = map(DLL, chunk_lists)
140 |         chunks = map(self.tokenizer._tokenize, token_dlls)
141 |         complete = list(itertools.chain.from_iterable(chunks))
142 |         offsets = somajo.alignment.token_offsets(list(itertools.chain.from_iterable(chunk_lists)), raw, 0, True, complete)
143 |         self.assertEqual([raw[s:e] for s, e in offsets], tokenized)
144 | 
145 |     def test_token_alignment_05(self):
146 |         self._equal_xml(
147 |             "<foo>der beste Betreuer? - &gt;ProfSmith! : )</foo>",
148 |             ["<foo>", "der", "beste", "Betreuer", "?", "- &gt;", "Prof", "Smith", "!", ": )", "</foo>"]
149 |         )
150 | 
151 |     def test_token_alignment_06(self):
152 |         self._equal_xml("<foo>das steht auf S.&#x00ad;5</foo>", "<foo> das steht auf S. 5 </foo>")
153 | 
154 |     def test_token_alignment_07(self):
155 |         self._equal_xml("<foo><bar>na so was -&#x200B;</bar><bar>&gt; bla</bar></foo>", "<foo> <bar> na so was - </bar> <bar> &gt; bla </bar> </foo>")
156 | 
157 |     def test_token_alignment_08(self):
158 |         self._equal_xml("<foo>T&#x0065;st</foo>", "<foo> T&#x0065;st </foo>")
159 | 
160 |     def test_token_alignment_09(self):
161 |         self._equal_xml("<foo>3 &#x003c; 5</foo>", "<foo> 3 &#x003c; 5 </foo>")
162 | 
163 |     def test_token_alignment_10(self):
164 |         self._equal_xml("<foo>Test&#x00ad;fall</foo>", "<foo> Test&#x00ad;fall </foo>")
165 | 
166 |     def test_token_alignment_11(self):
167 |         self._equal_xml("<foo>Test­fall</foo>", "<foo> Test­fall </foo>")
168 | 
169 |     def test_token_alignment_12(self):
170 |         """Single combining mark"""
171 |         self._equal_xml("<foo>foo xA&#x0308;x foo</foo>", "<foo> foo xA&#x0308;x foo </foo>")
172 | 
173 |     def test_token_alignment_13(self):
174 |         """Multiple combining marks"""
175 |         self._equal_xml("<foo>foo xs&#x0323;&#x0307;x foo</foo>", "<foo> foo xs&#x0323;&#x0307;x foo </foo>")
176 | 
177 |     def test_token_alignment_14(self):
178 |         """Multiple combining marks"""
179 |         self._equal_xml("<foo>foo xs&#x0307;&#x0323;x foo</foo>", "<foo> foo xs&#x0307;&#x0323;x foo </foo>")
180 | 
181 |     def test_token_alignment_15(self):
182 |         """Multiple combining marks"""
183 |         self._equal_xml("<foo>foo xs&#x1e0b;&#x0323;x foo</foo>", "<foo> foo xs&#x1e0b;&#x0323;x foo </foo>")
184 | 
185 |     def test_token_alignment_16(self):
186 |         """Multiple combining marks"""
187 |         self._equal_xml("<foo>foo xq&#x0307;&#x0323;x foo</foo>", "<foo> foo xq&#x0307;&#x0323;x foo </foo>")
188 | 
189 |     def test_token_alignment_17(self):
190 |         self._equal_xml("<foo bar='baz'>Foo</foo>", ["<foo bar='baz'>", "Foo", "</foo>"])
191 | 
192 |     def test_token_alignment_18(self):
193 |         self._equal_xml("<foo bar='ba\"z'>Foo</foo>", ["<foo bar='ba\"z'>", "Foo", "</foo>"])
194 | 
195 |     def test_token_alignment_19(self):
196 |         self._equal_xml("<foo   bar   =   'baz'>   Foo   </foo>", ["<foo   bar   =   'baz'>", "Foo", "</foo>"])
197 | 
198 |     def test_token_alignment_20(self):
199 |         self._equal_xml("<foo bar='ba\"z'>Foo \"Bar\" 'Baz'</foo>", ["<foo bar='ba\"z'>", "Foo", '"', "Bar", '"', "'", "Baz", "'", "</foo>"])
200 | 
201 |     def test_token_alignment_21(self):
202 |         self._equal_xml('<foo bar="baz"\n  spam="eggs">\n    Foo\n</foo>', ['<foo bar="baz"\n  spam="eggs">', "Foo", "</foo>"])
203 | 
204 |     def test_token_alignment_22(self):
205 |         self._equal_xml("<foo>Hallo<br/>Tschüß</foo>", ["<foo>", "Hallo", "<br/>", "", "Tschüß", "</foo>"])
206 | 
207 |     def test_token_alignment_23(self):
208 |         self._equal_xml("<foo>Hallo<br />Tschüß</foo>", ["<foo>", "Hallo", "<br />", "", "Tschüß", "</foo>"])
209 | 
210 |     def test_token_alignment_24(self):
211 |         self._equal_xml("<foo>\u0303foo</foo>", ["<foo>", "\u0303foo", "</foo>"])
212 | 
213 |     def test_token_alignment_25(self):
214 |         self._equal_xml("<foo>foo<p>bar</p></foo>", ["<foo>", "foo", "<p>", "bar", "</p>", "</foo>"])
215 | 
216 |     def test_token_alignment_26(self):
217 |         self._equal_xml("<foo><p>bar</p><p>baz</p></foo>", ["<foo>", "<p>", "bar", "</p>", "<p>", "baz", "</p>", "</foo>"])
218 | 


--------------------------------------------------------------------------------
/tests/test_somajo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import io
  4 | import unittest
  5 | 
  6 | from somajo.somajo import SoMaJo
  7 | 
  8 | 
  9 | class TestSoMaJo(unittest.TestCase):
 10 |     def setUp(self):
 11 |         """Necessary preparations"""
 12 |         self.tokenizer = SoMaJo("de_CMC")
 13 | 
 14 |     def _equal_text(self, paragraphs, tokenized_sentences, parallel=1):
 15 |         sentences = self.tokenizer.tokenize_text(paragraphs, parallel=parallel)
 16 |         sentences = [[t.text for t in s] for s in sentences]
 17 |         self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
 18 | 
 19 |     def _equal_text_file_single_newlines(self, paragraphs, tokenized_sentences, parallel=1):
 20 |         pseudofile = io.StringIO("\n".join(paragraphs))
 21 |         sentences = self.tokenizer.tokenize_text_file(pseudofile, paragraph_separator="single_newlines", parallel=parallel)
 22 |         sentences = [[t.text for t in s] for s in sentences]
 23 |         self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
 24 | 
 25 |     def _equal_text_file_empty_lines(self, paragraphs, tokenized_sentences, parallel=1):
 26 |         pseudofile = io.StringIO("\n\n".join(paragraphs))
 27 |         sentences = self.tokenizer.tokenize_text_file(pseudofile, paragraph_separator="empty_lines", parallel=parallel)
 28 |         sentences = [[t.text for t in s] for s in sentences]
 29 |         self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
 30 | 
 31 |     def _equal_xml(self, xml, tokenized_sentences, strip_tags=False, parallel=1, prune_tags=None):
 32 |         eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
 33 |         sentences = self.tokenizer.tokenize_xml(xml, eos_tags, strip_tags=strip_tags, parallel=parallel, prune_tags=prune_tags)
 34 |         sentences = [[t.text for t in s] for s in sentences]
 35 |         self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
 36 | 
 37 |     def _equal_xml_file(self, xml, tokenized_sentences, strip_tags=False, parallel=1, prune_tags=None):
 38 |         eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
 39 |         pseudofile = io.StringIO(xml)
 40 |         sentences = self.tokenizer.tokenize_xml_file(pseudofile, eos_tags, strip_tags=strip_tags, parallel=parallel, prune_tags=prune_tags)
 41 |         sentences = [[t.text for t in s] for s in sentences]
 42 |         self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
 43 | 
 44 | 
 45 | class TestSoMaJoNoSent(TestSoMaJo):
 46 |     def setUp(self):
 47 |         """Necessary preparations"""
 48 |         self.tokenizer = SoMaJo("de_CMC", split_sentences=False)
 49 | 
 50 | 
 51 | class TestText(TestSoMaJo):
 52 |     def test_text_01(self):
 53 |         self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"])
 54 | 
 55 |     def test_text_02(self):
 56 |         self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"])
 57 | 
 58 |     def test_text_03(self):
 59 |         self._equal_text_file_single_newlines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"])
 60 | 
 61 |     def test_text_04(self):
 62 |         self.assertRaises(TypeError, self.tokenizer.tokenize_text, "Foo bar. Baz qux")
 63 | 
 64 | 
 65 | class TestTextXMLSent(TestSoMaJo):
 66 |     def setUp(self):
 67 |         """Necessary preparations"""
 68 |         self.tokenizer = SoMaJo("de_CMC", xml_sentences="s")
 69 | 
 70 |     def test_text_01(self):
 71 |         self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], ["<s> Foo bar . </s>", "<s> Baz qux </s>", "<s> alpha . </s>", "<s> Beta gamma </s>"])
 72 | 
 73 |     def test_text_02(self):
 74 |         self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["<s> Foo bar . </s>", "<s> Baz qux </s>", "<s> alpha . </s>", "<s> Beta gamma </s>"])
 75 | 
 76 | 
 77 | class TestTextParallel(TestSoMaJo):
 78 |     def test_text_01(self):
 79 |         self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], parallel=2)
 80 | 
 81 |     def test_text_02(self):
 82 |         self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], parallel=2)
 83 | 
 84 |     def test_text_03(self):
 85 |         self._equal_text_file_single_newlines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], parallel=2)
 86 | 
 87 | 
 88 | class TestTextNoSent(TestSoMaJoNoSent):
 89 |     def test_text_01(self):
 90 |         self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"])
 91 | 
 92 |     def test_text_02(self):
 93 |         self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"])
 94 | 
 95 |     def test_text_03(self):
 96 |         self._equal_text_file_single_newlines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"])
 97 | 
 98 | 
 99 | class TestTextNoSentParallel(TestSoMaJoNoSent):
100 |     def test_text_01(self):
101 |         self._equal_text(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"], parallel=2)
102 | 
103 |     def test_text_02(self):
104 |         self._equal_text_file_empty_lines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"], parallel=2)
105 | 
106 |     def test_text_03(self):
107 |         self._equal_text_file_single_newlines(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar . Baz qux", "alpha . Beta gamma"], parallel=2)
108 | 
109 | 
110 | class TestXML(TestSoMaJo):
111 |     def test_xml_01(self):
112 |         self._equal_xml("<html>\n  <body>\n    <p>Foo bar. Baz qux</p>\n    <p>alpha. Beta gamma</p>\n  </body>\n</html>", ["<html> <body> <p> Foo bar .", "Baz qux </p>", "<p> alpha .", "Beta gamma </p> </body> </html>"])
113 | 
114 |     def test_xml_02(self):
115 |         self._equal_xml_file("<html>\n  <body>\n    <p>Foo bar. Baz qux</p>\n    <p>alpha. Beta gamma</p>\n  </body>\n</html>", ["<html> <body> <p> Foo bar .", "Baz qux </p>", "<p> alpha .", "Beta gamma </p> </body> </html>"])
116 | 
117 | 
118 | class TestXMLParallel(TestSoMaJo):
119 |     def test_xml_01(self):
120 |         self._equal_xml("<html>\n  <body>\n    <p>Foo bar. Baz qux</p>\n    <p>alpha. Beta gamma</p>\n  </body>\n</html>", ["<html> <body> <p> Foo bar .", "Baz qux </p>", "<p> alpha .", "Beta gamma </p> </body> </html>"], parallel=2)
121 | 
122 |     def test_xml_02(self):
123 |         self._equal_xml_file("<html>\n  <body>\n    <p>Foo bar. Baz qux</p>\n    <p>alpha. Beta gamma</p>\n  </body>\n</html>", ["<html> <body> <p> Foo bar .", "Baz qux </p>", "<p> alpha .", "Beta gamma </p> </body> </html>"], parallel=2)
124 | 
125 | 
126 | class TestXMLNoSent(TestSoMaJoNoSent):
127 |     def test_xml_01(self):
128 |         self._equal_xml("<html>\n  <body>\n    <p>Foo bar. Baz qux</p>\n    <p>alpha. Beta gamma</p>\n  </body>\n</html>", ["<html> <body> <p> Foo bar . Baz qux </p>", "<p> alpha . Beta gamma </p> </body> </html>"])
129 | 
130 |     def test_xml_02(self):
131 |         self._equal_xml_file("<html>\n  <body>\n    <p>Foo bar. Baz qux</p>\n    <p>alpha. Beta gamma</p>\n  </body>\n</html>", ["<html> <body> <p> Foo bar . Baz qux </p>", "<p> alpha . Beta gamma </p> </body> </html>"])
132 | 
133 | 
134 | class TestXMLNoSentParallel(TestSoMaJoNoSent):
135 |     def test_xml_01(self):
136 |         self._equal_xml("<html>\n  <body>\n    <p>Foo bar. Baz qux</p>\n    <p>alpha. Beta gamma</p>\n  </body>\n</html>", ["<html> <body> <p> Foo bar . Baz qux </p>", "<p> alpha . Beta gamma </p> </body> </html>"], parallel=2)
137 | 
138 |     def test_xml_02(self):
139 |         self._equal_xml_file("<html>\n  <body>\n    <p>Foo bar. Baz qux</p>\n    <p>alpha. Beta gamma</p>\n  </body>\n</html>", ["<html> <body> <p> Foo bar . Baz qux </p>", "<p> alpha . Beta gamma </p> </body> </html>"], parallel=2)
140 | 
141 | 
142 | class TestXMLStripTags(TestSoMaJo):
143 |     def test_xml_01(self):
144 |         self._equal_xml("<html>\n  <body>\n    <p>Foo bar. Baz qux</p>\n    <p>alpha. Beta gamma</p>\n  </body>\n</html>", ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], strip_tags=True)
145 | 
146 |     def test_xml_02(self):
147 |         self._equal_xml_file("<html>\n  <body>\n    <p>Foo bar. Baz qux</p>\n    <p>alpha. Beta gamma</p>\n  </body>\n</html>", ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"], strip_tags=True)
148 | 
149 | 
150 | class TestXMLPruneTags(TestSoMaJo):
151 |     def test_xml_01(self):
152 |         self._equal_xml("<html>\n  <head>\n    Spam\n  </head>\n  <body>\n    <p>Foo bar. Baz qux</p>\n    <p>alpha. Beta gamma</p>\n  </body>\n</html>", ["<html> <body> <p> Foo bar .", "Baz qux </p>", "<p> alpha .", "Beta gamma </p> </body> </html>"], prune_tags=["head"])
153 | 
154 |     def test_xml_02(self):
155 |         self._equal_xml_file("<html>\n  <head>\n    Spam\n  </head>\n  <body>\n    <p>Foo bar. Baz qux</p>\n    <p>alpha. Beta gamma</p>\n  </body>\n</html>", ["<html> <body> <p> Foo bar .", "Baz qux </p>", "<p> alpha .", "Beta gamma </p> </body> </html>"], prune_tags=["head"])
156 | 
157 | 
158 | class TestCharacterOffsets(TestSoMaJo):
159 |     def setUp(self):
160 |         """Necessary preparations"""
161 |         self.tokenizer = SoMaJo("de_CMC", character_offsets=True)
162 | 
163 |     def _equal_offsets_text_file(self, paragraphs, tokenized_sentences, parallel=1):
164 |         raw = "\n\n".join(paragraphs)
165 |         pseudofile = io.StringIO(raw)
166 |         sentences = self.tokenizer.tokenize_text_file(pseudofile, paragraph_separator="empty_lines", parallel=parallel)
167 |         sentences = list(sentences)
168 |         tokens = [[t.text for t in s] for s in sentences]
169 |         self.assertEqual(tokens, [ts.split() for ts in tokenized_sentences])
170 |         offsets = [[t.character_offset for t in s] for s in sentences]
171 |         extracted = [[raw[s:e] for s, e in sent] for sent in offsets]
172 |         self.assertEqual(tokens, extracted)
173 | 
174 |     def _equal_offsets_xml(self, xml, tokenized_sentences, strip_tags=False, parallel=1, prune_tags=None):
175 |         eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
176 |         sentences = self.tokenizer.tokenize_xml(xml, eos_tags, strip_tags=strip_tags, parallel=parallel, prune_tags=prune_tags)
177 |         sentences = list(sentences)
178 |         tokens = [[t.text for t in s] for s in sentences]
179 |         self.assertEqual(tokens, [ts.split() for ts in tokenized_sentences])
180 |         offsets = [[t.character_offset for t in s] for s in sentences]
181 |         extracted = [[xml[s:e] for s, e in sent] for sent in offsets]
182 |         self.assertEqual(tokens, extracted)
183 | 
184 |     def test_text_offsets_01(self):
185 |         self._equal_offsets_text_file(["Foo bar. Baz qux", "alpha. Beta gamma"], ["Foo bar .", "Baz qux", "alpha .", "Beta gamma"])
186 | 
187 |     def test_xml_offsets_01(self):
188 |         self._equal_offsets_xml("<foo><p>bar</p><p>baz</p></foo>", ["<foo> <p> bar </p>", "<p> baz </p> </foo>"])
189 | 
190 |     def test_xml_offsets_02(self):
191 |         self._equal_offsets_xml("<foo>\n<p>\nbar\n</p>\n<p>\nbaz\n</p>\n</foo>", ["<foo> <p> bar </p>", "<p> baz </p> </foo>"])
192 | 


--------------------------------------------------------------------------------
/utils/errors_test.txt:
--------------------------------------------------------------------------------
  1 | __________________________________________________________________________________________________
  2 | tmp/web_test_009.txt                              ..._standard/test_web/tokenized/web_test_009.txt
  3 | 
  4 | False Positive (linebreak inserted left):
  5 |    169:   Pops                                       169:   Pops                                  
  6 |    170:                                              170:                                         
  7 |    171: * 1.                                         171: * 1.1.                                  
  8 |    172: * 1.                                         172:   Kuchen                                
  9 |    173:   Kuchen                                     173:   für                                   
 10 |    174:   für                                        174:   Cake                                  
 11 | 
 12 | False Positive (linebreak inserted left):
 13 |    311:   .                                          310:   .                                     
 14 |    312:                                              311:                                         
 15 |    313: * 1.                                         312: * 1.2.                                  
 16 |    314: * 2.                                         313:   Kuchen                                
 17 |    315:   Kuchen                                     314:   für                                   
 18 |    316:   für                                        315:   Cake                                  
 19 | 
 20 | False Positive (linebreak inserted left):
 21 |    448:   .                                          446:   .                                     
 22 |    449:                                              447:                                         
 23 |    450: * 1.                                         448: * 1.3.                                  
 24 |    451: * 3.                                         449:   Kekse                                 
 25 |    452:   Kekse                                      450:   für                                   
 26 |    453:   für                                        451:   Cake                                  
 27 | 
 28 | __________________________________________________________________________________________________
 29 | tmp/web_test_011.txt                              ..._standard/test_web/tokenized/web_test_011.txt
 30 | 
 31 | False Negative (linebreak inserted right):
 32 |    238:   man                                        238:   man                                   
 33 |    239:   sie                                        239:   sie                                   
 34 |    240: * 1829/30                                    240: * 1829                                  
 35 |    241:   in                                         241: * /                                     
 36 |    242:   das                                        242:   30                                    
 37 |    243:   Herrschaftliche                            243:   in                                    
 38 | 
 39 | False Negative (linebreak inserted right):
 40 |    238:   man                                        239:   sie                                   
 41 |    239:   sie                                        240:   1829                                  
 42 |    240: * 1829/30                                    241: * /                                     
 43 |    241:   in                                         242: * 30                                    
 44 |    242:   das                                        243:   in                                    
 45 |    243:   Herrschaftliche                            244:   das                                   
 46 | 
 47 | False Negative (linebreak inserted right):
 48 |    411:   Garten                                     413:   Garten                                
 49 |    412:   musste                                     414:   musste                                
 50 |    413: * 1864/65                                    415: * 1864                                  
 51 |    414:   dem                                        416: * /                                     
 52 |    415:   Bau                                        417:   65                                    
 53 |    416:   des                                        418:   dem                                   
 54 | 
 55 | False Negative (linebreak inserted right):
 56 |    411:   Garten                                     414:   musste                                
 57 |    412:   musste                                     415:   1864                                  
 58 |    413: * 1864/65                                    416: * /                                     
 59 |    414:   dem                                        417: * 65                                    
 60 |    415:   Bau                                        418:   dem                                   
 61 |    416:   des                                        419:   Bau                                   
 62 | 
 63 | __________________________________________________________________________________________________
 64 | tmp/web_test_002.txt                              ..._standard/test_web/tokenized/web_test_002.txt
 65 | 
 66 | False Negative (linebreak inserted right):
 67 |     99:   der                                         99:   der                                   
 68 |    100:   Saison                                     100:   Saison                                
 69 |    101: * 2009/2010                                  101: * 2009                                  
 70 |    102:   sind                                       102: * /                                     
 71 |    103:   laut                                       103:   2010                                  
 72 |    104:   einer                                      104:   sind                                  
 73 | 
 74 | False Negative (linebreak inserted right):
 75 |     99:   der                                        100:   Saison                                
 76 |    100:   Saison                                     101:   2009                                  
 77 |    101: * 2009/2010                                  102: * /                                     
 78 |    102:   sind                                       103: * 2010                                  
 79 |    103:   laut                                       104:   sind                                  
 80 |    104:   einer                                      105:   laut                                  
 81 | 
 82 | __________________________________________________________________________________________________
 83 | tmp/web_test_012.txt                              ..._standard/test_web/tokenized/web_test_012.txt
 84 | 
 85 | False Positive (linebreak inserted left):
 86 |    660:   Backlinks                                  660:   Backlinks                             
 87 |    661:   :                                          661:   :                                     
 88 |    662: * [                                          662: * [[                                    
 89 |    663: * [                                          663:   security                              
 90 |    664:   security                                   664:   :                                     
 91 |    665:   :                                          665:   verschlüsselung                       
 92 | 
 93 | False Positive (linebreak inserted left):
 94 |    665:   :                                          664:   :                                     
 95 |    666:   verschlüsselung                            665:   verschlüsselung                       
 96 |    667: * ]                                          666: * ]]                                    
 97 |    668: * ]                                          667:                                         
 98 |    669:                                              668:   Navigation                            
 99 |    670:   Navigation                                 669:   Passwort-                             
100 | 
101 | __________________________________________________________________________________________________
102 | tmp/web_test_004.txt                              ..._standard/test_web/tokenized/web_test_004.txt
103 | 
104 | False Positive (linebreak inserted left):
105 |    141:   Telekommunikationsgeheimnis                141:   Telekommunikationsgeheimnis           
106 |    142:   (                                          142:   (                                     
107 |    143: * Art                                        143: * Art.                                  
108 |    144: * .                                          144:   10                                    
109 |    145:   10                                         145:   GG                                    
110 |    146:   GG                                         146:   ,                                     
111 | 
112 | False Positive (linebreak inserted left):
113 |    146:   GG                                         145:   GG                                    
114 |    147:   ,                                          146:   ,                                     
115 |    148: * Art                                        147: * Art.                                  
116 |    149: * .                                          148:   8                                     
117 |    150:   8                                          149:   Abs.                                  
118 |    151:   Abs.                                       150:   1                                     
119 | 
120 | False Positive (linebreak inserted left):
121 |    153:   EMRK                                       151:   EMRK                                  
122 |    154:   ,                                          152:   ,                                     
123 |    155: * Art                                        153: * Art.                                  
124 |    156: * .                                          154:   7                                     
125 |    157:   7                                          155:   EU-GrCh                               
126 |    158:   EU-GrCh                                    156:   )                                     
127 | 
128 | False Positive (linebreak inserted left):
129 |    173:   gewährleistet                              170:   gewährleistet                         
130 |    174:   .                                          171:   .                                     
131 |    175: * Art                                        172: * Art.                                  
132 |    176: * .                                          173:   10                                    
133 |    177:   10                                         174:   GG                                    
134 |    178:   GG                                         175:   sagt                                  
135 | 
136 | False Positive (linebreak inserted left):
137 |    270:   Fernmeldegeheimnisses                      266:   Fernmeldegeheimnisses                 
138 |    271:   in                                         267:   in                                    
139 |    272: * Art                                        268: * Art.                                  
140 |    273: * .                                          269:   8                                     
141 |    274:   8                                          270:   EMRK                                  
142 |    275:   EMRK                                       271:   und                                   
143 | 
144 | False Positive (linebreak inserted left):
145 |    275:   EMRK                                       270:   EMRK                                  
146 |    276:   und                                        271:   und                                   
147 |    277: * Art                                        272: * Art.                                  
148 |    278: * .                                          273:   7                                     
149 |    279:   7                                          274:   EU-GrCh                               
150 |    280:   EU-GrCh                                    275:   :                                     
151 | 
152 | False Positive (linebreak inserted left):
153 |    448:   Widerstandsrechts                          442:   Widerstandsrechts                     
154 |    449:   (                                          443:   (                                     
155 |    450: * Art                                        444: * Art.                                  
156 |    451: * .                                          445:   20                                    
157 |    452:   20                                         446:   Abs.                                  
158 |    453:   Abs.                                       447:   4                                     
159 | 
160 | 


--------------------------------------------------------------------------------
/CHANGES.txt:
--------------------------------------------------------------------------------
  1 | # CHANGELOG #
  2 | 
  3 | ## Version 2.4.3, 2024-08-05 ##
  4 | 
  5 | - Move non-abbreviation tokens that should not be split from
  6 |   `single_token_abbreviations_<LANG>.txt` to
  7 |   `single_tokens_<LANG>.txt` and add cellular networks generations
  8 |   (issue #32).
  9 | 
 10 | ## Version 2.4.2, 2024-02-10 ##
 11 | 
 12 | - Fix issues #28 and #29 (markdown links with trailing symbols after
 13 |   URL part).
 14 | 
 15 | ## Version 2.4.1, 2024-02-09 ##
 16 | 
 17 | - Fix issue #27 (URLs in angle brackets).
 18 | 
 19 | ## Version 2.4.0, 2023-12-23 ##
 20 | 
 21 | - New feature: SoMaJo can output character offsets for tokens,
 22 |   allowing for stand-off tokenization. Pass `character_offsets=True`
 23 |   to the constructor or use the option `--character-offsets` on the
 24 |   command line to enable the feature. The character offsets are
 25 |   determined by aligning the tokenized output with the input,
 26 |   therefore activating the feature incurs a noticeable increase in
 27 |   processing time.
 28 | 
 29 | ## Version 2.3.1, 2023-09-23 ##
 30 | 
 31 | - Fix issue #26 (markdown links that contain a URL in the link text).
 32 | 
 33 | ## Version 2.3.0, 2023-08-14 ##
 34 | 
 35 | - **Potentially breaking change:** The somajo-tokenizer script is
 36 |   automatically created upon installation and bin/somajo-tokenizer is
 37 |   removed. For most users, this does not make a difference. If you
 38 |   used to run your own modified version of SoMaJo directly via
 39 |   bin/somajo-tokenizer, consider installing the project in editable
 40 |   mode (see Development section in README.md).
 41 | - Switch from setup.py to pyconfig.toml and restructure the project
 42 |   (source in src, tests in tests).
 43 | - When creating a Token object, only known token classes can be
 44 |   passed.
 45 | - Fix issue #25 (dates at the end of sentences)
 46 | 
 47 | ## Version 2.2.4, 2023-06-23 ##
 48 | 
 49 | - Improvements to tokenization of words containing numbers (e.g.
 50 |   COVID-19-Pandemie, FFP2-Maske).
 51 | 
 52 | ## Version 2.2.3, 2023-02-02 ##
 53 | 
 54 | - Improvements to tokenization: Roman ordinals, abbreviation “Art.”
 55 |   preceding a number, certain units of measurement at the end of a
 56 |   sentence (e.g. km/h).
 57 | 
 58 | ## Version 2.2.2, 2022-09-12 ##
 59 | 
 60 | - Bugfix: Command-line option --sentence_tag implies option --split_sentences.
 61 | 
 62 | ## Version 2.2.1, 2022-03-08 ##
 63 | 
 64 | - Bugfix: Command-line option --strip-tags implies option --xml.
 65 | 
 66 | ## Version 2.2.0, 2022-01-18 ##
 67 | 
 68 | - New feature: Prune XML tags and their contents from the input before
 69 |   tokenization (via the command line option --prune TAGNAME1 --prune
 70 |   TAGNAME2 … or by passing prune_tags=["TAGNAME1", "TAGNAME2", …] to
 71 |   tokenize_xml or tokenize_xml_file). This can be useful when
 72 |   processing HTML files, e.g. for removing any <script> and <style>
 73 |   tags from the input.
 74 | 
 75 | ## Version 2.1.6, 2021-12-13 ##
 76 | 
 77 | - Recognize more URLs without protocol.
 78 | - Fix a small bug in implementation of doubly linked lists.
 79 | 
 80 | ## Version 2.1.5, 2021-08-24 ##
 81 | 
 82 | - Split sequences of hashtags without spaces.
 83 | - Add legal abbreviations (issue #21).
 84 | 
 85 | ## Version 2.1.4, 2021-07-09 ##
 86 | 
 87 | - Add a few abbreviations.
 88 | - Improve detection of sentence boundaries when punctuation is
 89 |   followed by emoticons, mentions or hashtags.
 90 | 
 91 | ## Version 2.1.3, 2021-03-05 ##
 92 | 
 93 | - Add a few abbreviations.
 94 | - Improve tokenization of protocol-less URLs.
 95 | - Improve tokenization of a few emoticons and symbols/dingbats.
 96 | - Improve tokenization of gendered nouns (gender star, gender colon).
 97 | - Improve tokenization of simple arithmetic operations.
 98 | 
 99 | ## Version 2.1.2, 2021-01-29 ##
100 | 
101 | - Allow hyphens in hashtags. While hyphens cannot be part of Twitter
102 |   hashtags, we do not want to split compounds like
103 |   “#Refugeeswelcome-Bewegung”.
104 | 
105 | ## Version 2.1.1, 2020-06-30 ##
106 | 
107 | - Detection of quotes delimited by apostrophes ('…') is more
108 |   conservative, now (issue #16).
109 | 
110 | ## Version 2.1.0, 2020-06-17 ##
111 | 
112 | - New feature: Delimit sentences with XML tags (via the command line
113 |   option --sentence-tag TAGNAME or by passing xml_sentences="TAGNAME"
114 |   to the constructor). When using this option with XML input, SoMaJo
115 |   tries hard to produce well-formed XML as output. To achieve this,
116 |   some tags will need to be closed and re-opened at sentence
117 |   boundaries. In this paragraph, for example, the italic region
118 |   contains a sentence boundary:
119 |   <p>Hi <i>there! How</i> are you?</p>
120 |   SoMaJo will close the i tag before the end of the sentence and
121 |   re-open it afterwards:
122 |   <p> <s> Hi <i> there ! </i> </s> <s> <i> How </i> are you ? </s> </p>
123 | 
124 | ## Version 2.0.6, 2020-06-12 ##
125 | 
126 | - Support all textual smileys and textfaces from Signal messenger.
127 | - Raise a TypeError if tokenize_text is called with a string instead
128 |   of an iterable of strings (issue #13)
129 | 
130 | ## Version 2.0.5, 2020-04-09 ##
131 | 
132 | - Add heuristics for ambiguous quotation marks (issue #11).
133 | - Avoid false positives for emoticons that contain a space (issue #12).
134 | - Correctly tokenize obfuscated email addresses that contain spaces.
135 | - Do not split tl;dr and its German variant zl;ng.
136 | 
137 | ## Version 2.0.4, 2020-03-05 ##
138 | 
139 | - Bugfix: Prevent race conditions between tokenizer and sentence
140 |   splitter in parallel processing (--parallel > 1).
141 | 
142 | ## Version 2.0.3, 2020-02-27 ##
143 | 
144 | - Skip tests for unimplemented features (some builds will fail if any
145 |   of the unit tests fail).
146 | 
147 | ## Version 2.0.2, 2020-02-27 ##
148 | 
149 | - Bugfix: Parallel tokenization (--parallel > 1) works again.
150 | - Support for musical notes (sharps).
151 | 
152 | ## Version 2.0.1, 2019-12-19 ##
153 | 
154 | - Bugfix.
155 | 
156 | ## Version 2.0.0, 2019-12-19 ##
157 | 
158 | ### New features and improvements ###
159 | 
160 | - New API: Use new class SoMaJo instead of Tokenizer and
161 |   SentenceSplitter. Currently, the old API is still supported but will
162 |   issue deprecation warnings.
163 | - Speed-up: Due to a new internal representation of the input text
164 |   during processing (as a doubly linked list of Token objects),
165 |   tokenization is now two to three times faster.
166 | - Incremental and parallel processing of XML: If a sensible set of
167 |   eos_tags is specified, the XML input will be processed incrementally
168 |   (allowing for arbitrarily large XML input). In addition, if a
169 |   sensible set of eos_tags is specified, processing can also be
170 |   parallelized.
171 | - New option --strip-tags to suppress the output of XML tags.
172 | - Support for textual representations of emojis (:smile:,
173 |   :stuck_out_tongue_winking_eye:, etc.).
174 | - Support for textfaces (༼ʘ̚ل͜ʘ̚༽, ╚(ಠ_ಠ)=┐, etc.).
175 | 
176 | ### Breaking changes ###
177 | 
178 | - Removed the tokenizer script (deprecated since version 1.5.0
179 |   released in October 2017). Use somajo-tokenizer instead.
180 | - Language codes contain the tokenization guideline: "de_CMC" instead
181 |   of "de" and "en_PTB" instead of "en".
182 | 
183 | ## Version 1.11.0, 2019-11-08 ##
184 | 
185 | - XML sentence splitting: Added hr tag to default sentence breaks
186 | - Recognize Reddit links in shorthand notation
187 | - Improved robustness of XML processing
188 | 
189 | ## Version 1.10.7, 2019-11-01 ##
190 | 
191 | - Make recognition of gender star case insensitive
192 | - Fix problem with “nasty” character as last character of text unit
193 | 
194 | ## Version 1.10.6, 2019-10-02 ##
195 | 
196 | - Recognize gender star.
197 | - Improve recognition of lists of numbers, section numbers and IPv4
198 |   addresses
199 | 
200 | ## Version 1.10.5, 2019-08-02 ##
201 | 
202 | - Correctly tokenize flags followed by a variation selector.
203 | - Delete variation selector that occurs on its own.
204 | 
205 | ## Version 1.10.4, 2019-08-01 ##
206 | 
207 | - Bugfix related to the --version option.
208 | 
209 | ## Version 1.10.3, 2019-07-19 ##
210 | 
211 | - New option -v/--version to output version information.
212 | - Explicitly specify input encoding as UTF-8.
213 | 
214 | ## Version 1.10.2, 2019-07-02 ##
215 | 
216 | - The error that 1.10.1 tried to fix was not really caused by the
217 |   version numbers of regex but by specifying our own version number in
218 |   __init__.py where we also indirectly load required modules.
219 | 
220 | ## Version 1.10.1, 2019-07-02 ##
221 | 
222 | - Use semantic versioning to specify minimal required version of
223 |   regex. This fixes a bug where the dependency was not correctly
224 |   installed.
225 | 
226 | ## Version 1.10.0, 2019-06-28 ##
227 | 
228 | - Treat emoji sequences that render as a single grapheme as a single
229 |   token. This includes flags and sequences containing modifiers and
230 |   zero-width joiners.
231 | - Recognize underscores used for "underlining" and split them off.
232 | - Added a few Unicode formatting characters to the “nasty” characters.
233 | - Replaced POSIX character classes with built-ins or Unicode
234 |   properties.
235 | 
236 | ## Version 1.9.0, 2019-04-01 ##
237 | 
238 | - New method Tokenizer.tokenize_file for easy tokenization of files
239 |   from Python
240 | - Added text and emoji variation selectors.
241 | - Added new English abbreviation (Appl'n.).
242 | 
243 | ## Version 1.8.3, 2018-11-02 ##
244 | 
245 | - Fixed a bug that caused abbreviations with internal dots but without
246 |   final dot to be split up erroneously (e.g. E.ON).
247 | 
248 | ## Version 1.8.2, 2018-10-26 ##
249 | 
250 | - Fixed a bug with degree measurements in English (°F, etc.).
251 | - Fixed a bug that caused SoMaJo to hang when an XML tag occured
252 |   within a token that is allowed to contain whitespace.
253 | 
254 | ## Version 1.8.1, 2018-07-30 ##
255 | 
256 | - Fixed the following bug: When using option -e, “nasty” characters
257 |   between whitespace within tokens that are allowed to contain
258 |   whitespace (e.g. XML tags) caused SoMaJo to hang.
259 | - Added zero-width no-break space (FEFF) to “nasty” characters.
260 | 
261 | ## Version 1.8.0, 2018-07-04 ##
262 | 
263 | - New language: SoMaJo can tokenize English texts (using the new
264 |   option -l/--language).
265 | - Small improvements to tokenization (URLs, emoticons, number
266 |   compounds, …).
267 | 
268 | ## Version 1.7.0, 2018-03-22 ##
269 | 
270 | SoMaJo has now full XML support. To tokenize an XML file, use the
271 | option -x/--xml. Via the option --tag (can be used multiple times),
272 | you can specify which tags always constitute sentence breaks, e.g.
273 | title, h1 or p tags in an HTML file.
274 | 
275 | ## Version 1.6.0, 2018-03-05 ##
276 | 
277 | - XML declarations are recognized as single tokens.
278 | - Additional “nasty” characters (zero-width joiners and non-joiners,
279 |   left-to-right and right-to-left marks) are removed from the input.
280 | - The input is normalized to Unicode normal form C (NFC).
281 | 
282 | ## Version 1.5.0, 2017-10-23 ##
283 | 
284 | - Bugfix: Removed trailing space from last token in
285 |   paragraph/sentence.
286 | - SoMaJo should be run as 'somajo-tokenizer'. The 'tokenizer' command
287 |   is deprecated.
288 | - XML entities (&amp;, &#75;, &#x7f;) are recognized as single tokens.
289 | - Some abbreviations (usw., usf., etc., uvam.) indicate sentence
290 |   boundaries if they are followed by a potential sentence start.
291 | - We also print a log message that indicates tokenization speed.
292 | 
293 | ## Version 1.4.4, 2017-08-03 ##
294 | 
295 | This release improves sentence splitting for sentences ending in
296 | German closing quotation marks (“).
297 | 
298 | ## Version 1.4.3, 2017-08-02 ##
299 | 
300 | This is a bugfix release that fixes a bug that occured in 1.4.2 when
301 | using the option -e on some inputs containing control characters and
302 | other “nasty” characters.
303 | 
304 | ## Version 1.4.2, 2017-07-31 ##
305 | 
306 | Control characters and other “nasty” characters (soft hyphens and
307 | zero-width spaces) are removed from the input.
308 | 
309 | ## Version 1.4.1, 2017-07-28 ##
310 | 
311 | Added support for Unicode emoticons and various other Unicode symbols.
312 | 
313 | ## Version 1.4.0, 2017-07-13 ##
314 | 
315 | SoMaJo can now perform sentence splitting (using the new option
316 | --split_sentences).
317 | 
318 | ## Version 1.3.1, 2017-07-04 ##
319 | 
320 | SoMaJo is now hosted on Github and the changes made in this version
321 | reflect that change.
322 | 
323 | ## Version 1.3.0, 2016-09-02 ##
324 | 
325 | Matching of items containing “+” or “&” or being written in camel case
326 | has been optimized a bit. Now the tokenizer runs roughly three to four
327 | times faster.
328 | 
329 | ## Version 1.2.0, 2016-09-01 ##
330 | 
331 | Two new options added: With -s/--paragraph_separator, you can specify
332 | how paragraphs are delimited in the input data, i.e. by empty lines or
333 | by single newlines. The --parallelization option makes it possible to
334 | use a pool of worker processes to speed up tokenization.
335 | 
336 | ## Version 1.1.2, 2016-08-25 ##
337 | 
338 | The example in the documentation is now self-contained: Sample input
339 | has been added and the output will be printed.
340 | 
341 | ## Version 1.1.1, 2016-08-19 ##
342 | 
343 | The link in the Evaluation section of the Readme now points to the
344 | complete gold standard data.
345 | 
346 | ## Version 1.1.0, 2016-08-19 ##
347 | 
348 | SoMaJo can now output additional information about the original
349 | spelling of the tokens, i.e. if a token was followed by whitespace or
350 | if a token contained internal whitespace (according to the
351 | tokenization guidelines, things like “: )” get normalized to “:)”). To
352 | use this feature, provide the tokenizer script with the -e option.
353 | 
354 | ## Version 1.0.3, 2016-08-18 ##
355 | 
356 | This version works around a bug in the regex module that caused
357 | exponential runtimes on certain inputs.
358 | 


--------------------------------------------------------------------------------
/tests/test_tokenizer_internal.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import unittest
  4 | 
  5 | import regex as re
  6 | 
  7 | from somajo import Tokenizer
  8 | from somajo.doubly_linked_list import DLL
  9 | from somajo.token import Token
 10 | 
 11 | 
 12 | class TestRemoveEmptyTokens(unittest.TestCase):
 13 |     """"""
 14 |     def setUp(self):
 15 |         """Necessary preparations"""
 16 |         self.tokenizer = Tokenizer(language="de_CMC", split_camel_case=True)
 17 | 
 18 |     def test_remove_empty_tokens_01(self):
 19 |         token_dll = DLL([Token(s) for s in (" ", "Foo", "bar", "baz", "qux")])
 20 |         token_dll.first.value.first_in_sentence = True
 21 |         token_dll.last.value.last_in_sentence = True
 22 |         self.tokenizer._remove_empty_tokens(token_dll)
 23 |         self.assertEqual([t.text for t in token_dll.to_list()], "Foo bar baz qux".split())
 24 |         self.assertTrue(token_dll.first.value.first_in_sentence)
 25 |         self.assertTrue(token_dll.last.value.last_in_sentence)
 26 | 
 27 |     def test_remove_empty_tokens_02(self):
 28 |         token_dll = DLL([Token(s) for s in ("Foo", "bar", "baz", "qux", " ")])
 29 |         token_dll.first.value.first_in_sentence = True
 30 |         token_dll.last.value.last_in_sentence = True
 31 |         self.tokenizer._remove_empty_tokens(token_dll)
 32 |         self.assertEqual([t.text for t in token_dll.to_list()], "Foo bar baz qux".split())
 33 |         self.assertTrue(token_dll.first.value.first_in_sentence)
 34 |         self.assertTrue(token_dll.last.value.last_in_sentence)
 35 | 
 36 |     def test_remove_empty_tokens_03(self):
 37 |         token_dll = DLL([Token(s) for s in ("<s>", " ", "Foo", "bar", "baz", "qux", "</s>")])
 38 |         token_dll.first.value.markup = True
 39 |         token_dll.last.value.markup = True
 40 |         token_dll.first.next.value.first_in_sentence = True
 41 |         token_dll.last.prev.value.last_in_sentence = True
 42 |         self.tokenizer._remove_empty_tokens(token_dll)
 43 |         self.assertEqual([t.text for t in token_dll.to_list()], "<s> Foo bar baz qux </s>".split())
 44 |         self.assertTrue(token_dll.first.next.value.first_in_sentence)
 45 |         self.assertTrue(token_dll.last.prev.value.last_in_sentence)
 46 | 
 47 |     def test_remove_empty_tokens_04(self):
 48 |         token_dll = DLL([Token(s) for s in ("<s>", "Foo", "bar", "baz", "qux", " ", "</s>")])
 49 |         token_dll.first.value.markup = True
 50 |         token_dll.last.value.markup = True
 51 |         token_dll.first.next.value.first_in_sentence = True
 52 |         token_dll.last.prev.value.last_in_sentence = True
 53 |         self.tokenizer._remove_empty_tokens(token_dll)
 54 |         self.assertEqual([t.text for t in token_dll.to_list()], "<s> Foo bar baz qux </s>".split())
 55 |         self.assertTrue(token_dll.first.next.value.first_in_sentence)
 56 |         self.assertTrue(token_dll.last.prev.value.last_in_sentence)
 57 | 
 58 |     def test_remove_empty_tokens_05(self):
 59 |         token_dll = DLL([Token(s) for s in ("<s>", " ", "<br/>", "Foo", "bar", "baz", "qux", "</s>")])
 60 |         token_dll.first.value.markup = True
 61 |         token_dll.last.value.markup = True
 62 |         token_dll.first.next.next.value.markup = True
 63 |         token_dll.first.next.value.first_in_sentence = True
 64 |         token_dll.last.prev.value.last_in_sentence = True
 65 |         self.tokenizer._remove_empty_tokens(token_dll)
 66 |         self.assertEqual([t.text for t in token_dll.to_list()], "<s> <br/> Foo bar baz qux </s>".split())
 67 |         self.assertTrue(token_dll.first.next.next.value.first_in_sentence)
 68 |         self.assertTrue(token_dll.last.prev.value.last_in_sentence)
 69 | 
 70 |     def test_remove_empty_tokens_06(self):
 71 |         token_dll = DLL([Token(s) for s in ("<s>", "Foo", "bar", "baz", "qux", "<br/>", " ", "</s>")])
 72 |         token_dll.first.value.markup = True
 73 |         token_dll.last.value.markup = True
 74 |         token_dll.last.prev.prev.value.markup = True
 75 |         token_dll.first.next.value.first_in_sentence = True
 76 |         token_dll.last.prev.value.last_in_sentence = True
 77 |         self.tokenizer._remove_empty_tokens(token_dll)
 78 |         self.assertEqual([t.text for t in token_dll.to_list()], "<s> Foo bar baz qux <br/> </s>".split())
 79 |         self.assertTrue(token_dll.first.next.value.first_in_sentence)
 80 |         self.assertTrue(token_dll.last.prev.prev.value.last_in_sentence)
 81 | 
 82 |     def test_remove_empty_tokens_07(self):
 83 |         token_dll = DLL([Token(s) for s in ("<s>", " ", "Foo", " ", "</s>")])
 84 |         token_dll.first.value.markup = True
 85 |         token_dll.last.value.markup = True
 86 |         token_dll.first.next.value.first_in_sentence = True
 87 |         token_dll.last.prev.value.last_in_sentence = True
 88 |         self.tokenizer._remove_empty_tokens(token_dll)
 89 |         self.assertEqual([t.text for t in token_dll.to_list()], "<s> Foo </s>".split())
 90 |         self.assertTrue(token_dll.first.next.value.first_in_sentence)
 91 |         self.assertTrue(token_dll.first.next.value.last_in_sentence)
 92 | 
 93 |     def test_remove_empty_tokens_08(self):
 94 |         token_dll = DLL([Token(s) for s in ("<s>", " ", " ", "</s>")])
 95 |         token_dll.first.value.markup = True
 96 |         token_dll.last.value.markup = True
 97 |         token_dll.first.next.value.first_in_sentence = True
 98 |         token_dll.last.prev.value.last_in_sentence = True
 99 |         self.tokenizer._remove_empty_tokens(token_dll)
100 |         self.assertEqual([t.text for t in token_dll.to_list()], "<s> </s>".split())
101 |         self.assertFalse(any([t.value.first_in_sentence for t in token_dll]))
102 |         self.assertFalse(any([t.value.last_in_sentence for t in token_dll]))
103 | 
104 | 
105 | class TestSplitPaired(unittest.TestCase):
106 |     """"""
107 |     def setUp(self):
108 |         """Necessary preparations"""
109 |         self.tokenizer = Tokenizer(language="de_CMC", split_camel_case=True)
110 |         self.regex = re.compile(r"(?P<left>a)[^a]+(?P<right>a)")
111 | 
112 |     def test_split_paired_01(self):
113 |         token_dll = DLL([Token("babbbab")])
114 |         self.tokenizer._split_all_matches(self.regex, token_dll)
115 |         self.assertEqual([t.text for t in token_dll.to_list()], "b a bbb a b".split())
116 | 
117 |     def test_split_paired_02(self):
118 |         token_dll = DLL([Token("abbba")])
119 |         self.tokenizer._split_all_matches(self.regex, token_dll)
120 |         self.assertEqual([t.text for t in token_dll.to_list()], "a bbb a".split())
121 | 
122 |     def test_split_paired_03(self):
123 |         token_dll = DLL([Token("babbbababbab")])
124 |         self.tokenizer._split_all_matches(self.regex, token_dll)
125 |         self.assertEqual([t.text for t in token_dll.to_list()], "b a bbb a b a bb a b".split())
126 | 
127 |     def test_split_paired_04(self):
128 |         token_dll = DLL([Token("babbbababbb")])
129 |         self.tokenizer._split_all_matches(self.regex, token_dll)
130 |         self.assertEqual([t.text for t in token_dll.to_list()], "b a bbb a babbb".split())
131 | 
132 |     def test_split_paired_05(self):
133 |         token_dll = DLL([Token("bbb")])
134 |         self.tokenizer._split_all_matches(self.regex, token_dll)
135 |         self.assertEqual([t.text for t in token_dll.to_list()], "bbb".split())
136 | 
137 |     def test_split_paired_06(self):
138 |         token_dll = DLL([Token("")])
139 |         self.tokenizer._split_all_matches(self.regex, token_dll)
140 |         self.assertEqual([t.text for t in token_dll.to_list()], [""])
141 | 
142 | 
143 | class TestSplitOnBoundaries(unittest.TestCase):
144 |     """"""
145 |     def setUp(self):
146 |         """Necessary preparations"""
147 |         self.tokenizer = Tokenizer(language="de_CMC", split_camel_case=True)
148 | 
149 |     def test_split_on_boundaries_01(self):
150 |         token_dll = DLL([Token("123456789")])
151 |         self.tokenizer._split_on_boundaries(token_dll.first, [(1, 3, None), (5, 8, None)], "regular")
152 |         self.assertEqual([t.text for t in token_dll.to_list()], "1 23 45 678 9".split())
153 |         tokens = token_dll.to_list()
154 |         self.assertEqual(tokens[1].token_class, "regular")
155 |         self.assertEqual(tokens[3].token_class, "regular")
156 |         self.assertEqual([t._locked for t in tokens], [False, True, False, True, False])
157 | 
158 |     def test_split_on_boundaries_02(self):
159 |         token_dll = DLL([Token(" 2345678 ")])
160 |         self.tokenizer._split_on_boundaries(token_dll.first, [(1, 3, None), (5, 8, None)], "regular")
161 |         self.assertEqual([t.text for t in token_dll.to_list()], "23 45 678".split())
162 |         tokens = token_dll.to_list()
163 |         self.assertEqual(tokens[0].token_class, "regular")
164 |         self.assertEqual(tokens[2].token_class, "regular")
165 |         self.assertTrue(tokens[2].space_after)
166 |         self.assertEqual([t._locked for t in tokens], [True, False, True])
167 | 
168 |     def test_split_on_boundaries_03(self):
169 |         token_dll = DLL([Token("12 456 89")])
170 |         self.tokenizer._split_on_boundaries(token_dll.first, [(1, 3, None), (5, 8, None)], "regular")
171 |         self.assertEqual([t.text for t in token_dll.to_list()], ["1", "2", "45", "6 8", "9"])
172 |         tokens = token_dll.to_list()
173 |         self.assertEqual(tokens[1].token_class, "regular")
174 |         self.assertEqual(tokens[3].token_class, "regular")
175 |         self.assertTrue(tokens[1].space_after)
176 |         self.assertEqual([t._locked for t in tokens], [False, True, False, True, False])
177 | 
178 |     def test_split_on_boundaries_04(self):
179 |         token_dll = DLL([Token("1 3456 89")])
180 |         self.tokenizer._split_on_boundaries(token_dll.first, [(1, 3, None), (5, 8, None)], "regular")
181 |         self.assertEqual([t.text for t in token_dll.to_list()], ["1", "3", "45", "6 8", "9"])
182 |         tokens = token_dll.to_list()
183 |         self.assertEqual(tokens[1].token_class, "regular")
184 |         self.assertEqual(tokens[3].token_class, "regular")
185 |         self.assertTrue(tokens[0].space_after)
186 |         self.assertEqual([t._locked for t in tokens], [False, True, False, True, False])
187 | 
188 |     def test_split_on_boundaries_05(self):
189 |         token_dll = DLL([Token("12 456 89")])
190 |         self.tokenizer._split_on_boundaries(token_dll.first, [(1, 3, None), (5, 8, None)], "regular", delete_whitespace=True)
191 |         self.assertEqual([t.text for t in token_dll.to_list()], "1 2 45 68 9".split())
192 |         tokens = token_dll.to_list()
193 |         self.assertEqual(tokens[1].token_class, "regular")
194 |         self.assertEqual(tokens[3].token_class, "regular")
195 |         self.assertTrue(tokens[1].space_after)
196 |         self.assertEqual(tokens[3].original_spelling, "6 8")
197 |         self.assertEqual([t._locked for t in tokens], [False, True, False, True, False])
198 | 
199 |     def test_split_on_boundaries_06(self):
200 |         token_dll = DLL([Token("123456789")])
201 |         token_dll.first.value.first_in_sentence = True
202 |         token_dll.first.value.last_in_sentence = True
203 |         self.tokenizer._split_on_boundaries(token_dll.first, [(0, 2, None), (6, 9, None)], "regular")
204 |         self.assertEqual([t.text for t in token_dll.to_list()], "12 3456 789".split())
205 |         tokens = token_dll.to_list()
206 |         self.assertEqual(tokens[0].token_class, "regular")
207 |         self.assertEqual(tokens[2].token_class, "regular")
208 |         self.assertTrue(tokens[0].first_in_sentence)
209 |         self.assertTrue(tokens[2].last_in_sentence)
210 |         self.assertFalse(tokens[0].last_in_sentence)
211 |         self.assertFalse(tokens[2].first_in_sentence)
212 |         self.assertFalse(tokens[1].first_in_sentence)
213 |         self.assertFalse(tokens[1].last_in_sentence)
214 |         self.assertEqual([t._locked for t in tokens], [True, False, True])
215 | 
216 |     def test_split_on_boundaries_07(self):
217 |         token_dll = DLL([Token("123456789")])
218 |         self.tokenizer._split_on_boundaries(token_dll.first, [(1, 3, "repl1"), (5, 8, "repl2")], "regular")
219 |         self.assertEqual([t.text for t in token_dll.to_list()], "1 repl1 45 repl2 9".split())
220 |         tokens = token_dll.to_list()
221 |         self.assertEqual(tokens[1].token_class, "regular")
222 |         self.assertEqual(tokens[3].token_class, "regular")
223 |         self.assertEqual(tokens[1].original_spelling, "23")
224 |         self.assertEqual(tokens[3].original_spelling, "678")
225 |         self.assertEqual([t._locked for t in tokens], [False, True, False, True, False])
226 | 
227 | 
228 | class TestSplitAllSet(unittest.TestCase):
229 |     """"""
230 |     def setUp(self):
231 |         """Necessary preparations"""
232 |         self.tokenizer = Tokenizer(language="de_CMC", split_camel_case=True)
233 |         self.regex = re.compile(r"\p{L}{3}")
234 |         self.set_ = set(["abc", "xYz"])
235 | 
236 |     def test_split_all_set_01(self):
237 |         token_dll = DLL([Token(s) for s in "0abc0 0xyz0".split()])
238 |         self.tokenizer._split_all_set(token_dll, self.regex, self.set_)
239 |         tokens = token_dll.to_list()
240 |         self.assertEqual([t.text for t in tokens], "0 abc 0 0xyz0".split())
241 | 
242 |     def test_split_all_set_02(self):
243 |         token_dll = DLL([Token(s) for s in "0aBc0 0xYz0".split()])
244 |         self.tokenizer._split_all_set(token_dll, self.regex, self.set_)
245 |         tokens = token_dll.to_list()
246 |         self.assertEqual([t.text for t in tokens], "0aBc0 0 xYz 0".split())
247 | 
248 |     def test_split_all_set_03(self):
249 |         token_dll = DLL([Token(s) for s in "0abc0 0xyz0".split()])
250 |         self.tokenizer._split_all_set(token_dll, self.regex, self.set_, to_lower=True)
251 |         tokens = token_dll.to_list()
252 |         self.assertEqual([t.text for t in tokens], "0 abc 0 0xyz0".split())
253 | 
254 |     def test_split_all_set_04(self):
255 |         token_dll = DLL([Token(s) for s in "0aBc0 0xYz0".split()])
256 |         self.tokenizer._split_all_set(token_dll, self.regex, self.set_, to_lower=True)
257 |         tokens = token_dll.to_list()
258 |         self.assertEqual([t.text for t in tokens], "0 aBc 0 0xYz0".split())
259 | 


--------------------------------------------------------------------------------
/src/somajo/data/abbreviations_en.txt:
--------------------------------------------------------------------------------
   1 | # A list of abbreviations that are not matched by
   2 | # “(?:[[:alpha:]]\.){2,}”, i.e. are not sequences of single letters
   3 | # followed by single dots
   4 | #
   5 | # Lines starting with “#” are treated as comments and will be ignored.
   6 | 
   7 | # days
   8 | Mon.
   9 | Tue.
  10 | Wed.
  11 | Thur.
  12 | Fri.
  13 | Sat.
  14 | # Sun. could also mean the sun
  15 | 
  16 | # months
  17 | Jan.
  18 | Feb.
  19 | Mar.
  20 | Apr.
  21 | Jun.
  22 | Jul.
  23 | Aug.
  24 | Sep.
  25 | Sept.
  26 | Oct.
  27 | Nov.
  28 | Dec.
  29 | 
  30 | # https://public.oed.com/how-to-use-the-oed/abbreviations/
  31 | abbrev.
  32 | Abbrev.
  33 | Abd.
  34 | Aberd.
  35 | Aberdeensh.
  36 | abl.
  37 | Abol.
  38 | Aborig.
  39 | Abp.
  40 | Abr.
  41 | Abridg.
  42 | Abridgem.
  43 | absol.
  44 | Absol.
  45 | Abst.
  46 | abstr.
  47 | Abstr.
  48 | Acad.
  49 | acc.
  50 | Acc.
  51 | Accomm.
  52 | Accompl.
  53 | Accs.
  54 | Acct.
  55 | Accts.
  56 | accus.
  57 | Achievem.
  58 | Add.
  59 | Addit.
  60 | Addr.
  61 | adj.
  62 | adjs.
  63 | Adm.
  64 | Admir.
  65 | Admon.
  66 | Admonit.
  67 | adv.
  68 | Adv.
  69 | Advancem.
  70 | advb.
  71 | Advert.
  72 | Advoc.
  73 | advs.
  74 | Advt.
  75 | Advts.
  76 | Aerodynam.
  77 | Aeronaut.
  78 | Aff.
  79 | Afr.
  80 | Agric.
  81 | agst.
  82 | al.
  83 | Alch.
  84 | Alg.
  85 | Alleg.
  86 | Allit.
  87 | Alm.
  88 | Alph.
  89 | alt.
  90 | Amer.
  91 | Analyt.
  92 | Anat.
  93 | Anc.
  94 | Anecd.
  95 | Ang.
  96 | Angl.
  97 | Anglo-Ind.
  98 | Anim.
  99 | Ann.
 100 | Anniv.
 101 | Annot.
 102 | Anon.
 103 | Answ.
 104 | Anthrop.
 105 | Anthropol.
 106 | Antiq.
 107 | aphet.
 108 | Apoc.
 109 | Apol.
 110 | Appl'n.
 111 | Appl.
 112 | Applic.
 113 | appos.
 114 | Arb.
 115 | Archaeol.
 116 | Archit.
 117 | Argt.
 118 | Arith.
 119 | Arithm.
 120 | Arrangem.
 121 | arrv.
 122 | Artic.
 123 | Artific.
 124 | Artill.
 125 | Ashm.
 126 | Assemb.
 127 | Assoc.
 128 | Assyriol.
 129 | Astr.
 130 | Astrol.
 131 | Astron.
 132 | Att.
 133 | attrib.
 134 | Attrib.
 135 | Austral.
 136 | Auth.
 137 | Autobiog.
 138 | Autobiogr.
 139 | Ave.
 140 | Ayrsh.
 141 | Bacteriol.
 142 | Bedfordsh.
 143 | bef.
 144 | Belg.
 145 | Berks.
 146 | Berksh.
 147 | Berw.
 148 | Berwicksh.
 149 | betw.
 150 | Bibliogr.
 151 | Biochem.
 152 | Biog.
 153 | Biogr.
 154 | Biol.
 155 | Bk.
 156 | Bks.
 157 | Blvd.
 158 | Bord.
 159 | Bp.
 160 | Braz.
 161 | Bros.
 162 | Bur.
 163 | Cal.
 164 | Calc.
 165 | Calend.
 166 | Calif.
 167 | Calligr.
 168 | Camb.
 169 | Cambr.
 170 | Campanol.
 171 | Canad.
 172 | Canterb.
 173 | Capt.
 174 | Cartogr.
 175 | Catal.
 176 | Catech.
 177 | Cath.
 178 | Ceram.
 179 | Cert.
 180 | Certif.
 181 | cf.
 182 | Ch.
 183 | Chamb.
 184 | Char.
 185 | Charac.
 186 | Chas.
 187 | Chem.
 188 | Chesh.
 189 | Chr.
 190 | Chron.
 191 | Chronol.
 192 | Chrons.
 193 | Cinematogr.
 194 | Circ.
 195 | cl.
 196 | Cl.
 197 | Classif.
 198 | Climatol.
 199 | Clin.
 200 | Co.
 201 | Col.
 202 | Coll.
 203 | colloq.
 204 | Colloq.
 205 | Com.
 206 | Comm.
 207 | Commandm.
 208 | Commend.
 209 | Commerc.
 210 | Commiss.
 211 | Commonw.
 212 | Communic.
 213 | comp.
 214 | Comp.
 215 | Compan.
 216 | compar.
 217 | Compar.
 218 | Compend.
 219 | compl.
 220 | Compl.
 221 | Compos.
 222 | conc.
 223 | Conc.
 224 | Conch.
 225 | Concl.
 226 | concr.
 227 | Conf.
 228 | Confid.
 229 | Confl.
 230 | Confut.
 231 | Congr.
 232 | Congreg.
 233 | conj.
 234 | Conn.
 235 | cons.
 236 | Consc.
 237 | Consecr.
 238 | Consid.
 239 | Consol.
 240 | const.
 241 | Constit.
 242 | Constr.
 243 | Contemp.
 244 | Contempl.
 245 | contempt.
 246 | Contend.
 247 | Contin.
 248 | contr.
 249 | Contrib.
 250 | Controv.
 251 | Conv.
 252 | Conversat.
 253 | Convoc.
 254 | Cor.
 255 | Cornw.
 256 | Coron.
 257 | Corp.
 258 | Corr.
 259 | corresp.
 260 | Corresp.
 261 | Counc.
 262 | Courtsh.
 263 | cpd.
 264 | Craniol.
 265 | Craniom.
 266 | Crim.
 267 | Crit.
 268 | Crt.
 269 | Crts.
 270 | Cryptogr.
 271 | Crystallogr.
 272 | Ct.
 273 | Cumb.
 274 | Cumberld.
 275 | Cumbld.
 276 | Cycl.
 277 | Cytol.
 278 | dat.
 279 | Dau.
 280 | Deb.
 281 | Declar.
 282 | Ded.
 283 | def.
 284 | Def.
 285 | Deliv.
 286 | dem.
 287 | Demonstr.
 288 | Dep.
 289 | Depred.
 290 | Depredat.
 291 | Dept.
 292 | Derbysh.
 293 | deriv.
 294 | derog.
 295 | Descr.
 296 | Deut.
 297 | Devel.
 298 | Devonsh.
 299 | Dict.
 300 | Diffic.
 301 | dim.
 302 | Dis.
 303 | Discipl.
 304 | Discov.
 305 | Discrim.
 306 | Diss.
 307 | Dist.
 308 | Distemp.
 309 | Distill.
 310 | Distrib.
 311 | Div.
 312 | Divers.
 313 | Dk.
 314 | Doc.
 315 | Doct.
 316 | Domest.
 317 | Dr.
 318 | Drs.
 319 | Durh.
 320 | dyslog.
 321 | Eccl.
 322 | Eccles.
 323 | Ecclus.
 324 | Ecol.
 325 | Econ.
 326 | ed.
 327 | Ed.
 328 | Edin.
 329 | Edinb.
 330 | Educ.
 331 | Edw.
 332 | Egypt.
 333 | Egyptol.
 334 | Electr.
 335 | Electro-magn.
 336 | Electro-physiol.
 337 | Elem.
 338 | Eliz.
 339 | Elizab.
 340 | ellipt.
 341 | Emb.
 342 | Embryol.
 343 | emph.
 344 | encl.
 345 | Encycl.
 346 | Eng.
 347 | Engin.
 348 | Englishw.
 349 | Enq.
 350 | Ent.
 351 | Enthus.
 352 | Entom.
 353 | Entomol.
 354 | Enzymol.
 355 | Ep.
 356 | Eph.
 357 | Ephes.
 358 | Epil.
 359 | Episc.
 360 | Epist.
 361 | Epit.
 362 | Equip.
 363 | erron.
 364 | Esd.
 365 | esp.
 366 | Ess.
 367 | Essent.
 368 | Establ.
 369 | Esth.
 370 | etc.
 371 | Ethnol.
 372 | etym.
 373 | etymol.
 374 | Etymol.
 375 | euphem.
 376 | Eval.
 377 | Evang.
 378 | Evid.
 379 | Evol.
 380 | Exalt.
 381 | exc.
 382 | Exch.
 383 | Exec.
 384 | Exerc.
 385 | Exhib.
 386 | Exod.
 387 | Exped.
 388 | Exper.
 389 | Explan.
 390 | Explic.
 391 | Explor.
 392 | Expos.
 393 | ext.
 394 | Ezek.
 395 | Fab.
 396 | fam.
 397 | Fam.
 398 | famil.
 399 | Farew.
 400 | fem.
 401 | Ff.
 402 | Fifesh.
 403 | fig.
 404 | fl.
 405 | Footpr.
 406 | Forfarsh.
 407 | Fortif.
 408 | Fortn.
 409 | Found.
 410 | Fr.
 411 | Fragm.
 412 | Fratern.
 413 | freq.
 414 | Friendsh.
 415 | ft.
 416 | Furnit.
 417 | fut.
 418 | Gal.
 419 | Gard.
 420 | Gastron.
 421 | Gaz.
 422 | Gd.
 423 | gen.
 424 | Gen.
 425 | Geo.
 426 | Geog.
 427 | Geogr.
 428 | Geol.
 429 | Geom.
 430 | Geomorphol.
 431 | Ger.
 432 | Glac.
 433 | Glasg.
 434 | Glos.
 435 | Gloss.
 436 | Glouc.
 437 | Gloucestersh.
 438 | Gosp.
 439 | Gov.
 440 | Govt.
 441 | Gr.
 442 | Gram.
 443 | Gt.
 444 | Gynaecol.
 445 | Hab.
 446 | Haematol.
 447 | Hag.
 448 | Hampsh.
 449 | Handbk.
 450 | Hants.
 451 | Heb.
 452 | Hebr.
 453 | Hen.
 454 | Herb.
 455 | Heref.
 456 | Herefordsh.
 457 | Hertfordsh.
 458 | Hierogl.
 459 | hist.
 460 | Hist.
 461 | Histol.
 462 | Hom.
 463 | Horol.
 464 | Hort.
 465 | Hos.
 466 | Hosp.
 467 | Househ.
 468 | Housek.
 469 | Husb.
 470 | Hydraul.
 471 | Hydrol.
 472 | Ichth.
 473 | Icthyol.
 474 | Ideol.
 475 | Idol.
 476 | Illustr.
 477 | Imag.
 478 | imit.
 479 | Immunol.
 480 | imp.
 481 | imperf.
 482 | impers.
 483 | impf.
 484 | Impr.
 485 | improp.
 486 | Inaug.
 487 | Inc.
 488 | Inclos.
 489 | ind.
 490 | Ind.
 491 | indef.
 492 | indic.
 493 | indir.
 494 | Industr.
 495 | infin.
 496 | infl.
 497 | Infl.
 498 | Innoc.
 499 | Inorg.
 500 | Inq.
 501 | Inst.
 502 | instr.
 503 | Instr.
 504 | int.
 505 | Intell.
 506 | Interc.
 507 | interj.
 508 | Interl.
 509 | Internat.
 510 | Interpr.
 511 | interrog.
 512 | intr.
 513 | intrans.
 514 | Intro.
 515 | Introd.
 516 | Inv.
 517 | Invertebr.
 518 | Investig.
 519 | Investm.
 520 | Invoc.
 521 | Ir.
 522 | Irel.
 523 | iron.
 524 | irreg.
 525 | Isa.
 526 | Ital.
 527 | Jahrb.
 528 | Jam.
 529 | Jap.
 530 | Jas.
 531 | Jer.
 532 | joc.
 533 | Josh.
 534 | Jr.
 535 | Jrnl.
 536 | Jrnls.
 537 | Jud.
 538 | Judg.
 539 | Jurisd.
 540 | Jurisdict.
 541 | Jurispr.
 542 | Justif.
 543 | Justific.
 544 | Kgs.
 545 | Kingd.
 546 | Knowl.
 547 | Kpr.
 548 | Lam.
 549 | Lanc.
 550 | Lancash.
 551 | Lancs.
 552 | Lang.
 553 | Langs.
 554 | Lat.
 555 | lb.
 556 | Ld.
 557 | Lds.
 558 | Lect.
 559 | Leechd.
 560 | Leicest.
 561 | Leicestersh.
 562 | Leics.
 563 | Let.
 564 | Lett.
 565 | Lev.
 566 | Lex.
 567 | Libr.
 568 | Limnol.
 569 | Lincolnsh.
 570 | Lincs.
 571 | Ling.
 572 | Linn.
 573 | lit.
 574 | Lit.
 575 | Lithogr.
 576 | Lithol.
 577 | Liturg.
 578 | ll.
 579 | Lond.
 580 | Lt.
 581 | Ltd.
 582 | Macc.
 583 | Mach.
 584 | Mag.
 585 | Magn.
 586 | Mal.
 587 | Managem.
 588 | Manch.
 589 | Manip.
 590 | Manuf.
 591 | masc.
 592 | Matt.
 593 | Meas.
 594 | Measurem.
 595 | Mech.
 596 | med.
 597 | Med.
 598 | Medit.
 599 | Mem.
 600 | Merc.
 601 | Merch.
 602 | Metall.
 603 | Metallif.
 604 | Metallogr.
 605 | Metamorph.
 606 | Metaph.
 607 | metaphor.
 608 | Meteorol.
 609 | Metrop.
 610 | Mex.
 611 | Mic.
 612 | Mich.
 613 | Microbiol.
 614 | Microsc.
 615 | midl.
 616 | Mil.
 617 | Milit.
 618 | Min.
 619 | Mineral.
 620 | Misc.
 621 | Miscell.
 622 | mispr.
 623 | Monum.
 624 | Morphol.
 625 | Mr.
 626 | Mrs.
 627 | MS.
 628 | M.Sc.
 629 | MSS.
 630 | Mt.
 631 | Mtg.
 632 | Mts.
 633 | Munic.
 634 | Munif.
 635 | Munim.
 636 | Mus.
 637 | Myst.
 638 | Mythol.
 639 | Nah.
 640 | Narr.
 641 | Narrat.
 642 | Nat.
 643 | Naut.
 644 | Nav.
 645 | Navig.
 646 | Neh.
 647 | Neighb.
 648 | Nerv.
 649 | Neurol.
 650 | Neurosurg.
 651 | Newc.
 652 | Newspr.
 653 | nom.
 654 | nonce-wd.
 655 | Non-conf.
 656 | Nonconf.
 657 | Norf.
 658 | Northamptonsh.
 659 | Northants.
 660 | Northumb.
 661 | Northumbld.
 662 | Northumbr.
 663 | Norw.
 664 | Norweg.
 665 | Notts.
 666 | ns.
 667 | Nucl.
 668 | Num.
 669 | Numism.
 670 | Obad.
 671 | Obed.
 672 | obj.
 673 | Obj.
 674 | obl.
 675 | obs.
 676 | Obs.
 677 | Observ.
 678 | Obstet.
 679 | Obstetr.
 680 | occas.
 681 | Occas.
 682 | Occup.
 683 | Occurr.
 684 | Oceanogr.
 685 | Offic.
 686 | Okla.
 687 | Ont.
 688 | Ophthalm.
 689 | Ophthalmol.
 690 | opp.
 691 | Oppress.
 692 | Opt.
 693 | Orac.
 694 | Ord.
 695 | Org.
 696 | orig.
 697 | Orig.
 698 | Orkn.
 699 | Ornith.
 700 | Ornithol.
 701 | Orthogr.
 702 | Outl.
 703 | Oxf.
 704 | Oxfordsh.
 705 | Oxon.
 706 | oz.
 707 | pa.
 708 | Pa.
 709 | Palaeobot.
 710 | Palaeogr.
 711 | Palaeont.
 712 | Palaeontol.
 713 | Paraphr.
 714 | Parasitol.
 715 | Parl.
 716 | Parnass.
 717 | Pathol.
 718 | Peculat.
 719 | Penins.
 720 | perf.
 721 | Perf.
 722 | perh.
 723 | Periodontol.
 724 | pers.
 725 | Pers.
 726 | Persec.
 727 | personif.
 728 | Perthsh.
 729 | Petrogr.
 730 | pf.
 731 | Pharm.
 732 | Pharmaceut.
 733 | Pharmacol.
 734 | Ph.D.
 735 | Phil.
 736 | Philad.
 737 | Philem.
 738 | Philipp.
 739 | Philol.
 740 | Philos.
 741 | Phoen.
 742 | phonet.
 743 | Phonol.
 744 | Photog.
 745 | Photogr.
 746 | phr.
 747 | Phrenol.
 748 | Phys.
 749 | Physiogr.
 750 | Physiol.
 751 | Pict.
 752 | pl.
 753 | plur.
 754 | Pol.
 755 | Polit.
 756 | Polytechn.
 757 | Porc.
 758 | poss.
 759 | Posth.
 760 | Postm.
 761 | ppl.
 762 | pple.
 763 | pples.
 764 | Pract.
 765 | prec.
 766 | pred.
 767 | predic.
 768 | Predict.
 769 | pref.
 770 | Pref.
 771 | Preh.
 772 | Prehist.
 773 | prep.
 774 | Prerog.
 775 | pres.
 776 | Pres.
 777 | Presb.
 778 | Preserv.
 779 | Prim.
 780 | Princ.
 781 | priv.
 782 | prob.
 783 | Probab.
 784 | Probl.
 785 | Proc.
 786 | Prod.
 787 | Prof.
 788 | Prol.
 789 | pron.
 790 | pronunc.
 791 | Pronunc.
 792 | prop.
 793 | Prop.
 794 | propr.
 795 | Pros.
 796 | prov.
 797 | Prov.
 798 | Provid.
 799 | Provinc.
 800 | Provis.
 801 | Ps.
 802 | pseudo-arch.
 803 | pseudo-dial.
 804 | pseudo-Sc.
 805 | Psych.
 806 | Psychoanal.
 807 | Psychoanalyt.
 808 | Psychol.
 809 | Psychopathol.
 810 | Pt.
 811 | Publ.
 812 | Purg.
 813 | Qld.
 814 | quot.
 815 | quots.
 816 | Radiol.
 817 | Reas.
 818 | Reb.
 819 | Rec.
 820 | Reclam.
 821 | Recoll.
 822 | Redempt.
 823 | redupl.
 824 | Ref.
 825 | refash.
 826 | refl.
 827 | Refl.
 828 | Refus.
 829 | Refut.
 830 | reg.
 831 | Reg.
 832 | Regic.
 833 | Regist.
 834 | Regr.
 835 | rel.
 836 | Rel.
 837 | Relig.
 838 | Reminisc.
 839 | Remonstr.
 840 | Renfrewsh.
 841 | Rep.
 842 | repr.
 843 | Reprod.
 844 | Reps.
 845 | Rept.
 846 | Repub.
 847 | Res.
 848 | Resid.
 849 | Ret.
 850 | Retrosp.
 851 | Rev.
 852 | Revol.
 853 | rhet.
 854 | Rhet.
 855 | Rich.
 856 | Ross-sh.
 857 | Roxb.
 858 | Roy.
 859 | Rudim.
 860 | Russ.
 861 | Sam.
 862 | Sask.
 863 | Sat.
 864 | sc.
 865 | Sc.
 866 | Scand.
 867 | Sch.
 868 | Sci.
 869 | Scot.
 870 | Scotl.
 871 | Script.
 872 | Sculpt.
 873 | Seismol.
 874 | Sel.
 875 | Sen.
 876 | Ser.
 877 | Serm.
 878 | Sess.
 879 | Settlem.
 880 | Sev.
 881 | Shakes.
 882 | Shaks.
 883 | Sheph.
 884 | Shetl.
 885 | Shropsh.
 886 | Soc.
 887 | Sociol.
 888 | Som.
 889 | Sonn.
 890 | sp.
 891 | spec.
 892 | Spec.
 893 | Specif.
 894 | Specim.
 895 | Spectrosc.
 896 | Sq.
 897 | Sr.
 898 | SS.
 899 | St.
 900 | Staffordsh.
 901 | Staffs.
 902 | Stat.
 903 | Statist.
 904 | Ste.
 905 | str.
 906 | Stratigr.
 907 | Struct.
 908 | Sts.
 909 | Stud.
 910 | subj.
 911 | Subj.
 912 | subjunct.
 913 | subord.
 914 | Subscr.
 915 | Subscript.
 916 | subseq.
 917 | subst.
 918 | suff.
 919 | Suff.
 920 | superl.
 921 | Suppl.
 922 | Supplic.
 923 | Suppress.
 924 | Surg.
 925 | Surv.
 926 | Sus.
 927 | syll.
 928 | Symmetr.
 929 | Symp.
 930 | Syst.
 931 | Taxon.
 932 | techn.
 933 | Techn.
 934 | Technol.
 935 | Tel.
 936 | Telecomm.
 937 | Telegr.
 938 | Teleph.
 939 | Teratol.
 940 | Terminol.
 941 | Terrestr.
 942 | Textbk.
 943 | Theat.
 944 | Theatr.
 945 | Theol.
 946 | Theoret.
 947 | Thermonucl.
 948 | Thes.
 949 | Thess.
 950 | Topogr.
 951 | tr.
 952 | Trad.
 953 | Trag.
 954 | trans.
 955 | Trans.
 956 | transf.
 957 | transl.
 958 | Transl.
 959 | Transubstant.
 960 | Trav.
 961 | Treas.
 962 | Treatm.
 963 | Trib.
 964 | Trig.
 965 | Trigonom.
 966 | Trop.
 967 | Troub.
 968 | Troubl.
 969 | Typog.
 970 | Typogr.
 971 | ult.
 972 | Univ.
 973 | unkn.
 974 | Unnat.
 975 | Unoffic.
 976 | unstr.
 977 | usu.
 978 | Utilit.
 979 | Va.
 980 | Vac.
 981 | Valedict.
 982 | var.
 983 | varr.
 984 | vars.
 985 | vb.
 986 | vbl.
 987 | vbs.
 988 | Veg.
 989 | Venet.
 990 | Vertebr.
 991 | Vet.
 992 | Vic.
 993 | Vict.
 994 | Vind.
 995 | Vindic.
 996 | Virg.
 997 | Virol.
 998 | viz.
 999 | Voc.
1000 | Vocab.
1001 | Vol.
1002 | Vols.
1003 | Voy.
1004 | vs.
1005 | vulg.
1006 | Vulg.
1007 | Warwicksh.
1008 | wd.
1009 | Wd.
1010 | Westm.
1011 | Westmld.
1012 | Westmorld.
1013 | Westmrld.
1014 | Wilts.
1015 | Wiltsh.
1016 | Wis.
1017 | Wisd.
1018 | wk.
1019 | Wk.
1020 | Wkly.
1021 | Wks.
1022 | Wonderf.
1023 | Worc.
1024 | Worcestersh.
1025 | Worcs.
1026 | Writ.
1027 | Yearbk.
1028 | Yng.
1029 | Yorks.
1030 | Yorksh.
1031 | Yr.
1032 | Yrs.
1033 | Zech.
1034 | Zeitschr.
1035 | Zeph.
1036 | Zoogeogr.
1037 | Zool.
1038 | 


--------------------------------------------------------------------------------
/src/somajo/sentence_splitter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import collections
  4 | import regex as re
  5 | 
  6 | from . import (
  7 |     doubly_linked_list,
  8 |     token,
  9 |     utils
 10 | )
 11 | 
 12 | 
 13 | class SentenceSplitter():
 14 |     def __init__(self, is_tuple=False, language="de_CMC"):
 15 |         """Create a SentenceSplitter object. If the tokenized paragraphs
 16 |         contain token classes or extra info, set is_tuple=True.
 17 | 
 18 |         """
 19 |         self.is_tuple = is_tuple
 20 |         # full stop, ellipsis, exclamation and question marks
 21 |         self.sentence_ending_punct = re.compile(r"^(?:\.+|…+\.*|[!?]+)$")
 22 |         self.opening_punct = re.compile(r"^(?:['\"¿¡\p{Pi}\p{Ps}–—]|-{2,})$")
 23 |         self.closing_punct = re.compile(r"^(?:['\"\p{Pf}\p{Pe}])$")
 24 |         # International quotes: «» “” ‹› ‘’
 25 |         # German quotes: »« „“ ›‹ ‚‘
 26 |         self.problematic_quotes = {'"'}
 27 |         if language == "de" or language == "de_CMC":
 28 |             # German opening quotes [»›] have category Pf
 29 |             # German closing quotes [“‘«‹] have category Pi
 30 |             self.problematic_quotes = {'"', "»", "«", "›", "‹", "“", "‘"}
 31 |         self.eos_abbreviations = utils.read_abbreviation_file("eos_abbreviations.txt")
 32 |         # We match these via regular expressions because users could
 33 |         # call the split or split_xml methods with pretokenized input
 34 |         # that does not have SoMaJo token classes
 35 |         self.mention = re.compile(r'^[@]\w+$')
 36 |         self.hashtag = re.compile(r'^[#]\w(?:[\w-]*\w)?$')
 37 | 
 38 |     def _get_sentence_boundaries(self, tokens):
 39 |         sentence_boundaries = []
 40 |         n = len(tokens)
 41 |         for i, t in enumerate(tokens, start=1):
 42 |             if t.last_in_sentence:
 43 |                 boundary = i
 44 |                 for j in range(i, n):
 45 |                     if tokens[j].markup_class == "end":
 46 |                         boundary += 1
 47 |                     else:
 48 |                         break
 49 |                 sentence_boundaries.append(boundary)
 50 |         if len(sentence_boundaries) == 0:
 51 |             sentence_boundaries.append(n)
 52 |         if sentence_boundaries[-1] != n:
 53 |             sentence_boundaries[-1] = n
 54 |         return sentence_boundaries
 55 | 
 56 |     def _add_xml_tags(self, tokens, s_tag="s"):
 57 |         """Mark sentence boundaries with XML tags."""
 58 |         # Positions of XML tags w.r.t. the actual sentence:
 59 |         start, inside, end, na = 1, 2, 3, 4
 60 |         open_tags = collections.deque()
 61 |         reopen_after_start = collections.deque()
 62 |         reopen_after_end = collections.deque()
 63 |         start_tag = re.compile(r"^<([^ ]+)[ ]?[^>]*>$")
 64 |         end_tag = re.compile(r"^</(.+)>$")
 65 |         for sentence in tokens:
 66 |             # print([(t.text, t.first_in_sentence, t.last_in_sentence) for t in sentence])
 67 |             sentence_dll = doubly_linked_list.DLL(sentence)
 68 |             position = start
 69 |             tags = collections.deque()
 70 |             first_token, last_token = None, None
 71 |             for tag in reversed(reopen_after_end):
 72 |                 top = open_tags.pop()
 73 |                 assert top is tag
 74 |             while len(reopen_after_end) > 0:
 75 |                 tag = reopen_after_end.pop()
 76 |                 sentence_dll.insert_left(tag["start_token"], sentence_dll.first)
 77 |             for tok in sentence_dll:
 78 |                 if tok.value.markup:
 79 |                     # better store tag name in Token object
 80 |                     if tok.value.markup_class == "start":
 81 |                         m = start_tag.search(tok.value.text)
 82 |                         assert m
 83 |                         tag_name = m.group(1)
 84 |                         tag = {"tag_name": tag_name, "start_token": tok, "end_token": None, "start": position, "end": na}
 85 |                         open_tags.append(tag)
 86 |                         tags.append(tag)
 87 |                     elif tok.value.markup_class == "end":
 88 |                         m = end_tag.search(tok.value.text)
 89 |                         assert m
 90 |                         tag_name = m.group(1)
 91 |                         top = open_tags.pop()
 92 |                         assert tag_name == top["tag_name"]
 93 |                         top["end_token"] = tok
 94 |                         top["end"] = position
 95 |                         if top["start"] == na:
 96 |                             tags.appendleft(top)
 97 |                 if tok.value.first_in_sentence:
 98 |                     position = inside
 99 |                     first_token = tok
100 |                 if tok.value.last_in_sentence:
101 |                     position = end
102 |                     last_token = tok
103 |             if first_token is None:
104 |                 yield sentence_dll.to_list()
105 |                 continue
106 |             s_start = first_token  # left of first token
107 |             s_end = last_token     # right of last token
108 |             lot = sentence_dll.last
109 |             for tag in tags:
110 |                 # print(tag)
111 |                 if tag["start"] == na:
112 |                     if tag["end"] == inside:
113 |                         ft = sentence_dll.first
114 |                         # close tag
115 |                         closing_tag = token.Token("</%s>" % tag["tag_name"], markup=True, markup_class="end", markup_eos=False, locked=True)
116 |                         sentence_dll.insert_left(closing_tag, ft)
117 |                         # put starting s-tag to the right
118 |                         assert sentence_dll.is_right_of(s_start, ft.prev)
119 |                         # re-open tag
120 |                         reopen_after_start.append(tag)
121 |                 elif tag["start"] == start:
122 |                     if tag["end"] == inside:
123 |                         # put starting s-tag to the left
124 |                         if not sentence_dll.is_left_of(s_start, tag["start_token"]):
125 |                             s_start = tag["start_token"]
126 |                 elif tag["start"] == inside:
127 |                     if tag["end"] == end:
128 |                         # put ending s-tag to the right
129 |                         if not sentence_dll.is_right_of(s_end, tag["end_token"]):
130 |                             s_end = tag["end_token"]
131 |                     elif tag["end"] == na:
132 |                         # close tag
133 |                         closing_tag = token.Token("</%s>" % tag["tag_name"], markup=True, markup_class="end", markup_eos=False, locked=True)
134 |                         sentence_dll.insert_right(closing_tag, lot)
135 |                         # put ending s-tag
136 |                         if not sentence_dll.is_right_of(s_end, lot.next):
137 |                             # s_end = sentence_dll.last
138 |                             s_end = lot.next
139 |                         # re-open tag
140 |                         reopen_after_end.append(tag)
141 |             # starting s-tag
142 |             sentence_dll.insert_left(token.Token("<%s>" % s_tag, markup=True, markup_class="start", markup_eos=True, locked=True), s_start)
143 |             while len(reopen_after_start) > 0:
144 |                 tag = reopen_after_start.popleft()
145 |                 sentence_dll.insert_left(tag["start_token"], s_start)
146 |             # ending s-tag
147 |             sentence_dll.insert_right(token.Token("</%s>" % s_tag, markup=True, markup_class="end", markup_eos=True, locked=True), s_end)
148 |             # for all tags on the stack, change start to na
149 |             for tag in open_tags:
150 |                 tag["start"] = na
151 |             yield sentence_dll.to_list()
152 |         assert len(open_tags) == 0
153 | 
154 |     def _merge_empty_sentences(self, tokens):
155 |         """Merge empty sentences with preceding sentence"""
156 |         empty_first = True
157 |         previous = []
158 |         for sentence in tokens:
159 |             empty_sentence = not any([tok.first_in_sentence for tok in sentence])
160 |             if empty_first:
161 |                 previous.extend(sentence)
162 |                 empty_first = empty_sentence
163 |             else:
164 |                 if empty_sentence:
165 |                     previous.extend(sentence)
166 |                 else:
167 |                     yield previous
168 |                     previous = sentence
169 |         yield previous
170 | 
171 |     def _split_sentences(self, tokens):
172 |         """Split list of Token objects into sentences."""
173 |         tokens, sentence_boundaries = self._split_token_objects(tokens)
174 |         return [tokens[i:j] for i, j in zip([0] + sentence_boundaries[:-1], sentence_boundaries)]
175 | 
176 |     def split(self, tokenized_paragraph):
177 |         """Split tokenized_paragraph into sentences."""
178 |         if self.is_tuple:
179 |             tokens = [token.Token(t[0]) for t in tokenized_paragraph]
180 |         else:
181 |             tokens = [token.Token(t) for t in tokenized_paragraph]
182 |         tokens, sentence_boundaries = self._split_token_objects(tokens)
183 |         return [tokenized_paragraph[i:j] for i, j in zip([0] + sentence_boundaries[:-1], sentence_boundaries)]
184 | 
185 |     def split_xml(self, tokenized_xml, eos_tags=set()):
186 |         """Split tokenized XML into sentences."""
187 |         opening_tag = re.compile(r"""<(?:[^\s:]+:)?([_A-Z][-.\w]*)(?:\s+[_:A-Z][-.:\w]*\s*=\s*(?:"[^"]*"|'[^']*'))*\s*/?>""", re.IGNORECASE)
188 |         closing_tag = re.compile(r"^</([_:A-Z][-.:\w]*)\s*>$", re.IGNORECASE)
189 |         if self.is_tuple:
190 |             tokens = [token.Token(t[0]) for t in tokenized_xml]
191 |         else:
192 |             tokens = [token.Token(t) for t in tokenized_xml]
193 |         first_token_in_sentence = True
194 |         for i, t in enumerate(tokens):
195 |             opening = opening_tag.search(t.text)
196 |             closing = closing_tag.search(t.text)
197 |             if opening:
198 |                 t.markup = True
199 |                 t.markup_class = "start"
200 |                 tagname = opening.group(1)
201 |             if closing:
202 |                 t.markup = True
203 |                 t.markup_class = "end"
204 |                 tagname = closing.group(1)
205 |             if t.markup:
206 |                 if tagname in eos_tags:
207 |                     # previous non-markup is last_in_sentence
208 |                     for j in range(i - 1, -1, -1):
209 |                         if not tokens[j].markup:
210 |                             tokens[j].last_in_sentence = True
211 |                             break
212 |                     # next non-markup is first_in_sentence
213 |                     first_token_in_sentence = True
214 |                 continue
215 |             if first_token_in_sentence:
216 |                 t.first_in_sentence = True
217 |                 first_token_in_sentence = False
218 |         tokens, sentence_boundaries = self._split_token_objects(tokens)
219 |         return [tokenized_xml[i:j] for i, j in zip([0] + sentence_boundaries[:-1], sentence_boundaries)]
220 | 
221 |     def _split_token_objects(self, tokens):
222 |         n = len(tokens)
223 |         # the first non-markup token is first_in_sentence
224 |         for tok in tokens:
225 |             if not tok.markup:
226 |                 tok.first_in_sentence = True
227 |                 break
228 |         # the last non-markup token is last_in_sentence
229 |         for tok in reversed(tokens):
230 |             if not tok.markup:
231 |                 tok.last_in_sentence = True
232 |                 break
233 |         for i, tok in enumerate(tokens):
234 |             if tok.markup:
235 |                 continue
236 |             if tok.last_in_sentence:
237 |                 continue
238 |             if self.sentence_ending_punct.search(tok.text) or tok.text.lower() in self.eos_abbreviations:
239 |                 last = None
240 |                 last_token_in_sentence = tok
241 |                 first_token_in_sentence = None
242 |                 for j in range(i + 1, n):
243 |                     tok_j = tokens[j]
244 |                     if tok_j.markup:
245 |                         continue
246 |                     opening, closing = False, False
247 |                     if first_token_in_sentence is None:
248 |                         first_token_in_sentence = tok_j
249 |                     # Heuristically disambiguate problematic quotes:
250 |                     if tok_j.text in self.problematic_quotes:
251 |                         # opening: preceded by space or opening
252 |                         if tokens[j - 1].space_after or self.opening_punct.search(tokens[j - 1].text):
253 |                             opening = True
254 |                         # closing: last token or followed by space or closing
255 |                         elif j == n - 1 or tok_j.space_after or self.closing_punct.search(tokens[j + 1].text):
256 |                             closing = True
257 |                     if tok_j.text[0].isupper() or tok_j.text.isnumeric() or self.mention.search(tok_j.text) or self.hashtag.search(tok_j.text):
258 |                         last_token_in_sentence.last_in_sentence = True
259 |                         first_token_in_sentence.first_in_sentence = True
260 |                         break
261 |                     # “Emoticons are treated as non-verbal comments to
262 |                     # the text and are thus integrated in the
263 |                     # utterance.” (Rehbein et al. 2018: 20)
264 |                     #
265 |                     # This does not work with pretokenized text fed
266 |                     # into the split and split_xml methods since their
267 |                     # input does not have SoMaJo token classes.
268 |                     elif tok_j.token_class == "emoticon" and last != "opening":
269 |                         last_token_in_sentence = tok_j
270 |                         first_token_in_sentence = None
271 |                     elif opening or (self.opening_punct.search(tok_j.text) and not closing):
272 |                         last = "opening"
273 |                     elif (closing or (self.closing_punct.search(tok_j.text) and not opening)) and last != "opening":
274 |                         last_token_in_sentence = tok_j
275 |                         first_token_in_sentence = None
276 |                         last = "closing"
277 |                     else:
278 |                         break
279 |         sentence_boundaries = self._get_sentence_boundaries(tokens)
280 |         return tokens, sentence_boundaries
281 | 


--------------------------------------------------------------------------------
/src/somajo/data/abbreviations_de.txt:
--------------------------------------------------------------------------------
   1 | # A list of abbreviations that are not matched by
   2 | # “(?:[[:alpha:]]\.){2,}”, i.e. are not sequences of single letters
   3 | # followed by single dots
   4 | #
   5 | # Lines starting with “#” are treated as comments and will be ignored.
   6 | 
   7 | # months
   8 | Jan.
   9 | Feb.
  10 | Mär.
  11 | Apr.
  12 | Jun.
  13 | Jul.
  14 | Aug.
  15 | Sep.
  16 | Sept.
  17 | Okt.
  18 | Nov.
  19 | Dez.
  20 | 
  21 | # days of the week without “So.” (because of things like “Das ist halt
  22 | # so.”)
  23 | Mo.
  24 | Di.
  25 | Mi.
  26 | Do.
  27 | Fr.
  28 | Sa.
  29 | 
  30 | # other abbreviations
  31 | A.d.Hrsg.
  32 | Abb.
  33 | abds.
  34 | Abfr.
  35 | Abg.
  36 | abgek.
  37 | Abh.
  38 | abh.
  39 | Abk.
  40 | ABl.
  41 | Abl.
  42 | abl.
  43 | ABl.EG
  44 | ABl.EU
  45 | Abm.
  46 | Abn.
  47 | abn.
  48 | Abr.
  49 | abr.
  50 | Abs.
  51 | abs.
  52 | Abschn.
  53 | Abst.
  54 | abulg.
  55 | abw.
  56 | abzgl.
  57 | accel.
  58 | Add.
  59 | Adj.
  60 | adj.
  61 | Adr.
  62 | Adv.
  63 | adv.
  64 | adyg.
  65 | aengl.
  66 | afghan.
  67 | afr.
  68 | afranz.
  69 | afranzös.
  70 | afries.
  71 | afrik.
  72 | afrk.
  73 | afrs.
  74 | afrz.
  75 | afränk.
  76 | ags.
  77 | Ahd.
  78 | ahd.
  79 | aind.
  80 | akad.
  81 | Akk.
  82 | akkad.
  83 | akt.
  84 | al.
  85 | alb.
  86 | alban.
  87 | alem.
  88 | alemann.
  89 | allg.
  90 | allj.
  91 | allm.
  92 | alltagsspr.
  93 | alphanum.
  94 | altai.
  95 | altengl.
  96 | altfranz.
  97 | altfranzös.
  98 | altfrz.
  99 | altgr.
 100 | althochdt.
 101 | altis.
 102 | altisländ.
 103 | altpreuß.
 104 | altröm.
 105 | alttest.
 106 | alëut.
 107 | amer.
 108 | amerik.
 109 | amerikan.
 110 | amhar.
 111 | amtl.
 112 | Amtl.Begr.
 113 | Amtl.Mitt.
 114 | Amtm.
 115 | Amtsbl.
 116 | Amtsdt.
 117 | Amtsspr.
 118 | Anat.
 119 | anat.
 120 | anatom.
 121 | andalus.
 122 | ang.
 123 | Ange.
 124 | Angekl.
 125 | angelsächs.
 126 | Angest.
 127 | angloamerik.
 128 | anglofrz.
 129 | angloind.
 130 | Anh.
 131 | Ank.
 132 | Ankl.
 133 | Anl.
 134 | anl.
 135 | Anm.
 136 | annamit.
 137 | Anord.
 138 | anord.
 139 | Anschl.
 140 | anschl.
 141 | Anschr.
 142 | antarkt.
 143 | Anthrop.
 144 | anthrop.
 145 | Anw.
 146 | AnwBl.
 147 | Anz.
 148 | aobd.
 149 | apl.
 150 | apl.Prof.
 151 | Apostr.
 152 | apreuß.
 153 | arab.
 154 | aram.
 155 | aran.
 156 | ArbRGeg.
 157 | archäol.
 158 | arg.
 159 | argent.
 160 | arkt.
 161 | aserbaidsch.
 162 | aslaw.
 163 | assyr.
 164 | ASt.
 165 | astron.
 166 | asächs.
 167 | Attr.
 168 | attr.
 169 | Auff.
 170 | Aufl.
 171 | ausdr.
 172 | ausf.
 173 | Ausg.
 174 | ausl.
 175 | Aussch.
 176 | ausschl.
 177 | Ausspr.
 178 | Ausst.
 179 | austral.
 180 | außenpol.
 181 | awar.
 182 | awest.
 183 | Az.
 184 | aztek.
 185 | bab.
 186 | BABl.
 187 | babyl.
 188 | bair.
 189 | Bakt.
 190 | balt.
 191 | baltoslaw.
 192 | Bankw.
 193 | baschk.
 194 | bask.
 195 | bauf.
 196 | baupol.
 197 | Bauw.
 198 | Bay.
 199 | bay.
 200 | BayBgm.
 201 | BayGVBl.
 202 | BayJMBl.
 203 | BayMABl.
 204 | bayr.
 205 | BayVBl.
 206 | BayÄrzteBl.
 207 | BBg.
 208 | Bd.
 209 | Bde.
 210 | Bearb.
 211 | Bed.
 212 | begl.
 213 | Begr.
 214 | begr.
 215 | beif.
 216 | Beigel.
 217 | bej.
 218 | Bek.
 219 | Bekl.
 220 | Bem.
 221 | Ber.
 222 | ber.
 223 | berbersprachl.
 224 | Bergb.
 225 | BerGer.
 226 | Berl.
 227 | BerlÄrzteBl.
 228 | Berufsbez.
 229 | Bes.
 230 | bes.
 231 | BesBed.
 232 | Besch.
 233 | besch.
 234 | Beschl.
 235 | Beschw.
 236 | BeschwGer.
 237 | bestr.
 238 | Betr.
 239 | betr.
 240 | Betriebswiss.
 241 | BetrVerf.
 242 | Bev.
 243 | Bew.
 244 | Bez.
 245 | bez.
 246 | bezw.
 247 | Bf.
 248 | bfn.
 249 | bft.
 250 | Bg.
 251 | BGBl.
 252 | Bgm.
 253 | Bhf.
 254 | bibl.
 255 | bildl.
 256 | bildungspol.
 257 | bildungsspr.
 258 | Biol.
 259 | biol.
 260 | Bj.
 261 | Bl.
 262 | bl.
 263 | Blk.
 264 | Bln.
 265 | Bodenk.
 266 | BPräs.
 267 | BR-Dr.
 268 | BR-Drs.
 269 | BR-Prot.
 270 | BRAK-Mitt.
 271 | bras.
 272 | Brem.
 273 | brem.
 274 | BremGBl.
 275 | bret.
 276 | breton.
 277 | brn.
 278 | Bruchz.
 279 | bsd.
 280 | Bsp.
 281 | bspw.
 282 | BT-Dr.
 283 | BT-Drs.
 284 | Btl.
 285 | btto.
 286 | Bttr.
 287 | Buchw.
 288 | buddh.
 289 | bulg.
 290 | bulgar.
 291 | bundespol.
 292 | burjat.
 293 | burmes.
 294 | Bw.
 295 | byzant.
 296 | Bz.
 297 | bzb.
 298 | BZBl.
 299 | bzgl.
 300 | bzw.
 301 | Börsenw.
 302 | ca.
 303 | Carp.
 304 | cf.
 305 | chakass.
 306 | chald.
 307 | chant.
 308 | Chem.
 309 | chem.
 310 | chilen.
 311 | chin.
 312 | Chr.
 313 | christl.
 314 | Chron.
 315 | chron.
 316 | Co.
 317 | cresc.
 318 | d.Gr.
 319 | Dat.
 320 | ders.
 321 | Dez.
 322 | dgl.
 323 | Di.
 324 | dial.
 325 | dichter.
 326 | Dig.
 327 | Dim.
 328 | dim.
 329 | Dimin.
 330 | dimin.
 331 | dingl.
 332 | Dipl.
 333 | Diss.
 334 | Do.
 335 | dominikan.
 336 | dor.
 337 | Doz.
 338 | Dr.
 339 | Dr.des.
 340 | Dr.h.c.
 341 | Dr.phil.
 342 | Dr.rer.nat.
 343 | Drchf.
 344 | Drcks.
 345 | Dres.
 346 | Drs.
 347 | Drucks.
 348 | dt.
 349 | Dtl.
 350 | dto.
 351 | dtsch.
 352 | Dtzd.
 353 | DVBl.
 354 | Dz.
 355 | dz.
 356 | DÄBl.
 357 | dän.
 358 | ebd.
 359 | ehem.
 360 | eidg.
 361 | eig.
 362 | eigtl.
 363 | Einf.
 364 | eingetr.
 365 | einh.
 366 | Einl.
 367 | einschl.
 368 | einstw.
 369 | Einw.
 370 | Eisenb.
 371 | Elektrot.
 372 | elektrotechn.
 373 | em.
 374 | engl.
 375 | entspr.
 376 | erb.
 377 | erf.
 378 | erg.
 379 | Erl.
 380 | erm.
 381 | ersch.
 382 | erschl.
 383 | Erw.
 384 | Erzb.
 385 | erzg.
 386 | erzgeb.
 387 | eskim.
 388 | ESt.
 389 | estn.
 390 | etc.
 391 | Etg.
 392 | etrusk.
 393 | etw.
 394 | eur.
 395 | europ.
 396 | EUSt.
 397 | ev.
 398 | evang.
 399 | evtl.
 400 | Ew.
 401 | ewen.
 402 | ewenk.
 403 | exkl.
 404 | Expl.
 405 | Ez.
 406 | Fa.
 407 | Face2face-Komm.
 408 | Fachspr.
 409 | fachspr.
 410 | Fam.
 411 | fam.
 412 | fem.
 413 | ff.
 414 | Fig.
 415 | fig.
 416 | finanzmath.
 417 | finn.
 418 | finnougr.
 419 | Flgh.
 420 | FlNr.
 421 | FlSt.
 422 | FlStk.
 423 | FlStNr.
 424 | fläm.
 425 | Fn.
 426 | fnhd.
 427 | folg.
 428 | Forts.
 429 | Fortstzg.
 430 | Fr.
 431 | fr.
 432 | fragm.
 433 | franz.
 434 | französ.
 435 | frdl.
 436 | Frh.
 437 | Frhr.
 438 | fries.
 439 | friesl.
 440 | Frl.
 441 | frnhd.
 442 | frz.
 443 | fränk.
 444 | frühnhd.
 445 | Fsm.
 446 | Ftm.
 447 | Fußn.
 448 | färö.
 449 | förml.
 450 | GABl.
 451 | gall.
 452 | galloroman.
 453 | Gart.
 454 | gaskogn.
 455 | Gbf.
 456 | Geb.
 457 | geb.
 458 | Gebr.
 459 | gebr.
 460 | ged.
 461 | gef.
 462 | geg.
 463 | gegr.
 464 | geh.
 465 | geisteswissenschaftl.
 466 | gek.
 467 | gel.
 468 | geleg.
 469 | gem.
 470 | gemeingerm.
 471 | Gen.
 472 | geod.
 473 | geogr.
 474 | geograf.
 475 | geograph.
 476 | geol.
 477 | geolog.
 478 | geophys.
 479 | georg.
 480 | gep.
 481 | ger.
 482 | germ.
 483 | Ges.
 484 | ges.
 485 | gesch.
 486 | geschr.
 487 | gespr.
 488 | gest.
 489 | gesundheitspol.
 490 | get.
 491 | Gew.
 492 | gew.
 493 | gez.
 494 | Gfsch.
 495 | Gft.
 496 | gg.
 497 | ggb.
 498 | ggbfs.
 499 | ggez.
 500 | ggf.
 501 | ggfs.
 502 | Ggs.
 503 | ggü.
 504 | Ghzg.
 505 | Ghzgt.
 506 | glchz.
 507 | gleichbed.
 508 | gleichz.
 509 | glz.
 510 | got.
 511 | Gr.
 512 | gr.
 513 | Gramm.
 514 | grammat.
 515 | graph.
 516 | grch.
 517 | Grchl.
 518 | Grdb.
 519 | Grdf.
 520 | Grdfl.
 521 | Grdg.
 522 | Grdl.
 523 | Grdr.
 524 | grds.
 525 | Grdst.
 526 | griech.
 527 | Grz.
 528 | grönländ.
 529 | GVBl.
 530 | GVNW.
 531 | GVOBl.
 532 | gyn.
 533 | gynäk.
 534 | Gz.
 535 | gäl.
 536 | hait.
 537 | Halbs.
 538 | Hamb.
 539 | hamb.
 540 | Handw.
 541 | Hbf.
 542 | hd.
 543 | Hdb.
 544 | hebr.
 545 | hess.
 546 | hethit.
 547 | Hf.
 548 | Hg.
 549 | hg.
 550 | hindust.
 551 | hinr.
 552 | hins.
 553 | Hinw.
 554 | hist.
 555 | HJber.
 556 | Hkl.
 557 | hl.
 558 | Hmb.
 559 | hochd.
 560 | hochspr.
 561 | Hom.
 562 | hptpl.
 563 | hpts.
 564 | Hptst.
 565 | HQu.
 566 | Hr.
 567 | Hrn.
 568 | Hrsg.
 569 | Hs.
 570 | Hubbr.
 571 | Hubr.
 572 | Hw.
 573 | Hyaz.
 574 | hydr.
 575 | hydrol.
 576 | Hzm.
 577 | i.Allg.
 578 | iber.
 579 | ibid.
 580 | Ident.
 581 | ident.
 582 | idg.
 583 | ie.
 584 | illyr.
 585 | Imkerspr.
 586 | Inc.
 587 | Ind.
 588 | ind.
 589 | indef.
 590 | indekl.
 591 | indian.
 592 | indiff.
 593 | indir.
 594 | indiv.
 595 | indog.
 596 | indogerm.
 597 | indogerman.
 598 | indoiran.
 599 | indon.
 600 | indones.
 601 | Inf.
 602 | inf.
 603 | Ing.
 604 | Inh.
 605 | inkl.
 606 | inn.
 607 | innenpol.
 608 | insb.
 609 | insbes.
 610 | int.
 611 | iran.
 612 | iron.
 613 | isl.
 614 | islam.
 615 | isländ.
 616 | it.
 617 | ital.
 618 | italien.
 619 | JABl.
 620 | Jahrh.
 621 | jakut.
 622 | Jan.
 623 | jap.
 624 | japan.
 625 | jav.
 626 | JBl.
 627 | jdn.
 628 | jem.
 629 | jemen.
 630 | Jg.
 631 | Jh.
 632 | Jhd.
 633 | Jhdt.
 634 | Jhs.
 635 | jidd.
 636 | JMBl.
 637 | jmd.
 638 | jmdm.
 639 | jmdn.
 640 | jmds.
 641 | jn.
 642 | journ.
 643 | jr.
 644 | Jt.
 645 | Jtsd.
 646 | jugendspr.
 647 | jugendsprachl.
 648 | jugoslaw.
 649 | Jul.
 650 | Jun.
 651 | jun.
 652 | jur.
 653 | JVBl.
 654 | jägersprachl.
 655 | jährl.
 656 | jüd.
 657 | kalm.
 658 | kanad.
 659 | Kap.
 660 | karib.
 661 | kastil.
 662 | katal.
 663 | kath.
 664 | kaufm.
 665 | kaukas.
 666 | KBG.EKD
 667 | kelt.
 668 | Kfm.
 669 | kfm.
 670 | Kfr.
 671 | Kgr.
 672 | kindersprachl.
 673 | kirchenlat.
 674 | kirchenslaw.
 675 | kirchl.
 676 | kirg.
 677 | Kj.
 678 | Kl.
 679 | klass.
 680 | klimatol.
 681 | kol.
 682 | Kom.
 683 | Komm.
 684 | Konf.
 685 | Konj.
 686 | Konv.
 687 | Kop.
 688 | kop.
 689 | kopt.
 690 | korean.
 691 | KostRsp.
 692 | Kr.
 693 | kreol.
 694 | kret.
 695 | krimgot.
 696 | kriminaltechn.
 697 | krit.
 698 | kroat.
 699 | Krs.
 700 | Ks.
 701 | Ktn.
 702 | Kto.
 703 | kuban.
 704 | kurd.
 705 | Kurzw.
 706 | Kw.
 707 | l.
 708 | lab.
 709 | LAbg.
 710 | ladin.
 711 | landespol.
 712 | landsch.
 713 | Landw.
 714 | langfr.
 715 | langj.
 716 | langob.
 717 | langobard.
 718 | lapp.
 719 | lat.
 720 | latein.
 721 | latinis.
 722 | lautl.
 723 | lautm.
 724 | lbd.
 725 | lbdg.
 726 | Ldkr.
 727 | led.
 728 | leg.
 729 | lett.
 730 | lfd.
 731 | Lfg.
 732 | Lfm.
 733 | Lfrg.
 734 | Lg.
 735 | LGBl.
 736 | lgfr.
 737 | Lgft.
 738 | lgj.
 739 | lig.
 740 | ling.
 741 | Lit.
 742 | lit.
 743 | lrh.
 744 | Ls.
 745 | LSt.
 746 | lt.
 747 | Ltd.
 748 | ltd.
 749 | luth.
 750 | luxemb.
 751 | MA.
 752 | ma.
 753 | MABl.
 754 | Mag.
 755 | malai.
 756 | marinespr.
 757 | marx.
 758 | mask.
 759 | Math.
 760 | math.
 761 | Max.
 762 | max.
 763 | mazedon.
 764 | MBl.
 765 | Mbl.
 766 | mbl.
 767 | Mbll.
 768 | md.
 769 | mdal.
 770 | Mdj.
 771 | mdj.
 772 | mdl.
 773 | mdls.
 774 | Mdt.
 775 | mech.
 776 | meckl.
 777 | med.
 778 | melanes.
 779 | mengl.
 780 | Merc.
 781 | meteorol.
 782 | meton.
 783 | mexik.
 784 | mfr.
 785 | mfranz.
 786 | mfrk.
 787 | mfrz.
 788 | mfränk.
 789 | Mgl.
 790 | mgl.
 791 | mglw.
 792 | mhd.
 793 | mhdt.
 794 | Mia.
 795 | mihd.
 796 | milit.
 797 | Mill.
 798 | Min.
 799 | min.
 800 | mind.
 801 | Mio.
 802 | Mitgl.
 803 | mitteld.
 804 | mitteldt.
 805 | mittelhochdt.
 806 | Mittw.
 807 | Mitw.
 808 | mlat.
 809 | mnd.
 810 | mndd.
 811 | mniederd.
 812 | mnl.
 813 | Mo.
 814 | mod.
 815 | mong.
 816 | Mr.
 817 | Mrd.
 818 | Mrs.
 819 | Mrz.
 820 | Ms.
 821 | Mschr.
 822 | Msgr.
 823 | Msp.
 824 | mtl.
 825 | mundartl.
 826 | MwSt.
 827 | Mz.
 828 | möbl.
 829 | mündl.
 830 | Nachf.
 831 | nachm.
 832 | Nachw.
 833 | nat.
 834 | nationalsoz.
 835 | natsoz.
 836 | Nbfl.
 837 | Nchf.
 838 | nd.
 839 | ndd.
 840 | ndrl.
 841 | Nds.
 842 | Nds.SOG
 843 | NdsVBl.
 844 | neapolit.
 845 | Neub.
 846 | neunorweg.
 847 | neutest.
 848 | neutr.
 849 | nhd.
 850 | niederd.
 851 | niederdt.
 852 | niederl.
 853 | niederld.
 854 | niem.
 855 | nl.
 856 | nlat.
 857 | Nom.
 858 | nordamerik.
 859 | nordd.
 860 | norddt.
 861 | nordgerm.
 862 | nordostd.
 863 | nordostdt.
 864 | nordwestd.
 865 | nordwestdt.
 866 | norw.
 867 | norweg.
 868 | Nov.
 869 | Nr.
 870 | Ntw.
 871 | ntw.
 872 | Nutzfl.
 873 | nw.
 874 | NWVBl.
 875 | näml.
 876 | nördl.
 877 | Obb.
 878 | obb.
 879 | obd.
 880 | oberlaus.
 881 | obers.
 882 | obersächs.
 883 | obj.
 884 | od.
 885 | offiz.
 886 | Offz.
 887 | Okt.
 888 | op.
 889 | org.
 890 | Orig.
 891 | orth.
 892 | osk.
 893 | osman.
 894 | ostd.
 895 | ostdt.
 896 | ostfrz.
 897 | ostgerm.
 898 | ostmdt.
 899 | ostmitteld.
 900 | ostniederd.
 901 | ostpreuß.
 902 | oz.
 903 | palästin.
 904 | Pat.
 905 | pers.
 906 | peruan.
 907 | Pet.
 908 | Pf.
 909 | Pfd.
 910 | Pfg.
 911 | Philos.
 912 | philos.
 913 | phonolog.
 914 | phryg.
 915 | Phys.
 916 | phys.
 917 | phöniz.
 918 | Pkt.
 919 | Pl.
 920 | Plur.
 921 | polit.
 922 | poln.
 923 | polynes.
 924 | portug.
 925 | Pos.
 926 | pos.
 927 | pp.
 928 | ppa.
 929 | Pr.
 930 | preuß.
 931 | Prof.
 932 | Prot.
 933 | prot.
 934 | Prov.
 935 | prov.
 936 | provenz.
 937 | Proz.
 938 | präd.
 939 | prähist.
 940 | Präs.
 941 | Psych.
 942 | psych.
 943 | Päd.
 944 | Qmstr.
 945 | Qt.
 946 | Qu.
 947 | qu.
 948 | Quadr.
 949 | quadr.
 950 | Quar.
 951 | Quart.
 952 | Quat.
 953 | Quäst.
 954 | Rak.
 955 | rd.
 956 | RdErl.
 957 | Rdnr.
 958 | Reg.
 959 | RegBl.
 960 | Regt.
 961 | Rel.
 962 | rel.
 963 | relig.
 964 | Rep.
 965 | resp.
 966 | RGBl.
 967 | rglm.
 968 | Rgstr.
 969 | Rgt.
 970 | Rh-Pf.
 971 | Rh.
 972 | rh.
 973 | rheinhess.
 974 | rhet.
 975 | rhfrk.
 976 | Rhj.
 977 | Rhld.
 978 | Rhs.
 979 | Ri.
 980 | Richtl.
 981 | rip.
 982 | rk.
 983 | RMBl.
 984 | Rn.
 985 | rotw.
 986 | Rr.
 987 | rrh.
 988 | Rs.
 989 | Rspr.
 990 | rumän.
 991 | russ.
 992 | Rvj.
 993 | rzp.
 994 | rätorom.
 995 | röm.
 996 | Sa.
 997 | Saarl.
 998 | Sachs.
 999 | sanskr.
1000 | Sbd.
1001 | sc.
1002 | scherzh.
1003 | schles.
1004 | schr.
1005 | schriftl.
1006 | schwed.
1007 | schwäb.
1008 | Sdp.
1009 | Sek.
1010 | sek.
1011 | sem.
1012 | semit.
1013 | sen.
1014 | Sep.
1015 | Sept.
1016 | serb.
1017 | serbokroat.
1018 | Sg.
1019 | sibir.
1020 | singhal.
1021 | sizilian.
1022 | skand.
1023 | slaw.
1024 | Slg.
1025 | slowak.
1026 | slowen.
1027 | sod.
1028 | sof.
1029 | sog.
1030 | sogen.
1031 | sogl.
1032 | soldatenspr.
1033 | solv.
1034 | somal.
1035 | sorb.
1036 | Sout.
1037 | soz.
1038 | Sozialgesch.
1039 | soziol.
1040 | spez.
1041 | sportspr.
1042 | Spr.
1043 | sprachwiss.
1044 | spätahd.
1045 | spätgriech.
1046 | spätlat.
1047 | spätmhd.
1048 | Sr.
1049 | ssp.
1050 | St.
1051 | staatl.
1052 | Std.
1053 | stdl.
1054 | Stellv.
1055 | stellv.
1056 | Stf.
1057 | Str.
1058 | str.
1059 | Stud.
1060 | stud.
1061 | StuZBl.
1062 | subsp.
1063 | Subst.
1064 | sumer.
1065 | svw.
1066 | syn.
1067 | syr.
1068 | sächs.
1069 | SächsVBl.
1070 | südafrik.
1071 | südd.
1072 | süddt.
1073 | südl.
1074 | südostdt.
1075 | südwestd.
1076 | Süßw.
1077 | Tabl.
1078 | Taf.
1079 | tamil.
1080 | tatar.
1081 | techn.
1082 | teilw.
1083 | Tel.
1084 | telef.
1085 | Terr.
1086 | Tfx.
1087 | tgl.
1088 | Tgt.
1089 | thrak.
1090 | Thür.
1091 | thür.
1092 | thüring.
1093 | ThürVBl.
1094 | Ti.
1095 | tib.
1096 | tirol.
1097 | tochar.
1098 | trans.
1099 | tschech.
1100 | tschechoslowak.
1101 | Tsd.
1102 | tungus.
1103 | turkotat.
1104 | typogr.
1105 | Tz.
1106 | tägl.
1107 | türk.
1108 | u.dgl.
1109 | UAbs.
1110 | ugr.
1111 | ugs.
1112 | ukrain.
1113 | umbr.
1114 | umstr.
1115 | unang.
1116 | unbefl.
1117 | ungar.
1118 | ungebr.
1119 | ungel.
1120 | Univ.
1121 | unzerbr.
1122 | urgerm.
1123 | Urk.
1124 | urkdl.
1125 | urspr.
1126 | ursprüngl.
1127 | Urt.
1128 | usf.
1129 | USt-IdNr.
1130 | usw.
1131 | v.Chr.
1132 | va.
1133 | Var.
1134 | VBl.
1135 | Verf.
1136 | Verg.
1137 | Vergl.
1138 | vergl.
1139 | verh.
1140 | verkehrspol.
1141 | vern.
1142 | Vers.
1143 | VerwArch.
1144 | Vfg.
1145 | vgl.
1146 | vh.
1147 | viell.
1148 | VKBl.
1149 | vl.
1150 | vlat.
1151 | vllt.
1152 | vlt.
1153 | VOBl.
1154 | volkst.
1155 | Vollj.
1156 | Vorbem.
1157 | Vors.
1158 | vs.
1159 | vsl.
1160 | vt.
1161 | vulg.
1162 | vulgärlat.
1163 | Vwz.
1164 | vzk.
1165 | Wa.
1166 | weibl.
1167 | weißruss.
1168 | westd.
1169 | westdt.
1170 | Westf.
1171 | westfäl.
1172 | westgerm.
1173 | westl.
1174 | westmitteld.
1175 | westmitteldt.
1176 | Wfl.
1177 | wg.
1178 | wh.
1179 | Whg.
1180 | winzerspr.
1181 | wirtschaftl.
1182 | wiss.
1183 | Wj.
1184 | wld.
1185 | Wtb.
1186 | Wwe.
1187 | Wz.
1188 | Xerogr.
1189 | Xerok.
1190 | Xyl.
1191 | Yd.
1192 | yd.
1193 | Yds.
1194 | yds.
1195 | z.Bsp.
1196 | z.Zt.
1197 | zB.
1198 | zBsp.
1199 | zeithist.
1200 | Zf.
1201 | Zi.
1202 | Ziff.
1203 | Zool.
1204 | zool.
1205 | ZPr.
1206 | Zssg.
1207 | Zssgn.
1208 | Zt.
1209 | zus.
1210 | zw.
1211 | Zz.
1212 | zz.
1213 | zzgl.
1214 | zzt.
1215 | ägypt.
1216 | Änd.
1217 | ÖBGBl.
1218 | Ökol.
1219 | ökol.
1220 | ökon.
1221 | ökum.
1222 | Österr.
1223 | österr.
1224 | östl.
1225 | übertr.
1226 | Überw.
1227 | überw.
1228 | Übk.
1229 | übl.
1230 | üblw.
1231 | 


--------------------------------------------------------------------------------
/doc/build/markdown/somajo.md:
--------------------------------------------------------------------------------
  1 | # somajo package
  2 | 
  3 | * [class somajo.somajo.SoMaJo](#class-somajosomajosomajolanguage--split_camel_casefalse-split_sentencestrue-xml_sentencesnone)
  4 |     * [tokenize_text](#tokenize_textparagraphs--parallel1)
  5 |     * [tokenize_text_file](#tokenize_text_filetext_file-paragraph_separator--parallel1)
  6 |     * [tokenize_xml](#tokenize_xmlxml_data-eos_tags--strip_tagsfalse-parallel1-prune_tagsnone)
  7 |     * [tokenize_xml_file](#tokenize_xml_filexml_file-eos_tags--strip_tagsfalse-parallel1-prune_tagsnone)
  8 | * [class somajo.token.Token](#class-somajotokentokentext--markupfalse-markup_classnone-markup_eosnone-lockedfalse-token_classnone-space_aftertrue-original_spellingnone-first_in_sentencefalse-last_in_sentencefalse)
  9 |     * [property extra_info()](#property-extra_info)
 10 | 
 11 | 
 12 | ## somajo.somajo module
 13 | 
 14 | ### *class* somajo.somajo.SoMaJo(language, \*, split_camel_case=False, split_sentences=True, xml_sentences=None, character_offsets=False)
 15 | 
 16 | Bases: `object`
 17 | 
 18 | Tokenization and sentence splitting.
 19 | 
 20 | * **Parameters**
 21 |   * **language** (*{'de_CMC', 'en_PTB'}*) – Language-specific tokenization rules.
 22 |   * **split_camel_case** (*bool, (**default=False**)*) – Split words written in camelCase (excluding established names
 23 |     and terms).
 24 |   * **split_sentences** (*bool, (**default=True**)*) – Perform sentence splitting in addition to tokenization.
 25 |   * **xml_sentences** (*str, (**default=None**)*) – Delimit sentences by XML tags of this name
 26 |     (`xml_sentences='s'` → &lt;s>…&lt;/s>). When used with XML input,
 27 |     this might lead to minor changes to the original tags to
 28 |     guarantee well-formed output (tags might need to be closed and
 29 |     re-opened at sentence boundaries).
 30 |   * **character_offsets** (*bool, (**default=False**)*) – Compute the character offsets in the input for each token.
 31 |     This allows for stand-off tokenization.
 32 | 
 33 | #### tokenize_text(paragraphs, \*, parallel=1)
 34 | 
 35 | Split paragraphs of text into sequences of tokens.
 36 | 
 37 | * **Parameters**
 38 |   * **paragraphs** (*iterable*) – An iterable of single paragraphs of text.
 39 |   * **parallel** (*int, (**default=1**)*) – Number of processes to use.
 40 | * **Yields**
 41 |   *list* – The `Token` objects in a single sentence or paragraph
 42 |   (depending on the value of `split_sentences`).
 43 | 
 44 | ##### Examples
 45 | 
 46 | Tokenization and sentence splitting; print one sentence per
 47 | line:
 48 | 
 49 | ```pycon
 50 | >>> paragraphs = ["Heyi:)", "Was machst du morgen Abend?! Lust auf Film?;-)"]
 51 | >>> tokenizer = SoMaJo("de_CMC")
 52 | >>> sentences = tokenizer.tokenize_text(paragraphs)
 53 | >>> for sentence in sentences:
 54 | ...     print(" ".join([token.text for token in sentence]))
 55 | ...
 56 | Heyi :)
 57 | Was machst du morgen Abend ?!
 58 | Lust auf Film ? ;-)
 59 | ```
 60 | 
 61 | Only tokenization; print one paragraph per line:
 62 | 
 63 | ```pycon
 64 | >>> tokenizer = SoMaJo("de_CMC", split_sentences=False)
 65 | >>> tokenized_paragraphs = tokenizer.tokenize_text(paragraphs)
 66 | >>> for paragraph in tokenized_paragraphs:
 67 | ...     print(" ".join([token.text for token in paragraph]))
 68 | ...
 69 | Heyi :)
 70 | Was machst du morgen Abend ?! Lust auf Film ? ;-)
 71 | ```
 72 | 
 73 | Tokenization and sentence splitting; print one token per line
 74 | with token classes and extra information; print an empty line
 75 | after each sentence:
 76 | 
 77 | ```pycon
 78 | >>> sentences = tokenizer.tokenize_text(paragraphs)
 79 | >>> for sentence in sentences:
 80 | ...     for token in sentence:
 81 | ...         print("{token.text}\t{token.token_class}\t{token.extra_info}")
 82 | ...     print()
 83 | ...
 84 | Heyi    regular SpaceAfter=No
 85 | :)      emoticon
 86 | ​
 87 | Was     regular
 88 | machst  regular
 89 | du      regular
 90 | morgen  regular
 91 | Abend   regular SpaceAfter=No
 92 | ?!      symbol
 93 | ​
 94 | Lust    regular
 95 | auf     regular
 96 | Film    regular SpaceAfter=No
 97 | ?       symbol  SpaceAfter=No
 98 | ;-)     emoticon
 99 | ​
100 | ```
101 | 
102 | Tokenization and sentence splitting; print one token per line
103 | and delimit sentences with XML tags:
104 | 
105 | ```pycon
106 | >>> tokenizer = SoMaJo("de_CMC", xml_sentences="s")
107 | >>> sentences = tokenizer.tokenize_text(paragraphs)
108 | >>> for sentence in sentences:
109 | ...     for token in sentence:
110 | ...         print(token.text)
111 | ...
112 | <s>
113 | Heyi
114 | :)
115 | </s>
116 | <s>
117 | Was
118 | machst
119 | du
120 | morgen
121 | Abend
122 | ?!
123 | </s>
124 | <s>
125 | Lust
126 | auf
127 | Film
128 | ?
129 | ;-)
130 | </s>
131 | ```
132 | 
133 | #### tokenize_text_file(text_file, paragraph_separator, \*, parallel=1)
134 | 
135 | Split the contents of a text file into sequences of tokens.
136 | 
137 | * **Parameters**
138 |   * **text_file** (*str* *or* *file-like object*) – Either a filename or a file-like object containing text.
139 |   * **paragraph_separator** (*{'single_newlines', 'empty_lines'}*) – How are paragraphs separated in the input? Is there one
140 |     paragraph per line (‘single_newlines’) or do paragraphs
141 |     span several lines and are separated by ‘empty_lines’?
142 |   * **parallel** (*int, (**default=1**)*) – Number of processes to use.
143 | * **Yields**
144 |   *list* – The `Token` objects in a single sentence or paragraph
145 |   (depending on the value of `split_sentences`).
146 | 
147 | ##### Examples
148 | 
149 | Tokenization and sentence splitting; input file with
150 | paragraphs separated by empty lines; print one token per line
151 | with token classes and extra information; print an empty line
152 | after each sentence:
153 | 
154 | ```pycon
155 | >>> with open("example_empty_lines.txt") as f:
156 | ...     print(f.read())
157 | ...
158 | Heyi:)
159 | ​
160 | Was machst du morgen Abend?! Lust auf Film?;-)
161 | >>> sentences = tokenizer.tokenize_text_file("example_empty_lines.txt", paragraph_separator="single_newlines")
162 | >>> for sentence in sentences:
163 | ...     for token in sentence:
164 | ...         print("{token.text}\t{token.token_class}\t{token.extra_info}")
165 | ...     print()
166 | ...
167 | Heyi    regular SpaceAfter=No
168 | :)      emoticon
169 | ​
170 | Was     regular
171 | machst  regular
172 | du      regular
173 | morgen  regular
174 | Abend   regular SpaceAfter=No
175 | ?!      symbol
176 | ​
177 | Lust    regular
178 | auf     regular
179 | Film    regular SpaceAfter=No
180 | ?       symbol  SpaceAfter=No
181 | ;-)     emoticon
182 | ​
183 | ```
184 | 
185 | Tokenization and sentence splitting; input file with
186 | paragraphs separated by single newlines; print one sentence
187 | per line:
188 | 
189 | ```pycon
190 | >>> with open("example_single_newlines.txt", encoding="utf-8") as f:
191 | ...     print(f.read())
192 | ...
193 | Heyi:)
194 | Was machst du morgen Abend?! Lust auf Film?;-)
195 | >>> tokenizer = SoMaJo("de_CMC")
196 | >>> with open("example_empty_lines.txt", encoding="utf-8") as f:
197 | ...     sentences = tokenizer.tokenize_text_file(f, paragraph_separator="empty_lines")
198 | ...     for sentence in sentences:
199 | ...         print(" ".join([token.text for token in sentence]))
200 | ...
201 | Heyi :)
202 | Was machst du morgen Abend ?!
203 | Lust auf Film ? ;-)
204 | ```
205 | 
206 | #### tokenize_xml(xml_data, eos_tags, \*, strip_tags=False, parallel=1, prune_tags=None)
207 | 
208 | Split a string of XML data into sequences of tokens.
209 | 
210 | * **Parameters**
211 |   * **xml_data** (*str*) – A string containing XML data.
212 |   * **eos_tags** (*iterable*) – XML tags that constitute sentence breaks, i.e. tags that
213 |     do not occur in the middle of a sentence. For HTML input,
214 |     you might use the following list of tags: `['title',
215 |     'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'br', 'hr',
216 |     'div', 'ol', 'ul', 'dl', 'table']`
217 |   * **strip_tags** (*bool, (**default=False**)*) – Remove the XML tags from the output.
218 |   * **parallel** (*int, (**default=1**)*) – Number of processes to use.
219 |   * **prune_tags** (*iterable*) – These XML tags and their contents will be removed from the
220 |     input before tokenization. For HTML input, you might use
221 |     `['script', 'style']` or, depending on your use case,
222 |     `['head']`.
223 | * **Yields**
224 |   *list* – The `Token` objects in a single sentence or stretch of
225 |   XML delimited by `eos_tags` (depending on the value of
226 |   `split_sentences`).
227 | 
228 | ##### Examples
229 | 
230 | Tokenization and sentence splitting; print one token per line
231 | and an empty line after each sentence:
232 | 
233 | ```pycon
234 | >>> xml = "<html><body><p>Heyi:)</p><p>Was machst du morgen Abend?! Lust auf Film?;-)</p></body></html>"
235 | >>> eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
236 | >>> tokenizer = SoMaJo("de_CMC")
237 | >>> sentences = tokenizer.tokenize_xml(xml, eos_tags)
238 | >>> for sentence in sentences:
239 | ...     for token in sentence:
240 | ...         print(token.text)
241 | ...     print()
242 | ...
243 | <html>
244 | <body>
245 | <p>
246 | Heyi
247 | :)
248 | </p>
249 | ​
250 | <p>
251 | Was
252 | machst
253 | du
254 | morgen
255 | Abend
256 | ?!
257 | ​
258 | Lust
259 | auf
260 | Film
261 | ?
262 | ;-)
263 | </p>
264 | </body>
265 | </html>
266 | ​
267 | ```
268 | 
269 | Tokenization and sentence splitting; strip XML tags from the
270 | output and print one sentence per line
271 | 
272 | ```pycon
273 | >>> sentences = tokenizer.tokenize_xml(xml, eos_tags, strip_tags=True)
274 | >>> for sentence in sentences:
275 | ...     print(" ".join([token.text for token in sentence]))
276 | ...
277 | Heyi :)
278 | Was machst du morgen Abend ?!
279 | Lust auf Film ? ;-)
280 | ```
281 | 
282 | Only tokenization; print one chunk of XML (delimited by
283 | `eos_tags`) per line:
284 | 
285 | ```pycon
286 | >>> tokenizer = SoMaJo("de_CMC", split_sentences=False)
287 | >>> chunks = tokenizer.tokenize_xml(xml, eos_tags)
288 | >>> for chunk in chunks:
289 | ...     print(" ".join([token.text for token in chunk]))
290 | ...
291 | <html> <body> <p> Heyi :) </p>
292 | <p> Was machst du morgen Abend ?! Lust auf Film ? ;-) </p> </body> </html>
293 | ```
294 | 
295 | Tokenization and sentence splitting; print one token per line
296 | and delimit sentences with XML tags:
297 | 
298 | ```pycon
299 | >>> xml = "<html><body><p>Heyi:)</p><p>Was machst du morgen Abend?! Lust auf Film?;-)</p></body></html>"
300 | >>> eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
301 | >>> tokenizer = SoMaJo("de_CMC", xml_sentences="s")
302 | >>> sentences = tokenizer.tokenize_xml(xml, eos_tags)
303 | >>> for sentence in sentences:
304 | ...     for token in sentence:
305 | ...         print(token.text)
306 | ...     print()
307 | ...
308 | <html>
309 | <body>
310 | <p>
311 | <s>
312 | Heyi
313 | :)
314 | </s>
315 | </p>
316 | <p>
317 | <s>
318 | Was
319 | machst
320 | du
321 | morgen
322 | Abend
323 | ?!
324 | </s>
325 | <s>
326 | Lust
327 | auf
328 | Film
329 | ?
330 | ;-)
331 | </s>
332 | </p>
333 | </body>
334 | </html>
335 | ```
336 | 
337 | #### tokenize_xml_file(xml_file, eos_tags, \*, strip_tags=False, parallel=1, prune_tags=None)
338 | 
339 | Split the contents of an xml file into sequences of tokens.
340 | 
341 | * **Parameters**
342 |   * **xml_file** (*str* *or* *file-like object*) – A file containing XML data. Either a filename or a
343 |     file-like object.
344 |   * **eos_tags** (*iterable*) – XML tags that constitute sentence breaks, i.e. tags that
345 |     do not occur in the middle of a sentence. For HTML input,
346 |     you might use the following list of tags: `['title',
347 |     'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'br', 'hr',
348 |     'div', 'ol', 'ul', 'dl', 'table']`
349 |   * **strip_tags** (*bool, (**default=False**)*) – Remove all XML tags from the output.
350 |   * **parallel** (*int, (**default=1**)*) – Number of processes to use.
351 |   * **prune_tags** (*iterable*) – These XML tags and their contents will be removed from the
352 |     input before tokenization. For HTML input, you might use
353 |     `['script', 'style']` or, depending on your use case,
354 |     `['head']`.
355 | * **Yields**
356 |   *list* – The `Token` objects in a single sentence or stretch of
357 |   XML delimited by `eos_tags` (depending on the value of
358 |   `split_sentences`).
359 | 
360 | ##### Examples
361 | 
362 | Tokenization and sentence splitting; print one token per line
363 | and an empty line after each sentence:
364 | 
365 | ```pycon
366 | >>> with open("example.xml") as f:
367 | ...     print(f.read())
368 | ...
369 | <html>
370 |   <body>
371 |     <p>Heyi:)</p>
372 |     <p>Was machst du morgen Abend?! Lust auf Film?;-)</p>
373 |   </body>
374 | </html>
375 | >>> eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
376 | >>> tokenizer = SoMaJo("de_CMC")
377 | >>> sentences = tokenizer.tokenize_xml_file("example.xml", eos_tags)
378 | >>> for sentence in sentences:
379 | ...     for token in sentence:
380 | ...         print(token)
381 | ...     print()
382 | ...
383 | <html>
384 | <body>
385 | <p>
386 | Heyi
387 | :)
388 | </p>
389 | ​
390 | <p>
391 | Was
392 | machst
393 | du
394 | morgen
395 | Abend
396 | ?!
397 | ​
398 | Lust
399 | auf
400 | Film
401 | ?
402 | ;-)
403 | </p>
404 | </body>
405 | </html>
406 | ​
407 | ```
408 | 
409 | Tokenization and sentence splitting; strip XML tags from the
410 | output and print one sentence per line:
411 | 
412 | ```pycon
413 | >>> with open("example.xml") as f:
414 | ...     sentences = tokenizer.tokenize_xml_file(f, eos_tags, strip_tags=True)
415 | ...     for sentence in sentences:
416 | ...         print(" ".join(token.text for token in sentence))
417 | ...
418 | Heyi :)
419 | Was machst du morgen Abend ?!
420 | Lust auf Film ? ;-)
421 | ```
422 | 
423 | Only tokenization; print one token per line
424 | 
425 | ```pycon
426 | >>> tokenizer = SoMaJo("de_CMC", split_sentences=False)
427 | >>> chunks = tokenizer.tokenize_xml_file("example.xml", eos_tags)
428 | >>> for chunk in chunks:
429 | ...     for token in chunk:
430 | ...         print(token.text)
431 | ...
432 | <html>
433 | <body>
434 | <p>
435 | Heyi
436 | :)
437 | </p>
438 | <p>
439 | Was
440 | machst
441 | du
442 | morgen
443 | Abend
444 | ?!
445 | Lust
446 | auf
447 | Film
448 | ?
449 | ;-)
450 | </p>
451 | </body>
452 | </html>
453 | ```
454 | 
455 | ## somajo.token module
456 | 
457 | ### *class* somajo.token.Token(text, \*, markup=False, markup_class=None, markup_eos=None, locked=False, token_class=None, space_after=True, original_spelling=None, first_in_sentence=False, last_in_sentence=False, character_offset=None)
458 | 
459 | Bases: `object`
460 | 
461 | Token objects store a piece of text (in the end a single token) with additional information.
462 | 
463 | * **Parameters**
464 |   * **text** (*str*) – The text that makes up the token object
465 |   * **markup** (*bool, (**default=False**)*) – Is the token a markup token?
466 |   * **markup_class** (*{'start', 'end'}, optional* *(**default=None**)*) – If markup=True, then markup_class must be either “start” or “end”.
467 |   * **markup_eos** (*bool, optional* *(**default=None**)*) – Is the markup token a sentence boundary?
468 |   * **locked** (*bool, (**default=False**)*) – Mark the token as locked.
469 |   * **token_class** (*{'URL', 'XML_entity', 'XML_tag', 'abbreviation', 'action_word', 'amount', 'date', 'email_address', 'emoticon', 'hashtag', 'measurement', 'mention', 'number', 'ordinal', 'regular', 'semester', 'symbol', 'time'}, optional* *(**default=None**)*) – The class of the token, e.g. “regular”, “emoticon”, “URL”, etc.
470 |   * **space_after** (*bool, (**default=True**)*) – Was there a space after the token in the original data?
471 |   * **original_spelling** (*str, optional* *(**default=None**)*) – The original spelling of the token, if it is different from the one in text.
472 |   * **first_in_sentence** (*bool, (**default=False**)*) – Is it the first token of a sentence?
473 |   * **last_in_sentence** (*bool, (**default=False**)*) – Is it the last token of a sentence?
474 |   * **character_offset** (*tuple, (**default=None**)*) – Character offset of the token in the input as tuple (start, end)
475 |     such that input[start:end] == text (if there are no changes to
476 |     the token text during tokenization)
477 | 
478 | #### *property* extra_info
479 | 
480 | String representation of extra information.
481 | 
482 | * **Returns**
483 |   A string representation of the space_after and original_spelling attributes.
484 | * **Return type**
485 |   str
486 | 
487 | ##### Examples
488 | 
489 | ```pycon
490 | >>> tok = Token(":)", token_class="regular", space_after=False, original_spelling=": )")
491 | >>> print(tok.text)
492 | :)
493 | >>> print(tok.extra_info)
494 | SpaceAfter=No, OriginalSpelling=": )"
495 | ```
496 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SoMaJo
  2 | 
  3 | [![PyPI](https://img.shields.io/pypi/v/SoMaJo)](https://pypi.org/project/SoMaJo/)
  4 | [![Build](https://github.com/tsproisl/SoMaJo/actions/workflows/test.yml/badge.svg?branch=master)](https://github.com/tsproisl/SoMaJo/actions/workflows/test.yml?query=branch%3Amaster)
  5 | 
  6 |   - [Introduction](#introduction)
  7 |   - [Features](#features)
  8 |   - [Installation](#installation)
  9 |   - [Usage](#usage)
 10 |       - [Using the somajo-tokenizer executable](#using-the-somajo-tokenizer-executable)
 11 |       - [Using the module](#using-the-module)
 12 |   - [Evaluation](#evaluation)
 13 |   - [Tokenizing English text](#tokenizing-english-text)
 14 |   - [Development](#development)
 15 |   - [References](#references)
 16 | 
 17 | 
 18 | ## Introduction
 19 | 
 20 | ```
 21 | echo 'Wow, superTool!;)' | somajo-tokenizer -c -
 22 | Wow
 23 | ,
 24 | super
 25 | Tool
 26 | !
 27 | ;)
 28 | ```
 29 | 
 30 | SoMaJo is a rule-based tokenizer and sentence splitter that implements
 31 | tokenization guidelines for German and English. It has a strong focus
 32 | on web and social media texts (it was originally created as the
 33 | winning submission to the [EmpiriST 2015 shared
 34 | task](https://sites.google.com/site/empirist2015/) on automatic
 35 | linguistic annotation of computer-mediated communication / social
 36 | media) and is particularly well-suited to perform tokenization on all
 37 | kinds of written discourse, for example chats, forums, wiki talk
 38 | pages, tweets, blog comments, social networks, SMS and WhatsApp
 39 | dialogues. Of course it also works on more formal texts.
 40 | 
 41 | Version 1 of the tokenizer is described in greater detail in [Proisl
 42 | and Uhrig (2016)](https://aclanthology.org/W16-2607).
 43 | 
 44 | For part-of-speech tagging (in particular of German web and social
 45 | media texts), we recommend
 46 | [SoMeWeTa](https://github.com/tsproisl/SoMeWeTa):
 47 | 
 48 | ```
 49 | somajo-tokenizer --split_sentences <file> | somewe-tagger --tag <model> -
 50 | ```
 51 | 
 52 | 
 53 | ## Features
 54 | 
 55 |   - Rule-based tokenization and sentence-splitting:
 56 |     - [EmpiriST 2015 tokenization
 57 |       guidelines](https://github.com/fau-klue/empirist-corpus/blob/9f00233951f7d1503ba4c3dd4af975d3c73cba80/doc/EmpiriST_Guideline-Tokenisierung.pdf)
 58 |       for German
 59 |     - “New” Penn Treebank conventions for English (described, for
 60 |       example, in the guidelines for ETTB 2.0 [(Mott et al.,
 61 |       2009)](https://web.archive.org/web/20110727133755/http://projects.ldc.upenn.edu/gale/task_specifications/ettb_guidelines.pdf)
 62 |       and CLEAR [(Warner et al.,
 63 |       2012)](https://clear.colorado.edu/compsem/documents/treebank_guidelines.pdf))
 64 |     - Optionally split camel-cased tokens
 65 |     - Optionally output token class information for each token, i.e.
 66 |       if it is a number, an emoticon, an abbreviation, etc.
 67 |     - Optionally output additional information for each token, e.g. if
 68 |       it was followed by whitespace or if it contained internal
 69 |       whitespace
 70 |     - Optionally split the tokenized text into sentences
 71 |     - Optionally determine the character offsets of the tokens in the
 72 |       input, allowing for stand-off tokenization
 73 |   - Text preprocessing/cleaning:
 74 |     - Normalize text to [Unicode Normalization Form C (NFC)](https://unicode.org/reports/tr15/)
 75 |     - Remove control characters and other usually unwanted characters,
 76 |       such as soft hyphens and zero-width spaces
 77 |   - XML support:
 78 |     - Transparent processing of XML: Tokenize the textual content of
 79 |       an XML file while preserving the XML structure
 80 |     - Optionally delimit sentence boundaries by XML tags
 81 |     - Optionally prune tags, i.e. subtrees, from the XML before
 82 |       tokenization (for example to remove `<script>` and `<style>`
 83 |       tags from HTML input)
 84 |     - Optionally strip all tags from the output, effectively turning
 85 |       the XML into plain text
 86 |   - Parallelization: Optionally run multiple worker processes to speed
 87 |     up tokenization
 88 | 
 89 | 
 90 | ## Installation
 91 | 
 92 | SoMaJo can be easily installed using pip (pip3 in some distributions):
 93 | 
 94 | ```sh
 95 | pip install -U SoMaJo
 96 | ```
 97 | 
 98 | Alternatively, you can download and decompress the [latest
 99 | release](https://github.com/tsproisl/SoMaJo/releases/latest) or clone
100 | the git repository:
101 | 
102 | ```sh
103 | git clone https://github.com/tsproisl/SoMaJo.git
104 | ```
105 | 
106 | In the new directory, run the following command:
107 | 
108 | ```sh
109 | pip install -U .
110 | ```
111 | 
112 | 
113 | ## Usage
114 | 
115 | ### Using the somajo-tokenizer executable
116 | 
117 | You can use the tokenizer as a standalone program from the command
118 | line. General usage information is available via the `-h` option:
119 | 
120 | ```
121 | somajo-tokenizer -h
122 | usage: somajo-tokenizer [-h] [-l {en_PTB,de_CMC}]
123 |                         [-s {single_newlines,empty_lines}] [-x] [--tag TAG]
124 |                         [--prune PRUNE] [--strip-tags] [-c]
125 |                         [--split_sentences] [--sentence_tag SENTENCE_TAG] [-t]
126 |                         [-e] [--parallel N] [-v]
127 |                         FILE
128 | 
129 | A tokenizer and sentence splitter for German and English texts. Currently, two
130 | tokenization guidelines are implemented: The EmpiriST guidelines for German
131 | web and social media texts (de_CMC) and the "new" Penn Treebank conventions
132 | for English texts (en_PTB).
133 | 
134 | positional arguments:
135 |   FILE                  The input file (UTF-8-encoded) or "-" to read from
136 |                         STDIN.
137 | 
138 | options:
139 |   -h, --help            show this help message and exit
140 |   -l {en_PTB,de_CMC}, --language {en_PTB,de_CMC}
141 |                         Choose a language. Currently supported are German
142 |                         EmpiriST-style tokenization (de_CMC) and English Penn-
143 |                         Treebank-style tokenization(en_PTB). (Default: de_CMC)
144 |   -s {single_newlines,empty_lines}, --paragraph_separator {single_newlines,empty_lines}
145 |                         How are paragraphs separated in the input text? Will
146 |                         be ignored if option -x/--xml is used. (Default:
147 |                         empty_lines)
148 |   -x, --xml             The input is an XML file. You can specify tags that
149 |                         always constitute a sentence break (e.g. HTML p tags)
150 |                         via the --tag option.
151 |   --tag TAG             Start and end tags of this type constitute sentence
152 |                         breaks, i.e. they do not occur in the middle of a
153 |                         sentence. Can be used multiple times to specify
154 |                         multiple tags, e.g. --tag p --tag br. Implies option
155 |                         -x/--xml. (Default: --tag title --tag h1 --tag h2
156 |                         --tag h3 --tag h4 --tag h5 --tag h6 --tag p --tag br
157 |                         --tag hr --tag div --tag ol --tag ul --tag dl --tag
158 |                         table)
159 |   --prune PRUNE         Tags of this type will be removed from the input
160 |                         before tokenization. Can be used multiple times to
161 |                         specify multiple tags, e.g. --tag script --tag style.
162 |                         Implies option -x/--xml. By default, no tags are
163 |                         pruned.
164 |   --strip-tags          Suppresses output of XML tags. Implies option
165 |                         -x/--xml.
166 |   -c, --split_camel_case
167 |                         Split items in written in camelCase (excluding
168 |                         established names and terms).
169 |   --split_sentences, --split-sentences
170 |                         Also split the input into sentences.
171 |   --sentence_tag SENTENCE_TAG, --sentence-tag SENTENCE_TAG
172 |                         Tag name for sentence boundaries (e.g. --sentence_tag
173 |                         s). If this option is specified, sentences will be
174 |                         delimited by XML tags (e.g. <s>…</s>) instead of empty
175 |                         lines. This option implies --split_sentences
176 |   -t, --token_classes   Output the token classes (number, XML tag,
177 |                         abbreviation, etc.) in addition to the tokens.
178 |   -e, --extra_info      Output additional information for each token:
179 |                         SpaceAfter=No if the token was not followed by a space
180 |                         and OriginalSpelling="…" if the token contained
181 |                         whitespace.
182 |   --character-offsets   Output character offsets in the input for each token.
183 |   --parallel N          Run N worker processes (up to the number of CPUs) to
184 |                         speed up tokenization.
185 |   -v, --version         Output version information and exit.
186 | ```
187 | 
188 | Here are some common use cases:
189 | 
190 |   - To tokenize a text file according to the guidelines of the
191 |     EmpiriST 2015 shared task:
192 |     
193 |     ```
194 |     somajo-tokenizer -c <file>
195 |     ```
196 |     
197 |     <details><summary>Show example</summary>
198 |     
199 |     ```
200 |     echo 'der beste Betreuer? - >ProfSmith! : )' | somajo-tokenizer -c -
201 |     der
202 |     beste
203 |     Betreuer
204 |     ?
205 |     ->
206 |     Prof
207 |     Smith
208 |     !
209 |     :)
210 |     ```
211 |     </details>
212 |   - If you do not want to split camel-cased tokens, simply drop the
213 |     `-c` option:
214 |     
215 |     ```
216 |     somajo-tokenizer <file>
217 |     ```
218 |     
219 |     <details><summary>Show example</summary>
220 |     
221 |     ```
222 |     echo 'der beste Betreuer? - >ProfSmith! : )' | somajo-tokenizer -
223 |     der
224 |     beste
225 |     Betreuer
226 |     ?
227 |     ->
228 |     ProfSmith
229 |     !
230 |     :)
231 |     ```
232 |     </details>
233 |   - Your input delimits paragraphs by single newlines instead of empty
234 |     lines? Tell the tokenizer via the `-s`/`--paragraph_separator`
235 |     option:
236 |     
237 |     ```
238 |     somajo-tokenizer --paragraph_separator single_newlines <file>
239 |     ```
240 |   - In addition to tokenizing the input, SoMaJo can also split it into
241 |     sentences:
242 |     
243 |     ```
244 |     somajo-tokenizer --split-sentences <file>
245 |     ``` 
246 |     
247 |     <details><summary>Show example</summary>
248 |     
249 |     ```
250 |     echo 'Palim, Palim! Ich hätte gerne eine Flasche Pommes Frites.' | somajo-tokenizer --split-sentences -
251 |     Palim
252 |     ,
253 |     Palim
254 |     !
255 |     
256 |     Ich
257 |     hätte
258 |     gerne
259 |     eine
260 |     Flasche
261 |     Pommes
262 |     Frites
263 |     .
264 |     
265 |     ``` 
266 |   - To tokenize English text according to the “new” Penn Treebank
267 |     conventions, explicitly specify the tokenization guideline using
268 |     the `-l`/`--language` option:
269 |     
270 |     ```
271 |     somajo-tokenizer -l en_PTB <file>
272 |     ```
273 |     
274 |     <details><summary>Show example</summary>
275 |     
276 |     ```
277 |     echo 'Dont you wanna come?' | somajo-tokenizer -l en_PTB -
278 |     Do
279 |     nt
280 |     you
281 |     wan
282 |     na
283 |     come
284 |     ?
285 |     ```
286 |     </details>
287 |   - SoMaJo can also process XML files. Use the `-x`/`--xml` option to
288 |     tell the tokenizer that your input is an XML file:
289 |     
290 |     ```
291 |     somajo-tokenizer --xml <xml-file>
292 |     ```
293 |     
294 |     <details><summary>Show example</summary>
295 |     
296 |     ```
297 |     echo '<html><head><title>Weihnachten</title></head><body><p>Fr&#x00fc;her war mehr Lametta!</p></body></html>' | somajo-tokenizer --xml -
298 |     <html>
299 |     <head>
300 |     <title>
301 |     Weihnachten
302 |     </title>
303 |     </head>
304 |     <body>
305 |     <p>
306 |     Früher
307 |     war
308 |     mehr
309 |     Lametta
310 |     !
311 |     </p>
312 |     </body>
313 |     </html>
314 |     ```
315 |     </details>
316 |   - For XML input, you can use (multiple instances of) the `--tag`
317 |     option to specify XML tags that are always sentence breaks, i.e.
318 |     that can never occur in the middle of a sentence. See the help
319 |     message for the default list of tags.
320 |     
321 |     ```
322 |     somajo-tokenizer --xml --split_sentences --tag h1 --tag p --tag div <xml-file>
323 |     ```
324 |   - Via option `-t`/`--token_classes`, SoMaJo can output token class
325 |     information for each token, i.e. if it is a number, an emoticon,
326 |     an abbreviation, etc. Via option `-e`/`--extra_info`, additional
327 |     information is available, e.g. if a token was followed by
328 |     whitespace or if it contained internal whitespace.
329 |     
330 |     <details><summary>Show example</summary>
331 |     
332 |     ```
333 |     echo 'der beste Betreuer? - >ProfSmith! : )' | somajo-tokenizer -c -e -t -
334 |     der      regular
335 |     beste    regular
336 |     Betreuer regular    SpaceAfter=No
337 |     ?        symbol
338 |     ->       symbol     SpaceAfter=No, OriginalSpelling="- >"
339 |     Prof     regular    SpaceAfter=No
340 |     Smith    regular    SpaceAfter=No
341 |     !        symbol
342 |     :)       emoticon   OriginalSpelling=": )"
343 |     ```
344 |     </details>
345 |   - To speed up tokenization, you can specify the number of worker
346 |     processes used via the `--parallel` option:
347 |     
348 |     ```
349 |     somajo-tokenizer --parallel <number> <file>
350 |     ```
351 | 
352 | 
353 | ### Using the module
354 | 
355 | Take a look at the [API documentation](doc/build/markdown/somajo.md).
356 | 
357 | You can incorporate SoMaJo into your own Python projects. All you need
358 | to do is importing `somajo`, creating a `SoMaJo` object and calling
359 | one of its tokenizer functions: `tokenize_text`, `tokenize_text_file`,
360 | `tokenize_xml` or `tokenize_xml_file`. These functions return a
361 | generator that yields tokenized chunks of text. By default, these
362 | chunks of text are sentences. If you set `split_sentences=False`, then
363 | the chunks of text are either paragraphs or chunks of XML. Every
364 | tokenized chunk of text is a list of `Token` objects.
365 | 
366 | Here is an example for tokenizing and sentence splitting two
367 | paragraphs:
368 | 
369 | ```python
370 | from somajo import SoMaJo
371 | 
372 | tokenizer = SoMaJo("de_CMC", split_camel_case=True)
373 | 
374 | # note that paragraphs are allowed to contain newlines
375 | paragraphs = ["der beste Betreuer?\n-- ProfSmith! : )",
376 |               "Was machst du morgen Abend?! Lust auf Film?;-)"]
377 | 
378 | sentences = tokenizer.tokenize_text(paragraphs)
379 | for sentence in sentences:
380 |     for token in sentence:
381 |         print(f"{token.text}\t{token.token_class}\t{token.extra_info}")
382 |     print()
383 | ```
384 | 
385 | And here is an example for tokenizing and sentence splitting a whole
386 | file. The option `paragraph_separator="single_newlines"` states that
387 | paragraphs are delimited by newlines instead of empty lines:
388 | 
389 | ```python
390 | sentences = tokenizer.tokenize_text_file("Beispieldatei.txt", paragraph_separator="single_newlines")
391 | for sentence in sentences:
392 |     for token in sentence:
393 |         print(token.text)
394 |     print()
395 | ```
396 | 
397 | For processing XML data, use the `tokenize_xml` or `tokenize_xml_file`
398 | methods:
399 | 
400 | ```python
401 | eos_tags = ["title", "h1", "p"]
402 | 
403 | # you can read from an open file object
404 | sentences = tokenizer.tokenize_xml_file(file_object, eos_tags)
405 | # or you can specify a file name
406 | sentences = tokenizer.tokenize_xml_file("Beispieldatei.xml", eos_tags)
407 | # or you can pass a string with XML data
408 | sentences = tokenizer.tokenize_xml(xml_string, eos_tags)
409 | 
410 | for sentence in sentences:
411 |     for token in sentence:
412 |         print(token.text)
413 |     print()
414 | ```
415 | 
416 | 
417 | ## Evaluation
418 | 
419 | SoMaJo was the system with the highest average F₁ score in the
420 | EmpiriST 2015 shared task. The performance of the current version on
421 | the two test sets is summarized in the following table (Training and
422 | test sets are available from the [official
423 | website](https://sites.google.com/site/empirist2015/gscl-shared-task-automatic-linguistic-annotation-of-computer-mediated-communication-social-media/gold-standard)):
424 | 
425 | | Corpus | Precision | Recall | F₁    |
426 | |--------|-----------|--------|-------|
427 | | CMC    | 99.71     | 99.56  | 99.64 |
428 | | Web    | 99.91     | 99.92  | 99.91 |
429 | 
430 | 
431 | ## Tokenizing English text
432 | 
433 | SoMaJo can also tokenize English text. In general, we follow the “new”
434 | Penn Treebank conventions described, for example, in the guidelines
435 | for ETTB 2.0 [(Mott et al.,
436 | 2009)](https://web.archive.org/web/20110727133755/http://projects.ldc.upenn.edu/gale/task_specifications/ettb_guidelines.pdf)
437 | and CLEAR [(Warner et al.,
438 | 2012)](https://clear.colorado.edu/compsem/documents/treebank_guidelines.pdf).
439 | 
440 | For tokenizing English text on the command line, specify the language
441 | via the `-l` or `--language` option:
442 | 
443 |     somajo-tokenizer -l en_PTB <file>
444 | 
445 | From Python, you can pass `language="en_PTB"` to the `SoMaJo`
446 | constructor, e.g.:
447 | 
448 | ```python
449 | paragraphs = ["That aint bad!:D"]
450 | tokenizer = SoMaJo(language="en_PTB")
451 | sentences = tokenizer.tokenize_text(paragraphs)
452 | ```
453 | 
454 | Performance of the English tokenizer:
455 | 
456 | | Corpus               | Precision | Recall | F₁    |
457 | |----------------------|-----------|--------|-------|
458 | | English Web Treebank | 99.66     | 99.64  | 99.65 |
459 | 
460 | 
461 | ## Development
462 | 
463 | Here are some brief notes to help you get started:
464 | 
465 |   - Preferably create a dedicated virtual environment.
466 |   - Make sure you have pip ≥ 21.3.
467 |   - Install the project in editable mode:
468 |     
469 |     ```sh
470 |     pip install -U -e .
471 |     ```
472 |   - Install the development dependencies:
473 |     
474 |     ```sh
475 |     pip install -r requirements_dev.txt
476 |     ```
477 |   - To run the tests:
478 |     
479 |     ```sh
480 |     python3 -m unittest discover
481 |     ```
482 |   - To build the documentation:
483 |     
484 |     ```sh
485 |     cd doc
486 |     make markdown
487 |     ```
488 |     Note that the created markdown is not perfect and needs some
489 |     manual postprocessing.
490 |   - To build the distribution files:
491 |     
492 |     ```sh
493 |     python3 -m build
494 |     ```
495 | 
496 | ## References
497 | 
498 | If you use SoMaJo for academic research, please consider citing the
499 | following paper:
500 | 
501 |   - Proisl, Thomas, and Peter Uhrig. 2016. “SoMaJo: State-of-the-Art
502 |     Tokenization for German Web and Social Media Texts.” In
503 |     *Proceedings of the 10th Web as Corpus Workshop (WAC-X) and the
504 |     EmpiriST Shared Task*, edited by Paul Cook, Stefan Evert, Roland
505 |     Schäfer, and Egon Stemle, 57–62. Berlin: Association for
506 |     Computational Linguistics. <https://doi.org/10.18653/v1/W16-2607>.
507 |     
508 |     ```bibtex
509 |     @InProceedings{Proisl_Uhrig_EmpiriST:2016,
510 |       author    = {Proisl, Thomas and Uhrig, Peter},
511 |       title     = {{SoMaJo}: {S}tate-of-the-art tokenization for {G}erman web and social media texts},
512 |       year      = {2016},
513 |       booktitle = {Proceedings of the 10th {W}eb as {C}orpus Workshop ({WAC-X}) and the {EmpiriST} Shared Task},
514 |       editor    = {Cook, Paul and Evert, Stefan and Schäfer, Roland and Stemle, Egon},
515 |       address   = {Berlin},
516 |       publisher = {Association for Computational Linguistics},
517 |       pages     = {57--62},
518 |       doi       = {10.18653/v1/W16-2607},
519 |       url       = {https://aclanthology.org/W16-2607},
520 |     }
521 |     ```
522 | 


--------------------------------------------------------------------------------