├── .github └── workflows │ └── test.yml ├── .gitignore ├── CHANGES.txt ├── LICENSE.txt ├── README.md ├── README.rst ├── doc ├── Makefile ├── build │ └── markdown │ │ └── somajo.md ├── make.bat └── source │ ├── conf.py │ ├── index.rst │ ├── modules.rst │ └── somajo.rst ├── pyproject.toml ├── requirements_dev.txt ├── src └── somajo │ ├── __init__.py │ ├── alignment.py │ ├── cli.py │ ├── data │ ├── abbreviations_de.txt │ ├── abbreviations_en.txt │ ├── camel_case_tokens.txt │ ├── eos_abbreviations.txt │ ├── non-breaking_hyphenated_words_en.txt │ ├── non-breaking_prefixes_en.txt │ ├── non-breaking_suffixes_en.txt │ ├── single_token_abbreviations_de.txt │ ├── single_token_abbreviations_en.txt │ ├── single_tokens_de.txt │ ├── single_tokens_en.txt │ ├── tokens_with_plus_or_ampersand.txt │ └── units.txt │ ├── doubly_linked_list.py │ ├── sentence_splitter.py │ ├── somajo.py │ ├── token.py │ ├── tokenizer.py │ └── utils.py ├── tests ├── __init__.py ├── test_alignment.py ├── test_doubly_linked_list.py ├── test_sentence_splitter.py ├── test_somajo.py ├── test_token.py ├── test_tokenizer.py ├── test_tokenizer_internal.py └── test_utils.py └── utils ├── annotate_cmc.sh ├── annotate_web.sh ├── baseline.sh ├── errors_baseline_test.txt ├── errors_test.txt ├── errors_train.txt ├── errors_trial.txt ├── evaluate.py ├── evaluate_on_ewt.sh ├── evaluate_on_gum.sh ├── evaluate_on_konvens.sh ├── evaluate_on_test_cmc.sh ├── evaluate_on_test_web.sh └── run_tests.sh /.github/workflows/test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/.github/workflows/test.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/.gitignore -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/CHANGES.txt -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/LICENSE.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/README.md -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/README.rst -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/doc/Makefile -------------------------------------------------------------------------------- /doc/build/markdown/somajo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/doc/build/markdown/somajo.md -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/doc/make.bat -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/doc/source/conf.py -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/doc/source/index.rst -------------------------------------------------------------------------------- /doc/source/modules.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/doc/source/modules.rst -------------------------------------------------------------------------------- /doc/source/somajo.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/doc/source/somajo.rst -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/requirements_dev.txt -------------------------------------------------------------------------------- /src/somajo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/__init__.py -------------------------------------------------------------------------------- /src/somajo/alignment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/alignment.py -------------------------------------------------------------------------------- /src/somajo/cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/cli.py -------------------------------------------------------------------------------- /src/somajo/data/abbreviations_de.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/data/abbreviations_de.txt -------------------------------------------------------------------------------- /src/somajo/data/abbreviations_en.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/data/abbreviations_en.txt -------------------------------------------------------------------------------- /src/somajo/data/camel_case_tokens.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/data/camel_case_tokens.txt -------------------------------------------------------------------------------- /src/somajo/data/eos_abbreviations.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/data/eos_abbreviations.txt -------------------------------------------------------------------------------- /src/somajo/data/non-breaking_hyphenated_words_en.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/data/non-breaking_hyphenated_words_en.txt -------------------------------------------------------------------------------- /src/somajo/data/non-breaking_prefixes_en.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/data/non-breaking_prefixes_en.txt -------------------------------------------------------------------------------- /src/somajo/data/non-breaking_suffixes_en.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/data/non-breaking_suffixes_en.txt -------------------------------------------------------------------------------- /src/somajo/data/single_token_abbreviations_de.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/data/single_token_abbreviations_de.txt -------------------------------------------------------------------------------- /src/somajo/data/single_token_abbreviations_en.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/data/single_token_abbreviations_en.txt -------------------------------------------------------------------------------- /src/somajo/data/single_tokens_de.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/data/single_tokens_de.txt -------------------------------------------------------------------------------- /src/somajo/data/single_tokens_en.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/data/single_tokens_en.txt -------------------------------------------------------------------------------- /src/somajo/data/tokens_with_plus_or_ampersand.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/data/tokens_with_plus_or_ampersand.txt -------------------------------------------------------------------------------- /src/somajo/data/units.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/data/units.txt -------------------------------------------------------------------------------- /src/somajo/doubly_linked_list.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/doubly_linked_list.py -------------------------------------------------------------------------------- /src/somajo/sentence_splitter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/sentence_splitter.py -------------------------------------------------------------------------------- /src/somajo/somajo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/somajo.py -------------------------------------------------------------------------------- /src/somajo/token.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/token.py -------------------------------------------------------------------------------- /src/somajo/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/tokenizer.py -------------------------------------------------------------------------------- /src/somajo/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/src/somajo/utils.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_alignment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/tests/test_alignment.py -------------------------------------------------------------------------------- /tests/test_doubly_linked_list.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/tests/test_doubly_linked_list.py -------------------------------------------------------------------------------- /tests/test_sentence_splitter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/tests/test_sentence_splitter.py -------------------------------------------------------------------------------- /tests/test_somajo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/tests/test_somajo.py -------------------------------------------------------------------------------- /tests/test_token.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/tests/test_token.py -------------------------------------------------------------------------------- /tests/test_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/tests/test_tokenizer.py -------------------------------------------------------------------------------- /tests/test_tokenizer_internal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/tests/test_tokenizer_internal.py -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/tests/test_utils.py -------------------------------------------------------------------------------- /utils/annotate_cmc.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/annotate_cmc.sh -------------------------------------------------------------------------------- /utils/annotate_web.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/annotate_web.sh -------------------------------------------------------------------------------- /utils/baseline.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/baseline.sh -------------------------------------------------------------------------------- /utils/errors_baseline_test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/errors_baseline_test.txt -------------------------------------------------------------------------------- /utils/errors_test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/errors_test.txt -------------------------------------------------------------------------------- /utils/errors_train.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/errors_train.txt -------------------------------------------------------------------------------- /utils/errors_trial.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/errors_trial.txt -------------------------------------------------------------------------------- /utils/evaluate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/evaluate.py -------------------------------------------------------------------------------- /utils/evaluate_on_ewt.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/evaluate_on_ewt.sh -------------------------------------------------------------------------------- /utils/evaluate_on_gum.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/evaluate_on_gum.sh -------------------------------------------------------------------------------- /utils/evaluate_on_konvens.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/evaluate_on_konvens.sh -------------------------------------------------------------------------------- /utils/evaluate_on_test_cmc.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/evaluate_on_test_cmc.sh -------------------------------------------------------------------------------- /utils/evaluate_on_test_web.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/evaluate_on_test_web.sh -------------------------------------------------------------------------------- /utils/run_tests.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsproisl/SoMaJo/HEAD/utils/run_tests.sh --------------------------------------------------------------------------------