├── .gitignore ├── .pylintrc ├── .python-version ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── data ├── general_abbreviations.json ├── legal_abbreviations.json ├── test.jsonl.gz └── train2.jsonl.gz ├── docs ├── algorithm.md ├── api-reference.md ├── getting-started.md └── training-guide.md ├── mypy.ini ├── nupunkt ├── __init__.py ├── _version.py ├── cli.py ├── cli_model_utils.py ├── core │ ├── __init__.py │ ├── base.py │ ├── constants.py │ ├── language_vars.py │ ├── parameters.py │ └── tokens.py ├── evaluation │ ├── __init__.py │ ├── dataset.py │ ├── evaluator.py │ └── metrics.py ├── hybrid │ ├── README.md │ ├── __init__.py │ ├── adaptive_tokenizer.py │ ├── confidence_tokenizer.py │ └── confidence_tokenizer_fixed.py ├── models │ ├── __init__.py │ └── default_model.json.gz ├── optimization │ ├── __init__.py │ ├── discovery.py │ └── hyperparameter.py ├── py.typed ├── tokenizers │ ├── __init__.py │ ├── paragraph_tokenizer.py │ └── sentence_tokenizer.py ├── trainers │ ├── __init__.py │ └── base_trainer.py ├── training │ ├── __init__.py │ ├── core.py │ ├── hyperparameters.py │ └── optimizer.py ├── utils │ ├── __init__.py │ ├── compression.py │ ├── iteration.py │ ├── paths.py │ └── statistics.py └── workflows │ ├── __init__.py │ └── automated_training.py ├── pyproject.toml ├── scripts ├── prepare_mixed_training_data.py └── profiling │ ├── __init__.py │ ├── profile_comparison.py │ ├── profile_sent_tokenize.py │ ├── profile_sent_tokenize_adaptive.py │ └── profiling_utils.py └── tests ├── __init__.py ├── conftest.py ├── test_adaptive_api.py ├── test_adaptive_spans.py ├── test_evaluation.py ├── test_evaluation_dataset.py ├── test_evaluation_metrics.py ├── test_hybrid_tokenizer.py ├── test_hyperparameters.py ├── test_language_vars.py ├── test_model_loading.py ├── test_model_persistence.py ├── test_paragraph_tokenizer.py ├── test_parameters.py ├── test_paths.py ├── test_sentence_spans.py ├── test_sentence_tokenizer.py ├── test_span_consistency.py ├── test_tokens.py ├── test_trainer.py ├── test_training_integration.py └── test_utils.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/.gitignore -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/.pylintrc -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.13 -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/CHANGELOG.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/MANIFEST.in -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/README.md -------------------------------------------------------------------------------- /data/general_abbreviations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/data/general_abbreviations.json -------------------------------------------------------------------------------- /data/legal_abbreviations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/data/legal_abbreviations.json -------------------------------------------------------------------------------- /data/test.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/data/test.jsonl.gz -------------------------------------------------------------------------------- /data/train2.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/data/train2.jsonl.gz -------------------------------------------------------------------------------- /docs/algorithm.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/docs/algorithm.md -------------------------------------------------------------------------------- /docs/api-reference.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/docs/api-reference.md -------------------------------------------------------------------------------- /docs/getting-started.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/docs/getting-started.md -------------------------------------------------------------------------------- /docs/training-guide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/docs/training-guide.md -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/mypy.ini -------------------------------------------------------------------------------- /nupunkt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/__init__.py -------------------------------------------------------------------------------- /nupunkt/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.6.0" 2 | -------------------------------------------------------------------------------- /nupunkt/cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/cli.py -------------------------------------------------------------------------------- /nupunkt/cli_model_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/cli_model_utils.py -------------------------------------------------------------------------------- /nupunkt/core/__init__.py: -------------------------------------------------------------------------------- 1 | """Core components for nupunkt.""" 2 | -------------------------------------------------------------------------------- /nupunkt/core/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/core/base.py -------------------------------------------------------------------------------- /nupunkt/core/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/core/constants.py -------------------------------------------------------------------------------- /nupunkt/core/language_vars.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/core/language_vars.py -------------------------------------------------------------------------------- /nupunkt/core/parameters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/core/parameters.py -------------------------------------------------------------------------------- /nupunkt/core/tokens.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/core/tokens.py -------------------------------------------------------------------------------- /nupunkt/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/evaluation/__init__.py -------------------------------------------------------------------------------- /nupunkt/evaluation/dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/evaluation/dataset.py -------------------------------------------------------------------------------- /nupunkt/evaluation/evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/evaluation/evaluator.py -------------------------------------------------------------------------------- /nupunkt/evaluation/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/evaluation/metrics.py -------------------------------------------------------------------------------- /nupunkt/hybrid/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/hybrid/README.md -------------------------------------------------------------------------------- /nupunkt/hybrid/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/hybrid/__init__.py -------------------------------------------------------------------------------- /nupunkt/hybrid/adaptive_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/hybrid/adaptive_tokenizer.py -------------------------------------------------------------------------------- /nupunkt/hybrid/confidence_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/hybrid/confidence_tokenizer.py -------------------------------------------------------------------------------- /nupunkt/hybrid/confidence_tokenizer_fixed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/hybrid/confidence_tokenizer_fixed.py -------------------------------------------------------------------------------- /nupunkt/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/models/__init__.py -------------------------------------------------------------------------------- /nupunkt/models/default_model.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/models/default_model.json.gz -------------------------------------------------------------------------------- /nupunkt/optimization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/optimization/__init__.py -------------------------------------------------------------------------------- /nupunkt/optimization/discovery.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/optimization/discovery.py -------------------------------------------------------------------------------- /nupunkt/optimization/hyperparameter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/optimization/hyperparameter.py -------------------------------------------------------------------------------- /nupunkt/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nupunkt/tokenizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/tokenizers/__init__.py -------------------------------------------------------------------------------- /nupunkt/tokenizers/paragraph_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/tokenizers/paragraph_tokenizer.py -------------------------------------------------------------------------------- /nupunkt/tokenizers/sentence_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/tokenizers/sentence_tokenizer.py -------------------------------------------------------------------------------- /nupunkt/trainers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/trainers/__init__.py -------------------------------------------------------------------------------- /nupunkt/trainers/base_trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/trainers/base_trainer.py -------------------------------------------------------------------------------- /nupunkt/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/training/__init__.py -------------------------------------------------------------------------------- /nupunkt/training/core.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/training/core.py -------------------------------------------------------------------------------- /nupunkt/training/hyperparameters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/training/hyperparameters.py -------------------------------------------------------------------------------- /nupunkt/training/optimizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/training/optimizer.py -------------------------------------------------------------------------------- /nupunkt/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/utils/__init__.py -------------------------------------------------------------------------------- /nupunkt/utils/compression.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/utils/compression.py -------------------------------------------------------------------------------- /nupunkt/utils/iteration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/utils/iteration.py -------------------------------------------------------------------------------- /nupunkt/utils/paths.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/utils/paths.py -------------------------------------------------------------------------------- /nupunkt/utils/statistics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/utils/statistics.py -------------------------------------------------------------------------------- /nupunkt/workflows/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/workflows/__init__.py -------------------------------------------------------------------------------- /nupunkt/workflows/automated_training.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/nupunkt/workflows/automated_training.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/pyproject.toml -------------------------------------------------------------------------------- /scripts/prepare_mixed_training_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/scripts/prepare_mixed_training_data.py -------------------------------------------------------------------------------- /scripts/profiling/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Profiling scripts for nupunkt performance analysis. 3 | """ -------------------------------------------------------------------------------- /scripts/profiling/profile_comparison.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/scripts/profiling/profile_comparison.py -------------------------------------------------------------------------------- /scripts/profiling/profile_sent_tokenize.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/scripts/profiling/profile_sent_tokenize.py -------------------------------------------------------------------------------- /scripts/profiling/profile_sent_tokenize_adaptive.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/scripts/profiling/profile_sent_tokenize_adaptive.py -------------------------------------------------------------------------------- /scripts/profiling/profiling_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/scripts/profiling/profiling_utils.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/conftest.py -------------------------------------------------------------------------------- /tests/test_adaptive_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_adaptive_api.py -------------------------------------------------------------------------------- /tests/test_adaptive_spans.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_adaptive_spans.py -------------------------------------------------------------------------------- /tests/test_evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_evaluation.py -------------------------------------------------------------------------------- /tests/test_evaluation_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_evaluation_dataset.py -------------------------------------------------------------------------------- /tests/test_evaluation_metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_evaluation_metrics.py -------------------------------------------------------------------------------- /tests/test_hybrid_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_hybrid_tokenizer.py -------------------------------------------------------------------------------- /tests/test_hyperparameters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_hyperparameters.py -------------------------------------------------------------------------------- /tests/test_language_vars.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_language_vars.py -------------------------------------------------------------------------------- /tests/test_model_loading.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_model_loading.py -------------------------------------------------------------------------------- /tests/test_model_persistence.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_model_persistence.py -------------------------------------------------------------------------------- /tests/test_paragraph_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_paragraph_tokenizer.py -------------------------------------------------------------------------------- /tests/test_parameters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_parameters.py -------------------------------------------------------------------------------- /tests/test_paths.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_paths.py -------------------------------------------------------------------------------- /tests/test_sentence_spans.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_sentence_spans.py -------------------------------------------------------------------------------- /tests/test_sentence_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_sentence_tokenizer.py -------------------------------------------------------------------------------- /tests/test_span_consistency.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_span_consistency.py -------------------------------------------------------------------------------- /tests/test_tokens.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_tokens.py -------------------------------------------------------------------------------- /tests/test_trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_trainer.py -------------------------------------------------------------------------------- /tests/test_training_integration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_training_integration.py -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alea-institute/nupunkt/HEAD/tests/test_utils.py --------------------------------------------------------------------------------