├── .github └── workflows │ ├── benchmark.yaml │ └── release.yaml ├── .gitignore ├── LICENSE ├── README.md ├── contrib ├── benchmark.py ├── test-data │ ├── clean │ ├── clean.enzh.10 │ ├── clean.zhen.10 │ ├── dirty │ ├── medium │ ├── test_enzh_config.expected.out │ ├── test_enzh_config_plain_expected.log │ ├── test_enzh_tags_advanced_config.expected.out │ ├── test_enzh_tags_stage_config.expected.out │ ├── test_zhen_config.expected.out │ ├── test_zhen_config_prefix.expected.out │ └── vocab.zhen.spm ├── test_enzh_config.yml ├── test_enzh_config_plain.yml ├── test_enzh_noise_config.yml ├── test_enzh_tags_advanced_config.yml ├── test_enzh_tags_stage_config.yml ├── test_full_config.yml ├── test_zhen_config.yml ├── test_zhen_prefix_config.yml └── train_config.yml ├── pyproject.toml ├── requirements.txt ├── run-tests.sh ├── src └── opustrainer │ ├── __init__.py │ ├── __main__.py │ ├── alignments.py │ ├── logger.py │ ├── modifiers │ ├── __init__.py │ ├── merge.py │ ├── noise.py │ ├── placeholders.py │ ├── pool.py │ ├── prefix.py │ ├── punctuation.py │ ├── retokenize.py │ ├── surface.py │ └── typos.py │ ├── shuffle.py │ ├── tokenizers.py │ ├── trainer.py │ └── types.py └── tests ├── test_endtoend.py ├── test_logger.py ├── test_merge.py ├── test_noise.py ├── test_placeholders.py ├── test_prefixes.py ├── test_punctuation.py ├── test_retokenizer.py ├── test_tokenizers.py ├── test_trainer.py ├── test_trainer_cli.py └── test_typos.py /.github/workflows/benchmark.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/.github/workflows/benchmark.yaml -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/.github/workflows/release.yaml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/README.md -------------------------------------------------------------------------------- /contrib/benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/benchmark.py -------------------------------------------------------------------------------- /contrib/test-data/clean: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test-data/clean -------------------------------------------------------------------------------- /contrib/test-data/clean.enzh.10: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test-data/clean.enzh.10 -------------------------------------------------------------------------------- /contrib/test-data/clean.zhen.10: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test-data/clean.zhen.10 -------------------------------------------------------------------------------- /contrib/test-data/dirty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test-data/dirty -------------------------------------------------------------------------------- /contrib/test-data/medium: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test-data/medium -------------------------------------------------------------------------------- /contrib/test-data/test_enzh_config.expected.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test-data/test_enzh_config.expected.out -------------------------------------------------------------------------------- /contrib/test-data/test_enzh_config_plain_expected.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test-data/test_enzh_config_plain_expected.log -------------------------------------------------------------------------------- /contrib/test-data/test_enzh_tags_advanced_config.expected.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test-data/test_enzh_tags_advanced_config.expected.out -------------------------------------------------------------------------------- /contrib/test-data/test_enzh_tags_stage_config.expected.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test-data/test_enzh_tags_stage_config.expected.out -------------------------------------------------------------------------------- /contrib/test-data/test_zhen_config.expected.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test-data/test_zhen_config.expected.out -------------------------------------------------------------------------------- /contrib/test-data/test_zhen_config_prefix.expected.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test-data/test_zhen_config_prefix.expected.out -------------------------------------------------------------------------------- /contrib/test-data/vocab.zhen.spm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test-data/vocab.zhen.spm -------------------------------------------------------------------------------- /contrib/test_enzh_config.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test_enzh_config.yml -------------------------------------------------------------------------------- /contrib/test_enzh_config_plain.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test_enzh_config_plain.yml -------------------------------------------------------------------------------- /contrib/test_enzh_noise_config.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test_enzh_noise_config.yml -------------------------------------------------------------------------------- /contrib/test_enzh_tags_advanced_config.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test_enzh_tags_advanced_config.yml -------------------------------------------------------------------------------- /contrib/test_enzh_tags_stage_config.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test_enzh_tags_stage_config.yml -------------------------------------------------------------------------------- /contrib/test_full_config.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test_full_config.yml -------------------------------------------------------------------------------- /contrib/test_zhen_config.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test_zhen_config.yml -------------------------------------------------------------------------------- /contrib/test_zhen_prefix_config.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/test_zhen_prefix_config.yml -------------------------------------------------------------------------------- /contrib/train_config.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/contrib/train_config.yml -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/requirements.txt -------------------------------------------------------------------------------- /run-tests.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/run-tests.sh -------------------------------------------------------------------------------- /src/opustrainer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/opustrainer/__main__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/__main__.py -------------------------------------------------------------------------------- /src/opustrainer/alignments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/alignments.py -------------------------------------------------------------------------------- /src/opustrainer/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/logger.py -------------------------------------------------------------------------------- /src/opustrainer/modifiers/__init__.py: -------------------------------------------------------------------------------- 1 | from opustrainer.types import Modifier -------------------------------------------------------------------------------- /src/opustrainer/modifiers/merge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/modifiers/merge.py -------------------------------------------------------------------------------- /src/opustrainer/modifiers/noise.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/modifiers/noise.py -------------------------------------------------------------------------------- /src/opustrainer/modifiers/placeholders.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/modifiers/placeholders.py -------------------------------------------------------------------------------- /src/opustrainer/modifiers/pool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/modifiers/pool.py -------------------------------------------------------------------------------- /src/opustrainer/modifiers/prefix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/modifiers/prefix.py -------------------------------------------------------------------------------- /src/opustrainer/modifiers/punctuation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/modifiers/punctuation.py -------------------------------------------------------------------------------- /src/opustrainer/modifiers/retokenize.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/modifiers/retokenize.py -------------------------------------------------------------------------------- /src/opustrainer/modifiers/surface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/modifiers/surface.py -------------------------------------------------------------------------------- /src/opustrainer/modifiers/typos.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/modifiers/typos.py -------------------------------------------------------------------------------- /src/opustrainer/shuffle.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/shuffle.py -------------------------------------------------------------------------------- /src/opustrainer/tokenizers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/tokenizers.py -------------------------------------------------------------------------------- /src/opustrainer/trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/trainer.py -------------------------------------------------------------------------------- /src/opustrainer/types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/src/opustrainer/types.py -------------------------------------------------------------------------------- /tests/test_endtoend.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/tests/test_endtoend.py -------------------------------------------------------------------------------- /tests/test_logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/tests/test_logger.py -------------------------------------------------------------------------------- /tests/test_merge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/tests/test_merge.py -------------------------------------------------------------------------------- /tests/test_noise.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/tests/test_noise.py -------------------------------------------------------------------------------- /tests/test_placeholders.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/tests/test_placeholders.py -------------------------------------------------------------------------------- /tests/test_prefixes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/tests/test_prefixes.py -------------------------------------------------------------------------------- /tests/test_punctuation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/tests/test_punctuation.py -------------------------------------------------------------------------------- /tests/test_retokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/tests/test_retokenizer.py -------------------------------------------------------------------------------- /tests/test_tokenizers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/tests/test_tokenizers.py -------------------------------------------------------------------------------- /tests/test_trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/tests/test_trainer.py -------------------------------------------------------------------------------- /tests/test_trainer_cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/tests/test_trainer_cli.py -------------------------------------------------------------------------------- /tests/test_typos.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusTrainer/HEAD/tests/test_typos.py --------------------------------------------------------------------------------