├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── api.md ├── changelog.md ├── contributing.md ├── index.md └── installation.md ├── makefile ├── mkdocs.yml ├── notebooks ├── compute_mapping_tfa.ipynb ├── create_wechsel_mapping.ipynb ├── download_oscar.ipynb ├── export │ └── alignments │ │ ├── translation-table.en-nl.tsv │ │ ├── translation-table[part1].en-tt.tsv │ │ └── translation-table[part2].en-tt.tsv ├── remap_embeddings_v3.py ├── train_spm_tokenizer.py └── weschel_dutch.ipynb ├── pyproject.toml ├── requirements.txt ├── scripts ├── calculate_ppl.py ├── calculate_ppl_MaLA.py └── create_llama3_model.py ├── setup.cfg ├── tests ├── __init__.py └── test_tiktotok.py └── transtokenizers ├── __init__.py └── transtokenizers.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/.gitignore -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.1.0 (2024-01-23) 4 | 5 | * First release on PyPI. 6 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/README.md -------------------------------------------------------------------------------- /docs/api.md: -------------------------------------------------------------------------------- 1 | ::: transtokenizers 2 | -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | {% 2 | include-markdown "../CHANGELOG.md" 3 | %} 4 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | {% 2 | include-markdown "../CONTRIBUTING.md" 3 | %} 4 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | {% 2 | include-markdown "../README.md" 3 | %} 4 | -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/docs/installation.md -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/makefile -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/mkdocs.yml -------------------------------------------------------------------------------- /notebooks/compute_mapping_tfa.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/notebooks/compute_mapping_tfa.ipynb -------------------------------------------------------------------------------- /notebooks/create_wechsel_mapping.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/notebooks/create_wechsel_mapping.ipynb -------------------------------------------------------------------------------- /notebooks/download_oscar.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/notebooks/download_oscar.ipynb -------------------------------------------------------------------------------- /notebooks/export/alignments/translation-table.en-nl.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/notebooks/export/alignments/translation-table.en-nl.tsv -------------------------------------------------------------------------------- /notebooks/export/alignments/translation-table[part1].en-tt.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/notebooks/export/alignments/translation-table[part1].en-tt.tsv -------------------------------------------------------------------------------- /notebooks/export/alignments/translation-table[part2].en-tt.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/notebooks/export/alignments/translation-table[part2].en-tt.tsv -------------------------------------------------------------------------------- /notebooks/remap_embeddings_v3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/notebooks/remap_embeddings_v3.py -------------------------------------------------------------------------------- /notebooks/train_spm_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/notebooks/train_spm_tokenizer.py -------------------------------------------------------------------------------- /notebooks/weschel_dutch.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/notebooks/weschel_dutch.ipynb -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/calculate_ppl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/scripts/calculate_ppl.py -------------------------------------------------------------------------------- /scripts/calculate_ppl_MaLA.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/scripts/calculate_ppl_MaLA.py -------------------------------------------------------------------------------- /scripts/create_llama3_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/scripts/create_llama3_model.py -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/setup.cfg -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit test package for transtokenizers.""" 2 | -------------------------------------------------------------------------------- /tests/test_tiktotok.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/tests/test_tiktotok.py -------------------------------------------------------------------------------- /transtokenizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/transtokenizers/__init__.py -------------------------------------------------------------------------------- /transtokenizers/transtokenizers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAGoM-NLP/transtokenizer/HEAD/transtokenizers/transtokenizers.py --------------------------------------------------------------------------------