├── .gitattributes ├── .github ├── FUNDING.yml └── workflows │ ├── python-publish.yaml │ └── test.yaml ├── .gitignore ├── .gitmodules ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── evaluation ├── README.md ├── install_jumanpp.sh ├── jglue │ ├── README.md │ ├── requirements.txt │ └── transformers-4.26.1_jglue-1.1.0_chitra-0.1.8.patch ├── pytorch │ ├── classification_utils.py │ ├── convert_dataset.py │ ├── multiple_choice_utils.py │ ├── qa_utils.py │ ├── requirements.txt │ ├── run_all.sh │ ├── run_evaluation.py │ └── tokenizer_utils.py ├── summary_results.py └── tensorflow │ ├── classification_utils.py │ ├── convert_dataset.py │ ├── multiple_choice_utils.py │ ├── qa_utils.py │ ├── requirements.txt │ ├── run_all.sh │ ├── run_evaluation.py │ └── tokenizer_utils.py ├── misc └── license-header.txt ├── pretraining └── bert │ ├── README.md │ ├── __init__.py │ ├── convert_original_tf2_checkpoint_to_pytorch_nvidia.py │ ├── corpus_preprocessing │ ├── __init__.py │ ├── filter │ │ ├── __init__.py │ │ ├── document_filter │ │ │ ├── __init__.py │ │ │ ├── document_filter.py │ │ │ └── document_filter_name.py │ │ └── sentence_filter │ │ │ ├── __init__.py │ │ │ ├── sentence_filter.py │ │ │ └── sentence_filter_name.py │ └── normalizer │ │ ├── __init__.py │ │ ├── document_normalizer │ │ ├── __init__.py │ │ ├── document_normalizer.py │ │ └── document_normalizer_name.py │ │ └── sentence_normalizer │ │ ├── __init__.py │ │ ├── sentence_normalizer.py │ │ └── sentence_normalizer_name.py │ ├── prepare_dataset.py │ ├── preprocess_dataset.py │ ├── requirements.txt │ ├── resources │ └── ng_words.txt │ ├── run_create_pretraining_data.sh │ ├── run_prepare_dataset.sh │ ├── split_dataset.py │ ├── train_pos_substitution_tokenizer.py │ └── train_wordpiece_tokenizer.py ├── requirements.txt ├── setup.py ├── sudachitra ├── __init__.py ├── conjugation_preserving_normalizer.py ├── input_string_normalizer.py ├── pretokenizer │ ├── __init__.py │ ├── japanese_bert_wordpiece_tokenizer.py │ ├── pos_substitution_tokenizer.py │ └── sudachipy_pretokenizer.py ├── resources │ ├── conjugation_type_table.json │ └── inflection_table.json ├── sudachipy_word_tokenizer.py ├── tokenization_bert_sudachipy.py ├── tokenization_electra_sudachipy.py └── word_formatter.py └── tests ├── __init__.py ├── test_japanese_bert_wordpiece_tokenizer.py └── test_tokenization_bert_sudachipy.py /.gitattributes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/.gitattributes -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/.github/FUNDING.yml -------------------------------------------------------------------------------- /.github/workflows/python-publish.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/.github/workflows/python-publish.yaml -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/.github/workflows/test.yaml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/.gitmodules -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/CHANGELOG.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE requirements.txt 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/README.md -------------------------------------------------------------------------------- /evaluation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/README.md -------------------------------------------------------------------------------- /evaluation/install_jumanpp.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/install_jumanpp.sh -------------------------------------------------------------------------------- /evaluation/jglue/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/jglue/README.md -------------------------------------------------------------------------------- /evaluation/jglue/requirements.txt: -------------------------------------------------------------------------------- 1 | sudachitra 2 | pytextspan 3 | -------------------------------------------------------------------------------- /evaluation/jglue/transformers-4.26.1_jglue-1.1.0_chitra-0.1.8.patch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/jglue/transformers-4.26.1_jglue-1.1.0_chitra-0.1.8.patch -------------------------------------------------------------------------------- /evaluation/pytorch/classification_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/pytorch/classification_utils.py -------------------------------------------------------------------------------- /evaluation/pytorch/convert_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/pytorch/convert_dataset.py -------------------------------------------------------------------------------- /evaluation/pytorch/multiple_choice_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/pytorch/multiple_choice_utils.py -------------------------------------------------------------------------------- /evaluation/pytorch/qa_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/pytorch/qa_utils.py -------------------------------------------------------------------------------- /evaluation/pytorch/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/pytorch/requirements.txt -------------------------------------------------------------------------------- /evaluation/pytorch/run_all.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/pytorch/run_all.sh -------------------------------------------------------------------------------- /evaluation/pytorch/run_evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/pytorch/run_evaluation.py -------------------------------------------------------------------------------- /evaluation/pytorch/tokenizer_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/pytorch/tokenizer_utils.py -------------------------------------------------------------------------------- /evaluation/summary_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/summary_results.py -------------------------------------------------------------------------------- /evaluation/tensorflow/classification_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/tensorflow/classification_utils.py -------------------------------------------------------------------------------- /evaluation/tensorflow/convert_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/tensorflow/convert_dataset.py -------------------------------------------------------------------------------- /evaluation/tensorflow/multiple_choice_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/tensorflow/multiple_choice_utils.py -------------------------------------------------------------------------------- /evaluation/tensorflow/qa_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/tensorflow/qa_utils.py -------------------------------------------------------------------------------- /evaluation/tensorflow/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/tensorflow/requirements.txt -------------------------------------------------------------------------------- /evaluation/tensorflow/run_all.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/tensorflow/run_all.sh -------------------------------------------------------------------------------- /evaluation/tensorflow/run_evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/tensorflow/run_evaluation.py -------------------------------------------------------------------------------- /evaluation/tensorflow/tokenizer_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/evaluation/tensorflow/tokenizer_utils.py -------------------------------------------------------------------------------- /misc/license-header.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/misc/license-header.txt -------------------------------------------------------------------------------- /pretraining/bert/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/README.md -------------------------------------------------------------------------------- /pretraining/bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/__init__.py -------------------------------------------------------------------------------- /pretraining/bert/convert_original_tf2_checkpoint_to_pytorch_nvidia.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/convert_original_tf2_checkpoint_to_pytorch_nvidia.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/__init__.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/filter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/filter/__init__.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/filter/document_filter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/filter/document_filter/__init__.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/filter/document_filter/document_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/filter/document_filter/document_filter.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/filter/document_filter/document_filter_name.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/filter/document_filter/document_filter_name.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/filter/sentence_filter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/filter/sentence_filter/__init__.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/filter/sentence_filter/sentence_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/filter/sentence_filter/sentence_filter.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/filter/sentence_filter/sentence_filter_name.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/filter/sentence_filter/sentence_filter_name.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/normalizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/normalizer/__init__.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/normalizer/document_normalizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/normalizer/document_normalizer/__init__.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/normalizer/document_normalizer/document_normalizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/normalizer/document_normalizer/document_normalizer.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/normalizer/document_normalizer/document_normalizer_name.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/normalizer/document_normalizer/document_normalizer_name.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/normalizer/sentence_normalizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/normalizer/sentence_normalizer/__init__.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/normalizer/sentence_normalizer/sentence_normalizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/normalizer/sentence_normalizer/sentence_normalizer.py -------------------------------------------------------------------------------- /pretraining/bert/corpus_preprocessing/normalizer/sentence_normalizer/sentence_normalizer_name.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/corpus_preprocessing/normalizer/sentence_normalizer/sentence_normalizer_name.py -------------------------------------------------------------------------------- /pretraining/bert/prepare_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/prepare_dataset.py -------------------------------------------------------------------------------- /pretraining/bert/preprocess_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/preprocess_dataset.py -------------------------------------------------------------------------------- /pretraining/bert/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/requirements.txt -------------------------------------------------------------------------------- /pretraining/bert/resources/ng_words.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/resources/ng_words.txt -------------------------------------------------------------------------------- /pretraining/bert/run_create_pretraining_data.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/run_create_pretraining_data.sh -------------------------------------------------------------------------------- /pretraining/bert/run_prepare_dataset.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/run_prepare_dataset.sh -------------------------------------------------------------------------------- /pretraining/bert/split_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/split_dataset.py -------------------------------------------------------------------------------- /pretraining/bert/train_pos_substitution_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/train_pos_substitution_tokenizer.py -------------------------------------------------------------------------------- /pretraining/bert/train_wordpiece_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/pretraining/bert/train_wordpiece_tokenizer.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/requirements.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/setup.py -------------------------------------------------------------------------------- /sudachitra/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/sudachitra/__init__.py -------------------------------------------------------------------------------- /sudachitra/conjugation_preserving_normalizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/sudachitra/conjugation_preserving_normalizer.py -------------------------------------------------------------------------------- /sudachitra/input_string_normalizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/sudachitra/input_string_normalizer.py -------------------------------------------------------------------------------- /sudachitra/pretokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/sudachitra/pretokenizer/__init__.py -------------------------------------------------------------------------------- /sudachitra/pretokenizer/japanese_bert_wordpiece_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/sudachitra/pretokenizer/japanese_bert_wordpiece_tokenizer.py -------------------------------------------------------------------------------- /sudachitra/pretokenizer/pos_substitution_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/sudachitra/pretokenizer/pos_substitution_tokenizer.py -------------------------------------------------------------------------------- /sudachitra/pretokenizer/sudachipy_pretokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/sudachitra/pretokenizer/sudachipy_pretokenizer.py -------------------------------------------------------------------------------- /sudachitra/resources/conjugation_type_table.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/sudachitra/resources/conjugation_type_table.json -------------------------------------------------------------------------------- /sudachitra/resources/inflection_table.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/sudachitra/resources/inflection_table.json -------------------------------------------------------------------------------- /sudachitra/sudachipy_word_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/sudachitra/sudachipy_word_tokenizer.py -------------------------------------------------------------------------------- /sudachitra/tokenization_bert_sudachipy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/sudachitra/tokenization_bert_sudachipy.py -------------------------------------------------------------------------------- /sudachitra/tokenization_electra_sudachipy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/sudachitra/tokenization_electra_sudachipy.py -------------------------------------------------------------------------------- /sudachitra/word_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/sudachitra/word_formatter.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_japanese_bert_wordpiece_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/tests/test_japanese_bert_wordpiece_tokenizer.py -------------------------------------------------------------------------------- /tests/test_tokenization_bert_sudachipy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/SudachiTra/HEAD/tests/test_tokenization_bert_sudachipy.py --------------------------------------------------------------------------------