├── .eggs ├── README.txt ├── pytest_runner-6.0.1-py3.10.egg │ ├── EGG-INFO │ │ ├── LICENSE │ │ ├── PKG-INFO │ │ ├── RECORD │ │ ├── WHEEL │ │ ├── entry_points.txt │ │ ├── requires.txt │ │ └── top_level.txt │ └── ptr │ │ └── __init__.py ├── pytest_runner-6.0.1-py3.11.egg │ ├── EGG-INFO │ │ ├── LICENSE │ │ ├── PKG-INFO │ │ ├── RECORD │ │ ├── WHEEL │ │ ├── entry_points.txt │ │ ├── requires.txt │ │ └── top_level.txt │ └── ptr │ │ └── __init__.py ├── pytest_runner-6.0.1-py3.12.egg │ ├── EGG-INFO │ │ ├── LICENSE │ │ ├── PKG-INFO │ │ ├── RECORD │ │ ├── WHEEL │ │ ├── entry_points.txt │ │ ├── requires.txt │ │ └── top_level.txt │ └── ptr │ │ └── __init__.py └── pytest_runner-6.0.1-py3.7.egg │ ├── EGG-INFO │ ├── LICENSE │ ├── PKG-INFO │ ├── RECORD │ ├── WHEEL │ ├── entry_points.txt │ ├── requires.txt │ └── top_level.txt │ └── ptr │ └── __init__.py ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── LICENSE ├── MANIFEST.in ├── PKG-INFO ├── README.md ├── README.rst ├── SinaTools.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── entry_points.txt ├── not-zip-safe ├── requires.txt └── top_level.txt ├── build └── lib │ └── sinatools │ ├── CLI │ ├── DataDownload │ │ └── download_files.py │ ├── morphology │ │ ├── ALMA_multi_word.py │ │ └── morph_analyzer.py │ ├── ner │ │ ├── corpus_entity_extractor.py │ │ └── entity_extractor.py │ └── utils │ │ ├── __init__.py │ │ ├── arStrip.py │ │ ├── corpus_tokenizer.py │ │ ├── implication.py │ │ ├── jaccard.py │ │ ├── remove_latin.py │ │ ├── remove_punctuation.py │ │ ├── sentence_tokenizer.py │ │ ├── text_dublication_detector.py │ │ └── text_transliteration.py │ ├── DataDownload │ ├── __init__.py │ └── downloader.py │ ├── VERSION │ ├── __init__.py │ ├── arabert │ ├── __init__.py │ ├── arabert │ │ ├── __init__.py │ │ ├── create_classification_data.py │ │ ├── create_pretraining_data.py │ │ ├── extract_features.py │ │ ├── lamb_optimizer.py │ │ ├── modeling.py │ │ ├── optimization.py │ │ ├── run_classifier.py │ │ ├── run_pretraining.py │ │ ├── run_squad.py │ │ └── tokenization.py │ ├── araelectra │ │ ├── __init__.py │ │ ├── build_openwebtext_pretraining_dataset.py │ │ ├── build_pretraining_dataset.py │ │ ├── build_pretraining_dataset_single_file.py │ │ ├── configure_finetuning.py │ │ ├── configure_pretraining.py │ │ ├── finetune │ │ │ ├── __init__.py │ │ │ ├── feature_spec.py │ │ │ ├── preprocessing.py │ │ │ ├── scorer.py │ │ │ ├── task.py │ │ │ └── task_builder.py │ │ ├── flops_computation.py │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── modeling.py │ │ │ ├── optimization.py │ │ │ └── tokenization.py │ │ ├── pretrain │ │ │ ├── __init__.py │ │ │ ├── pretrain_data.py │ │ │ └── pretrain_helpers.py │ │ ├── run_finetuning.py │ │ ├── run_pretraining.py │ │ └── util │ │ │ ├── __init__.py │ │ │ ├── training_utils.py │ │ │ └── utils.py │ ├── aragpt2 │ │ ├── __init__.py │ │ ├── create_pretraining_data.py │ │ ├── gpt2 │ │ │ ├── __init__.py │ │ │ ├── lamb_optimizer.py │ │ │ ├── optimization.py │ │ │ └── run_pretraining.py │ │ ├── grover │ │ │ ├── __init__.py │ │ │ ├── dataloader.py │ │ │ ├── modeling.py │ │ │ ├── modeling_gpt2.py │ │ │ ├── optimization_adafactor.py │ │ │ ├── train_tpu.py │ │ │ └── utils.py │ │ └── train_bpe_tokenizer.py │ └── preprocess.py │ ├── environment.yml │ ├── install_env.py │ ├── morphology │ ├── ALMA_multi_word.py │ ├── __init__.py │ └── morph_analyzer.py │ ├── ner │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── datasets.py │ │ └── transforms.py │ ├── data_format.py │ ├── datasets.py │ ├── entity_extractor.py │ ├── helpers.py │ ├── metrics.py │ ├── nn │ │ ├── BaseModel.py │ │ ├── BertNestedTagger.py │ │ ├── BertSeqTagger.py │ │ └── __init__.py │ ├── trainers │ │ ├── BaseTrainer.py │ │ ├── BertNestedTrainer.py │ │ ├── BertTrainer.py │ │ └── __init__.py │ └── transforms.py │ ├── relations │ ├── __init__.py │ └── relation_extractor.py │ ├── semantic_relatedness │ ├── __init__.py │ └── compute_relatedness.py │ ├── sinatools.py │ ├── synonyms │ ├── __init__.py │ └── synonyms_generator.py │ ├── utils │ ├── __init__.py │ ├── charsets.py │ ├── parser.py │ ├── readfile.py │ ├── similarity.py │ ├── text_dublication_detector.py │ ├── text_transliteration.py │ ├── tokenizer.py │ ├── tokenizers_words.py │ └── word_compare.py │ └── wsd │ ├── __init__.py │ ├── disambiguator.py │ ├── settings.py │ └── wsd.py ├── dist ├── SinaTools-0.1.41-py2.py3-none-any.whl └── sinatools-0.1.41.tar.gz ├── docs ├── Makefile ├── build │ ├── _images │ │ └── download.png │ ├── _static │ │ ├── download.png │ │ ├── file.png │ │ ├── minus.png │ │ └── plus.png │ ├── doctrees │ │ ├── License.doctree │ │ ├── Overview.doctree │ │ ├── about.doctree │ │ ├── api.doctree │ │ └── api │ │ │ ├── DataDownload.doctree │ │ │ ├── DataDownload │ │ │ └── downloader.doctree │ │ │ └── arabiner.doctree │ └── html │ │ ├── _images │ │ ├── SinaLogo.jpg │ │ └── download.png │ │ └── _static │ │ ├── SinaLogo.jpg │ │ ├── download.png │ │ ├── file.png │ │ ├── minus.png │ │ └── plus.png ├── make.bat └── source │ ├── License.rst │ ├── Overview.rst │ ├── _static │ ├── SinaLogo.jpg │ └── download.png │ ├── about.rst │ ├── api.rst │ ├── api │ ├── DataDownload.rst │ ├── DataDownload │ │ └── downloader.rst │ ├── arabiner.rst │ ├── arabiner │ │ └── bin │ │ │ └── infer.rst │ ├── morphology.rst │ ├── morphology │ │ └── morph_analyzer.rst │ ├── salma.rst │ ├── salma │ │ └── views.rst │ ├── utils.rst │ └── utils │ │ ├── corpus_tokenizer.rst │ │ ├── implication.rst │ │ ├── jaccard.rst │ │ ├── parser.rst │ │ ├── sentence_tokenizer.rst │ │ └── text_transliteration.rst │ ├── authors.rst │ ├── cli_tools.rst │ ├── cli_tools │ ├── DataDownload.rst │ ├── DataDownload │ │ ├── download_files.rst │ │ └── get_appdatadir.rst │ ├── arabiner.rst │ ├── arabiner │ │ └── infer.rst │ ├── morphology.rst │ ├── morphology │ │ ├── ALMA_multi_word.rst │ │ └── morph_analyzer.rst │ ├── salma.rst │ ├── salma │ │ └── salma_tools.rst │ ├── utils.rst │ └── utils │ │ ├── arStrip.rst │ │ ├── corpus_tokenizer.rst │ │ ├── implication.rst │ │ ├── jaccard.rst │ │ ├── latin_remove.rst │ │ ├── remove_punc.rst │ │ ├── sentence_tokenizer.rst │ │ └── text_transliteration.rst │ ├── conf.py │ ├── index.rst │ ├── installation.rst │ └── readme.rst ├── setup.cfg ├── setup.py └── sinatools ├── CLI ├── DataDownload │ ├── __pycache__ │ │ ├── download_files.cpython-310.pyc │ │ └── download_files.cpython-38.pyc │ └── download_files.py ├── morphology │ ├── ALMA_multi_word.py │ ├── __pycache__ │ │ ├── morph_analyzer.cpython-310.pyc │ │ └── morph_analyzer.cpython-38.pyc │ └── morph_analyzer.py ├── ner │ ├── __pycache__ │ │ ├── corpus_entity_extractor.cpython-38.pyc │ │ ├── entity_extractor.cpython-310.pyc │ │ ├── entity_extractor.cpython-311.pyc │ │ └── entity_extractor.cpython-38.pyc │ ├── corpus_entity_extractor.py │ └── entity_extractor.py ├── relations │ └── relation_extractor.py ├── semantic_relatedness │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── compute_relatedness.cpython-38.pyc │ │ └── settings.cpython-38.pyc │ └── compute_relatedness.py ├── synonyms │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ └── synonyms_generator.cpython-38.pyc │ ├── evaluate_synonyms.py │ └── extend_synonyms.py ├── utils │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── arStrip.cpython-38.pyc │ │ ├── corpus_tokenizer.cpython-38.pyc │ │ ├── implication.cpython-38.pyc │ │ ├── jaccard.cpython-38.pyc │ │ ├── remove_latin.cpython-38.pyc │ │ ├── remove_punctuation.cpython-38.pyc │ │ ├── sentence_tokenizer.cpython-38.pyc │ │ └── text_transliteration.cpython-38.pyc │ ├── arStrip.py │ ├── corpus_tokenizer.py │ ├── implication.py │ ├── jaccard.py │ ├── remove_latin.py │ ├── remove_punctuation.py │ ├── sentence_tokenizer.py │ ├── text_dublication_detector.py │ └── text_transliteration.py └── wsd │ ├── __pycache__ │ └── disambiguator.cpython-38.pyc │ └── disambiguator.py ├── DataDownload ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-38.pyc │ ├── downloader.cpython-310.pyc │ ├── downloader.cpython-311.pyc │ └── downloader.cpython-38.pyc └── downloader.py ├── VERSION ├── __init__.py ├── __pycache__ ├── __init__.cpython-310.pyc ├── __init__.cpython-311.pyc ├── __init__.cpython-38.pyc └── sinatools.cpython-38.pyc ├── arabert ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-38.pyc │ ├── preprocess.cpython-310.pyc │ ├── preprocess.cpython-311.pyc │ └── preprocess.cpython-38.pyc └── preprocess.py ├── environment.yml ├── install_env.py ├── morphology ├── ALMA_multi_word.py ├── __init__.py ├── __pycache__ │ ├── ALMA_multi_word.cpython-310.pyc │ ├── ALMA_multi_word.cpython-311.pyc │ ├── ALMA_multi_word.cpython-38.pyc │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-38.pyc │ ├── charsets.cpython-310.pyc │ ├── morph_analyzer.cpython-310.pyc │ ├── morph_analyzer.cpython-311.pyc │ ├── morph_analyzer.cpython-38.pyc │ ├── settings.cpython-310.pyc │ └── tokenizers_words.cpython-310.pyc └── morph_analyzer.py ├── ner ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-38.pyc │ ├── data.cpython-310.pyc │ ├── data.cpython-38.pyc │ ├── data_format.cpython-310.pyc │ ├── data_format.cpython-311.pyc │ ├── data_format.cpython-38.pyc │ ├── datasets.cpython-310.pyc │ ├── datasets.cpython-38.pyc │ ├── entity_extractor.cpython-310.pyc │ ├── entity_extractor.cpython-311.pyc │ ├── entity_extractor.cpython-38.pyc │ ├── helpers.cpython-310.pyc │ ├── helpers.cpython-311.pyc │ ├── helpers.cpython-38.pyc │ ├── metrics.cpython-310.pyc │ ├── metrics.cpython-311.pyc │ ├── metrics.cpython-38.pyc │ ├── relation_extractor.cpython-38.pyc │ ├── transforms.cpython-310.pyc │ └── transforms.cpython-38.pyc ├── data │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-311.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── datasets.cpython-310.pyc │ │ ├── datasets.cpython-311.pyc │ │ ├── datasets.cpython-38.pyc │ │ ├── transforms.cpython-310.pyc │ │ ├── transforms.cpython-311.pyc │ │ └── transforms.cpython-38.pyc │ ├── datasets.py │ └── transforms.py ├── data_format.py ├── datasets.py ├── entity_extractor.py ├── helpers.py ├── metrics.py ├── nn │ ├── BaseModel.py │ ├── BertNestedTagger.py │ ├── BertSeqTagger.py │ ├── __init__.py │ └── __pycache__ │ │ ├── BaseModel.cpython-310.pyc │ │ ├── BaseModel.cpython-311.pyc │ │ ├── BaseModel.cpython-38.pyc │ │ ├── BertNestedTagger.cpython-310.pyc │ │ ├── BertNestedTagger.cpython-311.pyc │ │ ├── BertNestedTagger.cpython-38.pyc │ │ ├── BertSeqTagger.cpython-310.pyc │ │ ├── BertSeqTagger.cpython-311.pyc │ │ ├── BertSeqTagger.cpython-38.pyc │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-311.pyc │ │ └── __init__.cpython-38.pyc ├── trainers │ ├── BaseTrainer.py │ ├── BertNestedTrainer.py │ ├── BertTrainer.py │ ├── __init__.py │ └── __pycache__ │ │ ├── BaseTrainer.cpython-310.pyc │ │ ├── BaseTrainer.cpython-311.pyc │ │ ├── BaseTrainer.cpython-38.pyc │ │ ├── BertNestedTrainer.cpython-310.pyc │ │ ├── BertNestedTrainer.cpython-311.pyc │ │ ├── BertNestedTrainer.cpython-38.pyc │ │ ├── BertTrainer.cpython-310.pyc │ │ ├── BertTrainer.cpython-311.pyc │ │ ├── BertTrainer.cpython-38.pyc │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-311.pyc │ │ └── __init__.cpython-38.pyc └── transforms.py ├── relations ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-38.pyc │ ├── relation_extractor.cpython-311.pyc │ └── relation_extractor.cpython-38.pyc └── relation_extractor.py ├── semantic_relatedness ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-38.pyc │ ├── compute_relatedness.cpython-311.pyc │ ├── compute_relatedness.cpython-38.pyc │ └── settings.cpython-38.pyc └── compute_relatedness.py ├── sinatools.py ├── synonyms ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-38.pyc │ ├── synonyms_generator.cpython-311.pyc │ └── synonyms_generator.cpython-38.pyc └── synonyms_generator.py ├── utils ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-38.pyc │ ├── charsets.cpython-310.pyc │ ├── charsets.cpython-311.pyc │ ├── charsets.cpython-38.pyc │ ├── implication.cpython-310.pyc │ ├── implication.cpython-38.pyc │ ├── jaccard.cpython-310.pyc │ ├── jaccard.cpython-38.pyc │ ├── parser.cpython-310.pyc │ ├── parser.cpython-311.pyc │ ├── parser.cpython-38.pyc │ ├── readfile.cpython-310.pyc │ ├── readfile.cpython-38.pyc │ ├── similarity.cpython-311.pyc │ ├── text_dublication_detector.cpython-38.pyc │ ├── text_transliteration.cpython-38.pyc │ ├── tokenizer.cpython-310.pyc │ ├── tokenizer.cpython-311.pyc │ ├── tokenizer.cpython-38.pyc │ ├── tokenizers_words.cpython-310.pyc │ ├── tokenizers_words.cpython-311.pyc │ ├── tokenizers_words.cpython-38.pyc │ ├── word_compare.cpython-311.pyc │ └── word_compare.cpython-38.pyc ├── charsets.py ├── parser.py ├── readfile.py ├── similarity.py ├── text_dublication_detector.py ├── text_transliteration.py ├── tokenizer.py ├── tokenizers_words.py └── word_compare.py └── wsd ├── __init__.py ├── __pycache__ ├── __init__.cpython-310.pyc ├── __init__.cpython-311.pyc ├── __init__.cpython-38.pyc ├── disambiguator.cpython-310.pyc ├── disambiguator.cpython-311.pyc ├── disambiguator.cpython-38.pyc ├── settings.cpython-310.pyc ├── settings.cpython-311.pyc ├── settings.cpython-38.pyc ├── views.cpython-38.pyc ├── wsd.cpython-310.pyc ├── wsd.cpython-311.pyc └── wsd.cpython-38.pyc ├── disambiguator.py ├── settings.py └── wsd.py /.eggs/README.txt: -------------------------------------------------------------------------------- 1 | This directory contains eggs that were downloaded by setuptools to build, test, and run plug-ins. 2 | 3 | This directory caches those eggs to prevent repeated downloads. 4 | 5 | However, it is safe to delete this directory. 6 | 7 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.10.egg/EGG-INFO/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Jason R. Coombs 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to 5 | deal in the Software without restriction, including without limitation the 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | sell copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.10.egg/EGG-INFO/RECORD: -------------------------------------------------------------------------------- 1 | ptr/__init__.py,sha256=0UfzhCooVgCNTBwVEOPOVGEPck4pnl_6PTfsC-QzNGM,6730 2 | pytest_runner-6.0.1.dist-info/LICENSE,sha256=2z8CRrH5J48VhFuZ_sR4uLUG63ZIeZNyL4xuJUKF-vg,1050 3 | pytest_runner-6.0.1.dist-info/METADATA,sha256=Ho3FvAFjFHeY5OQ64WFzkLigFaIpuNr4G3uSmOk3nho,7319 4 | pytest_runner-6.0.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92 5 | pytest_runner-6.0.1.dist-info/entry_points.txt,sha256=BqezBqeO63XyzSYmHYE58gKEFIjJUd-XdsRQkXHy2ig,58 6 | pytest_runner-6.0.1.dist-info/top_level.txt,sha256=DPzHbWlKG8yq8EOD5UgEvVNDWeJRPyimrwfShwV6Iuw,4 7 | pytest_runner-6.0.1.dist-info/RECORD,, 8 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.10.egg/EGG-INFO/WHEEL: -------------------------------------------------------------------------------- 1 | Wheel-Version: 1.0 2 | Generator: bdist_wheel (0.42.0) 3 | Root-Is-Purelib: true 4 | Tag: py3-none-any 5 | 6 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.10.egg/EGG-INFO/entry_points.txt: -------------------------------------------------------------------------------- 1 | [distutils.commands] 2 | ptr = ptr:PyTest 3 | pytest = ptr:PyTest 4 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.10.egg/EGG-INFO/requires.txt: -------------------------------------------------------------------------------- 1 | 2 | [docs] 3 | sphinx 4 | jaraco.packaging>=9 5 | rst.linker>=1.9 6 | jaraco.tidelift>=1.4 7 | 8 | [testing] 9 | pytest>=6 10 | pytest-checkdocs>=2.4 11 | pytest-flake8 12 | pytest-cov 13 | pytest-enabler>=1.0.1 14 | pytest-virtualenv 15 | types-setuptools 16 | pytest-black>=0.3.7 17 | pytest-mypy>=0.9.1 18 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.10.egg/EGG-INFO/top_level.txt: -------------------------------------------------------------------------------- 1 | ptr 2 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.11.egg/EGG-INFO/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Jason R. Coombs 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to 5 | deal in the Software without restriction, including without limitation the 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | sell copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.11.egg/EGG-INFO/RECORD: -------------------------------------------------------------------------------- 1 | ptr/__init__.py,sha256=0UfzhCooVgCNTBwVEOPOVGEPck4pnl_6PTfsC-QzNGM,6730 2 | pytest_runner-6.0.1.dist-info/LICENSE,sha256=2z8CRrH5J48VhFuZ_sR4uLUG63ZIeZNyL4xuJUKF-vg,1050 3 | pytest_runner-6.0.1.dist-info/METADATA,sha256=Ho3FvAFjFHeY5OQ64WFzkLigFaIpuNr4G3uSmOk3nho,7319 4 | pytest_runner-6.0.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92 5 | pytest_runner-6.0.1.dist-info/entry_points.txt,sha256=BqezBqeO63XyzSYmHYE58gKEFIjJUd-XdsRQkXHy2ig,58 6 | pytest_runner-6.0.1.dist-info/top_level.txt,sha256=DPzHbWlKG8yq8EOD5UgEvVNDWeJRPyimrwfShwV6Iuw,4 7 | pytest_runner-6.0.1.dist-info/RECORD,, 8 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.11.egg/EGG-INFO/WHEEL: -------------------------------------------------------------------------------- 1 | Wheel-Version: 1.0 2 | Generator: bdist_wheel (0.42.0) 3 | Root-Is-Purelib: true 4 | Tag: py3-none-any 5 | 6 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.11.egg/EGG-INFO/entry_points.txt: -------------------------------------------------------------------------------- 1 | [distutils.commands] 2 | ptr = ptr:PyTest 3 | pytest = ptr:PyTest 4 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.11.egg/EGG-INFO/requires.txt: -------------------------------------------------------------------------------- 1 | 2 | [docs] 3 | sphinx 4 | jaraco.packaging>=9 5 | rst.linker>=1.9 6 | jaraco.tidelift>=1.4 7 | 8 | [testing] 9 | pytest>=6 10 | pytest-checkdocs>=2.4 11 | pytest-flake8 12 | pytest-cov 13 | pytest-enabler>=1.0.1 14 | pytest-virtualenv 15 | types-setuptools 16 | pytest-black>=0.3.7 17 | pytest-mypy>=0.9.1 18 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.11.egg/EGG-INFO/top_level.txt: -------------------------------------------------------------------------------- 1 | ptr 2 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.12.egg/EGG-INFO/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Jason R. Coombs 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to 5 | deal in the Software without restriction, including without limitation the 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | sell copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.12.egg/EGG-INFO/RECORD: -------------------------------------------------------------------------------- 1 | ptr/__init__.py,sha256=0UfzhCooVgCNTBwVEOPOVGEPck4pnl_6PTfsC-QzNGM,6730 2 | pytest_runner-6.0.1.dist-info/LICENSE,sha256=2z8CRrH5J48VhFuZ_sR4uLUG63ZIeZNyL4xuJUKF-vg,1050 3 | pytest_runner-6.0.1.dist-info/METADATA,sha256=Ho3FvAFjFHeY5OQ64WFzkLigFaIpuNr4G3uSmOk3nho,7319 4 | pytest_runner-6.0.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92 5 | pytest_runner-6.0.1.dist-info/entry_points.txt,sha256=BqezBqeO63XyzSYmHYE58gKEFIjJUd-XdsRQkXHy2ig,58 6 | pytest_runner-6.0.1.dist-info/top_level.txt,sha256=DPzHbWlKG8yq8EOD5UgEvVNDWeJRPyimrwfShwV6Iuw,4 7 | pytest_runner-6.0.1.dist-info/RECORD,, 8 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.12.egg/EGG-INFO/WHEEL: -------------------------------------------------------------------------------- 1 | Wheel-Version: 1.0 2 | Generator: bdist_wheel (0.42.0) 3 | Root-Is-Purelib: true 4 | Tag: py3-none-any 5 | 6 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.12.egg/EGG-INFO/entry_points.txt: -------------------------------------------------------------------------------- 1 | [distutils.commands] 2 | ptr = ptr:PyTest 3 | pytest = ptr:PyTest 4 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.12.egg/EGG-INFO/requires.txt: -------------------------------------------------------------------------------- 1 | 2 | [docs] 3 | sphinx 4 | jaraco.packaging>=9 5 | rst.linker>=1.9 6 | jaraco.tidelift>=1.4 7 | 8 | [testing] 9 | pytest>=6 10 | pytest-checkdocs>=2.4 11 | pytest-flake8 12 | pytest-cov 13 | pytest-enabler>=1.0.1 14 | pytest-virtualenv 15 | types-setuptools 16 | pytest-black>=0.3.7 17 | pytest-mypy>=0.9.1 18 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.12.egg/EGG-INFO/top_level.txt: -------------------------------------------------------------------------------- 1 | ptr 2 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.7.egg/EGG-INFO/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Jason R. Coombs 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to 5 | deal in the Software without restriction, including without limitation the 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | sell copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.7.egg/EGG-INFO/RECORD: -------------------------------------------------------------------------------- 1 | ptr/__init__.py,sha256=0UfzhCooVgCNTBwVEOPOVGEPck4pnl_6PTfsC-QzNGM,6730 2 | pytest_runner-6.0.1.dist-info/LICENSE,sha256=2z8CRrH5J48VhFuZ_sR4uLUG63ZIeZNyL4xuJUKF-vg,1050 3 | pytest_runner-6.0.1.dist-info/METADATA,sha256=Ho3FvAFjFHeY5OQ64WFzkLigFaIpuNr4G3uSmOk3nho,7319 4 | pytest_runner-6.0.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92 5 | pytest_runner-6.0.1.dist-info/entry_points.txt,sha256=BqezBqeO63XyzSYmHYE58gKEFIjJUd-XdsRQkXHy2ig,58 6 | pytest_runner-6.0.1.dist-info/top_level.txt,sha256=DPzHbWlKG8yq8EOD5UgEvVNDWeJRPyimrwfShwV6Iuw,4 7 | pytest_runner-6.0.1.dist-info/RECORD,, 8 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.7.egg/EGG-INFO/WHEEL: -------------------------------------------------------------------------------- 1 | Wheel-Version: 1.0 2 | Generator: bdist_wheel (0.42.0) 3 | Root-Is-Purelib: true 4 | Tag: py3-none-any 5 | 6 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.7.egg/EGG-INFO/entry_points.txt: -------------------------------------------------------------------------------- 1 | [distutils.commands] 2 | ptr = ptr:PyTest 3 | pytest = ptr:PyTest 4 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.7.egg/EGG-INFO/requires.txt: -------------------------------------------------------------------------------- 1 | 2 | [docs] 3 | jaraco.packaging>=9 4 | jaraco.tidelift>=1.4 5 | rst.linker>=1.9 6 | sphinx 7 | 8 | [testing] 9 | pytest-black>=0.3.7 10 | pytest-checkdocs>=2.4 11 | pytest-cov 12 | pytest-enabler>=1.0.1 13 | pytest-flake8 14 | pytest-mypy>=0.9.1 15 | pytest-virtualenv 16 | pytest>=6 17 | types-setuptools 18 | -------------------------------------------------------------------------------- /.eggs/pytest_runner-6.0.1-py3.7.egg/EGG-INFO/top_level.txt: -------------------------------------------------------------------------------- 1 | ptr 2 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * SinaLab 9 | 10 | Contributors 11 | ------------ 12 | 13 | None yet. Why not be the first? 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023, SinaLab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.rst 4 | include LICENSE 5 | include README.rst 6 | 7 | recursive-include tests * 8 | recursive-exclude * __pycache__ 9 | recursive-exclude * *.py[co] 10 | 11 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif 12 | 13 | 14 | include setup.py 15 | include sinatools/VERSION 16 | include sinatools/utils/*.py 17 | include sinatools/ner/*.py 18 | include sinatools/arabert/*.py 19 | include sinatools/DataDownload/*.py 20 | include sinatools/morphology/*.py 21 | include sinatools/salma/*.py 22 | include sinatools/CLI/ner/*.py 23 | include sinatools/CLI/morphology/*.py 24 | include sinatools/CLI/salma/*.py 25 | include sinatools/CLI/utils/*.py 26 | include sinatools/CLI/DataDownload/*.py 27 | include tests/*.py 28 | global-exclude *~ -------------------------------------------------------------------------------- /PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: SinaTools 3 | Version: 0.1.1 4 | Summary: UNKNOWN 5 | Home-page: https://github.com/SinaLab/sinatools 6 | Author: UNKNOWN 7 | Author-email: UNKNOWN 8 | License: MIT license 9 | Description: ======== 10 | sinatools 11 | ======== 12 | 13 | 14 | .. image:: https://img.shields.io/pypi/v/sinatools.svg 15 | :target: https://pypi.python.org/pypi/SinaTools 16 | 17 | .. image:: https://img.shields.io/travis/sina_institute/sinatools.svg 18 | :target: https://travis-ci.com/sina_institute/SinaTools 19 | 20 | .. image:: https://readthedocs.org/projects/sinatools/badge/?version=latest 21 | :target: https://SinaTools.readthedocs.io/en/latest/?version=latest 22 | :alt: Documentation Status 23 | 24 | 25 | 26 | 27 | Python Boilerplate contains all the boilerplate you need to create a Python package. 28 | 29 | 30 | * Free software: MIT license 31 | * Documentation: https://sina.birzeit.edu/sinatools/ 32 | 33 | 34 | Credits 35 | ------- 36 | 37 | This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypackage`_ project template. 38 | 39 | .. _Cookiecutter: https://github.com/audreyr/cookiecutter 40 | .. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage 41 | 42 | 43 | ======= 44 | History 45 | ======= 46 | 47 | 0.1.2 (2024-06-04) 48 | ------------------ 49 | 50 | 51 | Keywords: sinatools 52 | Platform: UNKNOWN 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SinaTools 2 | ====================== 3 | Open Source Toolkit for Arabic NLP and NLU developed by [SinaLab](http://sina.birzeit.edu/) at Birzeit University. SinaTools is available through Python APIs, command lines, colabs, and online demos. 4 | 5 | See the full list of [Available Packages](https://sina.birzeit.edu/sinatools/), which include: (1) [Morphology Tagging](https://sina.birzeit.edu/sinatools/index.html#morph), (2) [Named Entity Recognition (NER)](https://sina.birzeit.edu/sinatools/index.html#ner), (3) [Word Sense Disambiguation (WSD)](https://sina.birzeit.edu/sinatools/index.html#wsd), (4) [Semantic Relatedness](https://sina.birzeit.edu/sinatools/index.html#sr), (5) [Synonymy Extraction and Evaluation](https://sina.birzeit.edu/sinatools/index.html#se), (6) [Relation Extraction](https://sina.birzeit.edu/sinatools/index.html#re), (7) [Utilities](https://sina.birzeit.edu/sinatools/index.html#u) (diacritic-based word matching, Jaccard similarly, parser, tokenizers, corpora processing, transliteration, etc). 6 | 7 | See [Demo Pages](https://sina.birzeit.edu/sinatools/). 8 | 9 | See the [benchmarking](https://www.jarrar.info/publications/HJK24.pdf), which shows that SinaTools outperformed all related toolkits. 10 | 11 | Installation 12 | -------- 13 | To install SinaTools, ensure you are using Python version 3.11.11, then clone the [GitHub](git://github.com/SinaLab/SinaTools) repository. 14 | 15 | Alternatively, you can execute the following command: 16 | 17 | ```bash 18 | pip install sinatools 19 | ``` 20 | 21 | Installing Models and Data Files 22 | -------- 23 | Some modules in SinaTools require some data files and fine-tuned models to be downloaded. To download these models, please consult the [DataDownload](https://sina.birzeit.edu/sinatools/documentation/cli_tools/DataDownload/DataDownload.html). 24 | 25 | Documentation 26 | -------- 27 | For information, please refer to the [main page](https://sina.birzeit.edu/sinatools) or the [online domuementation](https://sina.birzeit.edu/sinatools/documentation). 28 | 29 | Citation 30 | ------- 31 | Tymaa Hammouda, Mustafa Jarrar, Mohammed Khalilia: [SinaTools: Open Source Toolkit for Arabic Natural Language Understanding](http://www.jarrar.info/publications/HJK24.pdf). In Proceedings of the 2024 AI in Computational Linguistics (ACLing 2024), Procedia Computer Science, Dubai. ELSEVIER. 32 | 33 | License 34 | -------- 35 | SinaTools is available under the MIT License. See the [LICENSE](https://github.com/SinaLab/sinatools/blob/main/LICENSE) file for more information. 36 | 37 | Reporting Issues 38 | -------- 39 | To report any issues or bugs, please contact us at "sina.institute.bzu@gmail.com" or visit [SinaTools Issues](https://github.com/SinaLab/sinatools/issues). 40 | 41 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | SinaTools 2 | ====================== 3 | Open Source Toolkit for Arabic NLP and NLU developed by [SinaLab](http://sina.birzeit.edu/) at Birzeit University. SinaTools is available through Python APIs, command lines, colabs, and online demos. 4 | 5 | See the full list of [Available Packages](https://sina.birzeit.edu/sinatools/), which include: (1) [Morphology Tagging](https://sina.birzeit.edu/sinatools/index.html#morph), (2) [Named Entity Recognition (NER)](https://sina.birzeit.edu/sinatools/index.html#ner), (3) [Word Sense Disambiguation (WSD)](https://sina.birzeit.edu/sinatools/index.html#wsd), (4) [Semantic Relatedness](https://sina.birzeit.edu/sinatools/index.html#sr), (5) [Synonymy Extraction and Evaluation](https://sina.birzeit.edu/sinatools/index.html#se), (6) [Relation Extraction](https://sina.birzeit.edu/sinatools/index.html#re), (7) [Utilities](https://sina.birzeit.edu/sinatools/index.html#u) (diacritic-based word matching, Jaccard similarly, parser, tokenizers, corpora processing, transliteration, etc). 6 | 7 | See [Demo Pages](https://sina.birzeit.edu/sinatools/). 8 | 9 | See the [benchmarking](https://www.jarrar.info/publications/HJK24.pdf), which shows that SinaTools outperformed all related toolkits. 10 | 11 | Installation 12 | -------- 13 | To install SinaTools, ensure you are using Python version 3.10.8, then clone the [GitHub](git://github.com/SinaLab/SinaTools) repository. 14 | 15 | Alternatively, you can execute the following command: 16 | 17 | ```bash 18 | pip install sinatools 19 | ``` 20 | 21 | Installing Models and Data Files 22 | -------- 23 | Some modules in SinaTools require some data files and fine-tuned models to be downloaded. To download these models, please consult the [DataDownload](https://sina.birzeit.edu/sinatools/documentation/cli_tools/DataDownload/DataDownload.html). 24 | 25 | Documentation 26 | -------- 27 | For information, please refer to the [main page](https://sina.birzeit.edu/sinatools) or the [online domuementation](https://sina.birzeit.edu/sinatools/documentation). 28 | 29 | Citation 30 | ------- 31 | Tymaa Hammouda, Mustafa Jarrar, Mohammed Khalilia: [SinaTools: Open Source Toolkit for Arabic Natural Language Understanding](http://www.jarrar.info/publications/HJK24.pdf). In Proceedings of the 2024 AI in Computational Linguistics (ACLing 2024), Procedia Computer Science, Dubai. ELSEVIER. 32 | 33 | License 34 | -------- 35 | SinaTools is available under the MIT License. See the [LICENSE](https://github.com/SinaLab/sinatools/blob/main/LICENSE) file for more information. 36 | 37 | Reporting Issues 38 | -------- 39 | To report any issues or bugs, please contact us at "sina.institute.bzu@gmail.com" or visit [SinaTools Issues](https://github.com/SinaLab/sinatools/issues). 40 | -------------------------------------------------------------------------------- /SinaTools.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /SinaTools.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [console_scripts] 2 | alma_multi_word = sinatools.CLI.morphology.ALMA_multi_word:main 3 | appdatadir = sinatools.CLI.DataDownload.get_appdatadir:main 4 | arStrip = sinatools.CLI.utils.arStrip:main 5 | corpus_entity_extractor = sinatools.CLI.ner.corpus_entity_extractor:main 6 | corpus_tokenizer = sinatools.CLI.utils.corpus_tokenizer:main 7 | download_files = sinatools.CLI.DataDownload.download_files:main 8 | entity_extractor = sinatools.CLI.ner.entity_extractor:main 9 | evaluate_synonyms = sinatools.CLI.synonyms.evaluate_synonyms:main 10 | extend_synonyms = sinatools.CLI.synonyms.extend_synonyms:main 11 | implication = sinatools.CLI.utils.implication:main 12 | install_env = sinatools.install_env:main 13 | jaccard_similarity = sinatools.CLI.utils.jaccard:main 14 | morphology_analyzer = sinatools.CLI.morphology.morph_analyzer:main 15 | relation_extractor = sinatools.CLI.relations.relation_extractor:main 16 | remove_latin = sinatools.CLI.utils.remove_latin:main 17 | remove_punctuation = sinatools.CLI.utils.remove_punctuation:main 18 | semantic_relatedness = sinatools.CLI.semantic_relatedness.compute_relatedness:main 19 | sentence_tokenizer = sinatools.CLI.utils.sentence_tokenizer:main 20 | text_dublication_detector = sinatools.CLI.utils.text_dublication_detector:main 21 | transliterate = sinatools.CLI.utils.text_transliteration:main 22 | wsd = sinatools.CLI.wsd.disambiguator:main 23 | -------------------------------------------------------------------------------- /SinaTools.egg-info/not-zip-safe: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /SinaTools.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | six 2 | farasapy 3 | tqdm 4 | requests 5 | pathlib 6 | transformers==4.47.1 7 | torchvision==0.20.1 8 | seqeval==1.2.2 9 | natsort==7.1.1 10 | -------------------------------------------------------------------------------- /SinaTools.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | sinatools 2 | -------------------------------------------------------------------------------- /build/lib/sinatools/CLI/DataDownload/download_files.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | 5 | The download_files command, allows users to select specific files and models to download and use it within SinaTools modules. Additionally, it automatically manages the extraction of compressed files, including zip and tar.gz formats. 6 | 7 | Usage: 8 | ------ 9 | 10 | Below is the usage information that can be generated by running download_files --help. 11 | 12 | .. code-block:: none 13 | 14 | Usage: 15 | download_files [OPTIONS] 16 | 17 | .. code-block:: none 18 | 19 | Options: 20 | -f, --files FILES 21 | Names of the files to download. Available files are: ner, morph, wsd, synonyms. 22 | If no file is specified, all files will be downloaded. 23 | 24 | Examples: 25 | --------- 26 | 27 | .. code-block:: none 28 | 29 | download_files -f morph ner 30 | This command will download only the `morph` and `ner` files to the default directory. 31 | """ 32 | 33 | import argparse 34 | from sinatools.DataDownload.downloader import download_file 35 | from sinatools.DataDownload.downloader import download_files 36 | from sinatools.DataDownload.downloader import get_appdatadir 37 | from sinatools.DataDownload.downloader import download_folder_from_hf 38 | from sinatools.DataDownload.downloader import urls 39 | 40 | 41 | def main(): 42 | parser = argparse.ArgumentParser(description="Download files from specified URLs.") 43 | parser.add_argument('-f', '--files', nargs="*", 44 | help="Names of the files to download. Available files are: " 45 | f"{', '.join(urls.keys())}. If no file is specified, all files will be downloaded.") 46 | 47 | get_appdatadir() 48 | 49 | args = parser.parse_args() 50 | 51 | if args.files: 52 | for file in args.files: 53 | print("file: ", file) 54 | if file == "wsd": 55 | download_file(urls["morph"]) 56 | download_file(urls["ner"]) 57 | #download_file(urls["wsd_model"]) 58 | #download_file(urls["wsd_tokenizer"]) 59 | download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01") 60 | download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02") 61 | download_file(urls["one_gram"]) 62 | download_file(urls["five_grams"]) 63 | download_file(urls["four_grams"]) 64 | download_file(urls["three_grams"]) 65 | download_file(urls["two_grams"]) 66 | elif file == "synonyms": 67 | download_file(urls["graph_l2"]) 68 | download_file(urls["graph_l3"]) 69 | else: 70 | url = urls[file] 71 | download_file(url) 72 | else: 73 | download_files() 74 | 75 | if __name__ == '__main__': 76 | main() 77 | -------------------------------------------------------------------------------- /build/lib/sinatools/CLI/morphology/ALMA_multi_word.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from sinatools.morphology.ALMA_multi_word import ALMA_multi_word 3 | import json 4 | from sinatools.utils.readfile import read_file 5 | 6 | def main(): 7 | parser = argparse.ArgumentParser(description='Multi-Word Analysis using SinaTools') 8 | 9 | # Adding arguments for the multi-word input or file containing the multi-word input 10 | parser.add_argument('--multi_word', type=str, help='Multi-word text to be analyzed') 11 | parser.add_argument('--file', type=str, help='File containing the multi-word text to be analyzed') 12 | 13 | args = parser.parse_args() 14 | 15 | if args.multi_word is None and args.file is None: 16 | print("Error: Either --multi_word or --file argument must be provided.") 17 | return 18 | 19 | # Get the input either from the --multi_word argument or from the file specified in the --file argument 20 | multi_word_text = args.multi_word if args.multi_word else " ".join(read_file(args.file)) 21 | 22 | # Perform multi-word analysis 23 | results = ALMA_multi_word(multi_word_text) 24 | 25 | # Print the results in JSON format 26 | print(json.dumps(results, ensure_ascii=False, indent=4)) 27 | 28 | if __name__ == '__main__': 29 | main() 30 | #alma_multi_word --multi_word "Your multi-word text here" 31 | #alma_multi_word --file "path/to/your/file.txt" 32 | -------------------------------------------------------------------------------- /build/lib/sinatools/CLI/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/build/lib/sinatools/CLI/utils/__init__.py -------------------------------------------------------------------------------- /build/lib/sinatools/CLI/utils/corpus_tokenizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | About: 4 | ------ 5 | The corpus_tokenizer command offers functionality to tokenize a corpus and write the results to a CSV file. It recursively searches through a specified directory for text files, tokenizes the content, and outputs the results, including various metadata, to a specified CSV file. 6 | 7 | Usage: 8 | ------- 9 | Below is the usage information that can be generated by running corpus_tokenizer --help. 10 | 11 | .. code-block:: none 12 | 13 | Usage: 14 | corpus_tokenizer dir_path output_csv 15 | 16 | .. code-block:: none 17 | dir_path 18 | The path to the directory containing the text files. 19 | 20 | output_csv 21 | The path to the output CSV file. 22 | 23 | Examples: 24 | --------- 25 | .. code-block:: none 26 | corpus_tokenizer --dir_path "/path/to/text/directory/of/files" --output_csv "outputFile.csv" 27 | """ 28 | 29 | import argparse 30 | from sinatools.utils.tokenizer import corpus_tokenizer 31 | 32 | # Define the main function that will parse the arguments 33 | def main(): 34 | # Create an ArgumentParser object 35 | parser = argparse.ArgumentParser(description='Tokenize the corpus and write the results to a CSV file.') 36 | 37 | # Add arguments to the parser 38 | parser.add_argument('--dir_path', type=str, help='The path to the directory containing the text files.') 39 | parser.add_argument('--output_csv', type=str, help='The path to the output CSV file.') 40 | 41 | # Parse the command-line arguments 42 | args = parser.parse_args() 43 | 44 | # Call the corpus_tokenizer function with the parsed arguments 45 | corpus_tokenizer(args.dir_path, args.output_csv) 46 | 47 | # Call the main function when the script is executed 48 | if __name__ == '__main__': 49 | main() 50 | 51 | -------------------------------------------------------------------------------- /build/lib/sinatools/CLI/utils/implication.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | The implication tool evaluates the implication between two words using the functionalities provided by the `Implication` class of SinaTools. This tool can be utilized to determine the relationship between two words and understand if one implies the other. 5 | 6 | Usage: 7 | ------ 8 | Below is the usage information that can be generated by running implication --help. 9 | 10 | .. code-block:: none 11 | 12 | Usage: 13 | implication --inputWord1=WORD1 --inputWord2=WORD2 14 | 15 | implication --inputFile1=File1 --inputFile2=File2 16 | 17 | .. code-block:: none 18 | 19 | Options: 20 | --inputWord1 WORD1 21 | First input word. 22 | 23 | --inputWord2 WORD2 24 | Second input word. 25 | 26 | --file1 FILE1 27 | File containing the words to evaluate the implication. 28 | 29 | --file2 FILE2 30 | File containing the words to evaluate the implication. 31 | Examples: 32 | --------- 33 | 34 | .. code-block:: none 35 | 36 | implication --inputWord1 "word1" --inputWord2 "word2" 37 | 38 | implication --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" 39 | 40 | """ 41 | import argparse 42 | from sinatools.utils.word_compare import Implication 43 | 44 | def read_file(file_path): 45 | with open(file_path, 'r', encoding='utf-8') as file: 46 | word = file.readline().strip() 47 | if word: 48 | return word 49 | else: 50 | raise ValueError(f"File {file_path} must contain at least one word.") 51 | 52 | def main(): 53 | parser = argparse.ArgumentParser(description='Evaluate Implication between two words using SinaTools') 54 | 55 | # Adding optional arguments for the two input words and the files 56 | parser.add_argument('--inputWord1', type=str, help='First input word') 57 | parser.add_argument('--inputWord2', type=str, help='Second input word') 58 | parser.add_argument('--file1', type=str, help='File containing the first word to evaluate implication') 59 | parser.add_argument('--file2', type=str, help='File containing the second word to evaluate implication') 60 | 61 | args = parser.parse_args() 62 | 63 | if args.file1 and args.file2: 64 | word1 = read_file(args.file1) 65 | word2 = read_file(args.file2) 66 | elif args.inputWord1 and args.inputWord2: 67 | word1, word2 = args.inputWord1, args.inputWord2 68 | else: 69 | print("Either --file1 and --file2 arguments or both --inputWord1 and --inputWord2 arguments must be provided.") 70 | return 71 | 72 | # Instantiate the Implication class 73 | implication_obj = Implication(word1, word2) 74 | 75 | # For this example, assuming there is a method `get_verdict()` in the Implication class. 76 | result = implication_obj.get_verdict() 77 | print(result) 78 | 79 | if __name__ == '__main__': 80 | main() 81 | # implication --inputWord1 "word1" --inputWord2 "word2" 82 | # implication --file1 "path/to/your/firstfile.txt" --file2 "path/to/your/secondfile.txt" 83 | 84 | 85 | -------------------------------------------------------------------------------- /build/lib/sinatools/CLI/utils/remove_latin.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | The remove_latin command performs delete latin characters from the input text. 5 | 6 | Usage: 7 | ------ 8 | Below is the usage information that can be generated by running remove_latin --help. 9 | 10 | .. code-block:: none 11 | remove_latin --text=TEXT 12 | remove_latin --file "path/to/your/file.txt" 13 | 14 | Examples: 15 | --------- 16 | .. code-block:: none 17 | remove_latin --text "123test" 18 | remove_latin --file "path/to/your/file.txt" 19 | """ 20 | 21 | import argparse 22 | from sinatools.utils.parser import remove_latin 23 | 24 | 25 | def main(): 26 | parser = argparse.ArgumentParser(description='remove latin characters from the text') 27 | 28 | parser.add_argument('--text', type=str, required=True, help='The input text') 29 | args = parser.parse_args() 30 | result = remove_latin(args.text) 31 | 32 | print(result) 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /build/lib/sinatools/CLI/utils/remove_punctuation.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | The remove_punctuation command performs delete punctuation marks from the input text. 5 | 6 | Usage: 7 | ------ 8 | Below is the usage information that can be generated by running remove_punctuation --help. 9 | 10 | .. code-block:: none 11 | 12 | Usage: 13 | remove_punctuation --text=TEXT 14 | remove_punctuation --file "path/to/your/file.txt" 15 | 16 | Examples: 17 | --------- 18 | .. code-block:: none 19 | 20 | remove_punctuation --text "te%s@t...!!?" 21 | remove_punctuation --file "path/to/your/file.txt" 22 | """ 23 | 24 | import argparse 25 | from sinatools.utils.parser import remove_punctuation 26 | #from sinatools.utils.parser import read_file 27 | #from sinatools.utils.parser import write_file 28 | 29 | 30 | def main(): 31 | parser = argparse.ArgumentParser(description='remove punctuation marks from the text') 32 | 33 | parser.add_argument('--text',required=True,help="input text") 34 | # parser.add_argument('myFile', type=argparse.FileType('r'),help='Input file csv') 35 | args = parser.parse_args() 36 | result = remove_punctuation(args.text) 37 | 38 | print(result) 39 | if __name__ == '__main__': 40 | main() 41 | 42 | 43 | -------------------------------------------------------------------------------- /build/lib/sinatools/CLI/utils/sentence_tokenizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | 5 | The sentence_tokenizer command allows you to tokenize text into sentences using the SinaTools utility. It provides 6 | flexibility in tokenizing at different punctuation marks, including dots, question marks, and exclamation marks. It also 7 | allows tokenization at new lines. 8 | 9 | Usage: 10 | ------ 11 | Below is the usage information that can be generated by running sentence_tokenizer --help. 12 | 13 | .. code-block:: none 14 | 15 | Usage: 16 | sentence_tokenizer --text=TEXT [options] 17 | sentence_tokenizer --file=FILE [options] 18 | 19 | .. code-block:: none 20 | 21 | Options: 22 | --text TEXT 23 | Text to be tokenized into sentences. 24 | --file FILE 25 | File containing the text to be tokenized into sentences 26 | --dot 27 | Tokenize at dots. 28 | --new_line 29 | Tokenize at new lines. 30 | --question_mark 31 | Tokenize at question marks. 32 | --exclamation_mark 33 | Tokenize at exclamation marks. 34 | 35 | Examples: 36 | --------- 37 | 38 | .. code-block:: none 39 | 40 | sentence_tokenizer --text "Your text here. Does it work? Yes! Try with new lines." --dot --question_mark --exclamation_mark 41 | sentence_tokenizer --file "path/to/your/file.txt" --dot --question_mark --exclamation_mark 42 | 43 | """ 44 | import argparse 45 | from sinatools.utils.tokenizer import sentence_tokenizer 46 | from sinatools.utils.readfile import read_file 47 | 48 | def main(): 49 | parser = argparse.ArgumentParser(description='Sentence Tokenization using SinaTools') 50 | 51 | # Adding arguments for the text, file, and tokenization options 52 | parser.add_argument('--text', type=str, help='Text to be tokenized into sentences') 53 | parser.add_argument('--file', type=str, help='File containing the text to be tokenized into sentences') 54 | parser.add_argument('--dot', action='store_true', help='Tokenize at dots') 55 | parser.add_argument('--new_line', action='store_true', help='Tokenize at new lines') 56 | parser.add_argument('--question_mark', action='store_true', help='Tokenize at question marks') 57 | parser.add_argument('--exclamation_mark', action='store_true', help='Tokenize at exclamation marks') 58 | 59 | args = parser.parse_args() 60 | 61 | # Check if either text or file is provided 62 | if args.text is None and args.file is None: 63 | print("Either --text or --file argument must be provided.") 64 | return 65 | 66 | text_content = args.text if args.text else read_file(args.file) 67 | 68 | # Perform sentence tokenization 69 | sentences = sentence_tokenizer(" ".join(text_content), dot=args.dot, new_line=args.new_line, 70 | question_mark=args.question_mark, exclamation_mark=args.exclamation_mark) 71 | 72 | # Print each sentence in a new line 73 | for sentence in sentences: 74 | print(sentence) 75 | 76 | if __name__ == '__main__': 77 | main() 78 | -------------------------------------------------------------------------------- /build/lib/sinatools/CLI/utils/text_dublication_detector.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from sinatools.utils.text_dublication_detector import removal 3 | 4 | def main(): 5 | parser = argparse.ArgumentParser(description='Processes a CSV file of sentences to identify and remove duplicate sentences based on a specified threshold and cosine similarity. It saves the filtered results and the identified duplicates to separate files.') 6 | 7 | parser.add_argument('--csv_file', type=str, help='The path to the input CSV file that will be processed.') 8 | parser.add_argument('--column_name', type=str, help='The name of the column from which duplicates will be removed.') 9 | parser.add_argument('--final_file_name', type=str, help='The name of the output file that will contain the deduplicated results.') 10 | parser.add_argument('--deleted_file_name', type=str, help='The name of the output file that will contain the records that were identified as duplicates and removed.') 11 | parser.add_argument('--similarity_threshold', type=float, default=0.8, help='The similarity threshold for determining duplicates. Records with a similarity score above this value will be considered duplicates (default is 0.8).') 12 | 13 | args = parser.parse_args() 14 | 15 | if args.csv_file is None and args.column_name is None: 16 | print("Either --csv_file or --column_name argument must be provided.") 17 | return 18 | 19 | removal(args.csv_file, args.column_name, args.final_file_name, args.deleted_file_name, args.similarity_threshold) 20 | 21 | 22 | if __name__ == '__main__': 23 | main() 24 | 25 | # text_dublication_detector --csv_file "text.csv" --column_name "A" --final_file_name "Final.csv" --deleted_file_name "deleted.csv" --similarity_threshold 0.8 -------------------------------------------------------------------------------- /build/lib/sinatools/CLI/utils/text_transliteration.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | 5 | The transliterate tool allows you to transliterate text using the SinaTools' utility. This command-line utility 6 | takes in a text and a desired schema, and outputs the transliterated text. 7 | 8 | Usage: 9 | ------ 10 | Below is the usage information that can be generated by running transliterate --help. 11 | 12 | Usage: 13 | ------ 14 | 15 | .. code-block:: none 16 | 17 | transliterate --text=TEXT --schema=SCHEMA 18 | 19 | transliterate --file=FILE --schema=SCHEMA 20 | 21 | Options: 22 | -------- 23 | 24 | .. code-block:: none 25 | 26 | --text TEXT 27 | Text to be transliterated. 28 | --schema SCHEMA 29 | Transliteration schema to be used, which is bw2ar or ar2bw. 30 | 31 | Examples: 32 | --------- 33 | 34 | .. code-block:: none 35 | 36 | transliterate --text "klmp" --schema "bw2ar" 37 | transliterate --file "path/to/your/file.txt" --schema "ar2bw" 38 | 39 | 40 | """ 41 | import argparse 42 | from sinatools.utils.text_transliteration import perform_transliteration 43 | from sinatools.utils.readfile import read_file 44 | 45 | def main(): 46 | parser = argparse.ArgumentParser(description='Perform text transliteration using SinaTools') 47 | 48 | # Adding arguments for the text, file, and schema 49 | parser.add_argument('--text', type=str, help='Text to be transliterated') 50 | parser.add_argument('--file', type=str, help='File containing the text to be transliterated') 51 | parser.add_argument('--schema', type=str, required=True, help='Transliteration schema to be used') 52 | 53 | args = parser.parse_args() 54 | 55 | # Check if either text or file is provided 56 | if args.text is None and args.file is None: 57 | print("Either --text or --file argument must be provided.") 58 | return 59 | 60 | text_content = args.text if args.text else " ".join(read_file(args.file)) 61 | # Perform transliteration 62 | result = perform_transliteration(text_content, args.schema) 63 | 64 | print(result) 65 | 66 | if __name__ == '__main__': 67 | main() 68 | 69 | #transliterate --text "example text" --schema "bw2ar" 70 | #transliterate --file "path/to/your/file.txt" --schema "bw2ar" 71 | -------------------------------------------------------------------------------- /build/lib/sinatools/DataDownload/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/build/lib/sinatools/DataDownload/__init__.py -------------------------------------------------------------------------------- /build/lib/sinatools/VERSION: -------------------------------------------------------------------------------- 1 | 0.1.41 -------------------------------------------------------------------------------- /build/lib/sinatools/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level package for sinatools.""" 2 | 3 | __author__ = """SinaLab""" 4 | __email__ = 'sina.institute.bzu@gmail.com' 5 | __version__ = '0.8.5' -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/arabert/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/araelectra/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/araelectra/finetune/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/araelectra/finetune/feature_spec.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Defines the inputs used when fine-tuning a model.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | import configure_finetuning 26 | 27 | 28 | def get_shared_feature_specs(config: configure_finetuning.FinetuningConfig): 29 | """Non-task-specific model inputs.""" 30 | return [ 31 | FeatureSpec("input_ids", [config.max_seq_length]), 32 | FeatureSpec("input_mask", [config.max_seq_length]), 33 | FeatureSpec("segment_ids", [config.max_seq_length]), 34 | FeatureSpec("task_id", []), 35 | ] 36 | 37 | 38 | class FeatureSpec(object): 39 | """Defines a feature passed as input to the model.""" 40 | 41 | def __init__(self, name, shape, default_value_fn=None, is_int_feature=True): 42 | self.name = name 43 | self.shape = shape 44 | self.default_value_fn = default_value_fn 45 | self.is_int_feature = is_int_feature 46 | 47 | def get_parsing_spec(self): 48 | return tf.io.FixedLenFeature( 49 | self.shape, tf.int64 if self.is_int_feature else tf.float32) 50 | 51 | def get_default_values(self): 52 | if self.default_value_fn: 53 | return self.default_value_fn(self.shape) 54 | else: 55 | return np.zeros( 56 | self.shape, np.int64 if self.is_int_feature else np.float32) 57 | -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/araelectra/finetune/scorer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Base class for evaluation metrics.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import abc 23 | 24 | 25 | class Scorer(object): 26 | """Abstract base class for computing evaluation metrics.""" 27 | 28 | __metaclass__ = abc.ABCMeta 29 | 30 | def __init__(self): 31 | self._updated = False 32 | self._cached_results = {} 33 | 34 | @abc.abstractmethod 35 | def update(self, results): 36 | self._updated = True 37 | 38 | @abc.abstractmethod 39 | def get_loss(self): 40 | pass 41 | 42 | @abc.abstractmethod 43 | def _get_results(self): 44 | return [] 45 | 46 | def get_results(self, prefix=""): 47 | results = self._get_results() if self._updated else self._cached_results 48 | self._cached_results = results 49 | self._updated = False 50 | return [(prefix + k, v) for k, v in results] 51 | 52 | def results_str(self): 53 | return " - ".join(["{:}: {:.2f}".format(k, v) 54 | for k, v in self.get_results()]) 55 | -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/araelectra/finetune/task.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Defines a supervised NLP task.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import abc 23 | from typing import List, Tuple 24 | 25 | import configure_finetuning 26 | from finetune import feature_spec 27 | from finetune import scorer 28 | from model import modeling 29 | 30 | 31 | class Example(object): 32 | __metaclass__ = abc.ABCMeta 33 | 34 | def __init__(self, task_name): 35 | self.task_name = task_name 36 | 37 | 38 | class Task(object): 39 | """Override this class to add a new fine-tuning task.""" 40 | 41 | __metaclass__ = abc.ABCMeta 42 | 43 | def __init__(self, config: configure_finetuning.FinetuningConfig, name): 44 | self.config = config 45 | self.name = name 46 | 47 | def get_test_splits(self): 48 | return ["test"] 49 | 50 | @abc.abstractmethod 51 | def get_examples(self, split): 52 | pass 53 | 54 | @abc.abstractmethod 55 | def get_scorer(self) -> scorer.Scorer: 56 | pass 57 | 58 | @abc.abstractmethod 59 | def get_feature_specs(self) -> List[feature_spec.FeatureSpec]: 60 | pass 61 | 62 | @abc.abstractmethod 63 | def featurize(self, example: Example, is_training: bool, 64 | log: bool=False): 65 | pass 66 | 67 | @abc.abstractmethod 68 | def get_prediction_module( 69 | self, bert_model: modeling.BertModel, features: dict, is_training: bool, 70 | percent_done: float) -> Tuple: 71 | pass 72 | 73 | def __repr__(self): 74 | return "Task(" + self.name + ")" 75 | -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/araelectra/finetune/task_builder.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Returns task instances given the task name.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import configure_finetuning 23 | from finetune.classification import classification_tasks 24 | from finetune.qa import qa_tasks 25 | from finetune.tagging import tagging_tasks 26 | from model import tokenization 27 | 28 | 29 | def get_tasks(config: configure_finetuning.FinetuningConfig): 30 | tokenizer = tokenization.FullTokenizer(vocab_file=config.vocab_file, 31 | do_lower_case=config.do_lower_case) 32 | return [get_task(config, task_name, tokenizer) 33 | for task_name in config.task_names] 34 | 35 | 36 | def get_task(config: configure_finetuning.FinetuningConfig, task_name, 37 | tokenizer): 38 | """Get an instance of a task based on its name.""" 39 | if task_name == "cola": 40 | return classification_tasks.CoLA(config, tokenizer) 41 | elif task_name == "mrpc": 42 | return classification_tasks.MRPC(config, tokenizer) 43 | elif task_name == "mnli": 44 | return classification_tasks.MNLI(config, tokenizer) 45 | elif task_name == "sst": 46 | return classification_tasks.SST(config, tokenizer) 47 | elif task_name == "rte": 48 | return classification_tasks.RTE(config, tokenizer) 49 | elif task_name == "qnli": 50 | return classification_tasks.QNLI(config, tokenizer) 51 | elif task_name == "qqp": 52 | return classification_tasks.QQP(config, tokenizer) 53 | elif task_name == "sts": 54 | return classification_tasks.STS(config, tokenizer) 55 | elif task_name == "squad": 56 | return qa_tasks.SQuAD(config, tokenizer) 57 | elif task_name == "squadv1": 58 | return qa_tasks.SQuADv1(config, tokenizer) 59 | elif task_name == "newsqa": 60 | return qa_tasks.NewsQA(config, tokenizer) 61 | elif task_name == "naturalqs": 62 | return qa_tasks.NaturalQuestions(config, tokenizer) 63 | elif task_name == "triviaqa": 64 | return qa_tasks.TriviaQA(config, tokenizer) 65 | elif task_name == "searchqa": 66 | return qa_tasks.SearchQA(config, tokenizer) 67 | elif task_name == "chunk": 68 | return tagging_tasks.Chunking(config, tokenizer) 69 | else: 70 | raise ValueError("Unknown task " + task_name) 71 | -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/araelectra/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/araelectra/pretrain/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/araelectra/util/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/araelectra/util/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """A collection of general utility functions.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import json 23 | import pickle 24 | import sys 25 | 26 | import tensorflow as tf 27 | 28 | 29 | def load_json(path): 30 | with tf.io.gfile.GFile(path, "r") as f: 31 | return json.load(f) 32 | 33 | 34 | def write_json(o, path): 35 | if "/" in path: 36 | tf.io.gfile.makedirs(path.rsplit("/", 1)[0]) 37 | with tf.io.gfile.GFile(path, "w") as f: 38 | json.dump(o, f) 39 | 40 | 41 | def load_pickle(path): 42 | with tf.io.gfile.GFile(path, "rb") as f: 43 | return pickle.load(f) 44 | 45 | 46 | def write_pickle(o, path): 47 | if "/" in path: 48 | tf.io.gfile.makedirs(path.rsplit("/", 1)[0]) 49 | with tf.io.gfile.GFile(path, "wb") as f: 50 | pickle.dump(o, f, -1) 51 | 52 | 53 | def mkdir(path): 54 | if not tf.io.gfile.exists(path): 55 | tf.io.gfile.makedirs(path) 56 | 57 | 58 | def rmrf(path): 59 | if tf.io.gfile.exists(path): 60 | tf.io.gfile.rmtree(path) 61 | 62 | 63 | def rmkdir(path): 64 | rmrf(path) 65 | mkdir(path) 66 | 67 | 68 | def log(*args): 69 | msg = " ".join(map(str, args)) 70 | sys.stdout.write(msg + "\n") 71 | sys.stdout.flush() 72 | 73 | 74 | def log_config(config): 75 | for key, value in sorted(config.__dict__.items()): 76 | log(key, value) 77 | log() 78 | 79 | 80 | def heading(*args): 81 | log(80 * "=") 82 | log(*args) 83 | log(80 * "=") 84 | 85 | 86 | def nest_dict(d, prefixes, delim="_"): 87 | """Go from {prefix_key: value} to {prefix: {key: value}}.""" 88 | nested = {} 89 | for k, v in d.items(): 90 | for prefix in prefixes: 91 | if k.startswith(prefix + delim): 92 | if prefix not in nested: 93 | nested[prefix] = {} 94 | nested[prefix][k.split(delim, 1)[1]] = v 95 | else: 96 | nested[k] = v 97 | return nested 98 | 99 | 100 | def flatten_dict(d, delim="_"): 101 | """Go from {prefix: {key: value}} to {prefix_key: value}.""" 102 | flattened = {} 103 | for k, v in d.items(): 104 | if isinstance(v, dict): 105 | for k2, v2 in v.items(): 106 | flattened[k + delim + k2] = v2 107 | else: 108 | flattened[k] = v 109 | return flattened 110 | -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/aragpt2/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/aragpt2/gpt2/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/aragpt2/grover/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/build/lib/sinatools/arabert/aragpt2/grover/__init__.py -------------------------------------------------------------------------------- /build/lib/sinatools/arabert/aragpt2/train_bpe_tokenizer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | from tokenizers import ByteLevelBPETokenizer 5 | from transformers import GPT2TokenizerFast 6 | 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--data-files", type=str, required=True) 11 | parser.add_argument("--vocab-size", type=int, required=True) 12 | parser.add_argument("--output-dir", type=str, required=True) 13 | parser.add_argument("--output-file-name", type=str, required=True) 14 | args = parser.parse_args() 15 | 16 | gpt2_tok = ByteLevelBPETokenizer(add_prefix_space=True) 17 | 18 | files = glob.glob(args.data_files) 19 | if len(files) > 10: 20 | print(files[0:10]) 21 | else: 22 | print(files) 23 | 24 | gpt2_tok.train( 25 | files=files, 26 | vocab_size=args.vocab_size, 27 | show_progress=True, 28 | special_tokens=["<|endoftext|>", "", "", ""], 29 | ) 30 | 31 | if not os.path.exists(args.output_dir): 32 | os.makedirs(args.output_dir) 33 | 34 | 35 | gpt2_tok.save( 36 | os.path.join(args.output_dir,"tokenizer.json"), pretty=True 37 | ) # FIX Access is denied. (os error 5) 38 | gpt2_tok.save_model(args.output_dir, args.output_file_name) 39 | 40 | # tokenizer = GPT2TokenizerFast( 41 | # vocab_file=os.path.join(args.output_dir, args.output_file_name) + "-vocab.json", 42 | # merges_file=os.path.join(args.output_dir, args.output_file_name) 43 | # + "-merges.txt", 44 | # add_prefix_space=True, 45 | # ) 46 | 47 | # tokenizer.add_special_tokens( 48 | # { 49 | # "eos_token": "<|endoftext|>", 50 | # "bos_token": "<|endoftext|>", 51 | # "unk_token": "<|endoftext|>", 52 | # "pad_token": "<|endoftext|>", 53 | # "mask_token": "<|endoftext|>", 54 | # } 55 | # ) 56 | 57 | # tokenizer.save_pretrained( 58 | # args.output_dir, legacy_format=False, filename_prefix=args.output_file_name 59 | # ) 60 | -------------------------------------------------------------------------------- /build/lib/sinatools/install_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | def main(): 5 | # Determine the path to the 'environment.yml' file within the package 6 | package_dir = os.path.dirname(__file__) 7 | env_file = os.path.join(package_dir, 'environment.yml') 8 | 9 | # Create the conda environment using the 'environment.yml' file 10 | subprocess.call(["conda", "env", "create", "-f", env_file]) 11 | 12 | if __name__ == "__main__": 13 | main() 14 | -------------------------------------------------------------------------------- /build/lib/sinatools/morphology/ALMA_multi_word.py: -------------------------------------------------------------------------------- 1 | from sinatools.utils.parser import arStrip 2 | from . import five_grams_dict, four_grams_dict , three_grams_dict , two_grams_dict 3 | 4 | def ALMA_multi_word(multi_word, n): 5 | undiac_multi_word = arStrip(multi_word, True, True, True, False, True, False) # diacs , smallDiacs , shaddah , digit , alif , specialChars 6 | result_word = [] 7 | if n == 2: 8 | if undiac_multi_word in two_grams_dict.keys(): 9 | result_word = two_grams_dict[undiac_multi_word] 10 | elif n == 3: 11 | if undiac_multi_word in three_grams_dict.keys(): 12 | result_word = three_grams_dict[undiac_multi_word] 13 | elif n == 4: 14 | if undiac_multi_word in four_grams_dict.keys(): 15 | result_word = four_grams_dict[undiac_multi_word] 16 | else: 17 | if undiac_multi_word in five_grams_dict.keys(): 18 | result_word = five_grams_dict[undiac_multi_word] 19 | 20 | my_json = {} 21 | output_list = [] 22 | my_json['multi_word_lemma'] = multi_word 23 | my_json['undiac_multi_word_lemma'] = multi_word 24 | ids = [] 25 | if result_word != []: 26 | my_json['POS'] = result_word[0][1] #POS 27 | for result in result_word: 28 | ids.append(result[3]) 29 | my_json['ids'] = ids 30 | output_list.append(my_json) 31 | return output_list -------------------------------------------------------------------------------- /build/lib/sinatools/morphology/__init__.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from sinatools.DataDownload import downloader 3 | import os 4 | 5 | dictionary = {} 6 | five_grams_dict = {} 7 | four_grams_dict = {} 8 | three_grams_dict = {} 9 | two_grams_dict = {} 10 | 11 | filename = 'lemmas_dic.pickle' 12 | path = downloader.get_appdatadir() 13 | file_path = os.path.join(path, filename) 14 | with open(file_path, 'rb') as f: 15 | dictionary = pickle.load(f) 16 | 17 | filename_five = 'five_grams.pickle' 18 | path =downloader.get_appdatadir() 19 | file_path = os.path.join(path, filename_five) 20 | with open(file_path, 'rb') as f: 21 | five_grams_dict = pickle.load(f, encoding='utf-8') 22 | 23 | 24 | filename_four = 'four_grams.pickle' 25 | path =downloader.get_appdatadir() 26 | file_path = os.path.join(path, filename_four) 27 | with open(file_path, 'rb') as f: 28 | four_grams_dict = pickle.load(f, encoding='utf-8') 29 | 30 | 31 | filename_three = 'three_grams.pickle' 32 | path =downloader.get_appdatadir() 33 | file_path = os.path.join(path, filename_three) 34 | with open(file_path, 'rb') as f: 35 | three_grams_dict = pickle.load(f, encoding='utf-8') 36 | 37 | 38 | filename_two = 'two_grams.pickle' 39 | path =downloader.get_appdatadir() 40 | file_path = os.path.join(path, filename_two) 41 | with open(file_path, 'rb') as f: 42 | two_grams_dict = pickle.load(f, encoding='utf-8') 43 | -------------------------------------------------------------------------------- /build/lib/sinatools/ner/__init__.py: -------------------------------------------------------------------------------- 1 | from sinatools.DataDownload import downloader 2 | import os 3 | from sinatools.ner.helpers import load_object 4 | import pickle 5 | import os 6 | import torch 7 | import pickle 8 | import json 9 | from argparse import Namespace 10 | 11 | tagger = None 12 | tag_vocab = None 13 | train_config = None 14 | 15 | filename = 'Wj27012000.tar' 16 | path =downloader.get_appdatadir() 17 | model_path = os.path.join(path, filename) 18 | 19 | _path = os.path.join(model_path, "tag_vocab.pkl") 20 | 21 | with open(_path, "rb") as fh: 22 | tag_vocab = pickle.load(fh) 23 | 24 | train_config = Namespace() 25 | args_path = os.path.join(model_path, "args.json") 26 | 27 | with open(args_path, "r") as fh: 28 | train_config.__dict__ = json.load(fh) 29 | 30 | model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"]) 31 | model = torch.nn.DataParallel(model) 32 | 33 | if torch.cuda.is_available(): 34 | model = model.cuda() 35 | 36 | train_config.trainer_config["kwargs"]["model"] = model 37 | tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"]) 38 | tagger.load(os.path.join(model_path,"checkpoints")) 39 | -------------------------------------------------------------------------------- /build/lib/sinatools/ner/data/__init__.py: -------------------------------------------------------------------------------- 1 | from sinatools.ner.data.datasets import NestedTagsDataset -------------------------------------------------------------------------------- /build/lib/sinatools/ner/metrics.py: -------------------------------------------------------------------------------- 1 | from seqeval.metrics import ( 2 | classification_report, 3 | precision_score, 4 | recall_score, 5 | f1_score, 6 | accuracy_score, 7 | ) 8 | from seqeval.scheme import IOB2 9 | from types import SimpleNamespace 10 | import logging 11 | import re 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def compute_nested_metrics(segments, vocabs): 17 | """ 18 | Compute metrics for nested NER 19 | :param segments: List[List[arabiner.data.dataset.Token]] - list of segments 20 | :return: metrics - SimpleNamespace - F1/micro/macro/weights, recall, precision, accuracy 21 | """ 22 | y, y_hat = list(), list() 23 | 24 | # We duplicate the dataset N times, where N is the number of entity types 25 | # For each copy, we create y and y_hat 26 | # Example: first copy, will create pairs of ground truth and predicted labels for entity type GPE 27 | # another copy will create pairs for LOC, etc. 28 | for i, vocab in enumerate(vocabs): 29 | vocab_tags = [tag for tag in vocab.get_itos() if "-" in tag] 30 | r = re.compile("|".join(vocab_tags)) 31 | 32 | y += [[(list(filter(r.match, token.gold_tag)) or ["O"])[0] for token in segment] for segment in segments] 33 | y_hat += [[token.pred_tag[i]["tag"] for token in segment] for segment in segments] 34 | 35 | logging.info("\n" + classification_report(y, y_hat, scheme=IOB2, digits=4)) 36 | 37 | metrics = { 38 | "micro_f1": f1_score(y, y_hat, average="micro", scheme=IOB2), 39 | "macro_f1": f1_score(y, y_hat, average="macro", scheme=IOB2), 40 | "weights_f1": f1_score(y, y_hat, average="weighted", scheme=IOB2), 41 | "precision": precision_score(y, y_hat, scheme=IOB2), 42 | "recall": recall_score(y, y_hat, scheme=IOB2), 43 | "accuracy": accuracy_score(y, y_hat), 44 | } 45 | 46 | return SimpleNamespace(**metrics) 47 | 48 | 49 | def compute_single_label_metrics(segments): 50 | """ 51 | Compute metrics for flat NER 52 | :param segments: List[List[arabiner.data.dataset.Token]] - list of segments 53 | :return: metrics - SimpleNamespace - F1/micro/macro/weights, recall, precision, accuracy 54 | """ 55 | y = [[token.gold_tag[0] for token in segment] for segment in segments] 56 | y_hat = [[token.pred_tag[0]["tag"] for token in segment] for segment in segments] 57 | 58 | logging.info("\n" + classification_report(y, y_hat, scheme=IOB2)) 59 | 60 | metrics = { 61 | "micro_f1": f1_score(y, y_hat, average="micro", scheme=IOB2), 62 | "macro_f1": f1_score(y, y_hat, average="macro", scheme=IOB2), 63 | "weights_f1": f1_score(y, y_hat, average="weighted", scheme=IOB2), 64 | "precision": precision_score(y, y_hat, scheme=IOB2), 65 | "recall": recall_score(y, y_hat, scheme=IOB2), 66 | "accuracy": accuracy_score(y, y_hat), 67 | } 68 | 69 | return SimpleNamespace(**metrics) 70 | -------------------------------------------------------------------------------- /build/lib/sinatools/ner/nn/BaseModel.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import BertModel 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class BaseModel(nn.Module): 9 | def __init__(self, 10 | bert_model="aubmindlab/bert-base-arabertv2", 11 | num_labels=2, 12 | dropout=0.1, 13 | num_types=0): 14 | super().__init__() 15 | 16 | self.bert_model = bert_model 17 | self.num_labels = num_labels 18 | self.num_types = num_types 19 | self.dropout = dropout 20 | 21 | self.bert = BertModel.from_pretrained(bert_model) 22 | self.dropout = nn.Dropout(dropout) 23 | -------------------------------------------------------------------------------- /build/lib/sinatools/ner/nn/BertNestedTagger.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from sinatools.ner.nn import BaseModel 4 | 5 | 6 | class BertNestedTagger(BaseModel): 7 | def __init__(self, **kwargs): 8 | super(BertNestedTagger, self).__init__(**kwargs) 9 | 10 | self.max_num_labels = max(self.num_labels) 11 | classifiers = [nn.Linear(768, num_labels) for num_labels in self.num_labels] 12 | self.classifiers = torch.nn.Sequential(*classifiers) 13 | 14 | def forward(self, x): 15 | y = self.bert(x) 16 | y = self.dropout(y["last_hidden_state"]) 17 | output = list() 18 | 19 | for i, classifier in enumerate(self.classifiers): 20 | logits = classifier(y) 21 | 22 | # Pad logits to allow Multi-GPU/DataParallel training to work 23 | # We will truncate the padded dimensions when we compute the loss in the trainer 24 | logits = torch.nn.ConstantPad1d((0, self.max_num_labels - logits.shape[-1]), 0)(logits) 25 | output.append(logits) 26 | 27 | # Return tensor of the shape B x T x L x C 28 | # B: batch size 29 | # T: sequence length 30 | # L: number of tag types 31 | # C: number of classes per tag type 32 | output = torch.stack(output).permute((1, 2, 0, 3)) 33 | return output 34 | 35 | -------------------------------------------------------------------------------- /build/lib/sinatools/ner/nn/BertSeqTagger.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from transformers import BertModel 3 | 4 | 5 | class BertSeqTagger(nn.Module): 6 | def __init__(self, bert_model, num_labels=2, dropout=0.1): 7 | super().__init__() 8 | 9 | self.bert = BertModel.from_pretrained(bert_model) 10 | self.dropout = nn.Dropout(dropout) 11 | self.linear = nn.Linear(768, num_labels) 12 | 13 | def forward(self, x): 14 | y = self.bert(x) 15 | y = self.dropout(y["last_hidden_state"]) 16 | logits = self.linear(y) 17 | return logits 18 | -------------------------------------------------------------------------------- /build/lib/sinatools/ner/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from sinatools.ner.nn.BaseModel import BaseModel 2 | from sinatools.ner.nn.BertSeqTagger import BertSeqTagger 3 | from sinatools.ner.nn.BertNestedTagger import BertNestedTagger -------------------------------------------------------------------------------- /build/lib/sinatools/ner/trainers/__init__.py: -------------------------------------------------------------------------------- 1 | from sinatools.ner.trainers.BaseTrainer import BaseTrainer 2 | from sinatools.ner.trainers.BertTrainer import BertTrainer 3 | from sinatools.ner.trainers.BertNestedTrainer import BertNestedTrainer -------------------------------------------------------------------------------- /build/lib/sinatools/relations/__init__.py: -------------------------------------------------------------------------------- 1 | from sinatools.DataDownload import downloader 2 | import os 3 | from transformers import pipeline 4 | 5 | path =downloader.get_appdatadir() 6 | 7 | pipe = pipeline("sentiment-analysis", model= os.path.join(path, "relation_model"), return_all_scores =True, max_length=128, truncation=True) -------------------------------------------------------------------------------- /build/lib/sinatools/semantic_relatedness/__init__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | from sinatools.DataDownload import downloader 4 | import os 5 | from transformers import BertTokenizer,BertModel 6 | 7 | model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01" 8 | path =downloader.get_appdatadir() 9 | model_file_path = os.path.join(path, model_file_name) 10 | 11 | tokenizer_file_name = "bert-base-arabertv02" 12 | path =downloader.get_appdatadir() 13 | tokenizer_file_path = os.path.join(path, tokenizer_file_name) 14 | 15 | model = BertModel.from_pretrained('{}'.format(model_file_path), 16 | output_hidden_states = True, 17 | num_labels=2 18 | ) 19 | 20 | tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path)) -------------------------------------------------------------------------------- /build/lib/sinatools/semantic_relatedness/compute_relatedness.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from . import tokenizer 3 | from . import model 4 | 5 | #cosine using average embedding 6 | def get_similarity_score(sentence1, sentence2): 7 | """ 8 | Computes the degree of association between two sentences across various dimensions, meaning, underlying concepts, domain-specificity, topic overlap, viewpoint alignment. This method is described and implemented on this article. 9 | 10 | Args: 11 | sentence1 (:obj:`str`) – The Arabic sentence to find the semantic relatedness between it and the second sentence. 12 | sentence2 (:obj:`int`) – The Arabic sentence to find the semantic relatedness between it and the first sentence. 13 | 14 | Returns: 15 | :obj:`float`: An float number that represents the degree of relatedness between two provided sentences. 16 | 17 | **Example:** 18 | 19 | .. highlight:: python 20 | .. code-block:: python 21 | 22 | from sinatools.semantic_relatedness.compute_relatedness import get_similarity_score 23 | 24 | sentence1 = "تبلغ سرعة دوران الأرض حول الشمس حوالي 110 كيلومتر في الساعة." 25 | sentence2 = "تدور الأرض حول محورها بسرعة تصل تقريبا 1670 كيلومتر في الساعة." 26 | get_similarity_score(sentence1, sentence2) 27 | Score = 0.90 28 | """ 29 | 30 | # Tokenize and encode sentences 31 | inputs1 = tokenizer(sentence1, return_tensors="pt") 32 | inputs2 = tokenizer(sentence2, return_tensors="pt") 33 | 34 | # Extract embeddings 35 | with torch.no_grad(): 36 | outputs1 = model(**inputs1) 37 | outputs2 = model(**inputs2) 38 | 39 | embeddings1 = outputs1.last_hidden_state 40 | embeddings2 = outputs2.last_hidden_state 41 | 42 | # Mask padding tokens 43 | attention_mask1 = inputs1["attention_mask"] 44 | attention_mask2 = inputs2["attention_mask"] 45 | 46 | # Average pool across tokens, excluding padding 47 | embeddings1_avg = torch.sum(embeddings1 * attention_mask1.unsqueeze(-1), dim=1) / torch.sum(attention_mask1, dim=1, keepdim=True) 48 | embeddings2_avg = torch.sum(embeddings2 * attention_mask2.unsqueeze(-1), dim=1) / torch.sum(attention_mask2, dim=1, keepdim=True) 49 | 50 | # Calculate cosine similarity 51 | similarity = torch.nn.functional.cosine_similarity(embeddings1_avg, embeddings2_avg) 52 | 53 | return similarity.item() -------------------------------------------------------------------------------- /build/lib/sinatools/sinatools.py: -------------------------------------------------------------------------------- 1 | """Main module.""" 2 | -------------------------------------------------------------------------------- /build/lib/sinatools/synonyms/__init__.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from sinatools.DataDownload import downloader 3 | import os 4 | 5 | synonyms_level2_dict = {} 6 | level2_dict = 'graph_l2.pkl' 7 | path = downloader.get_appdatadir() 8 | file_path = os.path.join(path, level2_dict) 9 | with open(file_path, 'rb') as f: 10 | synonyms_level2_dict = pickle.load(f, encoding='utf-8') 11 | 12 | 13 | synonyms_level3_dict = {} 14 | level3_dict = 'graph_l3.pkl' 15 | path = downloader.get_appdatadir() 16 | file_path = os.path.join(path, level3_dict) 17 | with open(file_path, 'rb') as f: 18 | synonyms_level3_dict = pickle.load(f, encoding='utf-8') -------------------------------------------------------------------------------- /build/lib/sinatools/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/build/lib/sinatools/utils/__init__.py -------------------------------------------------------------------------------- /build/lib/sinatools/utils/charsets.py: -------------------------------------------------------------------------------- 1 | # We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/utils/charsets.html]. 2 | 3 | import unicodedata 4 | 5 | from six import unichr 6 | 7 | 8 | UNICODE_PUNCT_CHARSET = frozenset( 9 | [unichr(x) for x in range(65536) if unicodedata.category( 10 | unichr(x))[0] == 'P']) 11 | UNICODE_SYMBOL_CHARSET = frozenset( 12 | [unichr(x) for x in range(65536) if unicodedata.category( 13 | unichr(x))[0] == 'S']) 14 | UNICODE_PUNCT_SYMBOL_CHARSET = UNICODE_PUNCT_CHARSET | UNICODE_SYMBOL_CHARSET 15 | 16 | UNICODE_LETTER_CHARSET = frozenset( 17 | [unichr(x) for x in range(65536) if unicodedata.category( 18 | unichr(x))[0] == 'L']) 19 | UNICODE_MARK_CHARSET = frozenset( 20 | [unichr(x) for x in range(65536) if unicodedata.category( 21 | unichr(x))[0] == 'M']) 22 | UNICODE_NUMBER_CHARSET = frozenset( 23 | [unichr(x) for x in range(65536) if unicodedata.category( 24 | unichr(x))[0] == 'N']) 25 | UNICODE_LETTER_MARK_NUMBER_CHARSET = (UNICODE_LETTER_CHARSET | 26 | UNICODE_MARK_CHARSET | 27 | UNICODE_NUMBER_CHARSET) 28 | 29 | AR_LETTERS_CHARSET = frozenset(u'\u0621\u0622\u0623\u0624\u0625\u0626\u0627' 30 | u'\u0628\u0629\u062a\u062b\u062c\u062d\u062e' 31 | u'\u062f\u0630\u0631\u0632\u0633\u0634\u0635' 32 | u'\u0636\u0637\u0638\u0639\u063a\u0640\u0641' 33 | u'\u0642\u0643\u0644\u0645\u0646\u0647\u0648' 34 | u'\u0649\u064a\u0671\u067e\u0686\u06a4\u06af') 35 | AR_DIAC_CHARSET = frozenset(u'\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652' 36 | u'\u0670\u0640') 37 | AR_CHARSET = AR_LETTERS_CHARSET | AR_DIAC_CHARSET 38 | 39 | BW_LETTERS_CHARSET = frozenset(u'$&\'*<>ADEGHJPSTVYZ_bdfghjklmnpqrstvwxyz{|}') 40 | BW_DIAC_CHARSET = frozenset(u'FKN`aiou~_') 41 | BW_CHARSET = BW_LETTERS_CHARSET | BW_DIAC_CHARSET 42 | 43 | SAFEBW_LETTERS_CHARSET = frozenset(u'ABCDEGHIJLMOPQSTVWYZ_bcdefghjklmnpqrstvwx' 44 | u'yz') 45 | SAFEBW_DIAC_CHARSET = frozenset(u'FKNaeiou~_') 46 | SAFEBW_CHARSET = SAFEBW_LETTERS_CHARSET | SAFEBW_DIAC_CHARSET 47 | 48 | XMLBW_LETTERS_CHARSET = frozenset(u'$\'*ABDEGHIJOPSTWYZ_bdfghjklmnpqrstvwxyz{|' 49 | u'}') 50 | XMLBW_DIAC_CHARSET = frozenset(u'FKN`aiou~_') 51 | XMLBW_CHARSET = XMLBW_LETTERS_CHARSET | XMLBW_DIAC_CHARSET 52 | 53 | HSB_LETTERS_CHARSET = frozenset(u'\'ADHST_bcdfghjklmnpqrstvwxyz' 54 | u'\u00c2\u00c4\u00e1\u00f0\u00fd\u0100\u0102' 55 | u'\u010e\u0127\u0161\u0175\u0177\u03b3\u03b8' 56 | u'\u03c2') 57 | HSB_DIAC_CHARSET = frozenset(u'.aiu~\u00c4\u00e1\u00e3\u0129\u0169_') 58 | HSB_CHARSET = HSB_LETTERS_CHARSET | HSB_DIAC_CHARSET 59 | -------------------------------------------------------------------------------- /build/lib/sinatools/utils/readfile.py: -------------------------------------------------------------------------------- 1 | def read_file(file_path): 2 | with open(file_path, 'r', encoding='utf-8') as file: 3 | return [line.strip() for line in file] 4 | -------------------------------------------------------------------------------- /build/lib/sinatools/utils/tokenizers_words.py: -------------------------------------------------------------------------------- 1 | # We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/tokenizers/word.html]. 2 | 3 | import re 4 | from sinatools.utils.charsets import UNICODE_PUNCT_SYMBOL_CHARSET 5 | from sinatools.utils.charsets import UNICODE_LETTER_MARK_NUMBER_CHARSET 6 | 7 | 8 | _ALL_PUNCT = u''.join(UNICODE_PUNCT_SYMBOL_CHARSET) 9 | _ALL_LETTER_MARK_NUMBER = u''.join(UNICODE_LETTER_MARK_NUMBER_CHARSET) 10 | _TOKENIZE_RE = re.compile(r'[' + re.escape(_ALL_PUNCT) + r']|[' + 11 | re.escape(_ALL_LETTER_MARK_NUMBER) + r']+') 12 | 13 | 14 | def simple_word_tokenize(sentence): 15 | 16 | return _TOKENIZE_RE.findall(sentence) 17 | -------------------------------------------------------------------------------- /build/lib/sinatools/wsd/__init__.py: -------------------------------------------------------------------------------- 1 | from sinatools.wsd import settings 2 | import pickle 3 | from sinatools.DataDownload import downloader 4 | import os 5 | 6 | glosses_dic = {} 7 | filename = 'one_gram.pickle' 8 | path =downloader.get_appdatadir() 9 | file_path = os.path.join(path, filename) 10 | with open(file_path, 'rb') as f: 11 | glosses_dic = pickle.load(f) 12 | -------------------------------------------------------------------------------- /build/lib/sinatools/wsd/settings.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer,BertForSequenceClassification 2 | import warnings 3 | warnings.filterwarnings("ignore") 4 | import pandas as pd 5 | 6 | 7 | 8 | 9 | from sinatools.DataDownload import downloader 10 | import os 11 | 12 | 13 | model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01" 14 | path =downloader.get_appdatadir() 15 | model_file_path = os.path.join(path, model_file_name) 16 | 17 | tokenizer_file_name = "bert-base-arabertv02" 18 | path =downloader.get_appdatadir() 19 | tokenizer_file_path = os.path.join(path, tokenizer_file_name) 20 | 21 | dftrue = pd.DataFrame() 22 | 23 | model = BertForSequenceClassification.from_pretrained(model_file_path, output_hidden_states=True, num_labels=2) 24 | 25 | tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path)) -------------------------------------------------------------------------------- /dist/SinaTools-0.1.41-py2.py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/dist/SinaTools-0.1.41-py2.py3-none-any.whl -------------------------------------------------------------------------------- /dist/sinatools-0.1.41.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/dist/sinatools-0.1.41.tar.gz -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SOURCEDIR = source 6 | SPHINXOPTS = 7 | SPHINXBUILD = sphinx-build -c $(SOURCEDIR) -c $(SOURCEDIR)/config 8 | SPHINXPROJ = sinatools 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/build/_images/download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/_images/download.png -------------------------------------------------------------------------------- /docs/build/_static/download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/_static/download.png -------------------------------------------------------------------------------- /docs/build/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/_static/file.png -------------------------------------------------------------------------------- /docs/build/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/_static/minus.png -------------------------------------------------------------------------------- /docs/build/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/_static/plus.png -------------------------------------------------------------------------------- /docs/build/doctrees/License.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/License.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/Overview.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/Overview.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/about.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/about.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/api.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/api.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/api/DataDownload.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/api/DataDownload.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/api/DataDownload/downloader.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/api/DataDownload/downloader.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/api/arabiner.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/api/arabiner.doctree -------------------------------------------------------------------------------- /docs/build/html/_images/SinaLogo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_images/SinaLogo.jpg -------------------------------------------------------------------------------- /docs/build/html/_images/download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_images/download.png -------------------------------------------------------------------------------- /docs/build/html/_static/SinaLogo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_static/SinaLogo.jpg -------------------------------------------------------------------------------- /docs/build/html/_static/download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_static/download.png -------------------------------------------------------------------------------- /docs/build/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_static/file.png -------------------------------------------------------------------------------- /docs/build/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_static/minus.png -------------------------------------------------------------------------------- /docs/build/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_static/plus.png -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/License.rst: -------------------------------------------------------------------------------- 1 | License 2 | ======= 3 | 4 | MIT License 5 | 6 | Copyright 2023 Birzeit University 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in 16 | all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | -------------------------------------------------------------------------------- /docs/source/Overview.rst: -------------------------------------------------------------------------------- 1 | Overview 2 | ======== 3 | 4 | 5 | About 6 | ----- 7 | 8 | Sina Tools is a collection of Arabic natural language processing tools created by the Sina Lab at Birzeit University in Palestine. 9 | 10 | For additional details, please refer to the :doc:`installation` section. 11 | 12 | Sina Tools is available under the MIT license. See :doc:`License` for more information. 13 | 14 | .. _Github repo: https://github.com/SinaLab/sinatools 15 | .. _tarball: https://github.com/SinaLab/sinatools/tarball/master 16 | -------------------------------------------------------------------------------- /docs/source/_static/SinaLogo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/source/_static/SinaLogo.jpg -------------------------------------------------------------------------------- /docs/source/_static/download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/source/_static/download.png -------------------------------------------------------------------------------- /docs/source/about.rst: -------------------------------------------------------------------------------- 1 | About 2 | ===== 3 | 4 | 5 | 6 | SinaTools is a collection of Arabic natural language processing tools created by the SinaLab at Birzeit University in Palestine. 7 | 8 | For additional details, please refer to the :doc:`installation` section. 9 | 10 | SinaTools is available under the MIT license. See :doc:`License` for more information. 11 | 12 | .. _Github repo: https://github.com/SinaLab/sinatools 13 | .. _tarball: https://github.com/SinaLab/sinatools/tarball/master 14 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | Python API Reference 2 | ==================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :titlesonly: 7 | :caption: Modules: 8 | 9 | 10 | api/morphology 11 | api/DataDownload 12 | api/utils 13 | api/ner 14 | api/salma 15 | 16 | 17 | -------------------------------------------------------------------------------- /docs/source/api/DataDownload.rst: -------------------------------------------------------------------------------- 1 | sinatools.DataDownload 2 | ===================== 3 | 4 | 5 | .. automodule:: sinatools.DataDownload 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | :caption: Modules: 14 | 15 | DataDownload/downloader 16 | -------------------------------------------------------------------------------- /docs/source/api/DataDownload/downloader.rst: -------------------------------------------------------------------------------- 1 | sinatools.DataDownload.downloader 2 | ++++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.DataDownload.downloader 5 | :members: 6 | 7 | 8 | -------------------------------------------------------------------------------- /docs/source/api/arabiner.rst: -------------------------------------------------------------------------------- 1 | sinatools.ner 2 | ================= 3 | 4 | 5 | .. automodule:: sinatools.ner 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | :caption: Modules: 14 | 15 | ner/entity_extractor -------------------------------------------------------------------------------- /docs/source/api/arabiner/bin/infer.rst: -------------------------------------------------------------------------------- 1 | sinatools.ner.entity_extractor 2 | +++++++++++++++++++++++++++ 3 | 4 | 5 | .. automodule:: sinatools.ner.entity_extractor 6 | :members: 7 | :show-inheritance: 8 | 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | :caption: Modules: 13 | -------------------------------------------------------------------------------- /docs/source/api/morphology.rst: -------------------------------------------------------------------------------- 1 | sinatools.morphology 2 | =================== 3 | 4 | 5 | .. automodule:: sinatools.morphology 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | :caption: Modules: 14 | 15 | morphology/morph_analyzer 16 | -------------------------------------------------------------------------------- /docs/source/api/morphology/morph_analyzer.rst: -------------------------------------------------------------------------------- 1 | sinatools.morphology.morph_analyzer 2 | ++++++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.morphology.morph_analyzer 5 | :members: 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /docs/source/api/salma.rst: -------------------------------------------------------------------------------- 1 | sinatools.salma 2 | ============== 3 | 4 | 5 | .. automodule:: sinatools.salma 6 | :members: 7 | :show-inheritance: 8 | 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | :caption: Modules: 13 | 14 | salma/views -------------------------------------------------------------------------------- /docs/source/api/salma/views.rst: -------------------------------------------------------------------------------- 1 | sinatools.salma.views 2 | ++++++++++++++++++++ 3 | 4 | 5 | .. automodule:: sinatools.salma.views 6 | :members: 7 | :show-inheritance: 8 | 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | :caption: Modules: 13 | -------------------------------------------------------------------------------- /docs/source/api/utils.rst: -------------------------------------------------------------------------------- 1 | sinatools.utils 2 | ============== 3 | 4 | 5 | .. automodule:: sinatools.utils 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | :caption: Modules: 14 | 15 | utils/parser 16 | utils/implication 17 | utils/jaccard 18 | utils/text_transliteration 19 | utils/sentence_tokenizer 20 | utils/corpus_tokenizer -------------------------------------------------------------------------------- /docs/source/api/utils/corpus_tokenizer.rst: -------------------------------------------------------------------------------- 1 | sinatools.utils.corpus_tokenizer 2 | +++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.utils.corpus_tokenizer 5 | :members: -------------------------------------------------------------------------------- /docs/source/api/utils/implication.rst: -------------------------------------------------------------------------------- 1 | sinatools.utils.implication 2 | ++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.utils.implication 5 | :members: -------------------------------------------------------------------------------- /docs/source/api/utils/jaccard.rst: -------------------------------------------------------------------------------- 1 | sinatools.utils.jaccard 2 | ++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.utils.jaccard 5 | :members: -------------------------------------------------------------------------------- /docs/source/api/utils/parser.rst: -------------------------------------------------------------------------------- 1 | sinatools.utils.parser 2 | +++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.utils.parser 5 | :members: 6 | :show-inheritance: 7 | 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | :caption: Modules: -------------------------------------------------------------------------------- /docs/source/api/utils/sentence_tokenizer.rst: -------------------------------------------------------------------------------- 1 | sinatools.utils.sentence_tokenizer 2 | +++++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.utils.sentence_tokenizer 5 | :members: -------------------------------------------------------------------------------- /docs/source/api/utils/text_transliteration.rst: -------------------------------------------------------------------------------- 1 | sinatools.utils.text_transliteration 2 | +++++++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.utils.text_transliteration 5 | :members: -------------------------------------------------------------------------------- /docs/source/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /docs/source/cli_tools.rst: -------------------------------------------------------------------------------- 1 | SinaTools Command Line 2 | ======================= 3 | .. toctree:: 4 | :maxdepth: 2 5 | :titlesonly: 6 | :caption: Modules: 7 | 8 | 9 | cli_tools/utils 10 | cli_tools/morphology 11 | cli_tools/ner 12 | cli_tools/salma 13 | cli_tools/DataDownload 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /docs/source/cli_tools/DataDownload.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.DataDownload 2 | ========================= 3 | 4 | 5 | .. automodule:: sinatools.CLI.DataDownload 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | :caption: Modules: 14 | 15 | DataDownload/download_files 16 | -------------------------------------------------------------------------------- /docs/source/cli_tools/DataDownload/download_files.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.DataDownload.download_files 2 | ++++++++++++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.CLI.DataDownload.download_files 5 | :members: -------------------------------------------------------------------------------- /docs/source/cli_tools/DataDownload/get_appdatadir.rst: -------------------------------------------------------------------------------- 1 | CLI.DataDownload.get_appdatadir 2 | ++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: CLI.DataDownload.get_appdatadir 5 | :members: -------------------------------------------------------------------------------- /docs/source/cli_tools/arabiner.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.ner 2 | ========================= 3 | 4 | 5 | .. automodule:: sinatools.CLI.ner 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | :caption: Modules: 14 | 15 | ner 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /docs/source/cli_tools/arabiner/infer.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.ner.entity_extractor 2 | +++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.CLI.ner.entity_extractor 5 | :members: -------------------------------------------------------------------------------- /docs/source/cli_tools/morphology.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.morphology 2 | ======================= 3 | 4 | 5 | .. automodule:: sinatools.CLI.morphology 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | :caption: Modules: 14 | 15 | morphology/morph_analyzer 16 | morphology/ALMA_multi_word 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /docs/source/cli_tools/morphology/ALMA_multi_word.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.morphology.ALMA_multi_word 2 | +++++++++++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.CLI.morphology.ALMA_multi_word 5 | :members: -------------------------------------------------------------------------------- /docs/source/cli_tools/morphology/morph_analyzer.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.morphology.morph_analyzer 2 | ++++++++++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.CLI.morphology.morph_analyzer 5 | :members: -------------------------------------------------------------------------------- /docs/source/cli_tools/salma.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.salma 2 | ================== 3 | 4 | 5 | .. automodule:: sinatools.CLI.salma 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | :caption: Modules: 14 | 15 | salma/salma_tools 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /docs/source/cli_tools/salma/salma_tools.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.salma.salma_tools 2 | ============================== 3 | 4 | 5 | .. automodule:: sinatools.CLI.salma.salma_tools 6 | :members: 7 | :show-inheritance: 8 | 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | :caption: Modules: 13 | 14 | salma/salma_tools 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /docs/source/cli_tools/utils.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.utils 2 | ================== 3 | 4 | 5 | .. automodule:: sinatools.CLI.utils 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | :caption: Modules: 14 | 15 | utils/arStrip 16 | utils/latin_remove 17 | utils/remove_punc 18 | utils/implication 19 | utils/sentence_tokenizer 20 | utils/text_transliteration 21 | utils/jaccard 22 | utils/corpus_tokenizer -------------------------------------------------------------------------------- /docs/source/cli_tools/utils/arStrip.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.utils.arStrip 2 | ++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.CLI.utils.arStrip 5 | :members: -------------------------------------------------------------------------------- /docs/source/cli_tools/utils/corpus_tokenizer.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.utils.corpus_tokenizer 2 | +++++++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.CLI.utils.corpus_tokenizer 5 | :members: -------------------------------------------------------------------------------- /docs/source/cli_tools/utils/implication.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.utils.implication 2 | ++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.CLI.utils.implication 5 | :members: -------------------------------------------------------------------------------- /docs/source/cli_tools/utils/jaccard.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.utils.jaccard 2 | +++++++++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.CLI.utils.jaccard 5 | :members: -------------------------------------------------------------------------------- /docs/source/cli_tools/utils/latin_remove.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.utils.latin_remove 2 | +++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.CLI.utils.latin_remove 5 | :members: -------------------------------------------------------------------------------- /docs/source/cli_tools/utils/remove_punc.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.utils.remove_punc 2 | ++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.CLI.utils.remove_punc 5 | :members: -------------------------------------------------------------------------------- /docs/source/cli_tools/utils/sentence_tokenizer.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.utils.sentence_tokenizer 2 | +++++++++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.CLI.utils.sentence_tokenizer 5 | :members: -------------------------------------------------------------------------------- /docs/source/cli_tools/utils/text_transliteration.rst: -------------------------------------------------------------------------------- 1 | sinatools.CLI.utils.text_transliteration 2 | +++++++++++++++++++++++++++++++++++++++ 3 | 4 | .. automodule:: sinatools.CLI.utils.text_transliteration 5 | :members: -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | SinaTools Documentation 2 | ======================= 3 | 4 | .. image:: _images/SinaLogo.jpg 5 | :alt: SinaTools Logo 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | :caption: Contents: 10 | 11 | about 12 | installation 13 | cli_tools 14 | api 15 | License 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | =============== 4 | Getting Started 5 | =============== 6 | 7 | Installation 8 | ------------ 9 | 10 | You will need Python 3.10.8 (64-bit) as well as 11 | `the Rust compiler `_ installed. 12 | 13 | 14 | Install using pip 15 | ^^^^^^^^^^^^^^^^^ 16 | 17 | To install sinatools, run this command in your terminal: 18 | 19 | .. code-block:: console 20 | 21 | $ pip install sinatools 22 | 23 | This is the preferred method to install sinatools, as it will always install the most recent stable release. 24 | 25 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide 26 | you through the process. 27 | 28 | .. _pip: https://pip.pypa.io 29 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ 30 | 31 | 32 | Install from source 33 | ^^^^^^^^^^^^^^^^^^^ 34 | 35 | The sources for sinatools can be downloaded from the `Github repo`_ 36 | 37 | You can either clone the public repository: 38 | 39 | .. code-block:: console 40 | 41 | $ git clone git://github.com/SinaLab/sinatools/ 42 | 43 | Or download the `tarball`_: 44 | 45 | .. code-block:: console 46 | 47 | $ curl -OJL https://github.com/SinaLab/sinatools/tarball/master 48 | 49 | Once you have a copy of the source, you can install it with: 50 | 51 | .. code-block:: console 52 | 53 | $ python setup.py install 54 | 55 | 56 | .. _Github repo: https://github.com/SinaLab/sinatools/ 57 | .. _tarball: https://github.com/SinaLab/sinatools/tarball/master 58 | 59 | 60 | Installing data 61 | ^^^^^^^^^^^^^^^ 62 | 63 | To install the data sets required by SinaTools See :doc:`reference/packages`. 64 | 65 | 66 | By default, data is stored in 67 | ``C:\Users\your_user_name\AppData\Roaming\sinatools``. 68 | 69 | 70 | Next Steps 71 | ---------- 72 | 73 | To get started, you can follow along 74 | `the Guided Tour `_ 75 | for a quick overview of the components provided by SinaTools. 76 | 77 | See :doc:`cli_tools` for information on using the command-line tools or 78 | :doc:`api` for information on using the Python API. 79 | -------------------------------------------------------------------------------- /docs/source/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../README.rst 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | version_file = os.path.join(os.path.dirname(__file__), 3 | 'sinatools', 4 | 'VERSION') 5 | with open(version_file, encoding = 'utf-8') as version_fp: 6 | VERSION = version_fp.read().strip() 7 | current_version = VERSION 8 | commit = True 9 | tag = True 10 | 11 | [bumpversion:file:setup.py] 12 | search = version='{current_version}' 13 | replace = version='{new_version}' 14 | 15 | [bumpversion:file:sinatools/__init__.py] 16 | search = __version__ = '{current_version}' 17 | replace = __version__ = '{new_version}' 18 | 19 | [bdist_wheel] 20 | universal = 1 21 | 22 | [flake8] 23 | exclude = docs 24 | 25 | [aliases] 26 | test = pytest 27 | 28 | [tool:pytest] 29 | collect_ignore = ['setup.py'] 30 | 31 | [egg_info] 32 | tag_build = 33 | tag_date = 0 34 | 35 | -------------------------------------------------------------------------------- /sinatools/CLI/DataDownload/__pycache__/download_files.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/DataDownload/__pycache__/download_files.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/CLI/DataDownload/__pycache__/download_files.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/DataDownload/__pycache__/download_files.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/DataDownload/download_files.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | 5 | The download_files command, allows users to select specific files and models to download and use it within SinaTools modules. Additionally, it automatically manages the extraction of compressed files, including zip and tar.gz formats. 6 | 7 | Usage: 8 | ------ 9 | 10 | Below is the usage information that can be generated by running download_files --help. 11 | 12 | .. code-block:: none 13 | 14 | Usage: 15 | download_files [OPTIONS] 16 | 17 | .. code-block:: none 18 | 19 | Options: 20 | -f, --files FILES 21 | Names of the files to download. Available files are: ner, morph, wsd, synonyms. 22 | If no file is specified, all files will be downloaded. 23 | 24 | Examples: 25 | --------- 26 | 27 | .. code-block:: none 28 | 29 | download_files -f morph ner 30 | This command will download only the `morph` and `ner` files to the default directory. 31 | """ 32 | 33 | import argparse 34 | from sinatools.DataDownload.downloader import download_file 35 | from sinatools.DataDownload.downloader import download_files 36 | from sinatools.DataDownload.downloader import get_appdatadir 37 | from sinatools.DataDownload.downloader import download_folder_from_hf 38 | from sinatools.DataDownload.downloader import urls 39 | 40 | 41 | def main(): 42 | parser = argparse.ArgumentParser(description="Download files from specified URLs.") 43 | parser.add_argument('-f', '--files', nargs="*", 44 | help="Names of the files to download. Available files are: " 45 | f"{', '.join(urls.keys())}. If no file is specified, all files will be downloaded.") 46 | 47 | get_appdatadir() 48 | 49 | args = parser.parse_args() 50 | 51 | if args.files: 52 | for file in args.files: 53 | print("file: ", file) 54 | if file == "wsd": 55 | download_file(urls["morph"]) 56 | download_file(urls["ner"]) 57 | #download_file(urls["wsd_model"]) 58 | #download_file(urls["wsd_tokenizer"]) 59 | download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01") 60 | download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02") 61 | download_file(urls["one_gram"]) 62 | download_file(urls["five_grams"]) 63 | download_file(urls["four_grams"]) 64 | download_file(urls["three_grams"]) 65 | download_file(urls["two_grams"]) 66 | elif file == "synonyms": 67 | download_file(urls["graph_l2"]) 68 | download_file(urls["graph_l3"]) 69 | else: 70 | url = urls[file] 71 | download_file(url) 72 | else: 73 | download_files() 74 | 75 | if __name__ == '__main__': 76 | main() 77 | -------------------------------------------------------------------------------- /sinatools/CLI/morphology/ALMA_multi_word.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from sinatools.morphology.ALMA_multi_word import ALMA_multi_word 3 | import json 4 | from sinatools.utils.readfile import read_file 5 | 6 | def main(): 7 | parser = argparse.ArgumentParser(description='Multi-Word Analysis using SinaTools') 8 | 9 | # Adding arguments for the multi-word input or file containing the multi-word input 10 | parser.add_argument('--multi_word', type=str, help='Multi-word text to be analyzed') 11 | parser.add_argument('--file', type=str, help='File containing the multi-word text to be analyzed') 12 | 13 | args = parser.parse_args() 14 | 15 | if args.multi_word is None and args.file is None: 16 | print("Error: Either --multi_word or --file argument must be provided.") 17 | return 18 | 19 | # Get the input either from the --multi_word argument or from the file specified in the --file argument 20 | multi_word_text = args.multi_word if args.multi_word else " ".join(read_file(args.file)) 21 | 22 | # Perform multi-word analysis 23 | results = ALMA_multi_word(multi_word_text) 24 | 25 | # Print the results in JSON format 26 | print(json.dumps(results, ensure_ascii=False, indent=4)) 27 | 28 | if __name__ == '__main__': 29 | main() 30 | #alma_multi_word --multi_word "Your multi-word text here" 31 | #alma_multi_word --file "path/to/your/file.txt" 32 | -------------------------------------------------------------------------------- /sinatools/CLI/morphology/__pycache__/morph_analyzer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/morphology/__pycache__/morph_analyzer.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/CLI/morphology/__pycache__/morph_analyzer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/morphology/__pycache__/morph_analyzer.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/ner/__pycache__/corpus_entity_extractor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/ner/__pycache__/corpus_entity_extractor.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/ner/__pycache__/entity_extractor.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/ner/__pycache__/entity_extractor.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/CLI/ner/__pycache__/entity_extractor.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/ner/__pycache__/entity_extractor.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/CLI/ner/__pycache__/entity_extractor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/ner/__pycache__/entity_extractor.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/relations/relation_extractor.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | The relation_extractor command is designed to extract events and their corresponding arguments (agents, locations, and dates) within a text using the SinaTools relation_extractor API. 5 | 6 | Usage: 7 | ------ 8 | Below is the usage information that can be generated by running relation_extractor --help. 9 | 10 | .. code-block:: none 11 | 12 | relation_extractor --text=TEXT [OPTIONS] 13 | relation_extractor --file=FILE [OPTIONS] 14 | 15 | Options: 16 | -------- 17 | 18 | .. code-block:: none 19 | 20 | --text TEXT 21 | The text from which events need to be extracted. 22 | 23 | --file FILE 24 | File containing the text from which events need to be extracted. 25 | 26 | Examples: 27 | --------- 28 | 29 | .. code-block:: none 30 | 31 | relation_extractor --text "Your Arabic text here" 32 | relation_extractor --file "path/to/your/file.txt" 33 | 34 | """ 35 | 36 | import argparse 37 | from sinatools.relations.relation_extractor import event_argument_relation_extraction 38 | from sinatools.utils.readfile import read_file 39 | 40 | def main(): 41 | parser = argparse.ArgumentParser(description='Relation Extraction using SinaTools') 42 | 43 | parser.add_argument('--text', type=str, help='The text from which events need to be extracted.') 44 | parser.add_argument('--file', type=str, help='File containing the text from which events need to be extracted.') 45 | 46 | args = parser.parse_args() 47 | 48 | if args.text is None and args.file is None: 49 | print("Error: Either --text or --file argument must be provided.") 50 | return 51 | 52 | input_text = args.text if args.text else " ".join(read_file(args.file)) 53 | 54 | results = event_argument_relation_extraction(input_text) 55 | 56 | for result in results: 57 | print(result) 58 | 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /sinatools/CLI/semantic_relatedness/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/semantic_relatedness/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/semantic_relatedness/__pycache__/compute_relatedness.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/semantic_relatedness/__pycache__/compute_relatedness.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/semantic_relatedness/__pycache__/settings.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/semantic_relatedness/__pycache__/settings.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/semantic_relatedness/compute_relatedness.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | The semantic_relatedness command line interface (CLI) is designed to computes the degree of association between two sentences. As follows: 5 | 6 | Usage: 7 | ------ 8 | Below is the usage information that can be generated by running semantic_relatedness --help. 9 | 10 | .. code-block:: none 11 | 12 | semantic_relatedness –-sentence1 "your Arabic sentence here" --sentence2 "your Arabic sentence here" 13 | 14 | Options: 15 | -------- 16 | .. code-block:: none 17 | 18 | --sentence1 INPUT_TEXT 19 | The Arabic sentence to find the semantic relatedness between it and the second sentence. 20 | --sentence2 INPUT_TEXT 21 | The Arabic sentence to find the semantic relatedness between it and the first sentence. 22 | 23 | Examples: 24 | --------- 25 | .. code-block:: none 26 | 27 | semantic_relatedness --sentence1 "Your text here" --sentence2 "Your text here" 28 | """ 29 | 30 | import argparse 31 | from sinatools.semantic_relatedness.compute_relatedness import get_similarity_score 32 | 33 | def main(): 34 | parser = argparse.ArgumentParser(description='Computes the degree of association between two sentences across various dimensions, meaning, underlying concepts, domain-specificity, topic overlap, viewpoint alignment.') 35 | 36 | parser.add_argument('--sentence1', type=str, help='The first sentence to be compute similarity based on') 37 | parser.add_argument('--sentence2', type=str, help='The second sentence to be compute similarity based on') 38 | 39 | 40 | args = parser.parse_args() 41 | 42 | if args.sentence1 is None and args.sentence2 is None: 43 | print("Error: Either --sentence1 or --sentence2 argument must be provided.") 44 | return 45 | 46 | score = get_similarity_score(args.sentence1, args.sentence2) 47 | 48 | print(score) 49 | 50 | if __name__ == '__main__': 51 | main() 52 | -------------------------------------------------------------------------------- /sinatools/CLI/synonyms/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/synonyms/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/synonyms/__pycache__/synonyms_generator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/synonyms/__pycache__/synonyms_generator.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/synonyms/evaluate_synonyms.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | The evaluate_synonyms command line interface (CLI) is designed to take a set of synonyms and a level number, then evaluate how much these synonyms are realy a synonyms in this set. As follows: 5 | 6 | Usage: 7 | ------ 8 | Below is the usage information that can be generated by running synonyms --help. 9 | 10 | .. code-block:: none 11 | 12 | evaluate_synonyms –-synset "your synset here" --level level_number 13 | 14 | Options: 15 | -------- 16 | .. code-block:: none 17 | 18 | --synset INPUT_TEXT 19 | A set of synonyms to be evaluated (string of synonyms seperated by |) 20 | --level Integer 21 | The level number indicating the depth of synonym extension. Which could be 2 or 3 22 | 23 | Examples: 24 | --------- 25 | .. code-block:: none 26 | 27 | evaluate_synonyms --synset "ممر | طريق" --level 2 28 | """ 29 | 30 | import argparse 31 | from sinatools.synonyms.synonyms_generator import evaluate_synonyms 32 | 33 | def main(): 34 | parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools') 35 | 36 | parser.add_argument('--synset', type=str, help='Set of synonyms seperated by |') 37 | parser.add_argument('--level', type=int, help='The depth of edges the algorithm needs to reach') 38 | 39 | args = parser.parse_args() 40 | 41 | if args.synset is None and args.level is None: 42 | print("Error: Either --synset or --level argument must be provided.") 43 | return 44 | 45 | results = evaluate_synonyms(args.synset, args.level) 46 | 47 | print(results) 48 | 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /sinatools/CLI/synonyms/extend_synonyms.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | The extend_synonyms command line interface (CLI) is designed to receives a set of synonyms and a level number, then extends this set of synonyms with additional synonyms. Each synonym is assigned a fuzzy value based on the specified level. As follows: 5 | 6 | Usage: 7 | ------ 8 | Below is the usage information that can be generated by running synonyms --help. 9 | 10 | .. code-block:: none 11 | 12 | extend_synonyms –-synset "your synset here" --level level_number 13 | 14 | Options: 15 | -------- 16 | .. code-block:: none 17 | 18 | --synset INPUT_TEXT 19 | A set of initial synonyms to be extended (string of synonyms seperated by |) 20 | --level Integer 21 | The level number indicating the depth of synonym extension. Which could be 2 or 3 22 | 23 | Examples: 24 | --------- 25 | .. code-block:: none 26 | 27 | extend_synonyms --synset "ممر | طريق" --level 2 28 | """ 29 | 30 | import argparse 31 | from sinatools.synonyms.synonyms_generator import extend_synonyms 32 | 33 | def main(): 34 | parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools') 35 | 36 | parser.add_argument('--synset', type=str, help='Set of synonyms seperated by |') 37 | parser.add_argument('--level', type=int, help='The depth of edges the algorithm needs to reach') 38 | 39 | args = parser.parse_args() 40 | 41 | if args.synset is None and args.level is None: 42 | print("Error: Either --synset or --level argument must be provided.") 43 | return 44 | 45 | results = extend_synonyms(args.synset, args.level) 46 | 47 | print(results) 48 | 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /sinatools/CLI/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__init__.py -------------------------------------------------------------------------------- /sinatools/CLI/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/utils/__pycache__/arStrip.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/arStrip.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/utils/__pycache__/corpus_tokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/corpus_tokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/utils/__pycache__/implication.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/implication.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/utils/__pycache__/jaccard.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/jaccard.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/utils/__pycache__/remove_latin.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/remove_latin.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/utils/__pycache__/remove_punctuation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/remove_punctuation.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/utils/__pycache__/sentence_tokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/sentence_tokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/utils/__pycache__/text_transliteration.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/text_transliteration.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/utils/corpus_tokenizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | About: 4 | ------ 5 | The corpus_tokenizer command offers functionality to tokenize a corpus and write the results to a CSV file. It recursively searches through a specified directory for text files, tokenizes the content, and outputs the results, including various metadata, to a specified CSV file. 6 | 7 | Usage: 8 | ------- 9 | Below is the usage information that can be generated by running corpus_tokenizer --help. 10 | 11 | .. code-block:: none 12 | 13 | Usage: 14 | corpus_tokenizer dir_path output_csv 15 | 16 | .. code-block:: none 17 | dir_path 18 | The path to the directory containing the text files. 19 | 20 | output_csv 21 | The path to the output CSV file. 22 | 23 | Examples: 24 | --------- 25 | .. code-block:: none 26 | corpus_tokenizer --dir_path "/path/to/text/directory/of/files" --output_csv "outputFile.csv" 27 | """ 28 | 29 | import argparse 30 | from sinatools.utils.tokenizer import corpus_tokenizer 31 | 32 | # Define the main function that will parse the arguments 33 | def main(): 34 | # Create an ArgumentParser object 35 | parser = argparse.ArgumentParser(description='Tokenize the corpus and write the results to a CSV file.') 36 | 37 | # Add arguments to the parser 38 | parser.add_argument('--dir_path', type=str, help='The path to the directory containing the text files.') 39 | parser.add_argument('--output_csv', type=str, help='The path to the output CSV file.') 40 | 41 | # Parse the command-line arguments 42 | args = parser.parse_args() 43 | 44 | # Call the corpus_tokenizer function with the parsed arguments 45 | corpus_tokenizer(args.dir_path, args.output_csv) 46 | 47 | # Call the main function when the script is executed 48 | if __name__ == '__main__': 49 | main() 50 | 51 | -------------------------------------------------------------------------------- /sinatools/CLI/utils/implication.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | The implication tool evaluates the implication between two words using the functionalities provided by the `Implication` class of SinaTools. This tool can be utilized to determine the relationship between two words and understand if one implies the other. 5 | 6 | Usage: 7 | ------ 8 | Below is the usage information that can be generated by running implication --help. 9 | 10 | .. code-block:: none 11 | 12 | Usage: 13 | implication --inputWord1=WORD1 --inputWord2=WORD2 14 | 15 | implication --inputFile1=File1 --inputFile2=File2 16 | 17 | .. code-block:: none 18 | 19 | Options: 20 | --inputWord1 WORD1 21 | First input word. 22 | 23 | --inputWord2 WORD2 24 | Second input word. 25 | 26 | --file1 FILE1 27 | File containing the words to evaluate the implication. 28 | 29 | --file2 FILE2 30 | File containing the words to evaluate the implication. 31 | Examples: 32 | --------- 33 | 34 | .. code-block:: none 35 | 36 | implication --inputWord1 "word1" --inputWord2 "word2" 37 | 38 | implication --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" 39 | 40 | """ 41 | import argparse 42 | from sinatools.utils.word_compare import Implication 43 | 44 | def read_file(file_path): 45 | with open(file_path, 'r', encoding='utf-8') as file: 46 | word = file.readline().strip() 47 | if word: 48 | return word 49 | else: 50 | raise ValueError(f"File {file_path} must contain at least one word.") 51 | 52 | def main(): 53 | parser = argparse.ArgumentParser(description='Evaluate Implication between two words using SinaTools') 54 | 55 | # Adding optional arguments for the two input words and the files 56 | parser.add_argument('--inputWord1', type=str, help='First input word') 57 | parser.add_argument('--inputWord2', type=str, help='Second input word') 58 | parser.add_argument('--file1', type=str, help='File containing the first word to evaluate implication') 59 | parser.add_argument('--file2', type=str, help='File containing the second word to evaluate implication') 60 | 61 | args = parser.parse_args() 62 | 63 | if args.file1 and args.file2: 64 | word1 = read_file(args.file1) 65 | word2 = read_file(args.file2) 66 | elif args.inputWord1 and args.inputWord2: 67 | word1, word2 = args.inputWord1, args.inputWord2 68 | else: 69 | print("Either --file1 and --file2 arguments or both --inputWord1 and --inputWord2 arguments must be provided.") 70 | return 71 | 72 | # Instantiate the Implication class 73 | implication_obj = Implication(word1, word2) 74 | 75 | # For this example, assuming there is a method `get_verdict()` in the Implication class. 76 | result = implication_obj.get_verdict() 77 | print(result) 78 | 79 | if __name__ == '__main__': 80 | main() 81 | # implication --inputWord1 "word1" --inputWord2 "word2" 82 | # implication --file1 "path/to/your/firstfile.txt" --file2 "path/to/your/secondfile.txt" 83 | 84 | 85 | -------------------------------------------------------------------------------- /sinatools/CLI/utils/remove_latin.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | The remove_latin command performs delete latin characters from the input text. 5 | 6 | Usage: 7 | ------ 8 | Below is the usage information that can be generated by running remove_latin --help. 9 | 10 | .. code-block:: none 11 | remove_latin --text=TEXT 12 | remove_latin --file "path/to/your/file.txt" 13 | 14 | Examples: 15 | --------- 16 | .. code-block:: none 17 | remove_latin --text "123test" 18 | remove_latin --file "path/to/your/file.txt" 19 | """ 20 | 21 | import argparse 22 | from sinatools.utils.parser import remove_latin 23 | 24 | 25 | def main(): 26 | parser = argparse.ArgumentParser(description='remove latin characters from the text') 27 | 28 | parser.add_argument('--text', type=str, required=True, help='The input text') 29 | args = parser.parse_args() 30 | result = remove_latin(args.text) 31 | 32 | print(result) 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /sinatools/CLI/utils/remove_punctuation.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | The remove_punctuation command performs delete punctuation marks from the input text. 5 | 6 | Usage: 7 | ------ 8 | Below is the usage information that can be generated by running remove_punctuation --help. 9 | 10 | .. code-block:: none 11 | 12 | Usage: 13 | remove_punctuation --text=TEXT 14 | remove_punctuation --file "path/to/your/file.txt" 15 | 16 | Examples: 17 | --------- 18 | .. code-block:: none 19 | 20 | remove_punctuation --text "te%s@t...!!?" 21 | remove_punctuation --file "path/to/your/file.txt" 22 | """ 23 | 24 | import argparse 25 | from sinatools.utils.parser import remove_punctuation 26 | #from sinatools.utils.parser import read_file 27 | #from sinatools.utils.parser import write_file 28 | 29 | 30 | def main(): 31 | parser = argparse.ArgumentParser(description='remove punctuation marks from the text') 32 | 33 | parser.add_argument('--text',required=True,help="input text") 34 | # parser.add_argument('myFile', type=argparse.FileType('r'),help='Input file csv') 35 | args = parser.parse_args() 36 | result = remove_punctuation(args.text) 37 | 38 | print(result) 39 | if __name__ == '__main__': 40 | main() 41 | 42 | 43 | -------------------------------------------------------------------------------- /sinatools/CLI/utils/sentence_tokenizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | 5 | The sentence_tokenizer command allows you to tokenize text into sentences using the SinaTools utility. It provides 6 | flexibility in tokenizing at different punctuation marks, including dots, question marks, and exclamation marks. It also 7 | allows tokenization at new lines. 8 | 9 | Usage: 10 | ------ 11 | Below is the usage information that can be generated by running sentence_tokenizer --help. 12 | 13 | .. code-block:: none 14 | 15 | Usage: 16 | sentence_tokenizer --text=TEXT [options] 17 | sentence_tokenizer --file=FILE [options] 18 | 19 | .. code-block:: none 20 | 21 | Options: 22 | --text TEXT 23 | Text to be tokenized into sentences. 24 | --file FILE 25 | File containing the text to be tokenized into sentences 26 | --dot 27 | Tokenize at dots. 28 | --new_line 29 | Tokenize at new lines. 30 | --question_mark 31 | Tokenize at question marks. 32 | --exclamation_mark 33 | Tokenize at exclamation marks. 34 | 35 | Examples: 36 | --------- 37 | 38 | .. code-block:: none 39 | 40 | sentence_tokenizer --text "Your text here. Does it work? Yes! Try with new lines." --dot --question_mark --exclamation_mark 41 | sentence_tokenizer --file "path/to/your/file.txt" --dot --question_mark --exclamation_mark 42 | 43 | """ 44 | import argparse 45 | from sinatools.utils.tokenizer import sentence_tokenizer 46 | from sinatools.utils.readfile import read_file 47 | 48 | def main(): 49 | parser = argparse.ArgumentParser(description='Sentence Tokenization using SinaTools') 50 | 51 | # Adding arguments for the text, file, and tokenization options 52 | parser.add_argument('--text', type=str, help='Text to be tokenized into sentences') 53 | parser.add_argument('--file', type=str, help='File containing the text to be tokenized into sentences') 54 | parser.add_argument('--dot', action='store_true', help='Tokenize at dots') 55 | parser.add_argument('--new_line', action='store_true', help='Tokenize at new lines') 56 | parser.add_argument('--question_mark', action='store_true', help='Tokenize at question marks') 57 | parser.add_argument('--exclamation_mark', action='store_true', help='Tokenize at exclamation marks') 58 | 59 | args = parser.parse_args() 60 | 61 | # Check if either text or file is provided 62 | if args.text is None and args.file is None: 63 | print("Either --text or --file argument must be provided.") 64 | return 65 | 66 | text_content = args.text if args.text else read_file(args.file) 67 | 68 | # Perform sentence tokenization 69 | sentences = sentence_tokenizer(" ".join(text_content), dot=args.dot, new_line=args.new_line, 70 | question_mark=args.question_mark, exclamation_mark=args.exclamation_mark) 71 | 72 | # Print each sentence in a new line 73 | for sentence in sentences: 74 | print(sentence) 75 | 76 | if __name__ == '__main__': 77 | main() 78 | -------------------------------------------------------------------------------- /sinatools/CLI/utils/text_dublication_detector.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from sinatools.utils.text_dublication_detector import removal 3 | 4 | def main(): 5 | parser = argparse.ArgumentParser(description='Processes a CSV file of sentences to identify and remove duplicate sentences based on a specified threshold and cosine similarity. It saves the filtered results and the identified duplicates to separate files.') 6 | 7 | parser.add_argument('--csv_file', type=str, help='The path to the input CSV file that will be processed.') 8 | parser.add_argument('--column_name', type=str, help='The name of the column from which duplicates will be removed.') 9 | parser.add_argument('--final_file_name', type=str, help='The name of the output file that will contain the deduplicated results.') 10 | parser.add_argument('--deleted_file_name', type=str, help='The name of the output file that will contain the records that were identified as duplicates and removed.') 11 | parser.add_argument('--similarity_threshold', type=float, default=0.8, help='The similarity threshold for determining duplicates. Records with a similarity score above this value will be considered duplicates (default is 0.8).') 12 | 13 | args = parser.parse_args() 14 | 15 | if args.csv_file is None and args.column_name is None: 16 | print("Either --csv_file or --column_name argument must be provided.") 17 | return 18 | 19 | removal(args.csv_file, args.column_name, args.final_file_name, args.deleted_file_name, args.similarity_threshold) 20 | 21 | 22 | if __name__ == '__main__': 23 | main() 24 | 25 | # text_dublication_detector --csv_file "text.csv" --column_name "A" --final_file_name "Final.csv" --deleted_file_name "deleted.csv" --similarity_threshold 0.8 -------------------------------------------------------------------------------- /sinatools/CLI/utils/text_transliteration.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | 5 | The transliterate tool allows you to transliterate text using the SinaTools' utility. This command-line utility 6 | takes in a text and a desired schema, and outputs the transliterated text. 7 | 8 | Usage: 9 | ------ 10 | Below is the usage information that can be generated by running transliterate --help. 11 | 12 | Usage: 13 | ------ 14 | 15 | .. code-block:: none 16 | 17 | transliterate --text=TEXT --schema=SCHEMA 18 | 19 | transliterate --file=FILE --schema=SCHEMA 20 | 21 | Options: 22 | -------- 23 | 24 | .. code-block:: none 25 | 26 | --text TEXT 27 | Text to be transliterated. 28 | --schema SCHEMA 29 | Transliteration schema to be used, which is bw2ar or ar2bw. 30 | 31 | Examples: 32 | --------- 33 | 34 | .. code-block:: none 35 | 36 | transliterate --text "klmp" --schema "bw2ar" 37 | transliterate --file "path/to/your/file.txt" --schema "ar2bw" 38 | 39 | 40 | """ 41 | import argparse 42 | from sinatools.utils.text_transliteration import perform_transliteration 43 | from sinatools.utils.readfile import read_file 44 | 45 | def main(): 46 | parser = argparse.ArgumentParser(description='Perform text transliteration using SinaTools') 47 | 48 | # Adding arguments for the text, file, and schema 49 | parser.add_argument('--text', type=str, help='Text to be transliterated') 50 | parser.add_argument('--file', type=str, help='File containing the text to be transliterated') 51 | parser.add_argument('--schema', type=str, required=True, help='Transliteration schema to be used') 52 | 53 | args = parser.parse_args() 54 | 55 | # Check if either text or file is provided 56 | if args.text is None and args.file is None: 57 | print("Either --text or --file argument must be provided.") 58 | return 59 | 60 | text_content = args.text if args.text else " ".join(read_file(args.file)) 61 | # Perform transliteration 62 | result = perform_transliteration(text_content, args.schema) 63 | 64 | print(result) 65 | 66 | if __name__ == '__main__': 67 | main() 68 | 69 | #transliterate --text "example text" --schema "bw2ar" 70 | #transliterate --file "path/to/your/file.txt" --schema "bw2ar" 71 | -------------------------------------------------------------------------------- /sinatools/CLI/wsd/__pycache__/disambiguator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/wsd/__pycache__/disambiguator.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/CLI/wsd/disambiguator.py: -------------------------------------------------------------------------------- 1 | """ 2 | About: 3 | ------ 4 | The WSD command line interface (CLI) is designed to utilize the word sense disambiguation for Arabic sentences. This CLI allows users to input an Arabic sentence and receive concepts and lemma for each token within the sentence. As follows: 5 | 6 | Usage: 7 | ------ 8 | Below is the usage information that can be generated by running wsd --help. 9 | 10 | .. code-block:: none 11 | 12 | wsd –-text "your Arabic sentence here" 13 | wsd –-file path/to/txt/file 14 | 15 | Options: 16 | -------- 17 | .. code-block:: none 18 | 19 | --text INPUT_TEXT 20 | The text that needs to be analyzed for Named Entity Recognition. 21 | --file txt_file_path 22 | Directory containing the text files to be analyzed for Named Entity Recognition 23 | 24 | Examples: 25 | --------- 26 | .. code-block:: none 27 | 28 | wsd --text "Your text here" 29 | wsd --file "path/to/your/txt/file" 30 | 31 | """ 32 | 33 | import argparse 34 | import json 35 | from sinatools.wsd.disambiguator import disambiguate 36 | from sinatools.utils.readfile import read_file 37 | 38 | def main(): 39 | parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools') 40 | 41 | parser.add_argument('--text', type=str, help='Input sentence to process') 42 | parser.add_argument('--file', type=str, help='File containing the Arabic sentence to process') 43 | 44 | args = parser.parse_args() 45 | 46 | if args.text is None and args.file is None: 47 | print("Either --text or --file argument must be provided.") 48 | return 49 | 50 | text_content = args.text if args.text else " ".join(read_file(args.file)) 51 | result = disambiguate(text_content) 52 | print(json.dumps(result, ensure_ascii=False, indent=4)) 53 | 54 | if __name__ == "__main__": 55 | main() 56 | 57 | #wsd --text "your Arabic sentence here" 58 | #wsd --file "path/to/your/file.txt" -------------------------------------------------------------------------------- /sinatools/DataDownload/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__init__.py -------------------------------------------------------------------------------- /sinatools/DataDownload/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/DataDownload/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/DataDownload/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/DataDownload/__pycache__/downloader.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__pycache__/downloader.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/DataDownload/__pycache__/downloader.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__pycache__/downloader.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/DataDownload/__pycache__/downloader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__pycache__/downloader.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/VERSION: -------------------------------------------------------------------------------- 1 | 0.1.41 -------------------------------------------------------------------------------- /sinatools/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level package for sinatools.""" 2 | 3 | __author__ = """SinaLab""" 4 | __email__ = 'sina.institute.bzu@gmail.com' 5 | __version__ = '0.8.5' -------------------------------------------------------------------------------- /sinatools/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/__pycache__/sinatools.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/__pycache__/sinatools.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/arabert/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /sinatools/arabert/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/arabert/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/arabert/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/arabert/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/arabert/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/arabert/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/arabert/__pycache__/preprocess.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/arabert/__pycache__/preprocess.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/arabert/__pycache__/preprocess.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/arabert/__pycache__/preprocess.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/arabert/__pycache__/preprocess.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/arabert/__pycache__/preprocess.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/install_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | def main(): 5 | # Determine the path to the 'environment.yml' file within the package 6 | package_dir = os.path.dirname(__file__) 7 | env_file = os.path.join(package_dir, 'environment.yml') 8 | 9 | # Create the conda environment using the 'environment.yml' file 10 | subprocess.call(["conda", "env", "create", "-f", env_file]) 11 | 12 | if __name__ == "__main__": 13 | main() 14 | -------------------------------------------------------------------------------- /sinatools/morphology/ALMA_multi_word.py: -------------------------------------------------------------------------------- 1 | from sinatools.utils.parser import arStrip 2 | from . import five_grams_dict, four_grams_dict , three_grams_dict , two_grams_dict 3 | 4 | def ALMA_multi_word(multi_word, n): 5 | undiac_multi_word = arStrip(multi_word, True, True, True, False, True, False) # diacs , smallDiacs , shaddah , digit , alif , specialChars 6 | result_word = [] 7 | if n == 2: 8 | if undiac_multi_word in two_grams_dict.keys(): 9 | result_word = two_grams_dict[undiac_multi_word] 10 | elif n == 3: 11 | if undiac_multi_word in three_grams_dict.keys(): 12 | result_word = three_grams_dict[undiac_multi_word] 13 | elif n == 4: 14 | if undiac_multi_word in four_grams_dict.keys(): 15 | result_word = four_grams_dict[undiac_multi_word] 16 | else: 17 | if undiac_multi_word in five_grams_dict.keys(): 18 | result_word = five_grams_dict[undiac_multi_word] 19 | 20 | my_json = {} 21 | output_list = [] 22 | my_json['multi_word_lemma'] = multi_word 23 | my_json['undiac_multi_word_lemma'] = multi_word 24 | ids = [] 25 | if result_word != []: 26 | my_json['POS'] = result_word[0][1] #POS 27 | for result in result_word: 28 | ids.append(result[3]) 29 | my_json['ids'] = ids 30 | output_list.append(my_json) 31 | return output_list -------------------------------------------------------------------------------- /sinatools/morphology/__init__.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from sinatools.DataDownload import downloader 3 | import os 4 | 5 | dictionary = {} 6 | five_grams_dict = {} 7 | four_grams_dict = {} 8 | three_grams_dict = {} 9 | two_grams_dict = {} 10 | 11 | filename = 'lemmas_dic.pickle' 12 | path = downloader.get_appdatadir() 13 | file_path = os.path.join(path, filename) 14 | with open(file_path, 'rb') as f: 15 | dictionary = pickle.load(f) 16 | 17 | filename_five = 'five_grams.pickle' 18 | path =downloader.get_appdatadir() 19 | file_path = os.path.join(path, filename_five) 20 | with open(file_path, 'rb') as f: 21 | five_grams_dict = pickle.load(f, encoding='utf-8') 22 | 23 | 24 | filename_four = 'four_grams.pickle' 25 | path =downloader.get_appdatadir() 26 | file_path = os.path.join(path, filename_four) 27 | with open(file_path, 'rb') as f: 28 | four_grams_dict = pickle.load(f, encoding='utf-8') 29 | 30 | 31 | filename_three = 'three_grams.pickle' 32 | path =downloader.get_appdatadir() 33 | file_path = os.path.join(path, filename_three) 34 | with open(file_path, 'rb') as f: 35 | three_grams_dict = pickle.load(f, encoding='utf-8') 36 | 37 | 38 | filename_two = 'two_grams.pickle' 39 | path =downloader.get_appdatadir() 40 | file_path = os.path.join(path, filename_two) 41 | with open(file_path, 'rb') as f: 42 | two_grams_dict = pickle.load(f, encoding='utf-8') 43 | -------------------------------------------------------------------------------- /sinatools/morphology/__pycache__/ALMA_multi_word.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/ALMA_multi_word.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/morphology/__pycache__/ALMA_multi_word.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/ALMA_multi_word.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/morphology/__pycache__/ALMA_multi_word.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/ALMA_multi_word.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/morphology/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/morphology/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/morphology/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/morphology/__pycache__/charsets.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/charsets.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/morphology/__pycache__/morph_analyzer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/morph_analyzer.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/morphology/__pycache__/morph_analyzer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/morph_analyzer.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/morphology/__pycache__/morph_analyzer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/morph_analyzer.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/morphology/__pycache__/settings.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/settings.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/morphology/__pycache__/tokenizers_words.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/tokenizers_words.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/__init__.py: -------------------------------------------------------------------------------- 1 | from sinatools.DataDownload import downloader 2 | import os 3 | from sinatools.ner.helpers import load_object 4 | import pickle 5 | import os 6 | import torch 7 | import pickle 8 | import json 9 | from argparse import Namespace 10 | 11 | tagger = None 12 | tag_vocab = None 13 | train_config = None 14 | 15 | filename = 'Wj27012000.tar' 16 | path =downloader.get_appdatadir() 17 | model_path = os.path.join(path, filename) 18 | 19 | _path = os.path.join(model_path, "tag_vocab.pkl") 20 | 21 | with open(_path, "rb") as fh: 22 | tag_vocab = pickle.load(fh) 23 | 24 | train_config = Namespace() 25 | args_path = os.path.join(model_path, "args.json") 26 | 27 | with open(args_path, "r") as fh: 28 | train_config.__dict__ = json.load(fh) 29 | 30 | model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"]) 31 | model = torch.nn.DataParallel(model) 32 | 33 | if torch.cuda.is_available(): 34 | model = model.cuda() 35 | 36 | train_config.trainer_config["kwargs"]["model"] = model 37 | tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"]) 38 | tagger.load(os.path.join(model_path,"checkpoints")) 39 | -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/data.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/data.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/data.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/data.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/data_format.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/data_format.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/data_format.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/data_format.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/data_format.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/data_format.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/datasets.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/datasets.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/datasets.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/datasets.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/entity_extractor.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/entity_extractor.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/entity_extractor.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/entity_extractor.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/entity_extractor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/entity_extractor.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/helpers.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/helpers.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/helpers.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/helpers.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/helpers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/helpers.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/metrics.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/metrics.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/metrics.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/metrics.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/metrics.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/metrics.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/relation_extractor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/relation_extractor.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/transforms.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/transforms.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/__pycache__/transforms.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/transforms.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/data/__init__.py: -------------------------------------------------------------------------------- 1 | from sinatools.ner.data.datasets import NestedTagsDataset -------------------------------------------------------------------------------- /sinatools/ner/data/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/data/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/data/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/data/__pycache__/datasets.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/datasets.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/data/__pycache__/datasets.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/datasets.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/data/__pycache__/datasets.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/datasets.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/data/__pycache__/transforms.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/transforms.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/data/__pycache__/transforms.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/transforms.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/data/__pycache__/transforms.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/transforms.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/metrics.py: -------------------------------------------------------------------------------- 1 | from seqeval.metrics import ( 2 | classification_report, 3 | precision_score, 4 | recall_score, 5 | f1_score, 6 | accuracy_score, 7 | ) 8 | from seqeval.scheme import IOB2 9 | from types import SimpleNamespace 10 | import logging 11 | import re 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def compute_nested_metrics(segments, vocabs): 17 | """ 18 | Compute metrics for nested NER 19 | :param segments: List[List[arabiner.data.dataset.Token]] - list of segments 20 | :return: metrics - SimpleNamespace - F1/micro/macro/weights, recall, precision, accuracy 21 | """ 22 | y, y_hat = list(), list() 23 | 24 | # We duplicate the dataset N times, where N is the number of entity types 25 | # For each copy, we create y and y_hat 26 | # Example: first copy, will create pairs of ground truth and predicted labels for entity type GPE 27 | # another copy will create pairs for LOC, etc. 28 | for i, vocab in enumerate(vocabs): 29 | vocab_tags = [tag for tag in vocab.get_itos() if "-" in tag] 30 | r = re.compile("|".join(vocab_tags)) 31 | 32 | y += [[(list(filter(r.match, token.gold_tag)) or ["O"])[0] for token in segment] for segment in segments] 33 | y_hat += [[token.pred_tag[i]["tag"] for token in segment] for segment in segments] 34 | 35 | logging.info("\n" + classification_report(y, y_hat, scheme=IOB2, digits=4)) 36 | 37 | metrics = { 38 | "micro_f1": f1_score(y, y_hat, average="micro", scheme=IOB2), 39 | "macro_f1": f1_score(y, y_hat, average="macro", scheme=IOB2), 40 | "weights_f1": f1_score(y, y_hat, average="weighted", scheme=IOB2), 41 | "precision": precision_score(y, y_hat, scheme=IOB2), 42 | "recall": recall_score(y, y_hat, scheme=IOB2), 43 | "accuracy": accuracy_score(y, y_hat), 44 | } 45 | 46 | return SimpleNamespace(**metrics) 47 | 48 | 49 | def compute_single_label_metrics(segments): 50 | """ 51 | Compute metrics for flat NER 52 | :param segments: List[List[arabiner.data.dataset.Token]] - list of segments 53 | :return: metrics - SimpleNamespace - F1/micro/macro/weights, recall, precision, accuracy 54 | """ 55 | y = [[token.gold_tag[0] for token in segment] for segment in segments] 56 | y_hat = [[token.pred_tag[0]["tag"] for token in segment] for segment in segments] 57 | 58 | logging.info("\n" + classification_report(y, y_hat, scheme=IOB2)) 59 | 60 | metrics = { 61 | "micro_f1": f1_score(y, y_hat, average="micro", scheme=IOB2), 62 | "macro_f1": f1_score(y, y_hat, average="macro", scheme=IOB2), 63 | "weights_f1": f1_score(y, y_hat, average="weighted", scheme=IOB2), 64 | "precision": precision_score(y, y_hat, scheme=IOB2), 65 | "recall": recall_score(y, y_hat, scheme=IOB2), 66 | "accuracy": accuracy_score(y, y_hat), 67 | } 68 | 69 | return SimpleNamespace(**metrics) 70 | -------------------------------------------------------------------------------- /sinatools/ner/nn/BaseModel.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import BertModel 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class BaseModel(nn.Module): 9 | def __init__(self, 10 | bert_model="aubmindlab/bert-base-arabertv2", 11 | num_labels=2, 12 | dropout=0.1, 13 | num_types=0): 14 | super().__init__() 15 | 16 | self.bert_model = bert_model 17 | self.num_labels = num_labels 18 | self.num_types = num_types 19 | self.dropout = dropout 20 | 21 | self.bert = BertModel.from_pretrained(bert_model) 22 | self.dropout = nn.Dropout(dropout) 23 | -------------------------------------------------------------------------------- /sinatools/ner/nn/BertNestedTagger.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from sinatools.ner.nn import BaseModel 4 | 5 | 6 | class BertNestedTagger(BaseModel): 7 | def __init__(self, **kwargs): 8 | super(BertNestedTagger, self).__init__(**kwargs) 9 | 10 | self.max_num_labels = max(self.num_labels) 11 | classifiers = [nn.Linear(768, num_labels) for num_labels in self.num_labels] 12 | self.classifiers = torch.nn.Sequential(*classifiers) 13 | 14 | def forward(self, x): 15 | y = self.bert(x) 16 | y = self.dropout(y["last_hidden_state"]) 17 | output = list() 18 | 19 | for i, classifier in enumerate(self.classifiers): 20 | logits = classifier(y) 21 | 22 | # Pad logits to allow Multi-GPU/DataParallel training to work 23 | # We will truncate the padded dimensions when we compute the loss in the trainer 24 | logits = torch.nn.ConstantPad1d((0, self.max_num_labels - logits.shape[-1]), 0)(logits) 25 | output.append(logits) 26 | 27 | # Return tensor of the shape B x T x L x C 28 | # B: batch size 29 | # T: sequence length 30 | # L: number of tag types 31 | # C: number of classes per tag type 32 | output = torch.stack(output).permute((1, 2, 0, 3)) 33 | return output 34 | 35 | -------------------------------------------------------------------------------- /sinatools/ner/nn/BertSeqTagger.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from transformers import BertModel 3 | 4 | 5 | class BertSeqTagger(nn.Module): 6 | def __init__(self, bert_model, num_labels=2, dropout=0.1): 7 | super().__init__() 8 | 9 | self.bert = BertModel.from_pretrained(bert_model) 10 | self.dropout = nn.Dropout(dropout) 11 | self.linear = nn.Linear(768, num_labels) 12 | 13 | def forward(self, x): 14 | y = self.bert(x) 15 | y = self.dropout(y["last_hidden_state"]) 16 | logits = self.linear(y) 17 | return logits 18 | -------------------------------------------------------------------------------- /sinatools/ner/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from sinatools.ner.nn.BaseModel import BaseModel 2 | from sinatools.ner.nn.BertSeqTagger import BertSeqTagger 3 | from sinatools.ner.nn.BertNestedTagger import BertNestedTagger -------------------------------------------------------------------------------- /sinatools/ner/nn/__pycache__/BaseModel.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BaseModel.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/nn/__pycache__/BaseModel.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BaseModel.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/nn/__pycache__/BaseModel.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BaseModel.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/nn/__pycache__/BertNestedTagger.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BertNestedTagger.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/nn/__pycache__/BertNestedTagger.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BertNestedTagger.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/nn/__pycache__/BertNestedTagger.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BertNestedTagger.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/nn/__pycache__/BertSeqTagger.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BertSeqTagger.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/nn/__pycache__/BertSeqTagger.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BertSeqTagger.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/nn/__pycache__/BertSeqTagger.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BertSeqTagger.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/nn/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/nn/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/nn/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/trainers/__init__.py: -------------------------------------------------------------------------------- 1 | from sinatools.ner.trainers.BaseTrainer import BaseTrainer 2 | from sinatools.ner.trainers.BertTrainer import BertTrainer 3 | from sinatools.ner.trainers.BertNestedTrainer import BertNestedTrainer -------------------------------------------------------------------------------- /sinatools/ner/trainers/__pycache__/BaseTrainer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BaseTrainer.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/trainers/__pycache__/BaseTrainer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BaseTrainer.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/trainers/__pycache__/BaseTrainer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BaseTrainer.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/trainers/__pycache__/BertNestedTrainer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BertNestedTrainer.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/trainers/__pycache__/BertNestedTrainer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BertNestedTrainer.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/trainers/__pycache__/BertNestedTrainer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BertNestedTrainer.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/trainers/__pycache__/BertTrainer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BertTrainer.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/trainers/__pycache__/BertTrainer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BertTrainer.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/trainers/__pycache__/BertTrainer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BertTrainer.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/ner/trainers/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/ner/trainers/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/ner/trainers/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/relations/__init__.py: -------------------------------------------------------------------------------- 1 | from sinatools.DataDownload import downloader 2 | import os 3 | from transformers import pipeline 4 | 5 | path =downloader.get_appdatadir() 6 | 7 | pipe = pipeline("sentiment-analysis", model= os.path.join(path, "relation_model"), return_all_scores =True, max_length=128, truncation=True) -------------------------------------------------------------------------------- /sinatools/relations/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/relations/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/relations/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/relations/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/relations/__pycache__/relation_extractor.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/relations/__pycache__/relation_extractor.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/relations/__pycache__/relation_extractor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/relations/__pycache__/relation_extractor.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/semantic_relatedness/__init__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | from sinatools.DataDownload import downloader 4 | import os 5 | from transformers import BertTokenizer,BertModel 6 | 7 | model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01" 8 | path =downloader.get_appdatadir() 9 | model_file_path = os.path.join(path, model_file_name) 10 | 11 | tokenizer_file_name = "bert-base-arabertv02" 12 | path =downloader.get_appdatadir() 13 | tokenizer_file_path = os.path.join(path, tokenizer_file_name) 14 | 15 | model = BertModel.from_pretrained('{}'.format(model_file_path), 16 | output_hidden_states = True, 17 | num_labels=2 18 | ) 19 | 20 | tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path)) -------------------------------------------------------------------------------- /sinatools/semantic_relatedness/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/semantic_relatedness/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/semantic_relatedness/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/semantic_relatedness/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/semantic_relatedness/__pycache__/compute_relatedness.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/semantic_relatedness/__pycache__/compute_relatedness.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/semantic_relatedness/__pycache__/compute_relatedness.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/semantic_relatedness/__pycache__/compute_relatedness.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/semantic_relatedness/__pycache__/settings.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/semantic_relatedness/__pycache__/settings.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/semantic_relatedness/compute_relatedness.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from . import tokenizer 3 | from . import model 4 | 5 | #cosine using average embedding 6 | def get_similarity_score(sentence1, sentence2): 7 | """ 8 | Computes the degree of association between two sentences across various dimensions, meaning, underlying concepts, domain-specificity, topic overlap, viewpoint alignment. This method is described and implemented on this article. 9 | 10 | Args: 11 | sentence1 (:obj:`str`) – The Arabic sentence to find the semantic relatedness between it and the second sentence. 12 | sentence2 (:obj:`int`) – The Arabic sentence to find the semantic relatedness between it and the first sentence. 13 | 14 | Returns: 15 | :obj:`float`: An float number that represents the degree of relatedness between two provided sentences. 16 | 17 | **Example:** 18 | 19 | .. highlight:: python 20 | .. code-block:: python 21 | 22 | from sinatools.semantic_relatedness.compute_relatedness import get_similarity_score 23 | 24 | sentence1 = "تبلغ سرعة دوران الأرض حول الشمس حوالي 110 كيلومتر في الساعة." 25 | sentence2 = "تدور الأرض حول محورها بسرعة تصل تقريبا 1670 كيلومتر في الساعة." 26 | get_similarity_score(sentence1, sentence2) 27 | Score = 0.90 28 | """ 29 | 30 | # Tokenize and encode sentences 31 | inputs1 = tokenizer(sentence1, return_tensors="pt") 32 | inputs2 = tokenizer(sentence2, return_tensors="pt") 33 | 34 | # Extract embeddings 35 | with torch.no_grad(): 36 | outputs1 = model(**inputs1) 37 | outputs2 = model(**inputs2) 38 | 39 | embeddings1 = outputs1.last_hidden_state 40 | embeddings2 = outputs2.last_hidden_state 41 | 42 | # Mask padding tokens 43 | attention_mask1 = inputs1["attention_mask"] 44 | attention_mask2 = inputs2["attention_mask"] 45 | 46 | # Average pool across tokens, excluding padding 47 | embeddings1_avg = torch.sum(embeddings1 * attention_mask1.unsqueeze(-1), dim=1) / torch.sum(attention_mask1, dim=1, keepdim=True) 48 | embeddings2_avg = torch.sum(embeddings2 * attention_mask2.unsqueeze(-1), dim=1) / torch.sum(attention_mask2, dim=1, keepdim=True) 49 | 50 | # Calculate cosine similarity 51 | similarity = torch.nn.functional.cosine_similarity(embeddings1_avg, embeddings2_avg) 52 | 53 | return similarity.item() -------------------------------------------------------------------------------- /sinatools/sinatools.py: -------------------------------------------------------------------------------- 1 | """Main module.""" 2 | -------------------------------------------------------------------------------- /sinatools/synonyms/__init__.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from sinatools.DataDownload import downloader 3 | import os 4 | 5 | synonyms_level2_dict = {} 6 | level2_dict = 'graph_l2.pkl' 7 | path = downloader.get_appdatadir() 8 | file_path = os.path.join(path, level2_dict) 9 | with open(file_path, 'rb') as f: 10 | synonyms_level2_dict = pickle.load(f, encoding='utf-8') 11 | 12 | 13 | synonyms_level3_dict = {} 14 | level3_dict = 'graph_l3.pkl' 15 | path = downloader.get_appdatadir() 16 | file_path = os.path.join(path, level3_dict) 17 | with open(file_path, 'rb') as f: 18 | synonyms_level3_dict = pickle.load(f, encoding='utf-8') -------------------------------------------------------------------------------- /sinatools/synonyms/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/synonyms/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/synonyms/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/synonyms/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/synonyms/__pycache__/synonyms_generator.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/synonyms/__pycache__/synonyms_generator.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/synonyms/__pycache__/synonyms_generator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/synonyms/__pycache__/synonyms_generator.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__init__.py -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/charsets.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/charsets.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/charsets.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/charsets.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/charsets.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/charsets.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/implication.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/implication.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/implication.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/implication.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/jaccard.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/jaccard.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/jaccard.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/jaccard.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/parser.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/parser.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/parser.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/parser.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/parser.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/parser.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/readfile.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/readfile.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/readfile.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/readfile.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/similarity.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/similarity.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/text_dublication_detector.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/text_dublication_detector.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/text_transliteration.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/text_transliteration.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/tokenizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/tokenizer.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/tokenizer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/tokenizer.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/tokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/tokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/tokenizers_words.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/tokenizers_words.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/tokenizers_words.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/tokenizers_words.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/tokenizers_words.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/tokenizers_words.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/word_compare.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/word_compare.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/utils/__pycache__/word_compare.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/word_compare.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/utils/charsets.py: -------------------------------------------------------------------------------- 1 | # We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/utils/charsets.html]. 2 | 3 | import unicodedata 4 | 5 | from six import unichr 6 | 7 | 8 | UNICODE_PUNCT_CHARSET = frozenset( 9 | [unichr(x) for x in range(65536) if unicodedata.category( 10 | unichr(x))[0] == 'P']) 11 | UNICODE_SYMBOL_CHARSET = frozenset( 12 | [unichr(x) for x in range(65536) if unicodedata.category( 13 | unichr(x))[0] == 'S']) 14 | UNICODE_PUNCT_SYMBOL_CHARSET = UNICODE_PUNCT_CHARSET | UNICODE_SYMBOL_CHARSET 15 | 16 | UNICODE_LETTER_CHARSET = frozenset( 17 | [unichr(x) for x in range(65536) if unicodedata.category( 18 | unichr(x))[0] == 'L']) 19 | UNICODE_MARK_CHARSET = frozenset( 20 | [unichr(x) for x in range(65536) if unicodedata.category( 21 | unichr(x))[0] == 'M']) 22 | UNICODE_NUMBER_CHARSET = frozenset( 23 | [unichr(x) for x in range(65536) if unicodedata.category( 24 | unichr(x))[0] == 'N']) 25 | UNICODE_LETTER_MARK_NUMBER_CHARSET = (UNICODE_LETTER_CHARSET | 26 | UNICODE_MARK_CHARSET | 27 | UNICODE_NUMBER_CHARSET) 28 | 29 | AR_LETTERS_CHARSET = frozenset(u'\u0621\u0622\u0623\u0624\u0625\u0626\u0627' 30 | u'\u0628\u0629\u062a\u062b\u062c\u062d\u062e' 31 | u'\u062f\u0630\u0631\u0632\u0633\u0634\u0635' 32 | u'\u0636\u0637\u0638\u0639\u063a\u0640\u0641' 33 | u'\u0642\u0643\u0644\u0645\u0646\u0647\u0648' 34 | u'\u0649\u064a\u0671\u067e\u0686\u06a4\u06af') 35 | AR_DIAC_CHARSET = frozenset(u'\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652' 36 | u'\u0670\u0640') 37 | AR_CHARSET = AR_LETTERS_CHARSET | AR_DIAC_CHARSET 38 | 39 | BW_LETTERS_CHARSET = frozenset(u'$&\'*<>ADEGHJPSTVYZ_bdfghjklmnpqrstvwxyz{|}') 40 | BW_DIAC_CHARSET = frozenset(u'FKN`aiou~_') 41 | BW_CHARSET = BW_LETTERS_CHARSET | BW_DIAC_CHARSET 42 | 43 | SAFEBW_LETTERS_CHARSET = frozenset(u'ABCDEGHIJLMOPQSTVWYZ_bcdefghjklmnpqrstvwx' 44 | u'yz') 45 | SAFEBW_DIAC_CHARSET = frozenset(u'FKNaeiou~_') 46 | SAFEBW_CHARSET = SAFEBW_LETTERS_CHARSET | SAFEBW_DIAC_CHARSET 47 | 48 | XMLBW_LETTERS_CHARSET = frozenset(u'$\'*ABDEGHIJOPSTWYZ_bdfghjklmnpqrstvwxyz{|' 49 | u'}') 50 | XMLBW_DIAC_CHARSET = frozenset(u'FKN`aiou~_') 51 | XMLBW_CHARSET = XMLBW_LETTERS_CHARSET | XMLBW_DIAC_CHARSET 52 | 53 | HSB_LETTERS_CHARSET = frozenset(u'\'ADHST_bcdfghjklmnpqrstvwxyz' 54 | u'\u00c2\u00c4\u00e1\u00f0\u00fd\u0100\u0102' 55 | u'\u010e\u0127\u0161\u0175\u0177\u03b3\u03b8' 56 | u'\u03c2') 57 | HSB_DIAC_CHARSET = frozenset(u'.aiu~\u00c4\u00e1\u00e3\u0129\u0169_') 58 | HSB_CHARSET = HSB_LETTERS_CHARSET | HSB_DIAC_CHARSET 59 | -------------------------------------------------------------------------------- /sinatools/utils/readfile.py: -------------------------------------------------------------------------------- 1 | def read_file(file_path): 2 | with open(file_path, 'r', encoding='utf-8') as file: 3 | return [line.strip() for line in file] 4 | -------------------------------------------------------------------------------- /sinatools/utils/tokenizers_words.py: -------------------------------------------------------------------------------- 1 | # We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/tokenizers/word.html]. 2 | 3 | import re 4 | from sinatools.utils.charsets import UNICODE_PUNCT_SYMBOL_CHARSET 5 | from sinatools.utils.charsets import UNICODE_LETTER_MARK_NUMBER_CHARSET 6 | 7 | 8 | _ALL_PUNCT = u''.join(UNICODE_PUNCT_SYMBOL_CHARSET) 9 | _ALL_LETTER_MARK_NUMBER = u''.join(UNICODE_LETTER_MARK_NUMBER_CHARSET) 10 | _TOKENIZE_RE = re.compile(r'[' + re.escape(_ALL_PUNCT) + r']|[' + 11 | re.escape(_ALL_LETTER_MARK_NUMBER) + r']+') 12 | 13 | 14 | def simple_word_tokenize(sentence): 15 | 16 | return _TOKENIZE_RE.findall(sentence) 17 | -------------------------------------------------------------------------------- /sinatools/wsd/__init__.py: -------------------------------------------------------------------------------- 1 | from sinatools.wsd import settings 2 | import pickle 3 | from sinatools.DataDownload import downloader 4 | import os 5 | 6 | glosses_dic = {} 7 | filename = 'one_gram.pickle' 8 | path =downloader.get_appdatadir() 9 | file_path = os.path.join(path, filename) 10 | with open(file_path, 'rb') as f: 11 | glosses_dic = pickle.load(f) 12 | -------------------------------------------------------------------------------- /sinatools/wsd/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/wsd/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/wsd/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/wsd/__pycache__/disambiguator.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/disambiguator.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/wsd/__pycache__/disambiguator.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/disambiguator.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/wsd/__pycache__/disambiguator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/disambiguator.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/wsd/__pycache__/settings.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/settings.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/wsd/__pycache__/settings.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/settings.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/wsd/__pycache__/settings.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/settings.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/wsd/__pycache__/views.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/views.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/wsd/__pycache__/wsd.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/wsd.cpython-310.pyc -------------------------------------------------------------------------------- /sinatools/wsd/__pycache__/wsd.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/wsd.cpython-311.pyc -------------------------------------------------------------------------------- /sinatools/wsd/__pycache__/wsd.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/wsd.cpython-38.pyc -------------------------------------------------------------------------------- /sinatools/wsd/settings.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer,BertForSequenceClassification 2 | import warnings 3 | warnings.filterwarnings("ignore") 4 | import pandas as pd 5 | 6 | 7 | 8 | 9 | from sinatools.DataDownload import downloader 10 | import os 11 | 12 | 13 | model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01" 14 | path =downloader.get_appdatadir() 15 | model_file_path = os.path.join(path, model_file_name) 16 | 17 | tokenizer_file_name = "bert-base-arabertv02" 18 | path =downloader.get_appdatadir() 19 | tokenizer_file_path = os.path.join(path, tokenizer_file_name) 20 | 21 | dftrue = pd.DataFrame() 22 | 23 | model = BertForSequenceClassification.from_pretrained(model_file_path, output_hidden_states=True, num_labels=2) 24 | 25 | tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path)) --------------------------------------------------------------------------------