├── .eggs
    ├── README.txt
    ├── pytest_runner-6.0.1-py3.10.egg
    │   ├── EGG-INFO
    │   │   ├── LICENSE
    │   │   ├── PKG-INFO
    │   │   ├── RECORD
    │   │   ├── WHEEL
    │   │   ├── entry_points.txt
    │   │   ├── requires.txt
    │   │   └── top_level.txt
    │   └── ptr
    │   │   └── __init__.py
    ├── pytest_runner-6.0.1-py3.11.egg
    │   ├── EGG-INFO
    │   │   ├── LICENSE
    │   │   ├── PKG-INFO
    │   │   ├── RECORD
    │   │   ├── WHEEL
    │   │   ├── entry_points.txt
    │   │   ├── requires.txt
    │   │   └── top_level.txt
    │   └── ptr
    │   │   └── __init__.py
    ├── pytest_runner-6.0.1-py3.12.egg
    │   ├── EGG-INFO
    │   │   ├── LICENSE
    │   │   ├── PKG-INFO
    │   │   ├── RECORD
    │   │   ├── WHEEL
    │   │   ├── entry_points.txt
    │   │   ├── requires.txt
    │   │   └── top_level.txt
    │   └── ptr
    │   │   └── __init__.py
    └── pytest_runner-6.0.1-py3.7.egg
    │   ├── EGG-INFO
    │       ├── LICENSE
    │       ├── PKG-INFO
    │       ├── RECORD
    │       ├── WHEEL
    │       ├── entry_points.txt
    │       ├── requires.txt
    │       └── top_level.txt
    │   └── ptr
    │       └── __init__.py
├── AUTHORS.rst
├── CONTRIBUTING.rst
├── LICENSE
├── MANIFEST.in
├── PKG-INFO
├── README.md
├── README.rst
├── SinaTools.egg-info
    ├── PKG-INFO
    ├── SOURCES.txt
    ├── dependency_links.txt
    ├── entry_points.txt
    ├── not-zip-safe
    ├── requires.txt
    └── top_level.txt
├── build
    └── lib
    │   └── sinatools
    │       ├── CLI
    │           ├── DataDownload
    │           │   └── download_files.py
    │           ├── morphology
    │           │   ├── ALMA_multi_word.py
    │           │   └── morph_analyzer.py
    │           ├── ner
    │           │   ├── corpus_entity_extractor.py
    │           │   └── entity_extractor.py
    │           └── utils
    │           │   ├── __init__.py
    │           │   ├── arStrip.py
    │           │   ├── corpus_tokenizer.py
    │           │   ├── implication.py
    │           │   ├── jaccard.py
    │           │   ├── remove_latin.py
    │           │   ├── remove_punctuation.py
    │           │   ├── sentence_tokenizer.py
    │           │   ├── text_dublication_detector.py
    │           │   └── text_transliteration.py
    │       ├── DataDownload
    │           ├── __init__.py
    │           └── downloader.py
    │       ├── VERSION
    │       ├── __init__.py
    │       ├── arabert
    │           ├── __init__.py
    │           ├── arabert
    │           │   ├── __init__.py
    │           │   ├── create_classification_data.py
    │           │   ├── create_pretraining_data.py
    │           │   ├── extract_features.py
    │           │   ├── lamb_optimizer.py
    │           │   ├── modeling.py
    │           │   ├── optimization.py
    │           │   ├── run_classifier.py
    │           │   ├── run_pretraining.py
    │           │   ├── run_squad.py
    │           │   └── tokenization.py
    │           ├── araelectra
    │           │   ├── __init__.py
    │           │   ├── build_openwebtext_pretraining_dataset.py
    │           │   ├── build_pretraining_dataset.py
    │           │   ├── build_pretraining_dataset_single_file.py
    │           │   ├── configure_finetuning.py
    │           │   ├── configure_pretraining.py
    │           │   ├── finetune
    │           │   │   ├── __init__.py
    │           │   │   ├── feature_spec.py
    │           │   │   ├── preprocessing.py
    │           │   │   ├── scorer.py
    │           │   │   ├── task.py
    │           │   │   └── task_builder.py
    │           │   ├── flops_computation.py
    │           │   ├── model
    │           │   │   ├── __init__.py
    │           │   │   ├── modeling.py
    │           │   │   ├── optimization.py
    │           │   │   └── tokenization.py
    │           │   ├── pretrain
    │           │   │   ├── __init__.py
    │           │   │   ├── pretrain_data.py
    │           │   │   └── pretrain_helpers.py
    │           │   ├── run_finetuning.py
    │           │   ├── run_pretraining.py
    │           │   └── util
    │           │   │   ├── __init__.py
    │           │   │   ├── training_utils.py
    │           │   │   └── utils.py
    │           ├── aragpt2
    │           │   ├── __init__.py
    │           │   ├── create_pretraining_data.py
    │           │   ├── gpt2
    │           │   │   ├── __init__.py
    │           │   │   ├── lamb_optimizer.py
    │           │   │   ├── optimization.py
    │           │   │   └── run_pretraining.py
    │           │   ├── grover
    │           │   │   ├── __init__.py
    │           │   │   ├── dataloader.py
    │           │   │   ├── modeling.py
    │           │   │   ├── modeling_gpt2.py
    │           │   │   ├── optimization_adafactor.py
    │           │   │   ├── train_tpu.py
    │           │   │   └── utils.py
    │           │   └── train_bpe_tokenizer.py
    │           └── preprocess.py
    │       ├── environment.yml
    │       ├── install_env.py
    │       ├── morphology
    │           ├── ALMA_multi_word.py
    │           ├── __init__.py
    │           └── morph_analyzer.py
    │       ├── ner
    │           ├── __init__.py
    │           ├── data
    │           │   ├── __init__.py
    │           │   ├── datasets.py
    │           │   └── transforms.py
    │           ├── data_format.py
    │           ├── datasets.py
    │           ├── entity_extractor.py
    │           ├── helpers.py
    │           ├── metrics.py
    │           ├── nn
    │           │   ├── BaseModel.py
    │           │   ├── BertNestedTagger.py
    │           │   ├── BertSeqTagger.py
    │           │   └── __init__.py
    │           ├── trainers
    │           │   ├── BaseTrainer.py
    │           │   ├── BertNestedTrainer.py
    │           │   ├── BertTrainer.py
    │           │   └── __init__.py
    │           └── transforms.py
    │       ├── relations
    │           ├── __init__.py
    │           └── relation_extractor.py
    │       ├── semantic_relatedness
    │           ├── __init__.py
    │           └── compute_relatedness.py
    │       ├── sinatools.py
    │       ├── synonyms
    │           ├── __init__.py
    │           └── synonyms_generator.py
    │       ├── utils
    │           ├── __init__.py
    │           ├── charsets.py
    │           ├── parser.py
    │           ├── readfile.py
    │           ├── similarity.py
    │           ├── text_dublication_detector.py
    │           ├── text_transliteration.py
    │           ├── tokenizer.py
    │           ├── tokenizers_words.py
    │           └── word_compare.py
    │       └── wsd
    │           ├── __init__.py
    │           ├── disambiguator.py
    │           ├── settings.py
    │           └── wsd.py
├── dist
    ├── SinaTools-0.1.41-py2.py3-none-any.whl
    └── sinatools-0.1.41.tar.gz
├── docs
    ├── Makefile
    ├── build
    │   ├── _images
    │   │   └── download.png
    │   ├── _static
    │   │   ├── download.png
    │   │   ├── file.png
    │   │   ├── minus.png
    │   │   └── plus.png
    │   ├── doctrees
    │   │   ├── License.doctree
    │   │   ├── Overview.doctree
    │   │   ├── about.doctree
    │   │   ├── api.doctree
    │   │   └── api
    │   │   │   ├── DataDownload.doctree
    │   │   │   ├── DataDownload
    │   │   │       └── downloader.doctree
    │   │   │   └── arabiner.doctree
    │   └── html
    │   │   ├── _images
    │   │       ├── SinaLogo.jpg
    │   │       └── download.png
    │   │   └── _static
    │   │       ├── SinaLogo.jpg
    │   │       ├── download.png
    │   │       ├── file.png
    │   │       ├── minus.png
    │   │       └── plus.png
    ├── make.bat
    └── source
    │   ├── License.rst
    │   ├── Overview.rst
    │   ├── _static
    │       ├── SinaLogo.jpg
    │       └── download.png
    │   ├── about.rst
    │   ├── api.rst
    │   ├── api
    │       ├── DataDownload.rst
    │       ├── DataDownload
    │       │   └── downloader.rst
    │       ├── arabiner.rst
    │       ├── arabiner
    │       │   └── bin
    │       │   │   └── infer.rst
    │       ├── morphology.rst
    │       ├── morphology
    │       │   └── morph_analyzer.rst
    │       ├── salma.rst
    │       ├── salma
    │       │   └── views.rst
    │       ├── utils.rst
    │       └── utils
    │       │   ├── corpus_tokenizer.rst
    │       │   ├── implication.rst
    │       │   ├── jaccard.rst
    │       │   ├── parser.rst
    │       │   ├── sentence_tokenizer.rst
    │       │   └── text_transliteration.rst
    │   ├── authors.rst
    │   ├── cli_tools.rst
    │   ├── cli_tools
    │       ├── DataDownload.rst
    │       ├── DataDownload
    │       │   ├── download_files.rst
    │       │   └── get_appdatadir.rst
    │       ├── arabiner.rst
    │       ├── arabiner
    │       │   └── infer.rst
    │       ├── morphology.rst
    │       ├── morphology
    │       │   ├── ALMA_multi_word.rst
    │       │   └── morph_analyzer.rst
    │       ├── salma.rst
    │       ├── salma
    │       │   └── salma_tools.rst
    │       ├── utils.rst
    │       └── utils
    │       │   ├── arStrip.rst
    │       │   ├── corpus_tokenizer.rst
    │       │   ├── implication.rst
    │       │   ├── jaccard.rst
    │       │   ├── latin_remove.rst
    │       │   ├── remove_punc.rst
    │       │   ├── sentence_tokenizer.rst
    │       │   └── text_transliteration.rst
    │   ├── conf.py
    │   ├── index.rst
    │   ├── installation.rst
    │   └── readme.rst
├── setup.cfg
├── setup.py
└── sinatools
    ├── CLI
        ├── DataDownload
        │   ├── __pycache__
        │   │   ├── download_files.cpython-310.pyc
        │   │   └── download_files.cpython-38.pyc
        │   └── download_files.py
        ├── morphology
        │   ├── ALMA_multi_word.py
        │   ├── __pycache__
        │   │   ├── morph_analyzer.cpython-310.pyc
        │   │   └── morph_analyzer.cpython-38.pyc
        │   └── morph_analyzer.py
        ├── ner
        │   ├── __pycache__
        │   │   ├── corpus_entity_extractor.cpython-38.pyc
        │   │   ├── entity_extractor.cpython-310.pyc
        │   │   ├── entity_extractor.cpython-311.pyc
        │   │   └── entity_extractor.cpython-38.pyc
        │   ├── corpus_entity_extractor.py
        │   └── entity_extractor.py
        ├── relations
        │   └── relation_extractor.py
        ├── semantic_relatedness
        │   ├── __pycache__
        │   │   ├── __init__.cpython-38.pyc
        │   │   ├── compute_relatedness.cpython-38.pyc
        │   │   └── settings.cpython-38.pyc
        │   └── compute_relatedness.py
        ├── synonyms
        │   ├── __pycache__
        │   │   ├── __init__.cpython-38.pyc
        │   │   └── synonyms_generator.cpython-38.pyc
        │   ├── evaluate_synonyms.py
        │   └── extend_synonyms.py
        ├── utils
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-38.pyc
        │   │   ├── arStrip.cpython-38.pyc
        │   │   ├── corpus_tokenizer.cpython-38.pyc
        │   │   ├── implication.cpython-38.pyc
        │   │   ├── jaccard.cpython-38.pyc
        │   │   ├── remove_latin.cpython-38.pyc
        │   │   ├── remove_punctuation.cpython-38.pyc
        │   │   ├── sentence_tokenizer.cpython-38.pyc
        │   │   └── text_transliteration.cpython-38.pyc
        │   ├── arStrip.py
        │   ├── corpus_tokenizer.py
        │   ├── implication.py
        │   ├── jaccard.py
        │   ├── remove_latin.py
        │   ├── remove_punctuation.py
        │   ├── sentence_tokenizer.py
        │   ├── text_dublication_detector.py
        │   └── text_transliteration.py
        └── wsd
        │   ├── __pycache__
        │       └── disambiguator.cpython-38.pyc
        │   └── disambiguator.py
    ├── DataDownload
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-310.pyc
        │   ├── __init__.cpython-311.pyc
        │   ├── __init__.cpython-38.pyc
        │   ├── downloader.cpython-310.pyc
        │   ├── downloader.cpython-311.pyc
        │   └── downloader.cpython-38.pyc
        └── downloader.py
    ├── VERSION
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-310.pyc
        ├── __init__.cpython-311.pyc
        ├── __init__.cpython-38.pyc
        └── sinatools.cpython-38.pyc
    ├── arabert
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-310.pyc
        │   ├── __init__.cpython-311.pyc
        │   ├── __init__.cpython-38.pyc
        │   ├── preprocess.cpython-310.pyc
        │   ├── preprocess.cpython-311.pyc
        │   └── preprocess.cpython-38.pyc
        └── preprocess.py
    ├── environment.yml
    ├── install_env.py
    ├── morphology
        ├── ALMA_multi_word.py
        ├── __init__.py
        ├── __pycache__
        │   ├── ALMA_multi_word.cpython-310.pyc
        │   ├── ALMA_multi_word.cpython-311.pyc
        │   ├── ALMA_multi_word.cpython-38.pyc
        │   ├── __init__.cpython-310.pyc
        │   ├── __init__.cpython-311.pyc
        │   ├── __init__.cpython-38.pyc
        │   ├── charsets.cpython-310.pyc
        │   ├── morph_analyzer.cpython-310.pyc
        │   ├── morph_analyzer.cpython-311.pyc
        │   ├── morph_analyzer.cpython-38.pyc
        │   ├── settings.cpython-310.pyc
        │   └── tokenizers_words.cpython-310.pyc
        └── morph_analyzer.py
    ├── ner
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-310.pyc
        │   ├── __init__.cpython-311.pyc
        │   ├── __init__.cpython-38.pyc
        │   ├── data.cpython-310.pyc
        │   ├── data.cpython-38.pyc
        │   ├── data_format.cpython-310.pyc
        │   ├── data_format.cpython-311.pyc
        │   ├── data_format.cpython-38.pyc
        │   ├── datasets.cpython-310.pyc
        │   ├── datasets.cpython-38.pyc
        │   ├── entity_extractor.cpython-310.pyc
        │   ├── entity_extractor.cpython-311.pyc
        │   ├── entity_extractor.cpython-38.pyc
        │   ├── helpers.cpython-310.pyc
        │   ├── helpers.cpython-311.pyc
        │   ├── helpers.cpython-38.pyc
        │   ├── metrics.cpython-310.pyc
        │   ├── metrics.cpython-311.pyc
        │   ├── metrics.cpython-38.pyc
        │   ├── relation_extractor.cpython-38.pyc
        │   ├── transforms.cpython-310.pyc
        │   └── transforms.cpython-38.pyc
        ├── data
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-310.pyc
        │   │   ├── __init__.cpython-311.pyc
        │   │   ├── __init__.cpython-38.pyc
        │   │   ├── datasets.cpython-310.pyc
        │   │   ├── datasets.cpython-311.pyc
        │   │   ├── datasets.cpython-38.pyc
        │   │   ├── transforms.cpython-310.pyc
        │   │   ├── transforms.cpython-311.pyc
        │   │   └── transforms.cpython-38.pyc
        │   ├── datasets.py
        │   └── transforms.py
        ├── data_format.py
        ├── datasets.py
        ├── entity_extractor.py
        ├── helpers.py
        ├── metrics.py
        ├── nn
        │   ├── BaseModel.py
        │   ├── BertNestedTagger.py
        │   ├── BertSeqTagger.py
        │   ├── __init__.py
        │   └── __pycache__
        │   │   ├── BaseModel.cpython-310.pyc
        │   │   ├── BaseModel.cpython-311.pyc
        │   │   ├── BaseModel.cpython-38.pyc
        │   │   ├── BertNestedTagger.cpython-310.pyc
        │   │   ├── BertNestedTagger.cpython-311.pyc
        │   │   ├── BertNestedTagger.cpython-38.pyc
        │   │   ├── BertSeqTagger.cpython-310.pyc
        │   │   ├── BertSeqTagger.cpython-311.pyc
        │   │   ├── BertSeqTagger.cpython-38.pyc
        │   │   ├── __init__.cpython-310.pyc
        │   │   ├── __init__.cpython-311.pyc
        │   │   └── __init__.cpython-38.pyc
        ├── trainers
        │   ├── BaseTrainer.py
        │   ├── BertNestedTrainer.py
        │   ├── BertTrainer.py
        │   ├── __init__.py
        │   └── __pycache__
        │   │   ├── BaseTrainer.cpython-310.pyc
        │   │   ├── BaseTrainer.cpython-311.pyc
        │   │   ├── BaseTrainer.cpython-38.pyc
        │   │   ├── BertNestedTrainer.cpython-310.pyc
        │   │   ├── BertNestedTrainer.cpython-311.pyc
        │   │   ├── BertNestedTrainer.cpython-38.pyc
        │   │   ├── BertTrainer.cpython-310.pyc
        │   │   ├── BertTrainer.cpython-311.pyc
        │   │   ├── BertTrainer.cpython-38.pyc
        │   │   ├── __init__.cpython-310.pyc
        │   │   ├── __init__.cpython-311.pyc
        │   │   └── __init__.cpython-38.pyc
        └── transforms.py
    ├── relations
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-311.pyc
        │   ├── __init__.cpython-38.pyc
        │   ├── relation_extractor.cpython-311.pyc
        │   └── relation_extractor.cpython-38.pyc
        └── relation_extractor.py
    ├── semantic_relatedness
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-311.pyc
        │   ├── __init__.cpython-38.pyc
        │   ├── compute_relatedness.cpython-311.pyc
        │   ├── compute_relatedness.cpython-38.pyc
        │   └── settings.cpython-38.pyc
        └── compute_relatedness.py
    ├── sinatools.py
    ├── synonyms
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-311.pyc
        │   ├── __init__.cpython-38.pyc
        │   ├── synonyms_generator.cpython-311.pyc
        │   └── synonyms_generator.cpython-38.pyc
        └── synonyms_generator.py
    ├── utils
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-310.pyc
        │   ├── __init__.cpython-311.pyc
        │   ├── __init__.cpython-38.pyc
        │   ├── charsets.cpython-310.pyc
        │   ├── charsets.cpython-311.pyc
        │   ├── charsets.cpython-38.pyc
        │   ├── implication.cpython-310.pyc
        │   ├── implication.cpython-38.pyc
        │   ├── jaccard.cpython-310.pyc
        │   ├── jaccard.cpython-38.pyc
        │   ├── parser.cpython-310.pyc
        │   ├── parser.cpython-311.pyc
        │   ├── parser.cpython-38.pyc
        │   ├── readfile.cpython-310.pyc
        │   ├── readfile.cpython-38.pyc
        │   ├── similarity.cpython-311.pyc
        │   ├── text_dublication_detector.cpython-38.pyc
        │   ├── text_transliteration.cpython-38.pyc
        │   ├── tokenizer.cpython-310.pyc
        │   ├── tokenizer.cpython-311.pyc
        │   ├── tokenizer.cpython-38.pyc
        │   ├── tokenizers_words.cpython-310.pyc
        │   ├── tokenizers_words.cpython-311.pyc
        │   ├── tokenizers_words.cpython-38.pyc
        │   ├── word_compare.cpython-311.pyc
        │   └── word_compare.cpython-38.pyc
        ├── charsets.py
        ├── parser.py
        ├── readfile.py
        ├── similarity.py
        ├── text_dublication_detector.py
        ├── text_transliteration.py
        ├── tokenizer.py
        ├── tokenizers_words.py
        └── word_compare.py
    └── wsd
        ├── __init__.py
        ├── __pycache__
            ├── __init__.cpython-310.pyc
            ├── __init__.cpython-311.pyc
            ├── __init__.cpython-38.pyc
            ├── disambiguator.cpython-310.pyc
            ├── disambiguator.cpython-311.pyc
            ├── disambiguator.cpython-38.pyc
            ├── settings.cpython-310.pyc
            ├── settings.cpython-311.pyc
            ├── settings.cpython-38.pyc
            ├── views.cpython-38.pyc
            ├── wsd.cpython-310.pyc
            ├── wsd.cpython-311.pyc
            └── wsd.cpython-38.pyc
        ├── disambiguator.py
        ├── settings.py
        └── wsd.py


/.eggs/README.txt:
--------------------------------------------------------------------------------
1 | This directory contains eggs that were downloaded by setuptools to build, test, and run plug-ins.
2 | 
3 | This directory caches those eggs to prevent repeated downloads.
4 | 
5 | However, it is safe to delete this directory.
6 | 
7 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.10.egg/EGG-INFO/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Jason R. Coombs
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to
 5 | deal in the Software without restriction, including without limitation the
 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | sell copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | IN THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.10.egg/EGG-INFO/RECORD:
--------------------------------------------------------------------------------
1 | ptr/__init__.py,sha256=0UfzhCooVgCNTBwVEOPOVGEPck4pnl_6PTfsC-QzNGM,6730
2 | pytest_runner-6.0.1.dist-info/LICENSE,sha256=2z8CRrH5J48VhFuZ_sR4uLUG63ZIeZNyL4xuJUKF-vg,1050
3 | pytest_runner-6.0.1.dist-info/METADATA,sha256=Ho3FvAFjFHeY5OQ64WFzkLigFaIpuNr4G3uSmOk3nho,7319
4 | pytest_runner-6.0.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
5 | pytest_runner-6.0.1.dist-info/entry_points.txt,sha256=BqezBqeO63XyzSYmHYE58gKEFIjJUd-XdsRQkXHy2ig,58
6 | pytest_runner-6.0.1.dist-info/top_level.txt,sha256=DPzHbWlKG8yq8EOD5UgEvVNDWeJRPyimrwfShwV6Iuw,4
7 | pytest_runner-6.0.1.dist-info/RECORD,,
8 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.10.egg/EGG-INFO/WHEEL:
--------------------------------------------------------------------------------
1 | Wheel-Version: 1.0
2 | Generator: bdist_wheel (0.42.0)
3 | Root-Is-Purelib: true
4 | Tag: py3-none-any
5 | 
6 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.10.egg/EGG-INFO/entry_points.txt:
--------------------------------------------------------------------------------
1 | [distutils.commands]
2 | ptr = ptr:PyTest
3 | pytest = ptr:PyTest
4 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.10.egg/EGG-INFO/requires.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | [docs]
 3 | sphinx
 4 | jaraco.packaging>=9
 5 | rst.linker>=1.9
 6 | jaraco.tidelift>=1.4
 7 | 
 8 | [testing]
 9 | pytest>=6
10 | pytest-checkdocs>=2.4
11 | pytest-flake8
12 | pytest-cov
13 | pytest-enabler>=1.0.1
14 | pytest-virtualenv
15 | types-setuptools
16 | pytest-black>=0.3.7
17 | pytest-mypy>=0.9.1
18 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.10.egg/EGG-INFO/top_level.txt:
--------------------------------------------------------------------------------
1 | ptr
2 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.11.egg/EGG-INFO/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Jason R. Coombs
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to
 5 | deal in the Software without restriction, including without limitation the
 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | sell copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | IN THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.11.egg/EGG-INFO/RECORD:
--------------------------------------------------------------------------------
1 | ptr/__init__.py,sha256=0UfzhCooVgCNTBwVEOPOVGEPck4pnl_6PTfsC-QzNGM,6730
2 | pytest_runner-6.0.1.dist-info/LICENSE,sha256=2z8CRrH5J48VhFuZ_sR4uLUG63ZIeZNyL4xuJUKF-vg,1050
3 | pytest_runner-6.0.1.dist-info/METADATA,sha256=Ho3FvAFjFHeY5OQ64WFzkLigFaIpuNr4G3uSmOk3nho,7319
4 | pytest_runner-6.0.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
5 | pytest_runner-6.0.1.dist-info/entry_points.txt,sha256=BqezBqeO63XyzSYmHYE58gKEFIjJUd-XdsRQkXHy2ig,58
6 | pytest_runner-6.0.1.dist-info/top_level.txt,sha256=DPzHbWlKG8yq8EOD5UgEvVNDWeJRPyimrwfShwV6Iuw,4
7 | pytest_runner-6.0.1.dist-info/RECORD,,
8 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.11.egg/EGG-INFO/WHEEL:
--------------------------------------------------------------------------------
1 | Wheel-Version: 1.0
2 | Generator: bdist_wheel (0.42.0)
3 | Root-Is-Purelib: true
4 | Tag: py3-none-any
5 | 
6 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.11.egg/EGG-INFO/entry_points.txt:
--------------------------------------------------------------------------------
1 | [distutils.commands]
2 | ptr = ptr:PyTest
3 | pytest = ptr:PyTest
4 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.11.egg/EGG-INFO/requires.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | [docs]
 3 | sphinx
 4 | jaraco.packaging>=9
 5 | rst.linker>=1.9
 6 | jaraco.tidelift>=1.4
 7 | 
 8 | [testing]
 9 | pytest>=6
10 | pytest-checkdocs>=2.4
11 | pytest-flake8
12 | pytest-cov
13 | pytest-enabler>=1.0.1
14 | pytest-virtualenv
15 | types-setuptools
16 | pytest-black>=0.3.7
17 | pytest-mypy>=0.9.1
18 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.11.egg/EGG-INFO/top_level.txt:
--------------------------------------------------------------------------------
1 | ptr
2 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.12.egg/EGG-INFO/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Jason R. Coombs
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to
 5 | deal in the Software without restriction, including without limitation the
 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | sell copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | IN THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.12.egg/EGG-INFO/RECORD:
--------------------------------------------------------------------------------
1 | ptr/__init__.py,sha256=0UfzhCooVgCNTBwVEOPOVGEPck4pnl_6PTfsC-QzNGM,6730
2 | pytest_runner-6.0.1.dist-info/LICENSE,sha256=2z8CRrH5J48VhFuZ_sR4uLUG63ZIeZNyL4xuJUKF-vg,1050
3 | pytest_runner-6.0.1.dist-info/METADATA,sha256=Ho3FvAFjFHeY5OQ64WFzkLigFaIpuNr4G3uSmOk3nho,7319
4 | pytest_runner-6.0.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
5 | pytest_runner-6.0.1.dist-info/entry_points.txt,sha256=BqezBqeO63XyzSYmHYE58gKEFIjJUd-XdsRQkXHy2ig,58
6 | pytest_runner-6.0.1.dist-info/top_level.txt,sha256=DPzHbWlKG8yq8EOD5UgEvVNDWeJRPyimrwfShwV6Iuw,4
7 | pytest_runner-6.0.1.dist-info/RECORD,,
8 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.12.egg/EGG-INFO/WHEEL:
--------------------------------------------------------------------------------
1 | Wheel-Version: 1.0
2 | Generator: bdist_wheel (0.42.0)
3 | Root-Is-Purelib: true
4 | Tag: py3-none-any
5 | 
6 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.12.egg/EGG-INFO/entry_points.txt:
--------------------------------------------------------------------------------
1 | [distutils.commands]
2 | ptr = ptr:PyTest
3 | pytest = ptr:PyTest
4 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.12.egg/EGG-INFO/requires.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | [docs]
 3 | sphinx
 4 | jaraco.packaging>=9
 5 | rst.linker>=1.9
 6 | jaraco.tidelift>=1.4
 7 | 
 8 | [testing]
 9 | pytest>=6
10 | pytest-checkdocs>=2.4
11 | pytest-flake8
12 | pytest-cov
13 | pytest-enabler>=1.0.1
14 | pytest-virtualenv
15 | types-setuptools
16 | pytest-black>=0.3.7
17 | pytest-mypy>=0.9.1
18 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.12.egg/EGG-INFO/top_level.txt:
--------------------------------------------------------------------------------
1 | ptr
2 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.7.egg/EGG-INFO/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Jason R. Coombs
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to
 5 | deal in the Software without restriction, including without limitation the
 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 7 | sell copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | IN THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.7.egg/EGG-INFO/RECORD:
--------------------------------------------------------------------------------
1 | ptr/__init__.py,sha256=0UfzhCooVgCNTBwVEOPOVGEPck4pnl_6PTfsC-QzNGM,6730
2 | pytest_runner-6.0.1.dist-info/LICENSE,sha256=2z8CRrH5J48VhFuZ_sR4uLUG63ZIeZNyL4xuJUKF-vg,1050
3 | pytest_runner-6.0.1.dist-info/METADATA,sha256=Ho3FvAFjFHeY5OQ64WFzkLigFaIpuNr4G3uSmOk3nho,7319
4 | pytest_runner-6.0.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
5 | pytest_runner-6.0.1.dist-info/entry_points.txt,sha256=BqezBqeO63XyzSYmHYE58gKEFIjJUd-XdsRQkXHy2ig,58
6 | pytest_runner-6.0.1.dist-info/top_level.txt,sha256=DPzHbWlKG8yq8EOD5UgEvVNDWeJRPyimrwfShwV6Iuw,4
7 | pytest_runner-6.0.1.dist-info/RECORD,,
8 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.7.egg/EGG-INFO/WHEEL:
--------------------------------------------------------------------------------
1 | Wheel-Version: 1.0
2 | Generator: bdist_wheel (0.42.0)
3 | Root-Is-Purelib: true
4 | Tag: py3-none-any
5 | 
6 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.7.egg/EGG-INFO/entry_points.txt:
--------------------------------------------------------------------------------
1 | [distutils.commands]
2 | ptr = ptr:PyTest
3 | pytest = ptr:PyTest
4 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.7.egg/EGG-INFO/requires.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | [docs]
 3 | jaraco.packaging>=9
 4 | jaraco.tidelift>=1.4
 5 | rst.linker>=1.9
 6 | sphinx
 7 | 
 8 | [testing]
 9 | pytest-black>=0.3.7
10 | pytest-checkdocs>=2.4
11 | pytest-cov
12 | pytest-enabler>=1.0.1
13 | pytest-flake8
14 | pytest-mypy>=0.9.1
15 | pytest-virtualenv
16 | pytest>=6
17 | types-setuptools
18 | 


--------------------------------------------------------------------------------
/.eggs/pytest_runner-6.0.1-py3.7.egg/EGG-INFO/top_level.txt:
--------------------------------------------------------------------------------
1 | ptr
2 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | Development Lead
 6 | ----------------
 7 | 
 8 | * SinaLab <sina.institute.bzu@gmail.com>
 9 | 
10 | Contributors
11 | ------------
12 | 
13 | None yet. Why not be the first?
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023, SinaLab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.rst
 2 | include CONTRIBUTING.rst
 3 | include HISTORY.rst
 4 | include LICENSE
 5 | include README.rst
 6 | 
 7 | recursive-include tests *
 8 | recursive-exclude * __pycache__
 9 | recursive-exclude * *.py[co]
10 | 
11 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
12 | 
13 | 
14 | include setup.py
15 | include sinatools/VERSION
16 | include sinatools/utils/*.py
17 | include sinatools/ner/*.py
18 | include sinatools/arabert/*.py
19 | include sinatools/DataDownload/*.py
20 | include sinatools/morphology/*.py
21 | include sinatools/salma/*.py
22 | include sinatools/CLI/ner/*.py
23 | include sinatools/CLI/morphology/*.py
24 | include sinatools/CLI/salma/*.py
25 | include sinatools/CLI/utils/*.py
26 | include sinatools/CLI/DataDownload/*.py
27 | include tests/*.py
28 | global-exclude *~


--------------------------------------------------------------------------------
/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.0
 2 | Name: SinaTools
 3 | Version: 0.1.1
 4 | Summary: UNKNOWN
 5 | Home-page: https://github.com/SinaLab/sinatools
 6 | Author: UNKNOWN
 7 | Author-email: UNKNOWN
 8 | License: MIT license
 9 | Description: ========
10 |         sinatools
11 |         ========
12 |         
13 |         
14 |         .. image:: https://img.shields.io/pypi/v/sinatools.svg
15 |                 :target: https://pypi.python.org/pypi/SinaTools
16 |         
17 |         .. image:: https://img.shields.io/travis/sina_institute/sinatools.svg
18 |                 :target: https://travis-ci.com/sina_institute/SinaTools
19 |         
20 |         .. image:: https://readthedocs.org/projects/sinatools/badge/?version=latest
21 |                 :target: https://SinaTools.readthedocs.io/en/latest/?version=latest
22 |                 :alt: Documentation Status
23 |         
24 |         
25 |         
26 |         
27 |         Python Boilerplate contains all the boilerplate you need to create a Python package.
28 |         
29 |         
30 |         * Free software: MIT license
31 |         * Documentation: https://sina.birzeit.edu/sinatools/
32 |         
33 |         
34 |         Credits
35 |         -------
36 |         
37 |         This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypackage`_ project template.
38 |         
39 |         .. _Cookiecutter: https://github.com/audreyr/cookiecutter
40 |         .. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage
41 |         
42 |         
43 |         =======
44 |         History
45 |         =======
46 |         
47 |         0.1.2 (2024-06-04)
48 |         ------------------
49 |         
50 |         
51 | Keywords: sinatools
52 | Platform: UNKNOWN
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | SinaTools
 2 | ======================
 3 | Open Source Toolkit for Arabic NLP and NLU developed by [SinaLab](http://sina.birzeit.edu/) at Birzeit University. SinaTools is available through Python APIs, command lines, colabs, and online demos.
 4 | 
 5 | See the full list of [Available Packages](https://sina.birzeit.edu/sinatools/), which include: (1) [Morphology Tagging](https://sina.birzeit.edu/sinatools/index.html#morph), (2) [Named Entity Recognition (NER)](https://sina.birzeit.edu/sinatools/index.html#ner), (3) [Word Sense Disambiguation (WSD)](https://sina.birzeit.edu/sinatools/index.html#wsd), (4) [Semantic Relatedness](https://sina.birzeit.edu/sinatools/index.html#sr), (5) [Synonymy Extraction and Evaluation](https://sina.birzeit.edu/sinatools/index.html#se), (6) [Relation Extraction](https://sina.birzeit.edu/sinatools/index.html#re), (7) [Utilities](https://sina.birzeit.edu/sinatools/index.html#u) (diacritic-based word matching, Jaccard similarly, parser, tokenizers, corpora processing, transliteration, etc).
 6 | 
 7 | See [Demo Pages](https://sina.birzeit.edu/sinatools/).
 8 | 
 9 | See the [benchmarking](https://www.jarrar.info/publications/HJK24.pdf), which shows that SinaTools outperformed all related toolkits. 
10 | 
11 | Installation 
12 | --------
13 | To install SinaTools, ensure you are using Python version 3.11.11, then clone the [GitHub](git://github.com/SinaLab/SinaTools) repository.
14 | 
15 | Alternatively, you can execute the following command:
16 | 
17 | ```bash
18 | pip install sinatools
19 | ```
20 | 
21 | Installing Models and Data Files
22 | --------
23 | Some modules in SinaTools require some data files and fine-tuned models to be downloaded. To download these models, please consult the [DataDownload](https://sina.birzeit.edu/sinatools/documentation/cli_tools/DataDownload/DataDownload.html).
24 | 
25 | Documentation
26 | --------
27 | For information, please refer to the [main page](https://sina.birzeit.edu/sinatools) or the [online domuementation](https://sina.birzeit.edu/sinatools/documentation). 
28 | 
29 | Citation
30 | -------
31 | Tymaa Hammouda, Mustafa Jarrar, Mohammed Khalilia: [SinaTools: Open Source Toolkit for Arabic Natural Language Understanding](http://www.jarrar.info/publications/HJK24.pdf). In Proceedings of the 2024 AI in Computational Linguistics (ACLing 2024), Procedia Computer Science, Dubai. ELSEVIER.
32 | 
33 | License
34 | --------
35 | SinaTools is available under the MIT License. See the [LICENSE](https://github.com/SinaLab/sinatools/blob/main/LICENSE) file for more information.
36 | 
37 | Reporting Issues
38 | --------
39 | To report any issues or bugs, please contact us at "sina.institute.bzu@gmail.com" or visit [SinaTools Issues](https://github.com/SinaLab/sinatools/issues).
40 | 
41 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | SinaTools
 2 | ======================
 3 | Open Source Toolkit for Arabic NLP and NLU developed by [SinaLab](http://sina.birzeit.edu/) at Birzeit University. SinaTools is available through Python APIs, command lines, colabs, and online demos.
 4 | 
 5 | See the full list of [Available Packages](https://sina.birzeit.edu/sinatools/), which include: (1) [Morphology Tagging](https://sina.birzeit.edu/sinatools/index.html#morph), (2) [Named Entity Recognition (NER)](https://sina.birzeit.edu/sinatools/index.html#ner), (3) [Word Sense Disambiguation (WSD)](https://sina.birzeit.edu/sinatools/index.html#wsd), (4) [Semantic Relatedness](https://sina.birzeit.edu/sinatools/index.html#sr), (5) [Synonymy Extraction and Evaluation](https://sina.birzeit.edu/sinatools/index.html#se), (6) [Relation Extraction](https://sina.birzeit.edu/sinatools/index.html#re), (7) [Utilities](https://sina.birzeit.edu/sinatools/index.html#u) (diacritic-based word matching, Jaccard similarly, parser, tokenizers, corpora processing, transliteration, etc).
 6 | 
 7 | See [Demo Pages](https://sina.birzeit.edu/sinatools/).
 8 | 
 9 | See the [benchmarking](https://www.jarrar.info/publications/HJK24.pdf), which shows that SinaTools outperformed all related toolkits. 
10 | 
11 | Installation 
12 | --------
13 | To install SinaTools, ensure you are using Python version 3.10.8, then clone the [GitHub](git://github.com/SinaLab/SinaTools) repository.
14 | 
15 | Alternatively, you can execute the following command:
16 | 
17 | ```bash
18 | pip install sinatools
19 | ```
20 | 
21 | Installing Models and Data Files
22 | --------
23 | Some modules in SinaTools require some data files and fine-tuned models to be downloaded. To download these models, please consult the [DataDownload](https://sina.birzeit.edu/sinatools/documentation/cli_tools/DataDownload/DataDownload.html).
24 | 
25 | Documentation
26 | --------
27 | For information, please refer to the [main page](https://sina.birzeit.edu/sinatools) or the [online domuementation](https://sina.birzeit.edu/sinatools/documentation). 
28 | 
29 | Citation
30 | -------
31 | Tymaa Hammouda, Mustafa Jarrar, Mohammed Khalilia: [SinaTools: Open Source Toolkit for Arabic Natural Language Understanding](http://www.jarrar.info/publications/HJK24.pdf). In Proceedings of the 2024 AI in Computational Linguistics (ACLing 2024), Procedia Computer Science, Dubai. ELSEVIER.
32 | 
33 | License
34 | --------
35 | SinaTools is available under the MIT License. See the [LICENSE](https://github.com/SinaLab/sinatools/blob/main/LICENSE) file for more information.
36 | 
37 | Reporting Issues
38 | --------
39 | To report any issues or bugs, please contact us at "sina.institute.bzu@gmail.com" or visit [SinaTools Issues](https://github.com/SinaLab/sinatools/issues).
40 | 


--------------------------------------------------------------------------------
/SinaTools.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/SinaTools.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
 1 | [console_scripts]
 2 | alma_multi_word = sinatools.CLI.morphology.ALMA_multi_word:main
 3 | appdatadir = sinatools.CLI.DataDownload.get_appdatadir:main
 4 | arStrip = sinatools.CLI.utils.arStrip:main
 5 | corpus_entity_extractor = sinatools.CLI.ner.corpus_entity_extractor:main
 6 | corpus_tokenizer = sinatools.CLI.utils.corpus_tokenizer:main
 7 | download_files = sinatools.CLI.DataDownload.download_files:main
 8 | entity_extractor = sinatools.CLI.ner.entity_extractor:main
 9 | evaluate_synonyms = sinatools.CLI.synonyms.evaluate_synonyms:main
10 | extend_synonyms = sinatools.CLI.synonyms.extend_synonyms:main
11 | implication = sinatools.CLI.utils.implication:main
12 | install_env = sinatools.install_env:main
13 | jaccard_similarity = sinatools.CLI.utils.jaccard:main
14 | morphology_analyzer = sinatools.CLI.morphology.morph_analyzer:main
15 | relation_extractor = sinatools.CLI.relations.relation_extractor:main
16 | remove_latin = sinatools.CLI.utils.remove_latin:main
17 | remove_punctuation = sinatools.CLI.utils.remove_punctuation:main
18 | semantic_relatedness = sinatools.CLI.semantic_relatedness.compute_relatedness:main
19 | sentence_tokenizer = sinatools.CLI.utils.sentence_tokenizer:main
20 | text_dublication_detector = sinatools.CLI.utils.text_dublication_detector:main
21 | transliterate = sinatools.CLI.utils.text_transliteration:main
22 | wsd = sinatools.CLI.wsd.disambiguator:main
23 | 


--------------------------------------------------------------------------------
/SinaTools.egg-info/not-zip-safe:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/SinaTools.egg-info/requires.txt:
--------------------------------------------------------------------------------
 1 | six
 2 | farasapy
 3 | tqdm
 4 | requests
 5 | pathlib
 6 | transformers==4.47.1
 7 | torchvision==0.20.1
 8 | seqeval==1.2.2
 9 | natsort==7.1.1
10 | 


--------------------------------------------------------------------------------
/SinaTools.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | sinatools
2 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/CLI/DataDownload/download_files.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | 
 5 | The download_files command, allows users to select specific files and models to download and use it within SinaTools modules. Additionally, it automatically manages the extraction of compressed files, including zip and tar.gz formats.
 6 | 
 7 | Usage:
 8 | ------
 9 | 
10 | Below is the usage information that can be generated by running download_files --help.
11 | 
12 | .. code-block:: none
13 | 
14 |     Usage:
15 |      download_files [OPTIONS]
16 | 
17 | .. code-block:: none
18 | 
19 |         Options:
20 |         -f, --files FILES
21 |             Names of the files to download. Available files are: ner, morph, wsd, synonyms. 
22 |             If no file is specified, all files will be downloaded.
23 | 
24 | Examples:
25 | ---------
26 | 
27 | .. code-block:: none
28 | 
29 |     download_files -f morph ner 
30 |     This command will download only the `morph` and `ner` files to the default directory.
31 | """
32 | 
33 | import argparse
34 | from sinatools.DataDownload.downloader import download_file
35 | from sinatools.DataDownload.downloader import download_files
36 | from sinatools.DataDownload.downloader import get_appdatadir
37 | from sinatools.DataDownload.downloader import download_folder_from_hf
38 | from sinatools.DataDownload.downloader import urls
39 | 
40 | 
41 | def main():
42 |     parser = argparse.ArgumentParser(description="Download files from specified URLs.")
43 |     parser.add_argument('-f', '--files', nargs="*",
44 |                         help="Names of the files to download. Available files are: "
45 |                              f"{', '.join(urls.keys())}. If no file is specified, all files will be downloaded.")
46 |     
47 |     get_appdatadir()
48 | 
49 |     args = parser.parse_args()
50 | 
51 |     if args.files:
52 |         for file in args.files:
53 |             print("file: ", file)
54 |             if file == "wsd":
55 |                 download_file(urls["morph"])
56 |                 download_file(urls["ner"])
57 |                 #download_file(urls["wsd_model"])
58 |                 #download_file(urls["wsd_tokenizer"])
59 |                 download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01")
60 |                 download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02")
61 |                 download_file(urls["one_gram"])
62 |                 download_file(urls["five_grams"])
63 |                 download_file(urls["four_grams"])
64 |                 download_file(urls["three_grams"])
65 |                 download_file(urls["two_grams"])
66 |             elif file == "synonyms":
67 |                 download_file(urls["graph_l2"])
68 |                 download_file(urls["graph_l3"])
69 |             else:
70 |                url = urls[file]
71 |                download_file(url)
72 |     else:
73 |         download_files()
74 | 
75 | if __name__ == '__main__':
76 |     main()
77 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/CLI/morphology/ALMA_multi_word.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
 3 | import json
 4 | from sinatools.utils.readfile import read_file
 5 | 
 6 | def main():
 7 |     parser = argparse.ArgumentParser(description='Multi-Word Analysis using SinaTools')
 8 |     
 9 |     # Adding arguments for the multi-word input or file containing the multi-word input
10 |     parser.add_argument('--multi_word', type=str, help='Multi-word text to be analyzed')
11 |     parser.add_argument('--file', type=str, help='File containing the multi-word text to be analyzed')
12 | 
13 |     args = parser.parse_args()
14 | 
15 |     if args.multi_word is None and args.file is None:
16 |         print("Error: Either --multi_word or --file argument must be provided.")
17 |         return
18 | 
19 |     # Get the input either from the --multi_word argument or from the file specified in the --file argument
20 |     multi_word_text = args.multi_word if args.multi_word else " ".join(read_file(args.file))
21 | 
22 |     # Perform multi-word analysis
23 |     results = ALMA_multi_word(multi_word_text)
24 |     
25 |     # Print the results in JSON format
26 |     print(json.dumps(results, ensure_ascii=False, indent=4))
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | #alma_multi_word --multi_word "Your multi-word text here"
31 | #alma_multi_word --file "path/to/your/file.txt"
32 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/CLI/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/build/lib/sinatools/CLI/utils/__init__.py


--------------------------------------------------------------------------------
/build/lib/sinatools/CLI/utils/corpus_tokenizer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | About:
 4 | ------
 5 | The corpus_tokenizer command offers functionality to tokenize a corpus and write the results to a CSV file. It recursively searches through a specified directory for text files, tokenizes the content, and outputs the results, including various metadata, to a specified CSV file.
 6 | 
 7 | Usage:
 8 | -------
 9 | Below is the usage information that can be generated by running corpus_tokenizer --help.
10 | 
11 | .. code-block:: none
12 | 
13 |     Usage:
14 |         corpus_tokenizer dir_path output_csv
15 | 
16 | .. code-block:: none
17 |     dir_path
18 |         The path to the directory containing the text files.
19 | 
20 |     output_csv
21 |         The path to the output CSV file.
22 | 
23 | Examples:
24 | ---------
25 | .. code-block:: none
26 |     corpus_tokenizer --dir_path "/path/to/text/directory/of/files" --output_csv  "outputFile.csv"
27 | """
28 | 
29 | import argparse
30 | from sinatools.utils.tokenizer import corpus_tokenizer
31 | 
32 | # Define the main function that will parse the arguments
33 | def main():
34 |     # Create an ArgumentParser object
35 |     parser = argparse.ArgumentParser(description='Tokenize the corpus and write the results to a CSV file.')
36 |     
37 |     # Add arguments to the parser
38 |     parser.add_argument('--dir_path', type=str, help='The path to the directory containing the text files.')
39 |     parser.add_argument('--output_csv', type=str, help='The path to the output CSV file.')
40 |     
41 |     # Parse the command-line arguments
42 |     args = parser.parse_args()
43 |     
44 |     # Call the corpus_tokenizer function with the parsed arguments
45 |     corpus_tokenizer(args.dir_path, args.output_csv)
46 | 
47 | # Call the main function when the script is executed
48 | if __name__ == '__main__':
49 |     main()
50 | 
51 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/CLI/utils/implication.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | The implication tool evaluates the implication between two words using the functionalities provided by the `Implication` class of SinaTools. This tool can be utilized to determine the relationship between two words and understand if one implies the other.
 5 | 
 6 | Usage:
 7 | ------
 8 | Below is the usage information that can be generated by running implication --help.
 9 | 
10 | .. code-block:: none
11 | 
12 |     Usage:
13 |         implication --inputWord1=WORD1 --inputWord2=WORD2
14 |         
15 |         implication --inputFile1=File1 --inputFile2=File2  
16 | 
17 | .. code-block:: none
18 | 
19 |     Options:
20 |       --inputWord1 WORD1
21 |             First input word.
22 | 
23 |       --inputWord2 WORD2
24 |             Second input word.
25 | 
26 |       --file1 FILE1
27 |             File containing the words to evaluate the implication.
28 | 
29 |       --file2 FILE2
30 |             File containing the words to evaluate the implication.
31 | Examples:
32 | ---------
33 | 
34 | .. code-block:: none
35 | 
36 |       implication --inputWord1 "word1" --inputWord2 "word2"
37 |       
38 |       implication --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt"
39 | 
40 | """
41 | import argparse
42 | from sinatools.utils.word_compare import Implication
43 | 
44 | def read_file(file_path):
45 |     with open(file_path, 'r', encoding='utf-8') as file:
46 |         word = file.readline().strip()
47 |         if word:
48 |             return word
49 |         else:
50 |             raise ValueError(f"File {file_path} must contain at least one word.")
51 | 
52 | def main():
53 |     parser = argparse.ArgumentParser(description='Evaluate Implication between two words using SinaTools')
54 |     
55 |     # Adding optional arguments for the two input words and the files
56 |     parser.add_argument('--inputWord1', type=str, help='First input word')
57 |     parser.add_argument('--inputWord2', type=str, help='Second input word')
58 |     parser.add_argument('--file1', type=str, help='File containing the first word to evaluate implication')
59 |     parser.add_argument('--file2', type=str, help='File containing the second word to evaluate implication')
60 | 
61 |     args = parser.parse_args()
62 | 
63 |     if args.file1 and args.file2:
64 |         word1 = read_file(args.file1)
65 |         word2 = read_file(args.file2)
66 |     elif args.inputWord1 and args.inputWord2:
67 |         word1, word2 = args.inputWord1, args.inputWord2
68 |     else:
69 |         print("Either --file1 and --file2 arguments or both --inputWord1 and --inputWord2 arguments must be provided.")
70 |         return
71 | 
72 |     # Instantiate the Implication class
73 |     implication_obj = Implication(word1, word2)
74 |     
75 |     # For this example, assuming there is a method `get_verdict()` in the Implication class.
76 |     result = implication_obj.get_verdict()
77 |     print(result)
78 | 
79 | if __name__ == '__main__':
80 |     main()
81 | # implication --inputWord1 "word1" --inputWord2 "word2"
82 | # implication --file1 "path/to/your/firstfile.txt" --file2 "path/to/your/secondfile.txt"
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/CLI/utils/remove_latin.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | The remove_latin command performs delete latin characters from the input text.
 5 | 
 6 | Usage:
 7 | ------
 8 | Below is the usage information that can be generated by running remove_latin --help.
 9 | 
10 | .. code-block:: none
11 |     remove_latin --text=TEXT
12 |     remove_latin --file "path/to/your/file.txt"
13 | 
14 | Examples:
15 | ---------
16 | .. code-block:: none
17 |     remove_latin --text "123test"    
18 |     remove_latin --file "path/to/your/file.txt"
19 | """
20 | 
21 | import argparse
22 | from sinatools.utils.parser import remove_latin
23 | 
24 | 
25 | def main():
26 |     parser = argparse.ArgumentParser(description='remove latin characters from the text')
27 | 
28 |     parser.add_argument('--text', type=str, required=True, help='The input text')
29 |     args = parser.parse_args()
30 |     result = remove_latin(args.text)
31 |  
32 |     print(result)
33 |     if __name__ == '__main__':
34 |         main()
35 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/CLI/utils/remove_punctuation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | The remove_punctuation command performs delete punctuation marks from the input text.
 5 | 
 6 | Usage:
 7 | ------
 8 | Below is the usage information that can be generated by running remove_punctuation --help.
 9 | 
10 | .. code-block:: none
11 | 
12 |     Usage:
13 |         remove_punctuation --text=TEXT
14 |         remove_punctuation --file "path/to/your/file.txt"
15 | 
16 | Examples:
17 | ---------
18 | .. code-block:: none
19 | 
20 |     remove_punctuation --text "te%s@t...!!?"    
21 |     remove_punctuation --file "path/to/your/file.txt"
22 | """
23 | 
24 | import argparse
25 | from sinatools.utils.parser import remove_punctuation
26 | #from sinatools.utils.parser import read_file
27 | #from sinatools.utils.parser import write_file
28 | 
29 | 
30 | def main():
31 |     parser = argparse.ArgumentParser(description='remove punctuation marks from the text')
32 | 
33 |     parser.add_argument('--text',required=True,help="input text")
34 |    # parser.add_argument('myFile', type=argparse.FileType('r'),help='Input file csv')
35 |     args = parser.parse_args()
36 |     result = remove_punctuation(args.text)
37 |  
38 |     print(result)
39 |     if __name__ == '__main__':
40 |         main()
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/CLI/utils/sentence_tokenizer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | 
 5 | The sentence_tokenizer command allows you to tokenize text into sentences using the SinaTools utility. It provides 
 6 | flexibility in tokenizing at different punctuation marks, including dots, question marks, and exclamation marks. It also 
 7 | allows tokenization at new lines.
 8 | 
 9 | Usage:
10 | ------
11 | Below is the usage information that can be generated by running sentence_tokenizer --help.
12 | 
13 | .. code-block:: none
14 | 
15 |     Usage:
16 |         sentence_tokenizer --text=TEXT [options]
17 |         sentence_tokenizer --file=FILE [options]
18 |         
19 | .. code-block:: none
20 |       
21 |     Options:
22 |       --text TEXT
23 |             Text to be tokenized into sentences.
24 |       --file FILE
25 |             File containing the text to be tokenized into sentences
26 |       --dot
27 |             Tokenize at dots.
28 |       --new_line
29 |             Tokenize at new lines.
30 |       --question_mark
31 |             Tokenize at question marks.
32 |       --exclamation_mark
33 |             Tokenize at exclamation marks.
34 | 
35 | Examples:
36 | ---------
37 | 
38 | .. code-block:: none
39 | 
40 |   sentence_tokenizer --text "Your text here. Does it work? Yes! Try with new lines." --dot --question_mark --exclamation_mark
41 |   sentence_tokenizer --file "path/to/your/file.txt" --dot --question_mark --exclamation_mark
42 | 
43 | """
44 | import argparse
45 | from sinatools.utils.tokenizer import sentence_tokenizer
46 | from sinatools.utils.readfile import read_file
47 | 
48 | def main():
49 |     parser = argparse.ArgumentParser(description='Sentence Tokenization using SinaTools')
50 |     
51 |     # Adding arguments for the text, file, and tokenization options
52 |     parser.add_argument('--text', type=str, help='Text to be tokenized into sentences')
53 |     parser.add_argument('--file', type=str, help='File containing the text to be tokenized into sentences')
54 |     parser.add_argument('--dot', action='store_true', help='Tokenize at dots')
55 |     parser.add_argument('--new_line', action='store_true', help='Tokenize at new lines')
56 |     parser.add_argument('--question_mark', action='store_true', help='Tokenize at question marks')
57 |     parser.add_argument('--exclamation_mark', action='store_true', help='Tokenize at exclamation marks')
58 | 
59 |     args = parser.parse_args()
60 | 
61 |     # Check if either text or file is provided
62 |     if args.text is None and args.file is None:
63 |         print("Either --text or --file argument must be provided.")
64 |         return
65 | 
66 |     text_content = args.text if args.text else read_file(args.file)
67 | 
68 |     # Perform sentence tokenization
69 |     sentences = sentence_tokenizer(" ".join(text_content), dot=args.dot, new_line=args.new_line, 
70 |                               question_mark=args.question_mark, exclamation_mark=args.exclamation_mark)
71 |     
72 |     # Print each sentence in a new line
73 |     for sentence in sentences:
74 |         print(sentence)
75 | 
76 | if __name__ == '__main__':
77 |     main()
78 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/CLI/utils/text_dublication_detector.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from sinatools.utils.text_dublication_detector import removal
 3 | 
 4 | def main():
 5 |     parser = argparse.ArgumentParser(description='Processes a CSV file of sentences to identify and remove duplicate sentences based on a specified threshold and cosine similarity. It saves the filtered results and the identified duplicates to separate files.')
 6 |     
 7 |     parser.add_argument('--csv_file', type=str, help='The path to the input CSV file that will be processed.')
 8 |     parser.add_argument('--column_name', type=str, help='The name of the column from which duplicates will be removed.')
 9 |     parser.add_argument('--final_file_name', type=str, help='The name of the output file that will contain the deduplicated results.')
10 |     parser.add_argument('--deleted_file_name', type=str, help='The name of the output file that will contain the records that were identified as duplicates and removed.')
11 |     parser.add_argument('--similarity_threshold', type=float, default=0.8, help='The similarity threshold for determining duplicates. Records with a similarity score above this value will be considered duplicates (default is 0.8).')
12 | 
13 |     args = parser.parse_args()
14 | 
15 |     if args.csv_file is None and args.column_name is None:
16 |         print("Either --csv_file or --column_name argument must be provided.")
17 |         return
18 | 
19 |     removal(args.csv_file, args.column_name, args.final_file_name, args.deleted_file_name, args.similarity_threshold)
20 |     
21 | 
22 | if __name__ == '__main__':
23 |     main()
24 | 
25 | # text_dublication_detector --csv_file "text.csv" --column_name "A" --final_file_name "Final.csv" --deleted_file_name "deleted.csv" --similarity_threshold 0.8


--------------------------------------------------------------------------------
/build/lib/sinatools/CLI/utils/text_transliteration.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | 
 5 | The transliterate tool allows you to transliterate text using the SinaTools' utility. This command-line utility
 6 | takes in a text and a desired schema, and outputs the transliterated text.
 7 | 
 8 | Usage:
 9 | ------
10 | Below is the usage information that can be generated by running transliterate --help.
11 | 
12 |     Usage:
13 |     ------
14 | 
15 | .. code-block:: none
16 | 
17 |         transliterate --text=TEXT --schema=SCHEMA
18 | 
19 |         transliterate --file=FILE --schema=SCHEMA
20 | 
21 | Options:
22 | --------
23 | 
24 | .. code-block:: none
25 | 
26 |   --text TEXT
27 |         Text to be transliterated.
28 |   --schema SCHEMA
29 |         Transliteration schema to be used, which is bw2ar or ar2bw.
30 | 
31 | Examples:
32 | ---------
33 | 
34 | .. code-block:: none
35 | 
36 |     transliterate --text "klmp" --schema "bw2ar"
37 |     transliterate --file "path/to/your/file.txt" --schema "ar2bw"
38 | 
39 | 
40 | """
41 | import argparse
42 | from sinatools.utils.text_transliteration import perform_transliteration
43 | from sinatools.utils.readfile import read_file
44 | 
45 | def main():
46 |     parser = argparse.ArgumentParser(description='Perform text transliteration using SinaTools')
47 |     
48 |     # Adding arguments for the text, file, and schema
49 |     parser.add_argument('--text', type=str, help='Text to be transliterated')
50 |     parser.add_argument('--file', type=str, help='File containing the text to be transliterated')
51 |     parser.add_argument('--schema', type=str, required=True, help='Transliteration schema to be used')
52 | 
53 |     args = parser.parse_args()
54 | 
55 |     # Check if either text or file is provided
56 |     if args.text is None and args.file is None:
57 |         print("Either --text or --file argument must be provided.")
58 |         return
59 | 
60 |     text_content = args.text if args.text else " ".join(read_file(args.file))
61 |     # Perform transliteration
62 |     result = perform_transliteration(text_content, args.schema)
63 |     
64 |     print(result)
65 | 
66 | if __name__ == '__main__':
67 |     main()
68 | 
69 | #transliterate --text "example text" --schema "bw2ar"
70 | #transliterate --file "path/to/your/file.txt" --schema "bw2ar"
71 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/DataDownload/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/build/lib/sinatools/DataDownload/__init__.py


--------------------------------------------------------------------------------
/build/lib/sinatools/VERSION:
--------------------------------------------------------------------------------
1 | 0.1.41


--------------------------------------------------------------------------------
/build/lib/sinatools/__init__.py:
--------------------------------------------------------------------------------
1 | """Top-level package for sinatools."""
2 | 
3 | __author__ = """SinaLab"""
4 | __email__ = 'sina.institute.bzu@gmail.com'
5 | __version__ = '0.8.5'


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/arabert/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/araelectra/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/araelectra/finetune/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/araelectra/finetune/feature_spec.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Defines the inputs used when fine-tuning a model."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import numpy as np
23 | import tensorflow as tf
24 | 
25 | import configure_finetuning
26 | 
27 | 
28 | def get_shared_feature_specs(config: configure_finetuning.FinetuningConfig):
29 |   """Non-task-specific model inputs."""
30 |   return [
31 |       FeatureSpec("input_ids", [config.max_seq_length]),
32 |       FeatureSpec("input_mask", [config.max_seq_length]),
33 |       FeatureSpec("segment_ids", [config.max_seq_length]),
34 |       FeatureSpec("task_id", []),
35 |   ]
36 | 
37 | 
38 | class FeatureSpec(object):
39 |   """Defines a feature passed as input to the model."""
40 | 
41 |   def __init__(self, name, shape, default_value_fn=None, is_int_feature=True):
42 |     self.name = name
43 |     self.shape = shape
44 |     self.default_value_fn = default_value_fn
45 |     self.is_int_feature = is_int_feature
46 | 
47 |   def get_parsing_spec(self):
48 |     return tf.io.FixedLenFeature(
49 |         self.shape, tf.int64 if self.is_int_feature else tf.float32)
50 | 
51 |   def get_default_values(self):
52 |     if self.default_value_fn:
53 |       return self.default_value_fn(self.shape)
54 |     else:
55 |       return np.zeros(
56 |           self.shape, np.int64 if self.is_int_feature else np.float32)
57 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/araelectra/finetune/scorer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Base class for evaluation metrics."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import abc
23 | 
24 | 
25 | class Scorer(object):
26 |   """Abstract base class for computing evaluation metrics."""
27 | 
28 |   __metaclass__ = abc.ABCMeta
29 | 
30 |   def __init__(self):
31 |     self._updated = False
32 |     self._cached_results = {}
33 | 
34 |   @abc.abstractmethod
35 |   def update(self, results):
36 |     self._updated = True
37 | 
38 |   @abc.abstractmethod
39 |   def get_loss(self):
40 |     pass
41 | 
42 |   @abc.abstractmethod
43 |   def _get_results(self):
44 |     return []
45 | 
46 |   def get_results(self, prefix=""):
47 |     results = self._get_results() if self._updated else self._cached_results
48 |     self._cached_results = results
49 |     self._updated = False
50 |     return [(prefix + k, v) for k, v in results]
51 | 
52 |   def results_str(self):
53 |     return " - ".join(["{:}: {:.2f}".format(k, v)
54 |                        for k, v in self.get_results()])
55 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/araelectra/finetune/task.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Defines a supervised NLP task."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import abc
23 | from typing import List, Tuple
24 | 
25 | import configure_finetuning
26 | from finetune import feature_spec
27 | from finetune import scorer
28 | from model import modeling
29 | 
30 | 
31 | class Example(object):
32 |   __metaclass__ = abc.ABCMeta
33 | 
34 |   def __init__(self, task_name):
35 |     self.task_name = task_name
36 | 
37 | 
38 | class Task(object):
39 |   """Override this class to add a new fine-tuning task."""
40 | 
41 |   __metaclass__ = abc.ABCMeta
42 | 
43 |   def __init__(self, config: configure_finetuning.FinetuningConfig, name):
44 |     self.config = config
45 |     self.name = name
46 | 
47 |   def get_test_splits(self):
48 |     return ["test"]
49 | 
50 |   @abc.abstractmethod
51 |   def get_examples(self, split):
52 |     pass
53 | 
54 |   @abc.abstractmethod
55 |   def get_scorer(self) -> scorer.Scorer:
56 |     pass
57 | 
58 |   @abc.abstractmethod
59 |   def get_feature_specs(self) -> List[feature_spec.FeatureSpec]:
60 |     pass
61 | 
62 |   @abc.abstractmethod
63 |   def featurize(self, example: Example, is_training: bool,
64 |                 log: bool=False):
65 |     pass
66 | 
67 |   @abc.abstractmethod
68 |   def get_prediction_module(
69 |       self, bert_model: modeling.BertModel, features: dict, is_training: bool,
70 |       percent_done: float) -> Tuple:
71 |     pass
72 | 
73 |   def __repr__(self):
74 |     return "Task(" + self.name + ")"
75 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/araelectra/finetune/task_builder.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Returns task instances given the task name."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import configure_finetuning
23 | from finetune.classification import classification_tasks
24 | from finetune.qa import qa_tasks
25 | from finetune.tagging import tagging_tasks
26 | from model import tokenization
27 | 
28 | 
29 | def get_tasks(config: configure_finetuning.FinetuningConfig):
30 |   tokenizer = tokenization.FullTokenizer(vocab_file=config.vocab_file,
31 |                                          do_lower_case=config.do_lower_case)
32 |   return [get_task(config, task_name, tokenizer)
33 |           for task_name in config.task_names]
34 | 
35 | 
36 | def get_task(config: configure_finetuning.FinetuningConfig, task_name,
37 |              tokenizer):
38 |   """Get an instance of a task based on its name."""
39 |   if task_name == "cola":
40 |     return classification_tasks.CoLA(config, tokenizer)
41 |   elif task_name == "mrpc":
42 |     return classification_tasks.MRPC(config, tokenizer)
43 |   elif task_name == "mnli":
44 |     return classification_tasks.MNLI(config, tokenizer)
45 |   elif task_name == "sst":
46 |     return classification_tasks.SST(config, tokenizer)
47 |   elif task_name == "rte":
48 |     return classification_tasks.RTE(config, tokenizer)
49 |   elif task_name == "qnli":
50 |     return classification_tasks.QNLI(config, tokenizer)
51 |   elif task_name == "qqp":
52 |     return classification_tasks.QQP(config, tokenizer)
53 |   elif task_name == "sts":
54 |     return classification_tasks.STS(config, tokenizer)
55 |   elif task_name == "squad":
56 |     return qa_tasks.SQuAD(config, tokenizer)
57 |   elif task_name == "squadv1":
58 |     return qa_tasks.SQuADv1(config, tokenizer)
59 |   elif task_name == "newsqa":
60 |     return qa_tasks.NewsQA(config, tokenizer)
61 |   elif task_name == "naturalqs":
62 |     return qa_tasks.NaturalQuestions(config, tokenizer)
63 |   elif task_name == "triviaqa":
64 |     return qa_tasks.TriviaQA(config, tokenizer)
65 |   elif task_name == "searchqa":
66 |     return qa_tasks.SearchQA(config, tokenizer)
67 |   elif task_name == "chunk":
68 |     return tagging_tasks.Chunking(config, tokenizer)
69 |   else:
70 |     raise ValueError("Unknown task " + task_name)
71 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/araelectra/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/araelectra/pretrain/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/araelectra/util/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/araelectra/util/utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """A collection of general utility functions."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import json
 23 | import pickle
 24 | import sys
 25 | 
 26 | import tensorflow as tf
 27 | 
 28 | 
 29 | def load_json(path):
 30 |   with tf.io.gfile.GFile(path, "r") as f:
 31 |     return json.load(f)
 32 | 
 33 | 
 34 | def write_json(o, path):
 35 |   if "/" in path:
 36 |     tf.io.gfile.makedirs(path.rsplit("/", 1)[0])
 37 |   with tf.io.gfile.GFile(path, "w") as f:
 38 |     json.dump(o, f)
 39 | 
 40 | 
 41 | def load_pickle(path):
 42 |   with tf.io.gfile.GFile(path, "rb") as f:
 43 |     return pickle.load(f)
 44 | 
 45 | 
 46 | def write_pickle(o, path):
 47 |   if "/" in path:
 48 |     tf.io.gfile.makedirs(path.rsplit("/", 1)[0])
 49 |   with tf.io.gfile.GFile(path, "wb") as f:
 50 |     pickle.dump(o, f, -1)
 51 | 
 52 | 
 53 | def mkdir(path):
 54 |   if not tf.io.gfile.exists(path):
 55 |     tf.io.gfile.makedirs(path)
 56 | 
 57 | 
 58 | def rmrf(path):
 59 |   if tf.io.gfile.exists(path):
 60 |     tf.io.gfile.rmtree(path)
 61 | 
 62 | 
 63 | def rmkdir(path):
 64 |   rmrf(path)
 65 |   mkdir(path)
 66 | 
 67 | 
 68 | def log(*args):
 69 |   msg = " ".join(map(str, args))
 70 |   sys.stdout.write(msg + "\n")
 71 |   sys.stdout.flush()
 72 | 
 73 | 
 74 | def log_config(config):
 75 |   for key, value in sorted(config.__dict__.items()):
 76 |     log(key, value)
 77 |   log()
 78 | 
 79 | 
 80 | def heading(*args):
 81 |   log(80 * "=")
 82 |   log(*args)
 83 |   log(80 * "=")
 84 | 
 85 | 
 86 | def nest_dict(d, prefixes, delim="_"):
 87 |   """Go from {prefix_key: value} to {prefix: {key: value}}."""
 88 |   nested = {}
 89 |   for k, v in d.items():
 90 |     for prefix in prefixes:
 91 |       if k.startswith(prefix + delim):
 92 |         if prefix not in nested:
 93 |           nested[prefix] = {}
 94 |         nested[prefix][k.split(delim, 1)[1]] = v
 95 |       else:
 96 |         nested[k] = v
 97 |   return nested
 98 | 
 99 | 
100 | def flatten_dict(d, delim="_"):
101 |   """Go from {prefix: {key: value}} to {prefix_key: value}."""
102 |   flattened = {}
103 |   for k, v in d.items():
104 |     if isinstance(v, dict):
105 |       for k2, v2 in v.items():
106 |         flattened[k + delim + k2] = v2
107 |     else:
108 |       flattened[k] = v
109 |   return flattened
110 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/aragpt2/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 
3 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/aragpt2/gpt2/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 
3 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/aragpt2/grover/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/build/lib/sinatools/arabert/aragpt2/grover/__init__.py


--------------------------------------------------------------------------------
/build/lib/sinatools/arabert/aragpt2/train_bpe_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | from tokenizers import ByteLevelBPETokenizer
 5 | from transformers import GPT2TokenizerFast
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--data-files", type=str, required=True)
11 |     parser.add_argument("--vocab-size", type=int, required=True)
12 |     parser.add_argument("--output-dir", type=str, required=True)
13 |     parser.add_argument("--output-file-name", type=str, required=True)
14 |     args = parser.parse_args()
15 | 
16 |     gpt2_tok = ByteLevelBPETokenizer(add_prefix_space=True)
17 | 
18 |     files = glob.glob(args.data_files)
19 |     if len(files) > 10:
20 |         print(files[0:10])
21 |     else:
22 |         print(files)
23 | 
24 |     gpt2_tok.train(
25 |         files=files,
26 |         vocab_size=args.vocab_size,
27 |         show_progress=True,
28 |         special_tokens=["<|endoftext|>", "<s>", "<pad>", "</s>"],
29 |     )
30 | 
31 |     if not os.path.exists(args.output_dir):
32 |         os.makedirs(args.output_dir)
33 | 
34 | 
35 |     gpt2_tok.save(
36 |             os.path.join(args.output_dir,"tokenizer.json"), pretty=True
37 |         )  # FIX Access is denied. (os error 5)
38 |     gpt2_tok.save_model(args.output_dir, args.output_file_name)
39 | 
40 |     # tokenizer = GPT2TokenizerFast(
41 |     #     vocab_file=os.path.join(args.output_dir, args.output_file_name) + "-vocab.json",
42 |     #     merges_file=os.path.join(args.output_dir, args.output_file_name)
43 |     #     + "-merges.txt",
44 |     #     add_prefix_space=True,
45 |     # )
46 | 
47 |     # tokenizer.add_special_tokens(
48 |     #     {
49 |     #         "eos_token": "<|endoftext|>",
50 |     #         "bos_token": "<|endoftext|>",
51 |     #         "unk_token": "<|endoftext|>",
52 |     #         "pad_token": "<|endoftext|>",
53 |     #         "mask_token": "<|endoftext|>",
54 |     #     }
55 |     # )
56 | 
57 |     # tokenizer.save_pretrained(
58 |     #     args.output_dir, legacy_format=False, filename_prefix=args.output_file_name
59 |     # )
60 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/install_env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | def main():
 5 |     # Determine the path to the 'environment.yml' file within the package
 6 |     package_dir = os.path.dirname(__file__)
 7 |     env_file = os.path.join(package_dir, 'environment.yml')
 8 | 
 9 |     # Create the conda environment using the 'environment.yml' file
10 |     subprocess.call(["conda", "env", "create", "-f", env_file])
11 | 
12 | if __name__ == "__main__":
13 |     main()
14 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/morphology/ALMA_multi_word.py:
--------------------------------------------------------------------------------
 1 | from sinatools.utils.parser import arStrip
 2 | from . import five_grams_dict, four_grams_dict  , three_grams_dict , two_grams_dict
 3 | 
 4 | def ALMA_multi_word(multi_word, n):
 5 |     undiac_multi_word = arStrip(multi_word, True, True, True, False, True, False)  # diacs , smallDiacs , shaddah ,  digit , alif , specialChars
 6 |     result_word = []
 7 |     if n == 2:
 8 |         if undiac_multi_word in two_grams_dict.keys():
 9 |             result_word = two_grams_dict[undiac_multi_word]                    
10 |     elif n == 3:
11 |         if undiac_multi_word in three_grams_dict.keys():
12 |             result_word = three_grams_dict[undiac_multi_word]                    
13 |     elif n == 4:
14 |         if undiac_multi_word in four_grams_dict.keys():
15 |             result_word = four_grams_dict[undiac_multi_word]                    
16 |     else:    
17 |      if undiac_multi_word in five_grams_dict.keys():
18 |          result_word = five_grams_dict[undiac_multi_word]            
19 |     
20 |     my_json = {}
21 |     output_list = []
22 |     my_json['multi_word_lemma'] = multi_word
23 |     my_json['undiac_multi_word_lemma'] = multi_word
24 |     ids = []
25 |     if result_word != []:
26 |         my_json['POS'] = result_word[0][1] #POS
27 |         for result in result_word: 
28 |            ids.append(result[3])
29 |         my_json['ids'] = ids
30 |         output_list.append(my_json)    
31 |     return output_list  


--------------------------------------------------------------------------------
/build/lib/sinatools/morphology/__init__.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from sinatools.DataDownload import downloader
 3 | import os 
 4 | 
 5 | dictionary = {}
 6 | five_grams_dict = {} 
 7 | four_grams_dict = {}
 8 | three_grams_dict = {}
 9 | two_grams_dict = {} 
10 | 
11 | filename = 'lemmas_dic.pickle'
12 | path = downloader.get_appdatadir()
13 | file_path = os.path.join(path, filename)
14 | with open(file_path, 'rb') as f:
15 |     dictionary = pickle.load(f)
16 | 
17 | filename_five = 'five_grams.pickle'
18 | path =downloader.get_appdatadir()
19 | file_path = os.path.join(path, filename_five)
20 | with open(file_path, 'rb') as f:
21 |    five_grams_dict = pickle.load(f, encoding='utf-8')
22 | 
23 | 
24 | filename_four = 'four_grams.pickle'
25 | path =downloader.get_appdatadir()
26 | file_path = os.path.join(path, filename_four)
27 | with open(file_path, 'rb') as f:
28 |   four_grams_dict = pickle.load(f, encoding='utf-8')
29 |   
30 |      
31 | filename_three = 'three_grams.pickle'
32 | path =downloader.get_appdatadir()
33 | file_path = os.path.join(path, filename_three)
34 | with open(file_path, 'rb') as f:
35 |   three_grams_dict = pickle.load(f, encoding='utf-8')
36 |   
37 | 
38 | filename_two = 'two_grams.pickle'
39 | path =downloader.get_appdatadir()
40 | file_path = os.path.join(path, filename_two)
41 | with open(file_path, 'rb') as f:
42 |   two_grams_dict = pickle.load(f, encoding='utf-8')
43 |   


--------------------------------------------------------------------------------
/build/lib/sinatools/ner/__init__.py:
--------------------------------------------------------------------------------
 1 | from sinatools.DataDownload import downloader
 2 | import os
 3 | from sinatools.ner.helpers import load_object
 4 | import pickle
 5 | import os
 6 | import torch
 7 | import pickle
 8 | import json
 9 | from argparse import Namespace
10 | 
11 | tagger = None
12 | tag_vocab = None
13 | train_config = None
14 | 
15 | filename = 'Wj27012000.tar'
16 | path =downloader.get_appdatadir()
17 | model_path = os.path.join(path, filename)
18 | 
19 | _path = os.path.join(model_path, "tag_vocab.pkl")
20 | 
21 | with open(_path, "rb") as fh:
22 |     tag_vocab = pickle.load(fh)
23 | 
24 | train_config = Namespace()
25 | args_path = os.path.join(model_path, "args.json")
26 | 
27 | with open(args_path, "r") as fh:
28 |     train_config.__dict__ = json.load(fh)
29 | 
30 | model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
31 | model = torch.nn.DataParallel(model)
32 | 
33 | if torch.cuda.is_available():
34 |     model = model.cuda()
35 | 
36 | train_config.trainer_config["kwargs"]["model"] = model
37 | tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
38 | tagger.load(os.path.join(model_path,"checkpoints"))
39 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/ner/data/__init__.py:
--------------------------------------------------------------------------------
1 | from sinatools.ner.data.datasets import NestedTagsDataset


--------------------------------------------------------------------------------
/build/lib/sinatools/ner/metrics.py:
--------------------------------------------------------------------------------
 1 | from seqeval.metrics import (
 2 |     classification_report,
 3 |     precision_score,
 4 |     recall_score,
 5 |     f1_score,
 6 |     accuracy_score,
 7 | )
 8 | from seqeval.scheme import IOB2
 9 | from types import SimpleNamespace
10 | import logging
11 | import re
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def compute_nested_metrics(segments, vocabs):
17 |     """
18 |     Compute metrics for nested NER
19 |     :param segments: List[List[arabiner.data.dataset.Token]] - list of segments
20 |     :return: metrics - SimpleNamespace - F1/micro/macro/weights, recall, precision, accuracy
21 |     """
22 |     y, y_hat = list(), list()
23 | 
24 |     # We duplicate the dataset N times, where N is the number of entity types
25 |     # For each copy, we create y and y_hat
26 |     # Example: first copy, will create pairs of ground truth and predicted labels for entity type GPE
27 |     #          another copy will create pairs for LOC, etc.
28 |     for i, vocab in enumerate(vocabs):
29 |         vocab_tags = [tag for tag in vocab.get_itos() if "-" in tag]
30 |         r = re.compile("|".join(vocab_tags))
31 | 
32 |         y += [[(list(filter(r.match, token.gold_tag)) or ["O"])[0] for token in segment] for segment in segments]
33 |         y_hat += [[token.pred_tag[i]["tag"] for token in segment] for segment in segments]
34 | 
35 |     logging.info("\n" + classification_report(y, y_hat, scheme=IOB2, digits=4))
36 | 
37 |     metrics = {
38 |         "micro_f1": f1_score(y, y_hat, average="micro", scheme=IOB2),
39 |         "macro_f1": f1_score(y, y_hat, average="macro", scheme=IOB2),
40 |         "weights_f1": f1_score(y, y_hat, average="weighted", scheme=IOB2),
41 |         "precision": precision_score(y, y_hat, scheme=IOB2),
42 |         "recall": recall_score(y, y_hat, scheme=IOB2),
43 |         "accuracy": accuracy_score(y, y_hat),
44 |     }
45 | 
46 |     return SimpleNamespace(**metrics)
47 | 
48 | 
49 | def compute_single_label_metrics(segments):
50 |     """
51 |     Compute metrics for flat NER
52 |     :param segments: List[List[arabiner.data.dataset.Token]] - list of segments
53 |     :return: metrics - SimpleNamespace - F1/micro/macro/weights, recall, precision, accuracy
54 |     """
55 |     y = [[token.gold_tag[0] for token in segment] for segment in segments]
56 |     y_hat = [[token.pred_tag[0]["tag"] for token in segment] for segment in segments]
57 | 
58 |     logging.info("\n" + classification_report(y, y_hat, scheme=IOB2))
59 | 
60 |     metrics = {
61 |         "micro_f1": f1_score(y, y_hat, average="micro", scheme=IOB2),
62 |         "macro_f1": f1_score(y, y_hat, average="macro", scheme=IOB2),
63 |         "weights_f1": f1_score(y, y_hat, average="weighted", scheme=IOB2),
64 |         "precision": precision_score(y, y_hat, scheme=IOB2),
65 |         "recall": recall_score(y, y_hat, scheme=IOB2),
66 |         "accuracy": accuracy_score(y, y_hat),
67 |     }
68 | 
69 |     return SimpleNamespace(**metrics)
70 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/ner/nn/BaseModel.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from transformers import BertModel
 3 | import logging
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | class BaseModel(nn.Module):
 9 |     def __init__(self,
10 |                  bert_model="aubmindlab/bert-base-arabertv2",
11 |                  num_labels=2,
12 |                  dropout=0.1,
13 |                  num_types=0):
14 |         super().__init__()
15 | 
16 |         self.bert_model = bert_model
17 |         self.num_labels = num_labels
18 |         self.num_types = num_types
19 |         self.dropout = dropout
20 | 
21 |         self.bert = BertModel.from_pretrained(bert_model)
22 |         self.dropout = nn.Dropout(dropout)
23 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/ner/nn/BertNestedTagger.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from sinatools.ner.nn import BaseModel
 4 | 
 5 | 
 6 | class BertNestedTagger(BaseModel):
 7 |     def __init__(self, **kwargs):
 8 |         super(BertNestedTagger, self).__init__(**kwargs)
 9 | 
10 |         self.max_num_labels = max(self.num_labels)
11 |         classifiers = [nn.Linear(768, num_labels) for num_labels in self.num_labels]
12 |         self.classifiers = torch.nn.Sequential(*classifiers)
13 | 
14 |     def forward(self, x):
15 |         y = self.bert(x)
16 |         y = self.dropout(y["last_hidden_state"])
17 |         output = list()
18 | 
19 |         for i, classifier in enumerate(self.classifiers):
20 |             logits = classifier(y)
21 | 
22 |             # Pad logits to allow Multi-GPU/DataParallel training to work
23 |             # We will truncate the padded dimensions when we compute the loss in the trainer
24 |             logits = torch.nn.ConstantPad1d((0, self.max_num_labels - logits.shape[-1]), 0)(logits)
25 |             output.append(logits)
26 | 
27 |         # Return tensor of the shape B x T x L x C
28 |         # B: batch size
29 |         # T: sequence length
30 |         # L: number of tag types
31 |         # C: number of classes per tag type
32 |         output = torch.stack(output).permute((1, 2, 0, 3))
33 |         return output
34 | 
35 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/ner/nn/BertSeqTagger.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from transformers import BertModel
 3 | 
 4 | 
 5 | class BertSeqTagger(nn.Module):
 6 |     def __init__(self, bert_model, num_labels=2, dropout=0.1):
 7 |         super().__init__()
 8 | 
 9 |         self.bert = BertModel.from_pretrained(bert_model)
10 |         self.dropout = nn.Dropout(dropout)
11 |         self.linear = nn.Linear(768, num_labels)
12 | 
13 |     def forward(self, x):
14 |         y = self.bert(x)
15 |         y = self.dropout(y["last_hidden_state"])
16 |         logits = self.linear(y)
17 |         return logits
18 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/ner/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from sinatools.ner.nn.BaseModel import BaseModel
2 | from sinatools.ner.nn.BertSeqTagger import BertSeqTagger
3 | from sinatools.ner.nn.BertNestedTagger import BertNestedTagger


--------------------------------------------------------------------------------
/build/lib/sinatools/ner/trainers/__init__.py:
--------------------------------------------------------------------------------
1 | from sinatools.ner.trainers.BaseTrainer import BaseTrainer
2 | from sinatools.ner.trainers.BertTrainer import BertTrainer
3 | from sinatools.ner.trainers.BertNestedTrainer import BertNestedTrainer


--------------------------------------------------------------------------------
/build/lib/sinatools/relations/__init__.py:
--------------------------------------------------------------------------------
1 | from sinatools.DataDownload import downloader
2 | import os
3 | from transformers import pipeline
4 | 
5 | path =downloader.get_appdatadir()
6 | 
7 | pipe = pipeline("sentiment-analysis", model= os.path.join(path, "relation_model"), return_all_scores =True, max_length=128, truncation=True)


--------------------------------------------------------------------------------
/build/lib/sinatools/semantic_relatedness/__init__.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | from sinatools.DataDownload import downloader
 4 | import os 
 5 | from transformers import BertTokenizer,BertModel
 6 | 
 7 | model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"
 8 | path =downloader.get_appdatadir()
 9 | model_file_path = os.path.join(path, model_file_name)
10 | 
11 | tokenizer_file_name = "bert-base-arabertv02"
12 | path =downloader.get_appdatadir()
13 | tokenizer_file_path = os.path.join(path, tokenizer_file_name)
14 | 
15 | model = BertModel.from_pretrained('{}'.format(model_file_path),
16 |                                                       output_hidden_states = True,
17 |                                                       num_labels=2
18 |                                                       )
19 | 
20 | tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path))


--------------------------------------------------------------------------------
/build/lib/sinatools/semantic_relatedness/compute_relatedness.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from . import tokenizer 
 3 | from . import model 
 4 | 
 5 | #cosine using average embedding 
 6 | def get_similarity_score(sentence1, sentence2):
 7 |   """
 8 |     Computes the degree of association between two sentences across various dimensions, meaning, underlying concepts, domain-specificity, topic overlap, viewpoint alignment. This method is described and implemented on this article.
 9 | 
10 |     Args:
11 |         sentence1 (:obj:`str`) – The Arabic sentence to find the semantic relatedness between it and the second sentence.
12 |         sentence2 (:obj:`int`) – The Arabic sentence to find the semantic relatedness between it and the first sentence.
13 |     
14 |     Returns:
15 |         :obj:`float`: An float number that represents the degree of relatedness between two provided sentences.
16 | 
17 |     **Example:**
18 | 
19 |     .. highlight:: python
20 |     .. code-block:: python
21 | 
22 |     from sinatools.semantic_relatedness.compute_relatedness import get_similarity_score
23 | 
24 |     sentence1 = "تبلغ سرعة دوران الأرض حول الشمس حوالي 110 كيلومتر في الساعة."
25 |     sentence2 = "تدور الأرض حول محورها بسرعة تصل تقريبا 1670 كيلومتر في الساعة."    
26 |     get_similarity_score(sentence1, sentence2)
27 |     Score = 0.90
28 |   """         
29 | 
30 |   # Tokenize and encode sentences
31 |   inputs1 = tokenizer(sentence1, return_tensors="pt")
32 |   inputs2 = tokenizer(sentence2, return_tensors="pt")
33 | 
34 |   # Extract embeddings
35 |   with torch.no_grad():
36 |       outputs1 = model(**inputs1)
37 |       outputs2 = model(**inputs2)
38 |       
39 |       embeddings1 = outputs1.last_hidden_state
40 |       embeddings2 = outputs2.last_hidden_state
41 | 
42 |   # Mask padding tokens
43 |   attention_mask1 = inputs1["attention_mask"]
44 |   attention_mask2 = inputs2["attention_mask"]
45 | 
46 |   # Average pool across tokens, excluding padding
47 |   embeddings1_avg = torch.sum(embeddings1 * attention_mask1.unsqueeze(-1), dim=1) / torch.sum(attention_mask1, dim=1, keepdim=True)
48 |   embeddings2_avg = torch.sum(embeddings2 * attention_mask2.unsqueeze(-1), dim=1) / torch.sum(attention_mask2, dim=1, keepdim=True)
49 | 
50 |   # Calculate cosine similarity
51 |   similarity = torch.nn.functional.cosine_similarity(embeddings1_avg, embeddings2_avg)
52 | 
53 |   return similarity.item()


--------------------------------------------------------------------------------
/build/lib/sinatools/sinatools.py:
--------------------------------------------------------------------------------
1 | """Main module."""
2 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/synonyms/__init__.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from sinatools.DataDownload import downloader
 3 | import os 
 4 | 
 5 | synonyms_level2_dict = {} 
 6 | level2_dict = 'graph_l2.pkl'
 7 | path = downloader.get_appdatadir()
 8 | file_path = os.path.join(path, level2_dict)
 9 | with open(file_path, 'rb') as f:
10 |     synonyms_level2_dict = pickle.load(f, encoding='utf-8')
11 | 
12 | 
13 | synonyms_level3_dict = {}    
14 | level3_dict = 'graph_l3.pkl'
15 | path = downloader.get_appdatadir()
16 | file_path = os.path.join(path, level3_dict)
17 | with open(file_path, 'rb') as f:
18 |    synonyms_level3_dict = pickle.load(f, encoding='utf-8')


--------------------------------------------------------------------------------
/build/lib/sinatools/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/build/lib/sinatools/utils/__init__.py


--------------------------------------------------------------------------------
/build/lib/sinatools/utils/charsets.py:
--------------------------------------------------------------------------------
 1 | # We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/utils/charsets.html].
 2 | 
 3 | import unicodedata
 4 | 
 5 | from six import unichr
 6 | 
 7 | 
 8 | UNICODE_PUNCT_CHARSET = frozenset(
 9 |     [unichr(x) for x in range(65536) if unicodedata.category(
10 |         unichr(x))[0] == 'P'])
11 | UNICODE_SYMBOL_CHARSET = frozenset(
12 |     [unichr(x) for x in range(65536) if unicodedata.category(
13 |         unichr(x))[0] == 'S'])
14 | UNICODE_PUNCT_SYMBOL_CHARSET = UNICODE_PUNCT_CHARSET | UNICODE_SYMBOL_CHARSET
15 | 
16 | UNICODE_LETTER_CHARSET = frozenset(
17 |     [unichr(x) for x in range(65536) if unicodedata.category(
18 |         unichr(x))[0] == 'L'])
19 | UNICODE_MARK_CHARSET = frozenset(
20 |     [unichr(x) for x in range(65536) if unicodedata.category(
21 |         unichr(x))[0] == 'M'])
22 | UNICODE_NUMBER_CHARSET = frozenset(
23 |     [unichr(x) for x in range(65536) if unicodedata.category(
24 |         unichr(x))[0] == 'N'])
25 | UNICODE_LETTER_MARK_NUMBER_CHARSET = (UNICODE_LETTER_CHARSET |
26 |                                       UNICODE_MARK_CHARSET |
27 |                                       UNICODE_NUMBER_CHARSET)
28 | 
29 | AR_LETTERS_CHARSET = frozenset(u'\u0621\u0622\u0623\u0624\u0625\u0626\u0627'
30 |                                u'\u0628\u0629\u062a\u062b\u062c\u062d\u062e'
31 |                                u'\u062f\u0630\u0631\u0632\u0633\u0634\u0635'
32 |                                u'\u0636\u0637\u0638\u0639\u063a\u0640\u0641'
33 |                                u'\u0642\u0643\u0644\u0645\u0646\u0647\u0648'
34 |                                u'\u0649\u064a\u0671\u067e\u0686\u06a4\u06af')
35 | AR_DIAC_CHARSET = frozenset(u'\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652'
36 |                             u'\u0670\u0640')
37 | AR_CHARSET = AR_LETTERS_CHARSET | AR_DIAC_CHARSET
38 | 
39 | BW_LETTERS_CHARSET = frozenset(u'$&\'*<>ADEGHJPSTVYZ_bdfghjklmnpqrstvwxyz{|}')
40 | BW_DIAC_CHARSET = frozenset(u'FKN`aiou~_')
41 | BW_CHARSET = BW_LETTERS_CHARSET | BW_DIAC_CHARSET
42 | 
43 | SAFEBW_LETTERS_CHARSET = frozenset(u'ABCDEGHIJLMOPQSTVWYZ_bcdefghjklmnpqrstvwx'
44 |                                    u'yz')
45 | SAFEBW_DIAC_CHARSET = frozenset(u'FKNaeiou~_')
46 | SAFEBW_CHARSET = SAFEBW_LETTERS_CHARSET | SAFEBW_DIAC_CHARSET
47 | 
48 | XMLBW_LETTERS_CHARSET = frozenset(u'$\'*ABDEGHIJOPSTWYZ_bdfghjklmnpqrstvwxyz{|'
49 |                                   u'}')
50 | XMLBW_DIAC_CHARSET = frozenset(u'FKN`aiou~_')
51 | XMLBW_CHARSET = XMLBW_LETTERS_CHARSET | XMLBW_DIAC_CHARSET
52 | 
53 | HSB_LETTERS_CHARSET = frozenset(u'\'ADHST_bcdfghjklmnpqrstvwxyz'
54 |                                 u'\u00c2\u00c4\u00e1\u00f0\u00fd\u0100\u0102'
55 |                                 u'\u010e\u0127\u0161\u0175\u0177\u03b3\u03b8'
56 |                                 u'\u03c2')
57 | HSB_DIAC_CHARSET = frozenset(u'.aiu~\u00c4\u00e1\u00e3\u0129\u0169_')
58 | HSB_CHARSET = HSB_LETTERS_CHARSET | HSB_DIAC_CHARSET
59 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/utils/readfile.py:
--------------------------------------------------------------------------------
1 | def read_file(file_path):
2 |     with open(file_path, 'r', encoding='utf-8') as file:
3 |         return [line.strip() for line in file]
4 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/utils/tokenizers_words.py:
--------------------------------------------------------------------------------
 1 | # We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/tokenizers/word.html].
 2 | 
 3 | import re
 4 | from sinatools.utils.charsets import UNICODE_PUNCT_SYMBOL_CHARSET
 5 | from sinatools.utils.charsets import UNICODE_LETTER_MARK_NUMBER_CHARSET
 6 | 
 7 | 
 8 | _ALL_PUNCT = u''.join(UNICODE_PUNCT_SYMBOL_CHARSET)
 9 | _ALL_LETTER_MARK_NUMBER = u''.join(UNICODE_LETTER_MARK_NUMBER_CHARSET)
10 | _TOKENIZE_RE = re.compile(r'[' + re.escape(_ALL_PUNCT) + r']|[' +
11 |                           re.escape(_ALL_LETTER_MARK_NUMBER) + r']+')
12 | 
13 | 
14 | def simple_word_tokenize(sentence):
15 | 
16 |     return _TOKENIZE_RE.findall(sentence)
17 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/wsd/__init__.py:
--------------------------------------------------------------------------------
 1 | from sinatools.wsd import settings 
 2 | import pickle
 3 | from sinatools.DataDownload import downloader
 4 | import os 
 5 | 
 6 | glosses_dic = {}
 7 | filename = 'one_gram.pickle'
 8 | path =downloader.get_appdatadir()
 9 | file_path = os.path.join(path, filename)
10 | with open(file_path, 'rb') as f:
11 |     glosses_dic = pickle.load(f)
12 | 


--------------------------------------------------------------------------------
/build/lib/sinatools/wsd/settings.py:
--------------------------------------------------------------------------------
 1 | from transformers import BertTokenizer,BertForSequenceClassification
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import pandas as pd
 5 | 
 6 | 
 7 | 
 8 | 
 9 | from sinatools.DataDownload import downloader
10 | import os 
11 | 
12 | 
13 | model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"
14 | path =downloader.get_appdatadir()
15 | model_file_path = os.path.join(path, model_file_name)
16 | 
17 | tokenizer_file_name = "bert-base-arabertv02"
18 | path =downloader.get_appdatadir()
19 | tokenizer_file_path = os.path.join(path, tokenizer_file_name)
20 | 
21 | dftrue = pd.DataFrame()
22 | 
23 | model = BertForSequenceClassification.from_pretrained(model_file_path, output_hidden_states=True, num_labels=2)
24 | 
25 | tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path))


--------------------------------------------------------------------------------
/dist/SinaTools-0.1.41-py2.py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/dist/SinaTools-0.1.41-py2.py3-none-any.whl


--------------------------------------------------------------------------------
/dist/sinatools-0.1.41.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/dist/sinatools-0.1.41.tar.gz


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SOURCEDIR     = source
 6 | SPHINXOPTS    =
 7 | SPHINXBUILD   = sphinx-build -c $(SOURCEDIR) -c $(SOURCEDIR)/config
 8 | SPHINXPROJ    = sinatools
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 |     @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 |     @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/build/_images/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/_images/download.png


--------------------------------------------------------------------------------
/docs/build/_static/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/_static/download.png


--------------------------------------------------------------------------------
/docs/build/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/_static/file.png


--------------------------------------------------------------------------------
/docs/build/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/_static/minus.png


--------------------------------------------------------------------------------
/docs/build/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/_static/plus.png


--------------------------------------------------------------------------------
/docs/build/doctrees/License.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/License.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/Overview.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/Overview.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/about.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/about.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/api.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/api.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/api/DataDownload.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/api/DataDownload.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/api/DataDownload/downloader.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/api/DataDownload/downloader.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/api/arabiner.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/doctrees/api/arabiner.doctree


--------------------------------------------------------------------------------
/docs/build/html/_images/SinaLogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_images/SinaLogo.jpg


--------------------------------------------------------------------------------
/docs/build/html/_images/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_images/download.png


--------------------------------------------------------------------------------
/docs/build/html/_static/SinaLogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_static/SinaLogo.jpg


--------------------------------------------------------------------------------
/docs/build/html/_static/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_static/download.png


--------------------------------------------------------------------------------
/docs/build/html/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_static/file.png


--------------------------------------------------------------------------------
/docs/build/html/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_static/minus.png


--------------------------------------------------------------------------------
/docs/build/html/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/build/html/_static/plus.png


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/License.rst:
--------------------------------------------------------------------------------
 1 | License
 2 | =======
 3 | 
 4 | MIT License
 5 | 
 6 | Copyright 2023 Birzeit University
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | of this software and associated documentation files (the "Software"), to deal
10 | in the Software without restriction, including without limitation the rights
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | copies of the Software, and to permit persons to whom the Software is
13 | furnished to do so, subject to the following conditions:
14 | 
15 | The above copyright notice and this permission notice shall be included in
16 | all copies or substantial portions of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | SOFTWARE.
25 | 


--------------------------------------------------------------------------------
/docs/source/Overview.rst:
--------------------------------------------------------------------------------
 1 | Overview
 2 | ========
 3 | 
 4 | 
 5 | About
 6 | -----
 7 | 
 8 | Sina Tools is a collection of Arabic natural language processing tools created by the Sina Lab at Birzeit University in Palestine. 
 9 | 
10 | For additional details, please refer to the :doc:`installation` section.
11 | 
12 | Sina Tools is available under the MIT license. See :doc:`License` for more information.
13 | 
14 | .. _Github repo: https://github.com/SinaLab/sinatools
15 | .. _tarball: https://github.com/SinaLab/sinatools/tarball/master
16 | 


--------------------------------------------------------------------------------
/docs/source/_static/SinaLogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/source/_static/SinaLogo.jpg


--------------------------------------------------------------------------------
/docs/source/_static/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/docs/source/_static/download.png


--------------------------------------------------------------------------------
/docs/source/about.rst:
--------------------------------------------------------------------------------
 1 | About
 2 | =====
 3 | 
 4 | 
 5 | 
 6 | SinaTools is a collection of Arabic natural language processing tools created by the SinaLab at Birzeit University in Palestine. 
 7 | 
 8 | For additional details, please refer to the :doc:`installation` section.
 9 | 
10 | SinaTools is available under the MIT license. See :doc:`License` for more information.
11 | 
12 | .. _Github repo: https://github.com/SinaLab/sinatools
13 | .. _tarball: https://github.com/SinaLab/sinatools/tarball/master
14 | 


--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
 1 | Python API Reference
 2 | ====================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 |    :titlesonly:
 7 |    :caption: Modules:
 8 | 
 9 |  
10 |    api/morphology
11 |    api/DataDownload
12 |    api/utils
13 |    api/ner
14 |    api/salma
15 | 
16 |   
17 | 


--------------------------------------------------------------------------------
/docs/source/api/DataDownload.rst:
--------------------------------------------------------------------------------
 1 | sinatools.DataDownload
 2 | =====================
 3 | 
 4 | 
 5 | .. automodule:: sinatools.DataDownload
 6 |    :members:
 7 |    :undoc-members:
 8 |    :show-inheritance:
 9 | 
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 |    :caption: Modules:
14 | 
15 |    DataDownload/downloader
16 | 


--------------------------------------------------------------------------------
/docs/source/api/DataDownload/downloader.rst:
--------------------------------------------------------------------------------
1 | sinatools.DataDownload.downloader
2 | ++++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.DataDownload.downloader
5 |    :members:
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/docs/source/api/arabiner.rst:
--------------------------------------------------------------------------------
 1 | sinatools.ner
 2 | =================
 3 | 
 4 | 
 5 | .. automodule:: sinatools.ner
 6 |    :members:
 7 |    :undoc-members:
 8 |    :show-inheritance:
 9 | 
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 |    :caption: Modules:
14 | 
15 |    ner/entity_extractor


--------------------------------------------------------------------------------
/docs/source/api/arabiner/bin/infer.rst:
--------------------------------------------------------------------------------
 1 | sinatools.ner.entity_extractor
 2 | +++++++++++++++++++++++++++
 3 | 
 4 | 
 5 | .. automodule:: sinatools.ner.entity_extractor
 6 |    :members:
 7 |    :show-inheritance:
 8 | 
 9 | 
10 | .. toctree::
11 |    :maxdepth: 1
12 |    :caption: Modules:
13 | 


--------------------------------------------------------------------------------
/docs/source/api/morphology.rst:
--------------------------------------------------------------------------------
 1 | sinatools.morphology
 2 | ===================
 3 | 
 4 | 
 5 | .. automodule:: sinatools.morphology
 6 |    :members:
 7 |    :undoc-members:
 8 |    :show-inheritance:
 9 | 
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 |    :caption: Modules:
14 | 
15 |    morphology/morph_analyzer
16 | 


--------------------------------------------------------------------------------
/docs/source/api/morphology/morph_analyzer.rst:
--------------------------------------------------------------------------------
 1 | sinatools.morphology.morph_analyzer
 2 | ++++++++++++++++++++++++++++++++++
 3 | 
 4 | .. automodule:: sinatools.morphology.morph_analyzer
 5 |    :members:
 6 | 
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/docs/source/api/salma.rst:
--------------------------------------------------------------------------------
 1 | sinatools.salma
 2 | ==============
 3 | 
 4 | 
 5 | .. automodule:: sinatools.salma
 6 |    :members:
 7 |    :show-inheritance:
 8 | 
 9 | 
10 | .. toctree::
11 |    :maxdepth: 1
12 |    :caption: Modules:
13 | 
14 |    salma/views


--------------------------------------------------------------------------------
/docs/source/api/salma/views.rst:
--------------------------------------------------------------------------------
 1 | sinatools.salma.views
 2 | ++++++++++++++++++++
 3 | 
 4 | 
 5 | .. automodule:: sinatools.salma.views
 6 |    :members:
 7 |    :show-inheritance:
 8 | 
 9 | 
10 | .. toctree::
11 |    :maxdepth: 1
12 |    :caption: Modules:
13 | 


--------------------------------------------------------------------------------
/docs/source/api/utils.rst:
--------------------------------------------------------------------------------
 1 | sinatools.utils
 2 | ==============
 3 | 
 4 | 
 5 | .. automodule:: sinatools.utils
 6 |    :members:
 7 |    :undoc-members:
 8 |    :show-inheritance:
 9 | 
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 |    :caption: Modules:
14 | 
15 |    utils/parser
16 |    utils/implication
17 |    utils/jaccard
18 |    utils/text_transliteration
19 |    utils/sentence_tokenizer
20 |    utils/corpus_tokenizer


--------------------------------------------------------------------------------
/docs/source/api/utils/corpus_tokenizer.rst:
--------------------------------------------------------------------------------
1 | sinatools.utils.corpus_tokenizer
2 | +++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.utils.corpus_tokenizer
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/api/utils/implication.rst:
--------------------------------------------------------------------------------
1 | sinatools.utils.implication
2 | ++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.utils.implication
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/api/utils/jaccard.rst:
--------------------------------------------------------------------------------
1 | sinatools.utils.jaccard
2 | ++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.utils.jaccard
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/api/utils/parser.rst:
--------------------------------------------------------------------------------
 1 | sinatools.utils.parser
 2 | +++++++++++++++++++++
 3 | 
 4 | .. automodule:: sinatools.utils.parser
 5 |    :members:
 6 |    :show-inheritance:
 7 | 
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 1
11 |    :caption: Modules:


--------------------------------------------------------------------------------
/docs/source/api/utils/sentence_tokenizer.rst:
--------------------------------------------------------------------------------
1 | sinatools.utils.sentence_tokenizer
2 | +++++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.utils.sentence_tokenizer
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/api/utils/text_transliteration.rst:
--------------------------------------------------------------------------------
1 | sinatools.utils.text_transliteration
2 | +++++++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.utils.text_transliteration
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../../AUTHORS.rst
2 | 


--------------------------------------------------------------------------------
/docs/source/cli_tools.rst:
--------------------------------------------------------------------------------
 1 | SinaTools Command Line
 2 | =======================
 3 | .. toctree::
 4 |    :maxdepth: 2
 5 |    :titlesonly:
 6 |    :caption: Modules:
 7 | 
 8 |  
 9 |    cli_tools/utils
10 |    cli_tools/morphology
11 |    cli_tools/ner
12 |    cli_tools/salma
13 |    cli_tools/DataDownload
14 |          
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/docs/source/cli_tools/DataDownload.rst:
--------------------------------------------------------------------------------
 1 | sinatools.CLI.DataDownload
 2 | =========================
 3 | 
 4 | 
 5 | .. automodule:: sinatools.CLI.DataDownload
 6 |    :members:
 7 |    :undoc-members:
 8 |    :show-inheritance:
 9 | 
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 |    :caption: Modules:
14 | 
15 |    DataDownload/download_files
16 | 


--------------------------------------------------------------------------------
/docs/source/cli_tools/DataDownload/download_files.rst:
--------------------------------------------------------------------------------
1 | sinatools.CLI.DataDownload.download_files
2 | ++++++++++++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.CLI.DataDownload.download_files
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/cli_tools/DataDownload/get_appdatadir.rst:
--------------------------------------------------------------------------------
1 | CLI.DataDownload.get_appdatadir
2 | ++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: CLI.DataDownload.get_appdatadir
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/cli_tools/arabiner.rst:
--------------------------------------------------------------------------------
 1 | sinatools.CLI.ner
 2 | =========================
 3 | 
 4 | 
 5 | .. automodule:: sinatools.CLI.ner
 6 |    :members:
 7 |    :undoc-members:
 8 |    :show-inheritance:
 9 | 
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 |    :caption: Modules:
14 | 
15 |    ner
16 |      
17 |    
18 |    
19 |    
20 |    
21 |    


--------------------------------------------------------------------------------
/docs/source/cli_tools/arabiner/infer.rst:
--------------------------------------------------------------------------------
1 | sinatools.CLI.ner.entity_extractor
2 | +++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.CLI.ner.entity_extractor
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/cli_tools/morphology.rst:
--------------------------------------------------------------------------------
 1 | sinatools.CLI.morphology
 2 | =======================
 3 | 
 4 | 
 5 | .. automodule:: sinatools.CLI.morphology
 6 |    :members:
 7 |    :undoc-members:
 8 |    :show-inheritance:
 9 | 
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 |    :caption: Modules:
14 | 
15 |    morphology/morph_analyzer
16 |    morphology/ALMA_multi_word
17 |    
18 |    
19 |    
20 |    
21 |    
22 |    


--------------------------------------------------------------------------------
/docs/source/cli_tools/morphology/ALMA_multi_word.rst:
--------------------------------------------------------------------------------
1 | sinatools.CLI.morphology.ALMA_multi_word
2 | +++++++++++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.CLI.morphology.ALMA_multi_word
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/cli_tools/morphology/morph_analyzer.rst:
--------------------------------------------------------------------------------
1 | sinatools.CLI.morphology.morph_analyzer
2 | ++++++++++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.CLI.morphology.morph_analyzer
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/cli_tools/salma.rst:
--------------------------------------------------------------------------------
 1 | sinatools.CLI.salma
 2 | ==================
 3 | 
 4 | 
 5 | .. automodule:: sinatools.CLI.salma
 6 |    :members:
 7 |    :undoc-members:
 8 |    :show-inheritance:
 9 | 
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 |    :caption: Modules:
14 | 
15 |    salma/salma_tools
16 |    
17 |    
18 |    
19 |    
20 |    
21 |    
22 |    


--------------------------------------------------------------------------------
/docs/source/cli_tools/salma/salma_tools.rst:
--------------------------------------------------------------------------------
 1 | sinatools.CLI.salma.salma_tools
 2 | ==============================
 3 | 
 4 | 
 5 | .. automodule:: sinatools.CLI.salma.salma_tools
 6 |    :members:
 7 |    :show-inheritance:
 8 | 
 9 | 
10 | .. toctree::
11 |    :maxdepth: 1
12 |    :caption: Modules:
13 | 
14 |    salma/salma_tools
15 |      
16 |    
17 |    
18 |    
19 |    
20 |    


--------------------------------------------------------------------------------
/docs/source/cli_tools/utils.rst:
--------------------------------------------------------------------------------
 1 | sinatools.CLI.utils
 2 | ==================
 3 | 
 4 | 
 5 | .. automodule:: sinatools.CLI.utils
 6 |    :members:
 7 |    :undoc-members:
 8 |    :show-inheritance:
 9 | 
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 |    :caption: Modules:
14 | 
15 |    utils/arStrip
16 |    utils/latin_remove
17 |    utils/remove_punc
18 |    utils/implication
19 |    utils/sentence_tokenizer
20 |    utils/text_transliteration
21 |    utils/jaccard
22 |    utils/corpus_tokenizer


--------------------------------------------------------------------------------
/docs/source/cli_tools/utils/arStrip.rst:
--------------------------------------------------------------------------------
1 | sinatools.CLI.utils.arStrip
2 | ++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.CLI.utils.arStrip
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/cli_tools/utils/corpus_tokenizer.rst:
--------------------------------------------------------------------------------
1 | sinatools.CLI.utils.corpus_tokenizer
2 | +++++++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.CLI.utils.corpus_tokenizer
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/cli_tools/utils/implication.rst:
--------------------------------------------------------------------------------
1 | sinatools.CLI.utils.implication
2 | ++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.CLI.utils.implication
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/cli_tools/utils/jaccard.rst:
--------------------------------------------------------------------------------
1 | sinatools.CLI.utils.jaccard
2 | +++++++++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.CLI.utils.jaccard
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/cli_tools/utils/latin_remove.rst:
--------------------------------------------------------------------------------
1 | sinatools.CLI.utils.latin_remove
2 | +++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.CLI.utils.latin_remove
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/cli_tools/utils/remove_punc.rst:
--------------------------------------------------------------------------------
1 | sinatools.CLI.utils.remove_punc
2 | ++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.CLI.utils.remove_punc
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/cli_tools/utils/sentence_tokenizer.rst:
--------------------------------------------------------------------------------
1 | sinatools.CLI.utils.sentence_tokenizer
2 | +++++++++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.CLI.utils.sentence_tokenizer
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/cli_tools/utils/text_transliteration.rst:
--------------------------------------------------------------------------------
1 | sinatools.CLI.utils.text_transliteration
2 | +++++++++++++++++++++++++++++++++++++++
3 | 
4 | .. automodule:: sinatools.CLI.utils.text_transliteration
5 |    :members:


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | SinaTools Documentation
 2 | =======================
 3 | 
 4 | .. image:: _images/SinaLogo.jpg
 5 |    :alt: SinaTools Logo
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 1
 9 |    :caption: Contents:
10 | 
11 |    about
12 |    installation
13 |    cli_tools
14 |    api
15 |    License
16 | 
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
 1 | .. highlight:: shell
 2 | 
 3 | ===============
 4 | Getting Started
 5 | ===============
 6 | 
 7 | Installation
 8 | ------------
 9 | 
10 | You will need Python 3.10.8 (64-bit) as well as
11 | `the Rust compiler <https://www.rust-lang.org/learn/get-started>`_ installed.
12 | 
13 | 
14 | Install using pip
15 | ^^^^^^^^^^^^^^^^^
16 | 
17 | To install sinatools, run this command in your terminal:
18 | 
19 | .. code-block:: console
20 | 
21 |     $ pip install sinatools
22 | 
23 | This is the preferred method to install sinatools, as it will always install the most recent stable release.
24 | 
25 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide
26 | you through the process.
27 | 
28 | .. _pip: https://pip.pypa.io
29 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
30 | 
31 | 
32 | Install from source
33 | ^^^^^^^^^^^^^^^^^^^
34 | 
35 | The sources for sinatools can be downloaded from the `Github repo`_
36 | 
37 | You can either clone the public repository:
38 | 
39 | .. code-block:: console
40 | 
41 |     $ git clone git://github.com/SinaLab/sinatools/
42 | 
43 | Or download the `tarball`_:
44 | 
45 | .. code-block:: console
46 | 
47 |     $ curl -OJL https://github.com/SinaLab/sinatools/tarball/master
48 | 
49 | Once you have a copy of the source, you can install it with:
50 | 
51 | .. code-block:: console
52 | 
53 |     $ python setup.py install
54 | 
55 | 
56 | .. _Github repo: https://github.com/SinaLab/sinatools/
57 | .. _tarball: https://github.com/SinaLab/sinatools/tarball/master
58 | 
59 | 
60 | Installing data
61 | ^^^^^^^^^^^^^^^
62 | 
63 | To install the data sets required by SinaTools See :doc:`reference/packages`.
64 | 
65 | 
66 | By default, data is stored in
67 | ``C:\Users\your_user_name\AppData\Roaming\sinatools``.
68 | 
69 | 
70 | Next Steps
71 | ----------
72 | 
73 | To get started, you can follow along
74 | `the Guided Tour <https://colab.research.google.com/>`_
75 | for a quick overview of the components provided by SinaTools.
76 | 
77 | See :doc:`cli_tools` for information on using the command-line tools or 
78 | :doc:`api` for information on using the Python API.
79 | 


--------------------------------------------------------------------------------
/docs/source/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../../README.rst
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | version_file = os.path.join(os.path.dirname(__file__),
 3 | 	'sinatools',
 4 | 	'VERSION')
 5 | with open(version_file, encoding = 'utf-8') as version_fp:
 6 | 	VERSION = version_fp.read().strip()
 7 | current_version = VERSION
 8 | commit = True
 9 | tag = True
10 | 
11 | [bumpversion:file:setup.py]
12 | search = version='{current_version}'
13 | replace = version='{new_version}'
14 | 
15 | [bumpversion:file:sinatools/__init__.py]
16 | search = __version__ = '{current_version}'
17 | replace = __version__ = '{new_version}'
18 | 
19 | [bdist_wheel]
20 | universal = 1
21 | 
22 | [flake8]
23 | exclude = docs
24 | 
25 | [aliases]
26 | test = pytest
27 | 
28 | [tool:pytest]
29 | collect_ignore = ['setup.py']
30 | 
31 | [egg_info]
32 | tag_build = 
33 | tag_date = 0
34 | 
35 | 


--------------------------------------------------------------------------------
/sinatools/CLI/DataDownload/__pycache__/download_files.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/DataDownload/__pycache__/download_files.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/DataDownload/__pycache__/download_files.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/DataDownload/__pycache__/download_files.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/DataDownload/download_files.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | 
 5 | The download_files command, allows users to select specific files and models to download and use it within SinaTools modules. Additionally, it automatically manages the extraction of compressed files, including zip and tar.gz formats.
 6 | 
 7 | Usage:
 8 | ------
 9 | 
10 | Below is the usage information that can be generated by running download_files --help.
11 | 
12 | .. code-block:: none
13 | 
14 |     Usage:
15 |      download_files [OPTIONS]
16 | 
17 | .. code-block:: none
18 | 
19 |         Options:
20 |         -f, --files FILES
21 |             Names of the files to download. Available files are: ner, morph, wsd, synonyms. 
22 |             If no file is specified, all files will be downloaded.
23 | 
24 | Examples:
25 | ---------
26 | 
27 | .. code-block:: none
28 | 
29 |     download_files -f morph ner 
30 |     This command will download only the `morph` and `ner` files to the default directory.
31 | """
32 | 
33 | import argparse
34 | from sinatools.DataDownload.downloader import download_file
35 | from sinatools.DataDownload.downloader import download_files
36 | from sinatools.DataDownload.downloader import get_appdatadir
37 | from sinatools.DataDownload.downloader import download_folder_from_hf
38 | from sinatools.DataDownload.downloader import urls
39 | 
40 | 
41 | def main():
42 |     parser = argparse.ArgumentParser(description="Download files from specified URLs.")
43 |     parser.add_argument('-f', '--files', nargs="*",
44 |                         help="Names of the files to download. Available files are: "
45 |                              f"{', '.join(urls.keys())}. If no file is specified, all files will be downloaded.")
46 |     
47 |     get_appdatadir()
48 | 
49 |     args = parser.parse_args()
50 | 
51 |     if args.files:
52 |         for file in args.files:
53 |             print("file: ", file)
54 |             if file == "wsd":
55 |                 download_file(urls["morph"])
56 |                 download_file(urls["ner"])
57 |                 #download_file(urls["wsd_model"])
58 |                 #download_file(urls["wsd_tokenizer"])
59 |                 download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01")
60 |                 download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02")
61 |                 download_file(urls["one_gram"])
62 |                 download_file(urls["five_grams"])
63 |                 download_file(urls["four_grams"])
64 |                 download_file(urls["three_grams"])
65 |                 download_file(urls["two_grams"])
66 |             elif file == "synonyms":
67 |                 download_file(urls["graph_l2"])
68 |                 download_file(urls["graph_l3"])
69 |             else:
70 |                url = urls[file]
71 |                download_file(url)
72 |     else:
73 |         download_files()
74 | 
75 | if __name__ == '__main__':
76 |     main()
77 | 


--------------------------------------------------------------------------------
/sinatools/CLI/morphology/ALMA_multi_word.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
 3 | import json
 4 | from sinatools.utils.readfile import read_file
 5 | 
 6 | def main():
 7 |     parser = argparse.ArgumentParser(description='Multi-Word Analysis using SinaTools')
 8 |     
 9 |     # Adding arguments for the multi-word input or file containing the multi-word input
10 |     parser.add_argument('--multi_word', type=str, help='Multi-word text to be analyzed')
11 |     parser.add_argument('--file', type=str, help='File containing the multi-word text to be analyzed')
12 | 
13 |     args = parser.parse_args()
14 | 
15 |     if args.multi_word is None and args.file is None:
16 |         print("Error: Either --multi_word or --file argument must be provided.")
17 |         return
18 | 
19 |     # Get the input either from the --multi_word argument or from the file specified in the --file argument
20 |     multi_word_text = args.multi_word if args.multi_word else " ".join(read_file(args.file))
21 | 
22 |     # Perform multi-word analysis
23 |     results = ALMA_multi_word(multi_word_text)
24 |     
25 |     # Print the results in JSON format
26 |     print(json.dumps(results, ensure_ascii=False, indent=4))
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | #alma_multi_word --multi_word "Your multi-word text here"
31 | #alma_multi_word --file "path/to/your/file.txt"
32 | 


--------------------------------------------------------------------------------
/sinatools/CLI/morphology/__pycache__/morph_analyzer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/morphology/__pycache__/morph_analyzer.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/morphology/__pycache__/morph_analyzer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/morphology/__pycache__/morph_analyzer.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/ner/__pycache__/corpus_entity_extractor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/ner/__pycache__/corpus_entity_extractor.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/ner/__pycache__/entity_extractor.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/ner/__pycache__/entity_extractor.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/ner/__pycache__/entity_extractor.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/ner/__pycache__/entity_extractor.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/ner/__pycache__/entity_extractor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/ner/__pycache__/entity_extractor.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/relations/relation_extractor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | The relation_extractor command is designed to extract events and their corresponding arguments (agents, locations, and dates) within a text using the SinaTools relation_extractor API.
 5 | 
 6 | Usage:
 7 | ------
 8 | Below is the usage information that can be generated by running relation_extractor --help.
 9 | 
10 | .. code-block:: none
11 | 
12 |     relation_extractor --text=TEXT [OPTIONS]
13 |     relation_extractor --file=FILE [OPTIONS]
14 | 
15 | Options:
16 | --------
17 | 
18 | .. code-block:: none
19 | 
20 |   --text TEXT
21 |         The text from which events need to be extracted.
22 | 
23 |   --file FILE
24 |         File containing the text from which events need to be extracted.
25 | 
26 | Examples:
27 | ---------
28 | 
29 | .. code-block:: none
30 | 
31 |   relation_extractor --text "Your Arabic text here"
32 |   relation_extractor --file "path/to/your/file.txt"
33 | 
34 | """
35 | 
36 | import argparse
37 | from sinatools.relations.relation_extractor import event_argument_relation_extraction
38 | from sinatools.utils.readfile import read_file
39 | 
40 | def main():
41 |     parser = argparse.ArgumentParser(description='Relation Extraction using SinaTools')
42 |       
43 |     parser.add_argument('--text', type=str, help='The text from which events need to be extracted.')
44 |     parser.add_argument('--file', type=str, help='File containing the text from which events need to be extracted.')
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     if args.text is None and args.file is None:
49 |         print("Error: Either --text or --file argument must be provided.")
50 |         return
51 | 
52 |     input_text = args.text if args.text else " ".join(read_file(args.file))
53 | 
54 |     results = event_argument_relation_extraction(input_text)
55 | 
56 |     for result in results:
57 |         print(result)
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/sinatools/CLI/semantic_relatedness/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/semantic_relatedness/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/semantic_relatedness/__pycache__/compute_relatedness.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/semantic_relatedness/__pycache__/compute_relatedness.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/semantic_relatedness/__pycache__/settings.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/semantic_relatedness/__pycache__/settings.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/semantic_relatedness/compute_relatedness.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | The semantic_relatedness command line interface (CLI) is designed to computes the degree of association between two sentences. As follows:
 5 | 
 6 | Usage:
 7 | ------
 8 | Below is the usage information that can be generated by running semantic_relatedness --help.
 9 | 
10 | .. code-block:: none
11 | 
12 |     semantic_relatedness –-sentence1 "your Arabic sentence here" --sentence2 "your Arabic sentence here"
13 | 
14 | Options:
15 | --------
16 | .. code-block:: none
17 | 
18 |     --sentence1 INPUT_TEXT
19 |           The Arabic sentence to find the semantic relatedness between it and the second sentence.
20 |     --sentence2 INPUT_TEXT
21 |           The Arabic sentence to find the semantic relatedness between it and the first sentence.
22 | 
23 | Examples:
24 | ---------
25 | .. code-block:: none
26 | 
27 |     semantic_relatedness --sentence1 "Your text here" --sentence2 "Your text here"
28 | """
29 | 
30 | import argparse
31 | from sinatools.semantic_relatedness.compute_relatedness import get_similarity_score
32 | 
33 | def main():
34 |     parser = argparse.ArgumentParser(description='Computes the degree of association between two sentences across various dimensions, meaning, underlying concepts, domain-specificity, topic overlap, viewpoint alignment.')
35 |       
36 |     parser.add_argument('--sentence1', type=str, help='The first sentence to be compute similarity based on')
37 |     parser.add_argument('--sentence2', type=str, help='The second sentence to be compute similarity based on')
38 |     
39 | 
40 |     args = parser.parse_args()
41 | 
42 |     if args.sentence1 is None and args.sentence2 is None:
43 |         print("Error: Either --sentence1 or --sentence2 argument must be provided.")
44 |         return
45 | 
46 |     score = get_similarity_score(args.sentence1, args.sentence2)
47 | 
48 |     print(score)
49 | 
50 | if __name__ == '__main__':
51 |     main()
52 | 


--------------------------------------------------------------------------------
/sinatools/CLI/synonyms/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/synonyms/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/synonyms/__pycache__/synonyms_generator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/synonyms/__pycache__/synonyms_generator.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/synonyms/evaluate_synonyms.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | The evaluate_synonyms command line interface (CLI) is designed to take a set of synonyms and a level number, then evaluate how much these synonyms are realy a synonyms in this set. As follows:
 5 | 
 6 | Usage:
 7 | ------
 8 | Below is the usage information that can be generated by running synonyms --help.
 9 | 
10 | .. code-block:: none
11 | 
12 |     evaluate_synonyms –-synset "your synset here" --level level_number
13 | 
14 | Options:
15 | --------
16 | .. code-block:: none
17 | 
18 |     --synset INPUT_TEXT
19 |             A set of synonyms to be evaluated (string of synonyms seperated by |)
20 |     --level Integer
21 |             The level number indicating the depth of synonym extension. Which could be 2 or 3
22 | 
23 | Examples:
24 | ---------
25 | .. code-block:: none
26 | 
27 |     evaluate_synonyms --synset "ممر | طريق" --level 2
28 | """
29 | 
30 | import argparse
31 | from sinatools.synonyms.synonyms_generator import evaluate_synonyms
32 | 
33 | def main():
34 |     parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools')
35 |       
36 |     parser.add_argument('--synset', type=str, help='Set of synonyms seperated by |')
37 |     parser.add_argument('--level', type=int, help='The depth of edges the algorithm needs to reach')
38 | 
39 |     args = parser.parse_args()
40 | 
41 |     if args.synset is None and args.level is None:
42 |         print("Error: Either --synset or --level argument must be provided.")
43 |         return
44 | 
45 |     results = evaluate_synonyms(args.synset, args.level)
46 | 
47 |     print(results)
48 | 
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/sinatools/CLI/synonyms/extend_synonyms.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | The extend_synonyms command line interface (CLI) is designed to receives a set of synonyms and a level number, then extends this set of synonyms with additional synonyms. Each synonym is assigned a fuzzy value based on the specified level. As follows:
 5 | 
 6 | Usage:
 7 | ------
 8 | Below is the usage information that can be generated by running synonyms --help.
 9 | 
10 | .. code-block:: none
11 | 
12 |     extend_synonyms –-synset "your synset here" --level level_number
13 | 
14 | Options:
15 | --------
16 | .. code-block:: none
17 | 
18 |     --synset INPUT_TEXT
19 |           A set of initial synonyms to be extended (string of synonyms seperated by |)
20 |     --level Integer
21 |           The level number indicating the depth of synonym extension. Which could be 2 or 3
22 | 
23 | Examples:
24 | ---------
25 | .. code-block:: none
26 | 
27 |     extend_synonyms --synset "ممر | طريق" --level 2
28 | """
29 | 
30 | import argparse
31 | from sinatools.synonyms.synonyms_generator import extend_synonyms
32 | 
33 | def main():
34 |     parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools')
35 |       
36 |     parser.add_argument('--synset', type=str, help='Set of synonyms seperated by |')
37 |     parser.add_argument('--level', type=int, help='The depth of edges the algorithm needs to reach')
38 | 
39 |     args = parser.parse_args()
40 | 
41 |     if args.synset is None and args.level is None:
42 |         print("Error: Either --synset or --level argument must be provided.")
43 |         return
44 | 
45 |     results = extend_synonyms(args.synset, args.level)
46 | 
47 |     print(results)
48 | 
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/sinatools/CLI/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__init__.py


--------------------------------------------------------------------------------
/sinatools/CLI/utils/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/utils/__pycache__/arStrip.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/arStrip.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/utils/__pycache__/corpus_tokenizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/corpus_tokenizer.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/utils/__pycache__/implication.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/implication.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/utils/__pycache__/jaccard.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/jaccard.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/utils/__pycache__/remove_latin.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/remove_latin.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/utils/__pycache__/remove_punctuation.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/remove_punctuation.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/utils/__pycache__/sentence_tokenizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/sentence_tokenizer.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/utils/__pycache__/text_transliteration.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/utils/__pycache__/text_transliteration.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/utils/corpus_tokenizer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | About:
 4 | ------
 5 | The corpus_tokenizer command offers functionality to tokenize a corpus and write the results to a CSV file. It recursively searches through a specified directory for text files, tokenizes the content, and outputs the results, including various metadata, to a specified CSV file.
 6 | 
 7 | Usage:
 8 | -------
 9 | Below is the usage information that can be generated by running corpus_tokenizer --help.
10 | 
11 | .. code-block:: none
12 | 
13 |     Usage:
14 |         corpus_tokenizer dir_path output_csv
15 | 
16 | .. code-block:: none
17 |     dir_path
18 |         The path to the directory containing the text files.
19 | 
20 |     output_csv
21 |         The path to the output CSV file.
22 | 
23 | Examples:
24 | ---------
25 | .. code-block:: none
26 |     corpus_tokenizer --dir_path "/path/to/text/directory/of/files" --output_csv  "outputFile.csv"
27 | """
28 | 
29 | import argparse
30 | from sinatools.utils.tokenizer import corpus_tokenizer
31 | 
32 | # Define the main function that will parse the arguments
33 | def main():
34 |     # Create an ArgumentParser object
35 |     parser = argparse.ArgumentParser(description='Tokenize the corpus and write the results to a CSV file.')
36 |     
37 |     # Add arguments to the parser
38 |     parser.add_argument('--dir_path', type=str, help='The path to the directory containing the text files.')
39 |     parser.add_argument('--output_csv', type=str, help='The path to the output CSV file.')
40 |     
41 |     # Parse the command-line arguments
42 |     args = parser.parse_args()
43 |     
44 |     # Call the corpus_tokenizer function with the parsed arguments
45 |     corpus_tokenizer(args.dir_path, args.output_csv)
46 | 
47 | # Call the main function when the script is executed
48 | if __name__ == '__main__':
49 |     main()
50 | 
51 | 


--------------------------------------------------------------------------------
/sinatools/CLI/utils/implication.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | The implication tool evaluates the implication between two words using the functionalities provided by the `Implication` class of SinaTools. This tool can be utilized to determine the relationship between two words and understand if one implies the other.
 5 | 
 6 | Usage:
 7 | ------
 8 | Below is the usage information that can be generated by running implication --help.
 9 | 
10 | .. code-block:: none
11 | 
12 |     Usage:
13 |         implication --inputWord1=WORD1 --inputWord2=WORD2
14 |         
15 |         implication --inputFile1=File1 --inputFile2=File2  
16 | 
17 | .. code-block:: none
18 | 
19 |     Options:
20 |       --inputWord1 WORD1
21 |             First input word.
22 | 
23 |       --inputWord2 WORD2
24 |             Second input word.
25 | 
26 |       --file1 FILE1
27 |             File containing the words to evaluate the implication.
28 | 
29 |       --file2 FILE2
30 |             File containing the words to evaluate the implication.
31 | Examples:
32 | ---------
33 | 
34 | .. code-block:: none
35 | 
36 |       implication --inputWord1 "word1" --inputWord2 "word2"
37 |       
38 |       implication --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt"
39 | 
40 | """
41 | import argparse
42 | from sinatools.utils.word_compare import Implication
43 | 
44 | def read_file(file_path):
45 |     with open(file_path, 'r', encoding='utf-8') as file:
46 |         word = file.readline().strip()
47 |         if word:
48 |             return word
49 |         else:
50 |             raise ValueError(f"File {file_path} must contain at least one word.")
51 | 
52 | def main():
53 |     parser = argparse.ArgumentParser(description='Evaluate Implication between two words using SinaTools')
54 |     
55 |     # Adding optional arguments for the two input words and the files
56 |     parser.add_argument('--inputWord1', type=str, help='First input word')
57 |     parser.add_argument('--inputWord2', type=str, help='Second input word')
58 |     parser.add_argument('--file1', type=str, help='File containing the first word to evaluate implication')
59 |     parser.add_argument('--file2', type=str, help='File containing the second word to evaluate implication')
60 | 
61 |     args = parser.parse_args()
62 | 
63 |     if args.file1 and args.file2:
64 |         word1 = read_file(args.file1)
65 |         word2 = read_file(args.file2)
66 |     elif args.inputWord1 and args.inputWord2:
67 |         word1, word2 = args.inputWord1, args.inputWord2
68 |     else:
69 |         print("Either --file1 and --file2 arguments or both --inputWord1 and --inputWord2 arguments must be provided.")
70 |         return
71 | 
72 |     # Instantiate the Implication class
73 |     implication_obj = Implication(word1, word2)
74 |     
75 |     # For this example, assuming there is a method `get_verdict()` in the Implication class.
76 |     result = implication_obj.get_verdict()
77 |     print(result)
78 | 
79 | if __name__ == '__main__':
80 |     main()
81 | # implication --inputWord1 "word1" --inputWord2 "word2"
82 | # implication --file1 "path/to/your/firstfile.txt" --file2 "path/to/your/secondfile.txt"
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/sinatools/CLI/utils/remove_latin.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | The remove_latin command performs delete latin characters from the input text.
 5 | 
 6 | Usage:
 7 | ------
 8 | Below is the usage information that can be generated by running remove_latin --help.
 9 | 
10 | .. code-block:: none
11 |     remove_latin --text=TEXT
12 |     remove_latin --file "path/to/your/file.txt"
13 | 
14 | Examples:
15 | ---------
16 | .. code-block:: none
17 |     remove_latin --text "123test"    
18 |     remove_latin --file "path/to/your/file.txt"
19 | """
20 | 
21 | import argparse
22 | from sinatools.utils.parser import remove_latin
23 | 
24 | 
25 | def main():
26 |     parser = argparse.ArgumentParser(description='remove latin characters from the text')
27 | 
28 |     parser.add_argument('--text', type=str, required=True, help='The input text')
29 |     args = parser.parse_args()
30 |     result = remove_latin(args.text)
31 |  
32 |     print(result)
33 |     if __name__ == '__main__':
34 |         main()
35 | 


--------------------------------------------------------------------------------
/sinatools/CLI/utils/remove_punctuation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | The remove_punctuation command performs delete punctuation marks from the input text.
 5 | 
 6 | Usage:
 7 | ------
 8 | Below is the usage information that can be generated by running remove_punctuation --help.
 9 | 
10 | .. code-block:: none
11 | 
12 |     Usage:
13 |         remove_punctuation --text=TEXT
14 |         remove_punctuation --file "path/to/your/file.txt"
15 | 
16 | Examples:
17 | ---------
18 | .. code-block:: none
19 | 
20 |     remove_punctuation --text "te%s@t...!!?"    
21 |     remove_punctuation --file "path/to/your/file.txt"
22 | """
23 | 
24 | import argparse
25 | from sinatools.utils.parser import remove_punctuation
26 | #from sinatools.utils.parser import read_file
27 | #from sinatools.utils.parser import write_file
28 | 
29 | 
30 | def main():
31 |     parser = argparse.ArgumentParser(description='remove punctuation marks from the text')
32 | 
33 |     parser.add_argument('--text',required=True,help="input text")
34 |    # parser.add_argument('myFile', type=argparse.FileType('r'),help='Input file csv')
35 |     args = parser.parse_args()
36 |     result = remove_punctuation(args.text)
37 |  
38 |     print(result)
39 |     if __name__ == '__main__':
40 |         main()
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/sinatools/CLI/utils/sentence_tokenizer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | 
 5 | The sentence_tokenizer command allows you to tokenize text into sentences using the SinaTools utility. It provides 
 6 | flexibility in tokenizing at different punctuation marks, including dots, question marks, and exclamation marks. It also 
 7 | allows tokenization at new lines.
 8 | 
 9 | Usage:
10 | ------
11 | Below is the usage information that can be generated by running sentence_tokenizer --help.
12 | 
13 | .. code-block:: none
14 | 
15 |     Usage:
16 |         sentence_tokenizer --text=TEXT [options]
17 |         sentence_tokenizer --file=FILE [options]
18 |         
19 | .. code-block:: none
20 |       
21 |     Options:
22 |       --text TEXT
23 |             Text to be tokenized into sentences.
24 |       --file FILE
25 |             File containing the text to be tokenized into sentences
26 |       --dot
27 |             Tokenize at dots.
28 |       --new_line
29 |             Tokenize at new lines.
30 |       --question_mark
31 |             Tokenize at question marks.
32 |       --exclamation_mark
33 |             Tokenize at exclamation marks.
34 | 
35 | Examples:
36 | ---------
37 | 
38 | .. code-block:: none
39 | 
40 |   sentence_tokenizer --text "Your text here. Does it work? Yes! Try with new lines." --dot --question_mark --exclamation_mark
41 |   sentence_tokenizer --file "path/to/your/file.txt" --dot --question_mark --exclamation_mark
42 | 
43 | """
44 | import argparse
45 | from sinatools.utils.tokenizer import sentence_tokenizer
46 | from sinatools.utils.readfile import read_file
47 | 
48 | def main():
49 |     parser = argparse.ArgumentParser(description='Sentence Tokenization using SinaTools')
50 |     
51 |     # Adding arguments for the text, file, and tokenization options
52 |     parser.add_argument('--text', type=str, help='Text to be tokenized into sentences')
53 |     parser.add_argument('--file', type=str, help='File containing the text to be tokenized into sentences')
54 |     parser.add_argument('--dot', action='store_true', help='Tokenize at dots')
55 |     parser.add_argument('--new_line', action='store_true', help='Tokenize at new lines')
56 |     parser.add_argument('--question_mark', action='store_true', help='Tokenize at question marks')
57 |     parser.add_argument('--exclamation_mark', action='store_true', help='Tokenize at exclamation marks')
58 | 
59 |     args = parser.parse_args()
60 | 
61 |     # Check if either text or file is provided
62 |     if args.text is None and args.file is None:
63 |         print("Either --text or --file argument must be provided.")
64 |         return
65 | 
66 |     text_content = args.text if args.text else read_file(args.file)
67 | 
68 |     # Perform sentence tokenization
69 |     sentences = sentence_tokenizer(" ".join(text_content), dot=args.dot, new_line=args.new_line, 
70 |                               question_mark=args.question_mark, exclamation_mark=args.exclamation_mark)
71 |     
72 |     # Print each sentence in a new line
73 |     for sentence in sentences:
74 |         print(sentence)
75 | 
76 | if __name__ == '__main__':
77 |     main()
78 | 


--------------------------------------------------------------------------------
/sinatools/CLI/utils/text_dublication_detector.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from sinatools.utils.text_dublication_detector import removal
 3 | 
 4 | def main():
 5 |     parser = argparse.ArgumentParser(description='Processes a CSV file of sentences to identify and remove duplicate sentences based on a specified threshold and cosine similarity. It saves the filtered results and the identified duplicates to separate files.')
 6 |     
 7 |     parser.add_argument('--csv_file', type=str, help='The path to the input CSV file that will be processed.')
 8 |     parser.add_argument('--column_name', type=str, help='The name of the column from which duplicates will be removed.')
 9 |     parser.add_argument('--final_file_name', type=str, help='The name of the output file that will contain the deduplicated results.')
10 |     parser.add_argument('--deleted_file_name', type=str, help='The name of the output file that will contain the records that were identified as duplicates and removed.')
11 |     parser.add_argument('--similarity_threshold', type=float, default=0.8, help='The similarity threshold for determining duplicates. Records with a similarity score above this value will be considered duplicates (default is 0.8).')
12 | 
13 |     args = parser.parse_args()
14 | 
15 |     if args.csv_file is None and args.column_name is None:
16 |         print("Either --csv_file or --column_name argument must be provided.")
17 |         return
18 | 
19 |     removal(args.csv_file, args.column_name, args.final_file_name, args.deleted_file_name, args.similarity_threshold)
20 |     
21 | 
22 | if __name__ == '__main__':
23 |     main()
24 | 
25 | # text_dublication_detector --csv_file "text.csv" --column_name "A" --final_file_name "Final.csv" --deleted_file_name "deleted.csv" --similarity_threshold 0.8


--------------------------------------------------------------------------------
/sinatools/CLI/utils/text_transliteration.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | 
 5 | The transliterate tool allows you to transliterate text using the SinaTools' utility. This command-line utility
 6 | takes in a text and a desired schema, and outputs the transliterated text.
 7 | 
 8 | Usage:
 9 | ------
10 | Below is the usage information that can be generated by running transliterate --help.
11 | 
12 |     Usage:
13 |     ------
14 | 
15 | .. code-block:: none
16 | 
17 |         transliterate --text=TEXT --schema=SCHEMA
18 | 
19 |         transliterate --file=FILE --schema=SCHEMA
20 | 
21 | Options:
22 | --------
23 | 
24 | .. code-block:: none
25 | 
26 |   --text TEXT
27 |         Text to be transliterated.
28 |   --schema SCHEMA
29 |         Transliteration schema to be used, which is bw2ar or ar2bw.
30 | 
31 | Examples:
32 | ---------
33 | 
34 | .. code-block:: none
35 | 
36 |     transliterate --text "klmp" --schema "bw2ar"
37 |     transliterate --file "path/to/your/file.txt" --schema "ar2bw"
38 | 
39 | 
40 | """
41 | import argparse
42 | from sinatools.utils.text_transliteration import perform_transliteration
43 | from sinatools.utils.readfile import read_file
44 | 
45 | def main():
46 |     parser = argparse.ArgumentParser(description='Perform text transliteration using SinaTools')
47 |     
48 |     # Adding arguments for the text, file, and schema
49 |     parser.add_argument('--text', type=str, help='Text to be transliterated')
50 |     parser.add_argument('--file', type=str, help='File containing the text to be transliterated')
51 |     parser.add_argument('--schema', type=str, required=True, help='Transliteration schema to be used')
52 | 
53 |     args = parser.parse_args()
54 | 
55 |     # Check if either text or file is provided
56 |     if args.text is None and args.file is None:
57 |         print("Either --text or --file argument must be provided.")
58 |         return
59 | 
60 |     text_content = args.text if args.text else " ".join(read_file(args.file))
61 |     # Perform transliteration
62 |     result = perform_transliteration(text_content, args.schema)
63 |     
64 |     print(result)
65 | 
66 | if __name__ == '__main__':
67 |     main()
68 | 
69 | #transliterate --text "example text" --schema "bw2ar"
70 | #transliterate --file "path/to/your/file.txt" --schema "bw2ar"
71 | 


--------------------------------------------------------------------------------
/sinatools/CLI/wsd/__pycache__/disambiguator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/CLI/wsd/__pycache__/disambiguator.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/CLI/wsd/disambiguator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | About:
 3 | ------
 4 | The WSD command line interface (CLI) is designed to utilize the word sense disambiguation for Arabic sentences. This CLI allows users to input an Arabic sentence and receive concepts and lemma for each token within the sentence. As follows:
 5 | 
 6 | Usage:
 7 | ------
 8 | Below is the usage information that can be generated by running wsd --help.
 9 | 
10 | .. code-block:: none
11 | 
12 |     wsd –-text "your Arabic sentence here"
13 |     wsd –-file path/to/txt/file
14 | 
15 | Options:
16 | --------
17 | .. code-block:: none
18 | 
19 |   --text INPUT_TEXT
20 |         The text that needs to be analyzed for Named Entity Recognition.
21 |   --file txt_file_path
22 |         Directory containing the text files to be analyzed for Named Entity Recognition
23 | 
24 | Examples:
25 | ---------
26 | .. code-block:: none
27 | 
28 |     wsd --text "Your text here"
29 |     wsd --file "path/to/your/txt/file"
30 | 
31 | """
32 | 
33 | import argparse
34 | import json
35 | from sinatools.wsd.disambiguator import disambiguate
36 | from sinatools.utils.readfile import read_file
37 | 
38 | def main():
39 |     parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools')
40 |     
41 |     parser.add_argument('--text', type=str, help='Input sentence to process')
42 |     parser.add_argument('--file', type=str, help='File containing the Arabic sentence to process')
43 | 
44 |     args = parser.parse_args()
45 | 
46 |     if args.text is None and args.file is None:
47 |         print("Either --text or --file argument must be provided.")
48 |         return
49 |     
50 |     text_content = args.text if args.text else " ".join(read_file(args.file))
51 |     result = disambiguate(text_content)
52 |     print(json.dumps(result, ensure_ascii=False, indent=4))
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 
57 | #wsd --text "your Arabic sentence here"
58 | #wsd --file "path/to/your/file.txt"


--------------------------------------------------------------------------------
/sinatools/DataDownload/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__init__.py


--------------------------------------------------------------------------------
/sinatools/DataDownload/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/DataDownload/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/DataDownload/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/DataDownload/__pycache__/downloader.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__pycache__/downloader.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/DataDownload/__pycache__/downloader.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__pycache__/downloader.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/DataDownload/__pycache__/downloader.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/DataDownload/__pycache__/downloader.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/VERSION:
--------------------------------------------------------------------------------
1 | 0.1.41


--------------------------------------------------------------------------------
/sinatools/__init__.py:
--------------------------------------------------------------------------------
1 | """Top-level package for sinatools."""
2 | 
3 | __author__ = """SinaLab"""
4 | __email__ = 'sina.institute.bzu@gmail.com'
5 | __version__ = '0.8.5'


--------------------------------------------------------------------------------
/sinatools/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/__pycache__/sinatools.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/__pycache__/sinatools.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/arabert/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 


--------------------------------------------------------------------------------
/sinatools/arabert/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/arabert/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/arabert/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/arabert/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/arabert/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/arabert/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/arabert/__pycache__/preprocess.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/arabert/__pycache__/preprocess.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/arabert/__pycache__/preprocess.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/arabert/__pycache__/preprocess.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/arabert/__pycache__/preprocess.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/arabert/__pycache__/preprocess.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/install_env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | def main():
 5 |     # Determine the path to the 'environment.yml' file within the package
 6 |     package_dir = os.path.dirname(__file__)
 7 |     env_file = os.path.join(package_dir, 'environment.yml')
 8 | 
 9 |     # Create the conda environment using the 'environment.yml' file
10 |     subprocess.call(["conda", "env", "create", "-f", env_file])
11 | 
12 | if __name__ == "__main__":
13 |     main()
14 | 


--------------------------------------------------------------------------------
/sinatools/morphology/ALMA_multi_word.py:
--------------------------------------------------------------------------------
 1 | from sinatools.utils.parser import arStrip
 2 | from . import five_grams_dict, four_grams_dict  , three_grams_dict , two_grams_dict
 3 | 
 4 | def ALMA_multi_word(multi_word, n):
 5 |     undiac_multi_word = arStrip(multi_word, True, True, True, False, True, False)  # diacs , smallDiacs , shaddah ,  digit , alif , specialChars
 6 |     result_word = []
 7 |     if n == 2:
 8 |         if undiac_multi_word in two_grams_dict.keys():
 9 |             result_word = two_grams_dict[undiac_multi_word]                    
10 |     elif n == 3:
11 |         if undiac_multi_word in three_grams_dict.keys():
12 |             result_word = three_grams_dict[undiac_multi_word]                    
13 |     elif n == 4:
14 |         if undiac_multi_word in four_grams_dict.keys():
15 |             result_word = four_grams_dict[undiac_multi_word]                    
16 |     else:    
17 |      if undiac_multi_word in five_grams_dict.keys():
18 |          result_word = five_grams_dict[undiac_multi_word]            
19 |     
20 |     my_json = {}
21 |     output_list = []
22 |     my_json['multi_word_lemma'] = multi_word
23 |     my_json['undiac_multi_word_lemma'] = multi_word
24 |     ids = []
25 |     if result_word != []:
26 |         my_json['POS'] = result_word[0][1] #POS
27 |         for result in result_word: 
28 |            ids.append(result[3])
29 |         my_json['ids'] = ids
30 |         output_list.append(my_json)    
31 |     return output_list  


--------------------------------------------------------------------------------
/sinatools/morphology/__init__.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from sinatools.DataDownload import downloader
 3 | import os 
 4 | 
 5 | dictionary = {}
 6 | five_grams_dict = {} 
 7 | four_grams_dict = {}
 8 | three_grams_dict = {}
 9 | two_grams_dict = {} 
10 | 
11 | filename = 'lemmas_dic.pickle'
12 | path = downloader.get_appdatadir()
13 | file_path = os.path.join(path, filename)
14 | with open(file_path, 'rb') as f:
15 |     dictionary = pickle.load(f)
16 | 
17 | filename_five = 'five_grams.pickle'
18 | path =downloader.get_appdatadir()
19 | file_path = os.path.join(path, filename_five)
20 | with open(file_path, 'rb') as f:
21 |    five_grams_dict = pickle.load(f, encoding='utf-8')
22 | 
23 | 
24 | filename_four = 'four_grams.pickle'
25 | path =downloader.get_appdatadir()
26 | file_path = os.path.join(path, filename_four)
27 | with open(file_path, 'rb') as f:
28 |   four_grams_dict = pickle.load(f, encoding='utf-8')
29 |   
30 |      
31 | filename_three = 'three_grams.pickle'
32 | path =downloader.get_appdatadir()
33 | file_path = os.path.join(path, filename_three)
34 | with open(file_path, 'rb') as f:
35 |   three_grams_dict = pickle.load(f, encoding='utf-8')
36 |   
37 | 
38 | filename_two = 'two_grams.pickle'
39 | path =downloader.get_appdatadir()
40 | file_path = os.path.join(path, filename_two)
41 | with open(file_path, 'rb') as f:
42 |   two_grams_dict = pickle.load(f, encoding='utf-8')
43 |   


--------------------------------------------------------------------------------
/sinatools/morphology/__pycache__/ALMA_multi_word.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/ALMA_multi_word.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/morphology/__pycache__/ALMA_multi_word.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/ALMA_multi_word.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/morphology/__pycache__/ALMA_multi_word.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/ALMA_multi_word.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/morphology/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/morphology/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/morphology/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/morphology/__pycache__/charsets.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/charsets.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/morphology/__pycache__/morph_analyzer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/morph_analyzer.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/morphology/__pycache__/morph_analyzer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/morph_analyzer.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/morphology/__pycache__/morph_analyzer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/morph_analyzer.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/morphology/__pycache__/settings.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/settings.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/morphology/__pycache__/tokenizers_words.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/morphology/__pycache__/tokenizers_words.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__init__.py:
--------------------------------------------------------------------------------
 1 | from sinatools.DataDownload import downloader
 2 | import os
 3 | from sinatools.ner.helpers import load_object
 4 | import pickle
 5 | import os
 6 | import torch
 7 | import pickle
 8 | import json
 9 | from argparse import Namespace
10 | 
11 | tagger = None
12 | tag_vocab = None
13 | train_config = None
14 | 
15 | filename = 'Wj27012000.tar'
16 | path =downloader.get_appdatadir()
17 | model_path = os.path.join(path, filename)
18 | 
19 | _path = os.path.join(model_path, "tag_vocab.pkl")
20 | 
21 | with open(_path, "rb") as fh:
22 |     tag_vocab = pickle.load(fh)
23 | 
24 | train_config = Namespace()
25 | args_path = os.path.join(model_path, "args.json")
26 | 
27 | with open(args_path, "r") as fh:
28 |     train_config.__dict__ = json.load(fh)
29 | 
30 | model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
31 | model = torch.nn.DataParallel(model)
32 | 
33 | if torch.cuda.is_available():
34 |     model = model.cuda()
35 | 
36 | train_config.trainer_config["kwargs"]["model"] = model
37 | tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
38 | tagger.load(os.path.join(model_path,"checkpoints"))
39 | 


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/data.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/data.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/data.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/data.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/data_format.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/data_format.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/data_format.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/data_format.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/data_format.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/data_format.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/datasets.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/datasets.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/datasets.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/datasets.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/entity_extractor.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/entity_extractor.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/entity_extractor.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/entity_extractor.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/entity_extractor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/entity_extractor.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/helpers.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/helpers.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/helpers.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/helpers.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/helpers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/helpers.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/metrics.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/metrics.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/metrics.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/metrics.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/metrics.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/metrics.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/relation_extractor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/relation_extractor.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/transforms.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/transforms.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/__pycache__/transforms.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/__pycache__/transforms.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/data/__init__.py:
--------------------------------------------------------------------------------
1 | from sinatools.ner.data.datasets import NestedTagsDataset


--------------------------------------------------------------------------------
/sinatools/ner/data/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/data/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/data/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/data/__pycache__/datasets.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/datasets.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/data/__pycache__/datasets.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/datasets.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/data/__pycache__/datasets.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/datasets.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/data/__pycache__/transforms.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/transforms.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/data/__pycache__/transforms.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/transforms.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/data/__pycache__/transforms.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/data/__pycache__/transforms.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/metrics.py:
--------------------------------------------------------------------------------
 1 | from seqeval.metrics import (
 2 |     classification_report,
 3 |     precision_score,
 4 |     recall_score,
 5 |     f1_score,
 6 |     accuracy_score,
 7 | )
 8 | from seqeval.scheme import IOB2
 9 | from types import SimpleNamespace
10 | import logging
11 | import re
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def compute_nested_metrics(segments, vocabs):
17 |     """
18 |     Compute metrics for nested NER
19 |     :param segments: List[List[arabiner.data.dataset.Token]] - list of segments
20 |     :return: metrics - SimpleNamespace - F1/micro/macro/weights, recall, precision, accuracy
21 |     """
22 |     y, y_hat = list(), list()
23 | 
24 |     # We duplicate the dataset N times, where N is the number of entity types
25 |     # For each copy, we create y and y_hat
26 |     # Example: first copy, will create pairs of ground truth and predicted labels for entity type GPE
27 |     #          another copy will create pairs for LOC, etc.
28 |     for i, vocab in enumerate(vocabs):
29 |         vocab_tags = [tag for tag in vocab.get_itos() if "-" in tag]
30 |         r = re.compile("|".join(vocab_tags))
31 | 
32 |         y += [[(list(filter(r.match, token.gold_tag)) or ["O"])[0] for token in segment] for segment in segments]
33 |         y_hat += [[token.pred_tag[i]["tag"] for token in segment] for segment in segments]
34 | 
35 |     logging.info("\n" + classification_report(y, y_hat, scheme=IOB2, digits=4))
36 | 
37 |     metrics = {
38 |         "micro_f1": f1_score(y, y_hat, average="micro", scheme=IOB2),
39 |         "macro_f1": f1_score(y, y_hat, average="macro", scheme=IOB2),
40 |         "weights_f1": f1_score(y, y_hat, average="weighted", scheme=IOB2),
41 |         "precision": precision_score(y, y_hat, scheme=IOB2),
42 |         "recall": recall_score(y, y_hat, scheme=IOB2),
43 |         "accuracy": accuracy_score(y, y_hat),
44 |     }
45 | 
46 |     return SimpleNamespace(**metrics)
47 | 
48 | 
49 | def compute_single_label_metrics(segments):
50 |     """
51 |     Compute metrics for flat NER
52 |     :param segments: List[List[arabiner.data.dataset.Token]] - list of segments
53 |     :return: metrics - SimpleNamespace - F1/micro/macro/weights, recall, precision, accuracy
54 |     """
55 |     y = [[token.gold_tag[0] for token in segment] for segment in segments]
56 |     y_hat = [[token.pred_tag[0]["tag"] for token in segment] for segment in segments]
57 | 
58 |     logging.info("\n" + classification_report(y, y_hat, scheme=IOB2))
59 | 
60 |     metrics = {
61 |         "micro_f1": f1_score(y, y_hat, average="micro", scheme=IOB2),
62 |         "macro_f1": f1_score(y, y_hat, average="macro", scheme=IOB2),
63 |         "weights_f1": f1_score(y, y_hat, average="weighted", scheme=IOB2),
64 |         "precision": precision_score(y, y_hat, scheme=IOB2),
65 |         "recall": recall_score(y, y_hat, scheme=IOB2),
66 |         "accuracy": accuracy_score(y, y_hat),
67 |     }
68 | 
69 |     return SimpleNamespace(**metrics)
70 | 


--------------------------------------------------------------------------------
/sinatools/ner/nn/BaseModel.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from transformers import BertModel
 3 | import logging
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | class BaseModel(nn.Module):
 9 |     def __init__(self,
10 |                  bert_model="aubmindlab/bert-base-arabertv2",
11 |                  num_labels=2,
12 |                  dropout=0.1,
13 |                  num_types=0):
14 |         super().__init__()
15 | 
16 |         self.bert_model = bert_model
17 |         self.num_labels = num_labels
18 |         self.num_types = num_types
19 |         self.dropout = dropout
20 | 
21 |         self.bert = BertModel.from_pretrained(bert_model)
22 |         self.dropout = nn.Dropout(dropout)
23 | 


--------------------------------------------------------------------------------
/sinatools/ner/nn/BertNestedTagger.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from sinatools.ner.nn import BaseModel
 4 | 
 5 | 
 6 | class BertNestedTagger(BaseModel):
 7 |     def __init__(self, **kwargs):
 8 |         super(BertNestedTagger, self).__init__(**kwargs)
 9 | 
10 |         self.max_num_labels = max(self.num_labels)
11 |         classifiers = [nn.Linear(768, num_labels) for num_labels in self.num_labels]
12 |         self.classifiers = torch.nn.Sequential(*classifiers)
13 | 
14 |     def forward(self, x):
15 |         y = self.bert(x)
16 |         y = self.dropout(y["last_hidden_state"])
17 |         output = list()
18 | 
19 |         for i, classifier in enumerate(self.classifiers):
20 |             logits = classifier(y)
21 | 
22 |             # Pad logits to allow Multi-GPU/DataParallel training to work
23 |             # We will truncate the padded dimensions when we compute the loss in the trainer
24 |             logits = torch.nn.ConstantPad1d((0, self.max_num_labels - logits.shape[-1]), 0)(logits)
25 |             output.append(logits)
26 | 
27 |         # Return tensor of the shape B x T x L x C
28 |         # B: batch size
29 |         # T: sequence length
30 |         # L: number of tag types
31 |         # C: number of classes per tag type
32 |         output = torch.stack(output).permute((1, 2, 0, 3))
33 |         return output
34 | 
35 | 


--------------------------------------------------------------------------------
/sinatools/ner/nn/BertSeqTagger.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from transformers import BertModel
 3 | 
 4 | 
 5 | class BertSeqTagger(nn.Module):
 6 |     def __init__(self, bert_model, num_labels=2, dropout=0.1):
 7 |         super().__init__()
 8 | 
 9 |         self.bert = BertModel.from_pretrained(bert_model)
10 |         self.dropout = nn.Dropout(dropout)
11 |         self.linear = nn.Linear(768, num_labels)
12 | 
13 |     def forward(self, x):
14 |         y = self.bert(x)
15 |         y = self.dropout(y["last_hidden_state"])
16 |         logits = self.linear(y)
17 |         return logits
18 | 


--------------------------------------------------------------------------------
/sinatools/ner/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from sinatools.ner.nn.BaseModel import BaseModel
2 | from sinatools.ner.nn.BertSeqTagger import BertSeqTagger
3 | from sinatools.ner.nn.BertNestedTagger import BertNestedTagger


--------------------------------------------------------------------------------
/sinatools/ner/nn/__pycache__/BaseModel.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BaseModel.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/nn/__pycache__/BaseModel.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BaseModel.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/nn/__pycache__/BaseModel.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BaseModel.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/nn/__pycache__/BertNestedTagger.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BertNestedTagger.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/nn/__pycache__/BertNestedTagger.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BertNestedTagger.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/nn/__pycache__/BertNestedTagger.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BertNestedTagger.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/nn/__pycache__/BertSeqTagger.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BertSeqTagger.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/nn/__pycache__/BertSeqTagger.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BertSeqTagger.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/nn/__pycache__/BertSeqTagger.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/BertSeqTagger.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/nn/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/nn/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/nn/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/nn/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/trainers/__init__.py:
--------------------------------------------------------------------------------
1 | from sinatools.ner.trainers.BaseTrainer import BaseTrainer
2 | from sinatools.ner.trainers.BertTrainer import BertTrainer
3 | from sinatools.ner.trainers.BertNestedTrainer import BertNestedTrainer


--------------------------------------------------------------------------------
/sinatools/ner/trainers/__pycache__/BaseTrainer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BaseTrainer.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/trainers/__pycache__/BaseTrainer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BaseTrainer.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/trainers/__pycache__/BaseTrainer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BaseTrainer.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/trainers/__pycache__/BertNestedTrainer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BertNestedTrainer.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/trainers/__pycache__/BertNestedTrainer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BertNestedTrainer.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/trainers/__pycache__/BertNestedTrainer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BertNestedTrainer.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/trainers/__pycache__/BertTrainer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BertTrainer.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/trainers/__pycache__/BertTrainer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BertTrainer.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/trainers/__pycache__/BertTrainer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/BertTrainer.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/ner/trainers/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/ner/trainers/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/ner/trainers/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/ner/trainers/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/relations/__init__.py:
--------------------------------------------------------------------------------
1 | from sinatools.DataDownload import downloader
2 | import os
3 | from transformers import pipeline
4 | 
5 | path =downloader.get_appdatadir()
6 | 
7 | pipe = pipeline("sentiment-analysis", model= os.path.join(path, "relation_model"), return_all_scores =True, max_length=128, truncation=True)


--------------------------------------------------------------------------------
/sinatools/relations/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/relations/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/relations/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/relations/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/relations/__pycache__/relation_extractor.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/relations/__pycache__/relation_extractor.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/relations/__pycache__/relation_extractor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/relations/__pycache__/relation_extractor.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/semantic_relatedness/__init__.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | from sinatools.DataDownload import downloader
 4 | import os 
 5 | from transformers import BertTokenizer,BertModel
 6 | 
 7 | model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"
 8 | path =downloader.get_appdatadir()
 9 | model_file_path = os.path.join(path, model_file_name)
10 | 
11 | tokenizer_file_name = "bert-base-arabertv02"
12 | path =downloader.get_appdatadir()
13 | tokenizer_file_path = os.path.join(path, tokenizer_file_name)
14 | 
15 | model = BertModel.from_pretrained('{}'.format(model_file_path),
16 |                                                       output_hidden_states = True,
17 |                                                       num_labels=2
18 |                                                       )
19 | 
20 | tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path))


--------------------------------------------------------------------------------
/sinatools/semantic_relatedness/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/semantic_relatedness/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/semantic_relatedness/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/semantic_relatedness/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/semantic_relatedness/__pycache__/compute_relatedness.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/semantic_relatedness/__pycache__/compute_relatedness.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/semantic_relatedness/__pycache__/compute_relatedness.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/semantic_relatedness/__pycache__/compute_relatedness.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/semantic_relatedness/__pycache__/settings.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/semantic_relatedness/__pycache__/settings.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/semantic_relatedness/compute_relatedness.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from . import tokenizer 
 3 | from . import model 
 4 | 
 5 | #cosine using average embedding 
 6 | def get_similarity_score(sentence1, sentence2):
 7 |   """
 8 |     Computes the degree of association between two sentences across various dimensions, meaning, underlying concepts, domain-specificity, topic overlap, viewpoint alignment. This method is described and implemented on this article.
 9 | 
10 |     Args:
11 |         sentence1 (:obj:`str`) – The Arabic sentence to find the semantic relatedness between it and the second sentence.
12 |         sentence2 (:obj:`int`) – The Arabic sentence to find the semantic relatedness between it and the first sentence.
13 |     
14 |     Returns:
15 |         :obj:`float`: An float number that represents the degree of relatedness between two provided sentences.
16 | 
17 |     **Example:**
18 | 
19 |     .. highlight:: python
20 |     .. code-block:: python
21 | 
22 |     from sinatools.semantic_relatedness.compute_relatedness import get_similarity_score
23 | 
24 |     sentence1 = "تبلغ سرعة دوران الأرض حول الشمس حوالي 110 كيلومتر في الساعة."
25 |     sentence2 = "تدور الأرض حول محورها بسرعة تصل تقريبا 1670 كيلومتر في الساعة."    
26 |     get_similarity_score(sentence1, sentence2)
27 |     Score = 0.90
28 |   """         
29 | 
30 |   # Tokenize and encode sentences
31 |   inputs1 = tokenizer(sentence1, return_tensors="pt")
32 |   inputs2 = tokenizer(sentence2, return_tensors="pt")
33 | 
34 |   # Extract embeddings
35 |   with torch.no_grad():
36 |       outputs1 = model(**inputs1)
37 |       outputs2 = model(**inputs2)
38 |       
39 |       embeddings1 = outputs1.last_hidden_state
40 |       embeddings2 = outputs2.last_hidden_state
41 | 
42 |   # Mask padding tokens
43 |   attention_mask1 = inputs1["attention_mask"]
44 |   attention_mask2 = inputs2["attention_mask"]
45 | 
46 |   # Average pool across tokens, excluding padding
47 |   embeddings1_avg = torch.sum(embeddings1 * attention_mask1.unsqueeze(-1), dim=1) / torch.sum(attention_mask1, dim=1, keepdim=True)
48 |   embeddings2_avg = torch.sum(embeddings2 * attention_mask2.unsqueeze(-1), dim=1) / torch.sum(attention_mask2, dim=1, keepdim=True)
49 | 
50 |   # Calculate cosine similarity
51 |   similarity = torch.nn.functional.cosine_similarity(embeddings1_avg, embeddings2_avg)
52 | 
53 |   return similarity.item()


--------------------------------------------------------------------------------
/sinatools/sinatools.py:
--------------------------------------------------------------------------------
1 | """Main module."""
2 | 


--------------------------------------------------------------------------------
/sinatools/synonyms/__init__.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from sinatools.DataDownload import downloader
 3 | import os 
 4 | 
 5 | synonyms_level2_dict = {} 
 6 | level2_dict = 'graph_l2.pkl'
 7 | path = downloader.get_appdatadir()
 8 | file_path = os.path.join(path, level2_dict)
 9 | with open(file_path, 'rb') as f:
10 |     synonyms_level2_dict = pickle.load(f, encoding='utf-8')
11 | 
12 | 
13 | synonyms_level3_dict = {}    
14 | level3_dict = 'graph_l3.pkl'
15 | path = downloader.get_appdatadir()
16 | file_path = os.path.join(path, level3_dict)
17 | with open(file_path, 'rb') as f:
18 |    synonyms_level3_dict = pickle.load(f, encoding='utf-8')


--------------------------------------------------------------------------------
/sinatools/synonyms/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/synonyms/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/synonyms/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/synonyms/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/synonyms/__pycache__/synonyms_generator.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/synonyms/__pycache__/synonyms_generator.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/synonyms/__pycache__/synonyms_generator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/synonyms/__pycache__/synonyms_generator.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__init__.py


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/charsets.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/charsets.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/charsets.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/charsets.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/charsets.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/charsets.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/implication.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/implication.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/implication.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/implication.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/jaccard.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/jaccard.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/jaccard.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/jaccard.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/parser.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/parser.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/parser.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/parser.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/parser.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/parser.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/readfile.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/readfile.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/readfile.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/readfile.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/similarity.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/similarity.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/text_dublication_detector.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/text_dublication_detector.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/text_transliteration.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/text_transliteration.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/tokenizer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/tokenizer.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/tokenizer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/tokenizer.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/tokenizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/tokenizer.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/tokenizers_words.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/tokenizers_words.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/tokenizers_words.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/tokenizers_words.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/tokenizers_words.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/tokenizers_words.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/word_compare.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/word_compare.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/utils/__pycache__/word_compare.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/utils/__pycache__/word_compare.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/utils/charsets.py:
--------------------------------------------------------------------------------
 1 | # We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/utils/charsets.html].
 2 | 
 3 | import unicodedata
 4 | 
 5 | from six import unichr
 6 | 
 7 | 
 8 | UNICODE_PUNCT_CHARSET = frozenset(
 9 |     [unichr(x) for x in range(65536) if unicodedata.category(
10 |         unichr(x))[0] == 'P'])
11 | UNICODE_SYMBOL_CHARSET = frozenset(
12 |     [unichr(x) for x in range(65536) if unicodedata.category(
13 |         unichr(x))[0] == 'S'])
14 | UNICODE_PUNCT_SYMBOL_CHARSET = UNICODE_PUNCT_CHARSET | UNICODE_SYMBOL_CHARSET
15 | 
16 | UNICODE_LETTER_CHARSET = frozenset(
17 |     [unichr(x) for x in range(65536) if unicodedata.category(
18 |         unichr(x))[0] == 'L'])
19 | UNICODE_MARK_CHARSET = frozenset(
20 |     [unichr(x) for x in range(65536) if unicodedata.category(
21 |         unichr(x))[0] == 'M'])
22 | UNICODE_NUMBER_CHARSET = frozenset(
23 |     [unichr(x) for x in range(65536) if unicodedata.category(
24 |         unichr(x))[0] == 'N'])
25 | UNICODE_LETTER_MARK_NUMBER_CHARSET = (UNICODE_LETTER_CHARSET |
26 |                                       UNICODE_MARK_CHARSET |
27 |                                       UNICODE_NUMBER_CHARSET)
28 | 
29 | AR_LETTERS_CHARSET = frozenset(u'\u0621\u0622\u0623\u0624\u0625\u0626\u0627'
30 |                                u'\u0628\u0629\u062a\u062b\u062c\u062d\u062e'
31 |                                u'\u062f\u0630\u0631\u0632\u0633\u0634\u0635'
32 |                                u'\u0636\u0637\u0638\u0639\u063a\u0640\u0641'
33 |                                u'\u0642\u0643\u0644\u0645\u0646\u0647\u0648'
34 |                                u'\u0649\u064a\u0671\u067e\u0686\u06a4\u06af')
35 | AR_DIAC_CHARSET = frozenset(u'\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652'
36 |                             u'\u0670\u0640')
37 | AR_CHARSET = AR_LETTERS_CHARSET | AR_DIAC_CHARSET
38 | 
39 | BW_LETTERS_CHARSET = frozenset(u'$&\'*<>ADEGHJPSTVYZ_bdfghjklmnpqrstvwxyz{|}')
40 | BW_DIAC_CHARSET = frozenset(u'FKN`aiou~_')
41 | BW_CHARSET = BW_LETTERS_CHARSET | BW_DIAC_CHARSET
42 | 
43 | SAFEBW_LETTERS_CHARSET = frozenset(u'ABCDEGHIJLMOPQSTVWYZ_bcdefghjklmnpqrstvwx'
44 |                                    u'yz')
45 | SAFEBW_DIAC_CHARSET = frozenset(u'FKNaeiou~_')
46 | SAFEBW_CHARSET = SAFEBW_LETTERS_CHARSET | SAFEBW_DIAC_CHARSET
47 | 
48 | XMLBW_LETTERS_CHARSET = frozenset(u'$\'*ABDEGHIJOPSTWYZ_bdfghjklmnpqrstvwxyz{|'
49 |                                   u'}')
50 | XMLBW_DIAC_CHARSET = frozenset(u'FKN`aiou~_')
51 | XMLBW_CHARSET = XMLBW_LETTERS_CHARSET | XMLBW_DIAC_CHARSET
52 | 
53 | HSB_LETTERS_CHARSET = frozenset(u'\'ADHST_bcdfghjklmnpqrstvwxyz'
54 |                                 u'\u00c2\u00c4\u00e1\u00f0\u00fd\u0100\u0102'
55 |                                 u'\u010e\u0127\u0161\u0175\u0177\u03b3\u03b8'
56 |                                 u'\u03c2')
57 | HSB_DIAC_CHARSET = frozenset(u'.aiu~\u00c4\u00e1\u00e3\u0129\u0169_')
58 | HSB_CHARSET = HSB_LETTERS_CHARSET | HSB_DIAC_CHARSET
59 | 


--------------------------------------------------------------------------------
/sinatools/utils/readfile.py:
--------------------------------------------------------------------------------
1 | def read_file(file_path):
2 |     with open(file_path, 'r', encoding='utf-8') as file:
3 |         return [line.strip() for line in file]
4 | 


--------------------------------------------------------------------------------
/sinatools/utils/tokenizers_words.py:
--------------------------------------------------------------------------------
 1 | # We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/tokenizers/word.html].
 2 | 
 3 | import re
 4 | from sinatools.utils.charsets import UNICODE_PUNCT_SYMBOL_CHARSET
 5 | from sinatools.utils.charsets import UNICODE_LETTER_MARK_NUMBER_CHARSET
 6 | 
 7 | 
 8 | _ALL_PUNCT = u''.join(UNICODE_PUNCT_SYMBOL_CHARSET)
 9 | _ALL_LETTER_MARK_NUMBER = u''.join(UNICODE_LETTER_MARK_NUMBER_CHARSET)
10 | _TOKENIZE_RE = re.compile(r'[' + re.escape(_ALL_PUNCT) + r']|[' +
11 |                           re.escape(_ALL_LETTER_MARK_NUMBER) + r']+')
12 | 
13 | 
14 | def simple_word_tokenize(sentence):
15 | 
16 |     return _TOKENIZE_RE.findall(sentence)
17 | 


--------------------------------------------------------------------------------
/sinatools/wsd/__init__.py:
--------------------------------------------------------------------------------
 1 | from sinatools.wsd import settings 
 2 | import pickle
 3 | from sinatools.DataDownload import downloader
 4 | import os 
 5 | 
 6 | glosses_dic = {}
 7 | filename = 'one_gram.pickle'
 8 | path =downloader.get_appdatadir()
 9 | file_path = os.path.join(path, filename)
10 | with open(file_path, 'rb') as f:
11 |     glosses_dic = pickle.load(f)
12 | 


--------------------------------------------------------------------------------
/sinatools/wsd/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/wsd/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/wsd/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/wsd/__pycache__/disambiguator.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/disambiguator.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/wsd/__pycache__/disambiguator.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/disambiguator.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/wsd/__pycache__/disambiguator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/disambiguator.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/wsd/__pycache__/settings.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/settings.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/wsd/__pycache__/settings.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/settings.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/wsd/__pycache__/settings.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/settings.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/wsd/__pycache__/views.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/views.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/wsd/__pycache__/wsd.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/wsd.cpython-310.pyc


--------------------------------------------------------------------------------
/sinatools/wsd/__pycache__/wsd.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/wsd.cpython-311.pyc


--------------------------------------------------------------------------------
/sinatools/wsd/__pycache__/wsd.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SinaLab/sinatools/039d0173c434f41940fd23e17d1871519718b794/sinatools/wsd/__pycache__/wsd.cpython-38.pyc


--------------------------------------------------------------------------------
/sinatools/wsd/settings.py:
--------------------------------------------------------------------------------
 1 | from transformers import BertTokenizer,BertForSequenceClassification
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import pandas as pd
 5 | 
 6 | 
 7 | 
 8 | 
 9 | from sinatools.DataDownload import downloader
10 | import os 
11 | 
12 | 
13 | model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"
14 | path =downloader.get_appdatadir()
15 | model_file_path = os.path.join(path, model_file_name)
16 | 
17 | tokenizer_file_name = "bert-base-arabertv02"
18 | path =downloader.get_appdatadir()
19 | tokenizer_file_path = os.path.join(path, tokenizer_file_name)
20 | 
21 | dftrue = pd.DataFrame()
22 | 
23 | model = BertForSequenceClassification.from_pretrained(model_file_path, output_hidden_states=True, num_labels=2)
24 | 
25 | tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path))


--------------------------------------------------------------------------------