├── tests ├── __init__.py ├── domain │ ├── __init__.py │ └── test_token.py ├── test_data │ ├── __init__.py │ └── test_processor_cache.py ├── test_pipeline │ ├── __init__.py │ └── test_pipeline.py ├── test_processors │ ├── __init__.py │ ├── test_g2g.py │ ├── test_ner.py │ ├── test_dp.py │ ├── test_pos.py │ └── test_tokenizer.py └── guide_for_testing.md ├── gr_nlp_toolkit ├── data │ ├── __init__.py │ ├── downloader.py │ ├── downloader_stub.py │ ├── downloader_gdrive.py │ ├── processor_cache.py │ └── util.py ├── domain │ ├── __init__.py │ ├── dataset.py │ ├── document.py │ ├── token.py │ └── textVectorizer.py ├── models │ ├── __init__.py │ ├── util.py │ ├── g2g_transformer_model.py │ ├── ner_model.py │ ├── pos_model.py │ ├── dp_model.py │ └── g2g_RBNLM_model.py ├── configs │ ├── __init__.py │ ├── dp_labels.py │ ├── ner_labels.py │ ├── pos_labels.py │ └── dictionary_tables.py ├── pipeline │ ├── __init__.py │ └── pipeline.py ├── processors │ ├── __init__.py │ ├── abstract_processor.py │ ├── ner.py │ ├── dp.py │ ├── pos.py │ ├── g2g.py │ └── tokenizer.py ├── __init__.py └── RBNLM_weights │ ├── RBNLMtextVectorizer.pkl │ └── LSTM_LM_50000_char_120_32_512.pt ├── logo.png ├── requirements.txt ├── pyproject.toml ├── setup.py ├── .gitignore ├── README.md └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/domain/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/domain/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_processors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/configs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/processors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpaueb/gr-nlp-toolkit/HEAD/logo.png -------------------------------------------------------------------------------- /gr_nlp_toolkit/__init__.py: -------------------------------------------------------------------------------- 1 | from gr_nlp_toolkit.pipeline.pipeline import Pipeline -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | huggingface_hub==0.23.5 2 | torch==2.4.0 3 | transformers==4.44.0 4 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/RBNLM_weights/RBNLMtextVectorizer.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpaueb/gr-nlp-toolkit/HEAD/gr_nlp_toolkit/RBNLM_weights/RBNLMtextVectorizer.pkl -------------------------------------------------------------------------------- /gr_nlp_toolkit/RBNLM_weights/LSTM_LM_50000_char_120_32_512.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpaueb/gr-nlp-toolkit/HEAD/gr_nlp_toolkit/RBNLM_weights/LSTM_LM_50000_char_120_32_512.pt -------------------------------------------------------------------------------- /gr_nlp_toolkit/data/downloader.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class Downloader(ABC): 5 | 6 | @abstractmethod 7 | def download_processor(self, processor_name: str, target_path: str): 8 | pass 9 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/processors/abstract_processor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | from gr_nlp_toolkit.domain.document import Document 4 | 5 | 6 | class AbstractProcessor(ABC): 7 | @abstractmethod 8 | def __call__(self, doc : Document): 9 | pass -------------------------------------------------------------------------------- /gr_nlp_toolkit/data/downloader_stub.py: -------------------------------------------------------------------------------- 1 | from gr_nlp_toolkit.data.downloader import Downloader 2 | import os 3 | 4 | 5 | class DownloaderStub(Downloader): 6 | def download_processor(self, processor_name: str, target_path: str): 7 | with open(target_path , 'wb') as f: 8 | pass -------------------------------------------------------------------------------- /tests/domain/test_token.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from gr_nlp_toolkit.domain.token import Token 4 | 5 | 6 | class MyTestCase(unittest.TestCase): 7 | def test_new_token_object(self): 8 | token = Token(['α']) 9 | self.assertEqual(['α'], token.subwords) 10 | 11 | 12 | if __name__ == '__main__': 13 | unittest.main() 14 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/configs/dp_labels.py: -------------------------------------------------------------------------------- 1 | # The labels for the output of the dp model. 2 | # A string label can be obtained by an output index 3 | dp_labels = \ 4 | ['obl', 5 | 'obj', 6 | 'dep', 7 | 'mark', 8 | 'case', 9 | 'flat', 10 | 'nummod', 11 | 'obl:arg', 12 | 'punct', 13 | 'cop', 14 | 'acl:relcl', 15 | 'expl', 16 | 'nsubj', 17 | 'csubj:pass', 18 | 'root', 19 | 'advmod', 20 | 'nsubj:pass', 21 | 'ccomp', 22 | 'conj', 23 | 'amod', 24 | 'xcomp', 25 | 'aux', 26 | 'appos', 27 | 'csubj', 28 | 'fixed', 29 | 'nmod', 30 | 'iobj', 31 | 'parataxis', 32 | 'orphan', 33 | 'det', 34 | 'advcl', 35 | 'vocative', 36 | 'compound', 37 | 'cc', 38 | 'discourse', 39 | 'acl', 40 | 'obl:agent'] -------------------------------------------------------------------------------- /gr_nlp_toolkit/data/downloader_gdrive.py: -------------------------------------------------------------------------------- 1 | from gr_nlp_toolkit.data.downloader import Downloader 2 | import gdown 3 | 4 | 5 | class GDriveDownloader(Downloader): 6 | def __init__(self): 7 | self.urls = { 8 | 'pos': 'https://drive.google.com/uc?id=1Or5HDk1kVnxI3_w0fwgR8-dzO0jvcc_L', # pos link 9 | 'ner': 'https://drive.google.com/uc?id=1fx0pHtcN7F2Vj9L8y5TUpbjSqKTUaT3i', # ner link 10 | 'dp': 'https://drive.google.com/uc?id=1NhEqmLBf67Ydw-LdI7eB-f0afMPgNSmG' # dp link 11 | } 12 | 13 | def download_processor(self, processor_name: str, target_path: str): 14 | gdown.download(self.urls[processor_name], output=target_path, quiet=False) 15 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/domain/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class DatasetImpl(Dataset): 6 | def __init__(self, input_ids): 7 | self._input_ids = input_ids 8 | 9 | def __getitem__(self, index): 10 | return { 11 | "input": [ 12 | torch.tensor(self._input_ids[index], dtype=torch.long), 13 | torch.tensor(len(self._input_ids[index])), 14 | ] 15 | } 16 | 17 | def __len__(self): 18 | return 1 19 | 20 | @property 21 | def input_ids(self): 22 | return self._input_ids 23 | 24 | @input_ids.setter 25 | def input_ids(self, value): 26 | self._input_ids = value 27 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/models/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def create_mask_from_length(length_tensor, mask_size): 5 | 6 | """ 7 | Creates a binary mask based on length. 8 | 9 | Args: 10 | length_tensor (torch.Tensor): ND Tensor containing the lengths. 11 | mask_size (int): Integer specifying the mask size. Usually the largest length in the batch 12 | 13 | Return: 14 | torch.Tensor (N+1)D Int Tensor (..., mask_size) containing the binary mask. 15 | """ 16 | 17 | mask = torch.arange(0, mask_size, dtype=torch.int, device=length_tensor.device) 18 | 19 | mask = mask.int().view([1] * (len(length_tensor.shape)) + [-1]) 20 | 21 | return mask < length_tensor.int().unsqueeze(-1) 22 | 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="gr-nlp-toolkit", 8 | version="0.2.0", 9 | author="nlpaueb", 10 | author_email="p3170148@aueb.gr, p3170039@aueb.gr, spirosbarbakos7@gmail.com, eleftheriosloukas@aueb.gr, ipavlopoulos@aueb.gr", 11 | description="The state-of-the-art NLP toolkit for (modern) Greek", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/nlpaueb/gr-nlp-toolkit", 15 | project_urls={ 16 | # "Bug Tracker": "https://github.com/pypa/sampleproject/issues", 17 | }, 18 | classifiers=[ 19 | "Programming Language :: Python :: 3", 20 | "License :: OSI Approved :: Apache Software License", 21 | "Operating System :: OS Independent", 22 | "Topic :: Text Processing :: Linguistic", 23 | "Natural Language :: Greek", 24 | ], 25 | packages=setuptools.find_packages(where=".", exclude="./tests"), 26 | python_requires=">=3.9", 27 | install_requires=[ 28 | "torch>=2.1.2", 29 | "transformers>=4.11.1", 30 | "huggingface_hub>=0.23.5", 31 | ], 32 | ) 33 | -------------------------------------------------------------------------------- /tests/test_processors/test_g2g.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from gr_nlp_toolkit.processors.g2g import G2G 4 | from gr_nlp_toolkit.processors.g2g import detect_language 5 | from gr_nlp_toolkit.domain.document import Document 6 | 7 | class MyTestCase(unittest.TestCase): 8 | 9 | def test_g2g_lstm(self): 10 | 11 | g2g = G2G(mode="LSTM",model_path="gr_nlp_toolkit/RBNLM_weights/LSTM_LM_50000_char_120_32_512.pt", tokenizer_path="gr_nlp_toolkit/RBNLM_weights/RBNLMtextVectorizer.pkl") 12 | self.assertIsNotNone(g2g.model) 13 | self.assertIsNotNone(g2g.text_vectorizer) 14 | self.assertIsNotNone(g2g.LM) 15 | 16 | 17 | doc = Document("o volos kai h larisa einai poleis ths thessalias") 18 | doc = g2g(doc) 19 | self.assertEqual(detect_language(doc.text), 'greek') 20 | self.assertEqual(doc.text.split()[5], "είναι") 21 | 22 | 23 | def test_g2g_transformer(self): 24 | 25 | g2g = G2G(mode="transformer", model_path="AUEB-NLP/ByT5_g2g") 26 | self.assertIsNotNone(g2g.model) 27 | 28 | doc = Document('"o volos kai h larisa einai poleis ths thessalias"') 29 | doc = g2g(doc) 30 | self.assertEqual(detect_language(doc.text), 'greek') 31 | self.assertEqual(doc.text.split()[5], "είναι") 32 | 33 | 34 | if __name__ == '__main__': 35 | unittest.main() 36 | -------------------------------------------------------------------------------- /tests/guide_for_testing.md: -------------------------------------------------------------------------------- 1 | In Visual Studio Code, make sure: 2 | - you have created a virtual environment (venv) with the corresponding python version supported 3 | - you have installed the venv dependencies `pip install -r requirements.txt` 4 | - Configure this interpreter (`ctrl + shift + p` -> `Python: Select Interpreter`) 5 | - **Important**: Install the toolkit as a `package` in *editable* mode via `pip install -e .` (run this command from the root directory of the project!) 6 | - Configure the tests accordingly (`ctrl + shift + p`). Select `unittest` as your test framework, `tests` as the directory containing the tests and `test_*` as the file patterns to be matched as test files. This shall create a `.vscode/settings.json` file like this for you: 7 | ```json 8 | { 9 | "python.testing.unittestArgs": [ 10 | "-v", 11 | "-s", 12 | "tests", 13 | "-p", 14 | "test_*.py" 15 | ], 16 | "python.testing.unittestEnabled": true, 17 | } 18 | ``` 19 | - Now, go to the `Testing` tab in the left column in Visual Studio Code and click the `Refresh Tests` button in order to discover them. 20 | 21 | 22 |
Bonus (for Windows): 23 |
24 | If you use Windows, it would be helpful if you enable the developer mode. This will speed things up in the caching mechanism under-the-hood of huggingface hub. (https://learn.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development) -------------------------------------------------------------------------------- /tests/test_processors/test_ner.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from transformers import AutoModel 4 | 5 | from gr_nlp_toolkit.domain.document import Document 6 | from gr_nlp_toolkit.processors.ner import NER 7 | from gr_nlp_toolkit.processors.tokenizer import Tokenizer 8 | 9 | from gr_nlp_toolkit.configs.ner_labels import ner_labels 10 | 11 | 12 | class MyTestCase(unittest.TestCase): 13 | 14 | def test_ner_with_one_example(self): 15 | tokenizer = Tokenizer() 16 | doc = tokenizer(Document('Ο ποιητής')) 17 | 18 | ner = NER(entities=18) 19 | 20 | self.assertEqual(69, ner.output_size) 21 | self.assertIsNotNone(ner._model) 22 | doc = ner(doc) 23 | 24 | tokens = doc.tokens 25 | for token in tokens: 26 | self.assertIsNotNone(token.ner) 27 | self.assertTrue(token.ner in ner_labels) 28 | 29 | def test_ner_with_one_example_with_subwords(self): 30 | tokenizer = Tokenizer() 31 | doc = tokenizer(Document('ενα ποιηματακι')) 32 | 33 | ner = NER() 34 | self.assertIsNotNone(ner._model) 35 | doc = ner(doc) 36 | 37 | tokens = doc.tokens 38 | for token in tokens: 39 | self.assertIsNotNone(token.ner) 40 | self.assertTrue(token.ner in ner_labels) 41 | 42 | def test_ner_with_value_exception(self): 43 | with self.assertRaises(ValueError): 44 | NER(entities=2) 45 | 46 | 47 | if __name__ == '__main__': 48 | unittest.main() 49 | -------------------------------------------------------------------------------- /tests/test_processors/test_dp.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from transformers import AutoModel 4 | 5 | from gr_nlp_toolkit.domain.document import Document 6 | from gr_nlp_toolkit.processors.dp import DP 7 | from gr_nlp_toolkit.processors.tokenizer import Tokenizer 8 | 9 | from gr_nlp_toolkit.configs.dp_labels import dp_labels 10 | 11 | 12 | class MyTestCase(unittest.TestCase): 13 | 14 | def test_dp_with_one_example(self): 15 | tokenizer = Tokenizer() 16 | doc = tokenizer(Document('Ο ποιητής')) 17 | 18 | dp = DP() 19 | self.assertIsNotNone(dp._model) 20 | 21 | doc = dp(doc) 22 | 23 | tokens = doc.tokens 24 | for token in tokens: 25 | self.assertIsNotNone(token.head) 26 | self.assertIsNotNone(token.deprel) 27 | self.assertTrue(token.head in range(0, len(tokens))) 28 | self.assertTrue(token.deprel in dp_labels) 29 | 30 | def test_dp_with_one_example_with_subwords(self): 31 | tokenizer = Tokenizer() 32 | doc = tokenizer(Document('ενα ποιηματακι')) 33 | 34 | # bert model init 35 | dp = DP() 36 | 37 | self.assertIsNotNone(dp._model) 38 | doc = dp(doc) 39 | 40 | tokens = doc.tokens 41 | for token in tokens: 42 | self.assertIsNotNone(token.head) 43 | self.assertIsNotNone(token.deprel) 44 | self.assertTrue(token.head in range(0, len(tokens))) 45 | self.assertTrue(token.deprel in dp_labels) 46 | 47 | 48 | if __name__ == '__main__': 49 | unittest.main() -------------------------------------------------------------------------------- /gr_nlp_toolkit/data/processor_cache.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from os.path import expanduser 4 | 5 | from gr_nlp_toolkit.data.downloader import Downloader 6 | 7 | 8 | class ProcessorCache: 9 | def __init__(self, downloader : Downloader, cache_path : str): 10 | """ 11 | Initializes the cache of processors creating necessary directories 12 | :param downloader: an object with the Downloader interface 13 | """ 14 | # Get home directory 15 | self.home = expanduser("~") 16 | self.sep = os.sep 17 | self.cache_path = cache_path 18 | self.downloader = downloader 19 | # Initialize the filenames for each processor 20 | self.processor_names_to_filenames = { 21 | 'ner': 'ner_processor', 22 | 'pos': 'pos_processor', 23 | 'dp': 'dp_processor' 24 | } 25 | self.update_cache_path() 26 | 27 | def update_cache_path(self): 28 | Path(self.cache_path).mkdir(parents=True, exist_ok=True) 29 | 30 | def get_processor_path(self, processor_name: str) -> str: 31 | # Update cache path in case any changes occured 32 | self.update_cache_path() 33 | target_filename = self.processor_names_to_filenames[processor_name] 34 | if not os.path.exists(self.cache_path + self.sep + target_filename): 35 | self.downloader.download_processor(processor_name, self.cache_path + self.sep + target_filename) 36 | # Return the path 37 | return self.cache_path + self.sep + target_filename 38 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/models/g2g_transformer_model.py: -------------------------------------------------------------------------------- 1 | from transformers import T5ForConditionalGeneration, AutoTokenizer 2 | from torch import nn 3 | import torch 4 | 5 | 6 | 7 | class ByT5Model(nn.Module): 8 | """ 9 | A wrapper class for the T5 model for conditional generation 10 | 11 | Attributes: 12 | model (T5ForConditionalGeneration): The pre-trained ByT5 model 13 | tokenizer (AutoTokenizer): The tokenizer for the ByT5 model 14 | """ 15 | 16 | def __init__(self, model_path = None, device = 'cpu'): 17 | """ 18 | Initializes the ByT5Model with a pretrained T5 model and tokenizer. 19 | 20 | Args: 21 | model_path: The path to the pretrained model and tokenizer. 22 | """ 23 | super(ByT5Model, self).__init__() 24 | 25 | self.model = T5ForConditionalGeneration.from_pretrained(model_path) 26 | self.tokenizer = AutoTokenizer.from_pretrained(model_path) 27 | self.device = torch.device(device) 28 | self.model.to(self.device) 29 | 30 | def forward(self, text): 31 | """ 32 | Performs inference to the ByT5 to generate the transliterated text 33 | 34 | Args: 35 | text: the input text in greeklish 36 | 37 | Returns: 38 | The output text in greek 39 | """ 40 | self.model.eval() 41 | tokenized_text = self.tokenizer(text, return_tensors="pt").input_ids 42 | 43 | output = self.model.generate(tokenized_text.to(self.device), max_length=10000) 44 | 45 | return self.tokenizer.decode(output[0], skip_special_tokens=True) 46 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/models/ner_model.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from gr_nlp_toolkit.models.util import create_mask_from_length 4 | 5 | 6 | class NERBERTModel(nn.Module): 7 | """ 8 | Named Entity Recognition (NER) model class based on BERT. 9 | 10 | This class defines a NER model using a pre-trained BERT model 11 | with a dropout and a linear layer on top. 12 | 13 | Attributes: 14 | _bert_model (AutoModel): The pre-trained BERT model. 15 | _dp (nn.Dropout): Dropout layer for regularization. 16 | _output_linear (nn.Linear): Linear layer to produce model outputs. 17 | """ 18 | 19 | def __init__(self, bert_model, model_output_size, dp): 20 | """ 21 | Initializes the NERBERTModel with the specified parameters. 22 | 23 | Args: 24 | bert_model (AutoModel): The pre-trained BERT model. 25 | model_output_size (int): The size of the output layer. 26 | dp (float): Dropout probability. 27 | """ 28 | 29 | super(NERBERTModel, self).__init__() 30 | self._bert_model = bert_model 31 | self._dp = nn.Dropout(dp) 32 | self._output_linear = nn.Linear(768, model_output_size) 33 | 34 | def forward(self, text, text_len): 35 | """ 36 | Performs a forward pass of the model. 37 | 38 | Args: 39 | text (torch.Tensor): Input tensor containing token IDs. 40 | text_len (torch.Tensor): Tensor containing the lengths of each sequence in the batch. 41 | 42 | Returns: 43 | torch.Tensor: The output of the linear layer after applying dropout and BERT. 44 | """ 45 | 46 | # Create attention mask 47 | attention_mask = create_mask_from_length(text_len, text.shape[1]) 48 | 49 | return self._output_linear( 50 | self._dp(self._bert_model(text, attention_mask=attention_mask)[0]) 51 | ) 52 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/configs/ner_labels.py: -------------------------------------------------------------------------------- 1 | # The labels for the output of the ner model. 2 | # A string label can be obtained by an output index 3 | ner_labels = ['O', 4 | 'S-GPE', 5 | 'S-ORG', 6 | 'S-CARDINAL', 7 | 'B-ORG', 8 | 'E-ORG', 9 | 'B-DATE', 10 | 'E-DATE', 11 | 'S-NORP', 12 | 'B-GPE', 13 | 'E-GPE', 14 | 'S-EVENT', 15 | 'S-DATE', 16 | 'S-PRODUCT', 17 | 'S-LOC', 18 | 'I-ORG', 19 | 'S-PERSON', 20 | 'S-ORDINAL', 21 | 'B-PERSON', 22 | 'I-PERSON', 23 | 'E-PERSON', 24 | 'B-LAW', 25 | 'I-LAW', 26 | 'E-LAW', 27 | 'B-MONEY', 28 | 'I-MONEY', 29 | 'E-MONEY', 30 | 'B-EVENT', 31 | 'I-EVENT', 32 | 'E-EVENT', 33 | 'B-FAC', 34 | 'E-FAC', 35 | 'I-DATE', 36 | 'S-PERCENT', 37 | 'B-QUANTITY', 38 | 'E-QUANTITY', 39 | 'B-WORK_OF_ART', 40 | 'I-WORK_OF_ART', 41 | 'E-WORK_OF_ART', 42 | 'I-FAC', 43 | 'S-LAW', 44 | 'S-TIME', 45 | 'B-LOC', 46 | 'E-LOC', 47 | 'I-LOC', 48 | 'S-FAC', 49 | 'B-TIME', 50 | 'E-TIME', 51 | 'S-WORK_OF_ART', 52 | 'B-PRODUCT', 53 | 'E-PRODUCT', 54 | 'B-CARDINAL', 55 | 'E-CARDINAL', 56 | 'S-MONEY', 57 | 'S-LANGUAGE', 58 | 'I-TIME', 59 | 'I-PRODUCT', 60 | 'I-GPE', 61 | 'I-QUANTITY', 62 | 'B-NORP', 63 | 'E-NORP', 64 | 'S-QUANTITY', 65 | 'B-PERCENT', 66 | 'I-PERCENT', 67 | 'E-PERCENT', 68 | 'I-CARDINAL', 69 | 'B-ORDINAL', 70 | 'I-ORDINAL', 71 | 'E-ORDINAL'] 72 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/models/pos_model.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from gr_nlp_toolkit.models.util import create_mask_from_length 3 | 4 | class POSModel(nn.Module): 5 | """ 6 | Part-Of-Speech (POS) tagging model class based on BERT. 7 | 8 | This class defines a POS model using a pre-trained BERT model 9 | with a dropout and multiple linear layers on top. 10 | 11 | Attributes: 12 | _bert_model (AutoModel): The pre-trained BERT model. 13 | _dp (nn.Dropout): Dropout layer for regularization. 14 | _linear_dict (nn.ModuleDict): Dictionary of linear layers for different features. 15 | """ 16 | 17 | def __init__(self, bert_model, feat_to_size, dp): 18 | """ 19 | Initializes the POSModel with the specified parameters. 20 | 21 | Args: 22 | bert_model (AutoModel): The pre-trained BERT model. 23 | feat_to_size (dict): A dictionary mapping feature names to the size of their output layers. 24 | dp (float): Dropout probability. 25 | """ 26 | 27 | super(POSModel, self).__init__() 28 | self._bert_model = bert_model 29 | self._dp = nn.Dropout(dp) 30 | 31 | self._linear_dict = nn.ModuleDict({feat: nn.Linear(768, feat_to_size[feat]) for feat in feat_to_size}) 32 | 33 | def forward(self, text, text_len): 34 | """ 35 | Performs a forward pass of the model. 36 | 37 | Args: 38 | text (torch.Tensor): Input tensor containing token IDs. 39 | text_len (torch.Tensor): Tensor containing the lengths of each sequence in the batch. 40 | 41 | Returns: 42 | dict: A dictionary containing the output tensors for each feature. 43 | """ 44 | 45 | attention_mask = create_mask_from_length(text_len, text.shape[1]) 46 | bert_output = self._dp(self._bert_model(text, attention_mask=attention_mask)[0]) 47 | 48 | output_dict = {feat: self._linear_dict[feat](bert_output) for feat in self._linear_dict} 49 | return output_dict 50 | 51 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/configs/pos_labels.py: -------------------------------------------------------------------------------- 1 | # This is a dict where each entry contains an label for a morphological feature, 2 | # or the label for the UPOS tag if the key is 'upos' 3 | pos_properties = {'ADJ': ['Degree', 'Number', 'Gender', 'Case'], 4 | 'ADP': ['Number', 'Gender', 'Case'], 5 | 'ADV': ['Degree', 'Abbr'], 6 | 'AUX': ['Mood', 7 | 'Aspect', 8 | 'Tense', 9 | 'Number', 10 | 'Person', 11 | 'VerbForm', 12 | 'Voice'], 13 | 'CCONJ': [], 14 | 'DET': ['Number', 'Gender', 'PronType', 'Definite', 'Case'], 15 | 'NOUN': ['Number', 'Gender', 'Abbr', 'Case'], 16 | 'NUM': ['NumType', 'Number', 'Gender', 'Case'], 17 | 'PART': [], 18 | 'PRON': ['Number', 'Gender', 'Person', 'Poss', 'PronType', 'Case'], 19 | 'PROPN': ['Number', 'Gender', 'Case'], 20 | 'PUNCT': [], 21 | 'SCONJ': [], 22 | 'SYM': [], 23 | 'VERB': ['Mood', 24 | 'Aspect', 25 | 'Tense', 26 | 'Number', 27 | 'Gender', 28 | 'Person', 29 | 'VerbForm', 30 | 'Voice', 31 | 'Case'], 32 | 'X': ['Foreign'], 33 | '_': []} 34 | 35 | # The labels for the named entity output of the ner model. 36 | # A string label can be obtained by an output index 37 | pos_labels = {'Abbr': ['_', 'Yes'], 38 | 'Aspect': ['Perf', '_', 'Imp'], 39 | 'Case': ['Dat', '_', 'Acc', 'Gen', 'Nom', 'Voc'], 40 | 'Definite': ['Ind', 'Def', '_'], 41 | 'Degree': ['Cmp', 'Sup', '_'], 42 | 'Foreign': ['_', 'Yes'], 43 | 'Gender': ['Fem', 'Masc', '_', 'Neut'], 44 | 'Mood': ['Ind', '_', 'Imp'], 45 | 'NumType': ['Mult', 'Card', '_', 'Ord', 'Sets'], 46 | 'Number': ['Plur', '_', 'Sing'], 47 | 'Person': ['3', '1', '_', '2'], 48 | 'Poss': ['_', 'Yes'], 49 | 'PronType': ['Ind', 'Art', '_', 'Rel', 'Dem', 'Prs', 'Ind,Rel', 'Int'], 50 | 'Tense': ['Pres', 'Past', '_'], 51 | 'VerbForm': ['Part', 'Conv', '_', 'Inf', 'Fin'], 52 | 'Voice': ['Pass', 'Act', '_'], 53 | 'upos': ['X', 54 | 'PROPN', 55 | 'PRON', 56 | 'ADJ', 57 | 'AUX', 58 | 'PART', 59 | 'ADV', 60 | '_', 61 | 'DET', 62 | 'SYM', 63 | 'NUM', 64 | 'CCONJ', 65 | 'PUNCT', 66 | 'NOUN', 67 | 'SCONJ', 68 | 'ADP', 69 | 'VERB']} 70 | -------------------------------------------------------------------------------- /tests/test_data/test_processor_cache.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import shutil 4 | 5 | from gr_nlp_toolkit.data.downloader_stub import DownloaderStub 6 | from gr_nlp_toolkit.data.processor_cache import ProcessorCache 7 | 8 | 9 | class TestProcessorCache(unittest.TestCase): 10 | def test_download_processors_sequentially(self): 11 | 12 | sep = os.sep 13 | cache_path = "./test" 14 | stub = DownloaderStub() 15 | processor_cache = ProcessorCache(stub , cache_path) 16 | processor_cache.get_processor_path('ner') 17 | self.assertTrue(os.path.exists(cache_path + sep + "ner_processor")) 18 | processor_cache.get_processor_path('pos') 19 | self.assertTrue(os.path.exists(cache_path + sep + "pos_processor")) 20 | dp_path = processor_cache.get_processor_path('dp') 21 | self.assertTrue(type(dp_path) == str) 22 | self.assertTrue(os.path.exists(cache_path + sep + "dp_processor")) 23 | self.assertTrue(dp_path == (cache_path + sep + "dp_processor")) 24 | # Remove any files created 25 | shutil.rmtree(cache_path) 26 | 27 | def test_download_processor_removing_file_and_folder(self): 28 | 29 | home = os.path.expanduser("~") 30 | sep = os.sep 31 | cache_path = "./test" 32 | stub = DownloaderStub() 33 | processor_cache = ProcessorCache(stub, cache_path) 34 | processor_cache.get_processor_path('ner') 35 | self.assertTrue(os.path.exists(cache_path + sep + "ner_processor")) 36 | os.remove(cache_path + sep + "ner_processor") 37 | # Assert that the file is removed 38 | self.assertTrue(not os.path.exists(cache_path + sep + "ner_processor")) 39 | processor_cache.get_processor_path('ner') 40 | # Assert that the file has appeared again 41 | self.assertTrue(os.path.exists(cache_path + sep + "ner_processor")) 42 | processor_cache.get_processor_path('pos') 43 | # Remove entire directory 44 | shutil.rmtree(cache_path) 45 | processor_cache.get_processor_path('pos') 46 | # Assert that the certain processor has appeared again 47 | self.assertTrue(os.path.exists(cache_path + sep + "pos_processor")) 48 | # Remove any files created 49 | shutil.rmtree(cache_path) 50 | 51 | if __name__ == '__main__': 52 | unittest.main() 53 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/domain/document.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Document: 4 | """ 5 | Document class that represents an annotated text 6 | """ 7 | 8 | def __init__(self, text: str): 9 | """ 10 | Create a Document object setting possible parameters other than the text as None 11 | 12 | Keyword arguments: 13 | param text: The text of the document 14 | """ 15 | self._text = text 16 | 17 | self._input_ids = None 18 | self._token_mask = None 19 | 20 | self._tokens = None 21 | 22 | self._dataloader = None 23 | 24 | self._subword2word = None 25 | 26 | 27 | @property 28 | def text(self): 29 | """ 30 | Return the original text of the document 31 | """ 32 | return self._text 33 | 34 | @text.setter 35 | def text(self, value): 36 | self._text = value 37 | 38 | 39 | @property 40 | def tokens(self): 41 | """ 42 | A list of Tokens containing the tokens of the text as well as token level annotations 43 | """ 44 | return self._tokens 45 | 46 | @tokens.setter 47 | def tokens(self, value): 48 | self._tokens = value 49 | 50 | 51 | @property 52 | def input_ids(self): 53 | """ 54 | A tensor of shape [1,mseq] containing the input ids created with the BERT tokenizer 55 | """ 56 | return self._input_ids 57 | 58 | @input_ids.setter 59 | def input_ids(self, value): 60 | self._input_ids = value 61 | 62 | 63 | @property 64 | def token_mask(self): 65 | """ 66 | A tensor of shape [1,mseq] containign zeros at the positions of the input_ids tensor that map to subword tokens that are non first subword tokens 67 | """ 68 | return self._token_mask 69 | 70 | @token_mask.setter 71 | def token_mask(self, value): 72 | self._token_mask = value 73 | 74 | 75 | @property 76 | def dataloader(self): 77 | return self._dataloader 78 | 79 | 80 | @dataloader.setter 81 | def dataloader(self, value): 82 | self._dataloader = value 83 | 84 | @property 85 | def subword2word(self): 86 | """ 87 | A mapping for each subword to the word 88 | """ 89 | return self._subword2word 90 | 91 | @subword2word.setter 92 | def subword2word(self, value): 93 | self._subword2word = value -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Pycharm 132 | .idea 133 | 134 | # Temporary files 135 | gr_nlp_toolkit/tmp/ 136 | .vscode/ 137 | dist/ 138 | .pytest_cache/ 139 | test_toolkit_venv/ 140 | 141 | #pipenv 142 | Pipfile 143 | Pipfile.lock 144 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/domain/token.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | 4 | class Token: 5 | """ 6 | Token class which represents a word/token 7 | """ 8 | 9 | def __init__(self, subwords: List[str]): 10 | """ 11 | Create a Token object setting possible parameters other than the text as None 12 | 13 | Keyword arguments: 14 | subwords: A list with the token's subwords 15 | """ 16 | 17 | # the text 18 | self._text = "" 19 | 20 | # the subwords 21 | self._subwords = subwords 22 | 23 | # the ids 24 | self._ids = [] 25 | 26 | # Named Entity Recognition parameters 27 | # the named entity 28 | self._ner = None 29 | 30 | # Part of Speech Tagging parameters 31 | # the universal pos tag 32 | self._upos = None 33 | # the universal morphological features 34 | self._feats = {} 35 | 36 | # Dependency Parsing parameters 37 | # the dependant word index in the sentence 38 | self._head = None 39 | # the label of the relation between the specific word and the dependant one 40 | self._deprel = None 41 | 42 | @property 43 | def text(self): 44 | """ 45 | The text 46 | """ 47 | return self._text 48 | 49 | @text.setter 50 | def text(self, value): 51 | self._text = value 52 | 53 | @property 54 | def subwords(self): 55 | """ 56 | A list with Token's subwords 57 | """ 58 | return self._subwords 59 | 60 | @subwords.setter 61 | def subwords(self, value): 62 | self._subwords = value 63 | 64 | @property 65 | def ids(self): 66 | return self._ids 67 | 68 | @ids.setter 69 | def ids(self, value): 70 | self._ids = value 71 | 72 | @property 73 | def ner(self): 74 | """ 75 | The Named Entity 76 | """ 77 | return self._ner 78 | 79 | @ner.setter 80 | def ner(self, value): 81 | self._ner = value 82 | 83 | @property 84 | def upos(self): 85 | """ 86 | The universal pos tag 87 | """ 88 | return self._upos 89 | 90 | @upos.setter 91 | def upos(self, value): 92 | self._upos = value 93 | 94 | @property 95 | def feats(self): 96 | """ 97 | The universal morphological features 98 | """ 99 | return self._feats 100 | 101 | @feats.setter 102 | def feats(self, value): 103 | self._feats = value 104 | 105 | @property 106 | def head(self): 107 | """ 108 | The dependant word index in the sentence 109 | """ 110 | return self._head 111 | 112 | @head.setter 113 | def head(self, value): 114 | self._head = value 115 | 116 | @property 117 | def deprel(self): 118 | """ 119 | The label of the relation between the specific word and the dependant one 120 | """ 121 | return self._deprel 122 | 123 | @deprel.setter 124 | def deprel(self, value): 125 | self._deprel = value 126 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/processors/ner.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from transformers import AutoModel 5 | 6 | from gr_nlp_toolkit.configs.ner_labels import ner_labels 7 | from gr_nlp_toolkit.domain.document import Document 8 | from gr_nlp_toolkit.processors.abstract_processor import AbstractProcessor 9 | 10 | from gr_nlp_toolkit.models.ner_model import NERBERTModel 11 | 12 | 13 | model_params = {'dp': 0} 14 | 15 | 16 | class NER(AbstractProcessor): 17 | """ 18 | Named Entity Recognition (NER) processor class. 19 | 20 | This class performs NER using a pre-trained BERT model. It initializes the model, 21 | loads the necessary components, and provides functionality to process documents 22 | and perform NER on them. 23 | 24 | Attributes: 25 | I2L (list): A list of label names for the NER task. 26 | output_size (int): The number of output labels. 27 | _model (NERBERTModel): The NER model based on BERT. 28 | softmax (nn.Softmax): Softmax function for output normalization. 29 | device (torch.device): Device on which the model is loaded. 30 | """ 31 | 32 | def __init__(self, model_path=None, device='cpu', entities=18,): 33 | """ 34 | Initializes the NER class with the specified parameters. 35 | 36 | Args: 37 | model_path (str, optional): Path to the pre-trained model. Defaults to None. 38 | device (str, optional): Device to load the model on ('cpu' or 'cuda'). Defaults to 'cpu'. 39 | entities (int, optional): Number of entity labels. Should be set to 18. Defaults to 18. 40 | 41 | Raises: 42 | ValueError: If the number of entities is not 18. 43 | """ 44 | 45 | # Entities are the semantic catgories of the NER task (more info: http://nlp.cs.aueb.gr/theses/smyrnioudis_bsc_thesis.pdf) 46 | if entities == 18: 47 | self.I2L = ner_labels 48 | self.output_size = len(self.I2L) 49 | else: 50 | raise ValueError('Entities should be set to 18') 51 | 52 | # Initialize the BERT model 53 | bert_model = AutoModel.from_pretrained('nlpaueb/bert-base-greek-uncased-v1') 54 | self._model = NERBERTModel(bert_model, self.output_size, **model_params) 55 | self.softmax = nn.Softmax(dim=-1) 56 | self.device = torch.device(device) 57 | self._model.to(self.device) 58 | self._model.eval() 59 | 60 | # load the pretrained model if provided 61 | if model_path is not None: 62 | self._model.load_state_dict(torch.load(model_path, map_location=self.device, weights_only=True), strict=False) 63 | 64 | 65 | 66 | def __call__(self, doc: Document) -> Document: 67 | """ 68 | Processes a document to perform Named Entity Recognition. 69 | 70 | Args: 71 | doc (Document): The document to process. 72 | 73 | Returns: 74 | Document: The document with NER tags assigned to the tokens. 75 | """ 76 | 77 | # Get the input ids and text length of the document 78 | input_ids, text_len = next(iter(doc.dataloader))['input'] 79 | 80 | # Perform NER with the model 81 | output = self._model(input_ids.to(self.device), text_len.to(self.device)) 82 | predictions = self.softmax(output) 83 | predictions = torch.argmax(predictions[0], axis=-1).detach().cpu().numpy() 84 | 85 | # map predictions -> tokens, special tokens are not included 86 | i = 0 87 | for mask, pred in zip(doc.token_mask, predictions[1: len(predictions) - 1]): 88 | if mask: 89 | token = doc.tokens[i] 90 | token.ner = self.I2L[pred] 91 | i+=1 92 | 93 | return doc -------------------------------------------------------------------------------- /gr_nlp_toolkit/processors/dp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from transformers import AutoModel 5 | 6 | from gr_nlp_toolkit.configs.dp_labels import dp_labels 7 | from gr_nlp_toolkit.domain.document import Document 8 | from gr_nlp_toolkit.processors.abstract_processor import AbstractProcessor 9 | 10 | from gr_nlp_toolkit.models.dp_model import DPModel 11 | 12 | 13 | class DP(AbstractProcessor): 14 | """ 15 | Dependency Parsing (DP) processor class. 16 | 17 | This class performs dependency parsing using a pre-trained BERT model. It initializes the model, 18 | loads the necessary components, and provides functionality to process documents 19 | and assign head and dependency relation (deprel) tags to tokens. 20 | 21 | Attributes: 22 | I2L (list): A list of dependency relation labels. 23 | output_size (int): The number of output labels. 24 | _model (DPModel): The dependency parsing model based on BERT. 25 | softmax (nn.Softmax): Softmax function for output normalization. 26 | device (torch.device): Device on which the model is loaded. 27 | """ 28 | 29 | def __init__(self, model_path=None, device='cpu'): 30 | """ 31 | Initializes the DP class with the specified parameters. 32 | 33 | Args: 34 | model_path (str, optional): Path to the pre-trained model. Defaults to None. 35 | device (str, optional): Device to load the model on ('cpu' or 'cuda'). Defaults to 'cpu'. 36 | """ 37 | 38 | self.I2L = dp_labels 39 | self.output_size = len(self.I2L) 40 | 41 | # Initialize the BERT model 42 | bert_model = AutoModel.from_pretrained('nlpaueb/bert-base-greek-uncased-v1') 43 | self._model = DPModel(bert_model, self.I2L, 0) 44 | 45 | self.softmax = nn.Softmax(dim=-1) 46 | self.device = torch.device(device) 47 | self._model.to(self.device) 48 | self._model.eval() 49 | 50 | # Load the pretrained model if provided 51 | if model_path is not None: 52 | self._model.load_state_dict(torch.load(model_path, map_location=self.device, weights_only=True), strict=False) 53 | 54 | def __call__(self, doc: Document) -> Document: 55 | """ 56 | Processes a document to perform dependency parsing. 57 | 58 | Args: 59 | doc (Document): The document to process. 60 | 61 | Returns: 62 | Document: The document with head and deprel tags assigned to the tokens. 63 | """ 64 | 65 | # Predict heads 66 | input_ids, text_len = next(iter(doc.dataloader))['input'] 67 | 68 | output_heads = 'heads' 69 | 70 | predictions_heads = self._model(input_ids.to(self.device), text_len.to(self.device)) 71 | predictions_heads = self.softmax(predictions_heads[output_heads]) 72 | predictions_heads = torch.argmax(predictions_heads[0], axis=-1).detach().cpu().numpy() 73 | 74 | # Predict dependency relations (deprels) 75 | output_deprels = 'gathered_deprels' 76 | 77 | predictions_deprels = self._model(input_ids.to(self.device), text_len.to(self.device)) 78 | predictions_deprels = self.softmax(predictions_deprels[output_deprels]) 79 | predictions_deprels = torch.argmax(predictions_deprels[0], axis=-1).detach().cpu().numpy() 80 | 81 | # map predictions -> tokens, special tokens are not included 82 | i = 0 83 | for mask, pred_head, pred_deprel in zip(doc.token_mask, predictions_heads[1: len(predictions_heads) - 1], 84 | predictions_deprels[1: len(predictions_deprels) - 1]): 85 | if mask: 86 | token = doc.tokens[i] 87 | token.head = doc.subword2word[pred_head] 88 | token.deprel = self.I2L[pred_deprel] 89 | i +=1 90 | 91 | return doc 92 | -------------------------------------------------------------------------------- /tests/test_processors/test_pos.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from transformers import AutoModel 4 | 5 | from gr_nlp_toolkit.domain.document import Document 6 | from gr_nlp_toolkit.processors.pos import POS 7 | from gr_nlp_toolkit.processors.tokenizer import Tokenizer 8 | from gr_nlp_toolkit.configs.pos_labels import pos_labels, pos_properties 9 | 10 | 11 | class MyTestCase(unittest.TestCase): 12 | 13 | @classmethod 14 | def setUpClass(cls) -> None: 15 | cls.bert_model = AutoModel.from_pretrained('nlpaueb/bert-base-greek-uncased-v1') 16 | 17 | 18 | def test_pos_with_one_example(self): 19 | tokenizer = Tokenizer() 20 | doc = tokenizer(Document('Ο ποιητής')) 21 | 22 | pos = POS() 23 | self.assertIsNotNone(pos._model) 24 | # self.assertIsNotNone(pos.system) 25 | doc = pos(doc) 26 | 27 | tokens = doc.tokens 28 | for token in tokens: 29 | self.assertIsNotNone(token.upos) 30 | self.assertTrue(token.upos in pos_labels['upos']) 31 | 32 | self.assertIsNotNone(token.feats) 33 | self.assertEqual(len(list(token.feats.keys())), len(pos_properties[token.upos])) 34 | 35 | for feat, value in token.feats.items(): 36 | self.assertTrue(feat in pos_properties[token.upos]) 37 | self.assertTrue(value in pos_labels[feat]) 38 | 39 | def test_pos_with_one_example_with_subwords(self): 40 | tokenizer = Tokenizer() 41 | doc = tokenizer(Document('ενα ποιηματακι')) 42 | 43 | pos = POS(MyTestCase.bert_model) 44 | self.assertIsNotNone(pos._model) 45 | self.assertIsNotNone(pos.system) 46 | doc = pos(doc) 47 | 48 | tokens = doc.tokens 49 | for token in tokens: 50 | self.assertIsNotNone(token.upos) 51 | self.assertTrue(token.upos in pos_labels['upos']) 52 | 53 | self.assertIsNotNone(token.feats) 54 | self.assertEqual(len(list(token.feats.keys())), len(pos_properties[token.upos])) 55 | 56 | for feat, value in token.feats.items(): 57 | self.assertTrue(feat in pos_properties[token.upos]) 58 | self.assertTrue(value in pos_labels[feat]) 59 | 60 | def test_pos_with_one_example_with_subwords(self): 61 | tokenizer = Tokenizer() 62 | doc = tokenizer(Document('ενα ποιηματακι')) 63 | 64 | pos = POS(MyTestCase.bert_model) 65 | self.assertIsNotNone(pos._model) 66 | self.assertIsNotNone(pos.system) 67 | doc = pos(doc) 68 | 69 | tokens = doc.tokens 70 | for token in tokens: 71 | self.assertIsNotNone(token.upos) 72 | self.assertTrue(token.upos in pos_labels['upos']) 73 | 74 | self.assertIsNotNone(token.feats) 75 | self.assertEqual(len(list(token.feats.keys())), len(pos_properties[token.upos])) 76 | 77 | for feat, value in token.feats.items(): 78 | self.assertTrue(feat in pos_properties[token.upos]) 79 | self.assertTrue(value in pos_labels[feat]) 80 | 81 | def test_pos_with_one_example_with_subwords(self): 82 | tokenizer = Tokenizer() 83 | doc = tokenizer(Document('ενα ποιηματακι')) 84 | 85 | pos = POS() 86 | self.assertIsNotNone(pos._model) 87 | # self.assertIsNotNone(pos.system) 88 | doc = pos(doc) 89 | 90 | tokens = doc.tokens 91 | for token in tokens: 92 | self.assertIsNotNone(token.upos) 93 | self.assertTrue(token.upos in pos_labels['upos']) 94 | 95 | self.assertIsNotNone(token.feats) 96 | self.assertEqual(len(list(token.feats.keys())), len(pos_properties[token.upos])) 97 | 98 | for feat, value in token.feats.items(): 99 | self.assertTrue(feat in pos_properties[token.upos]) 100 | self.assertTrue(value in pos_labels[feat]) 101 | 102 | 103 | if __name__ == '__main__': 104 | unittest.main() 105 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/processors/pos.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from transformers import AutoModel 5 | 6 | from gr_nlp_toolkit.domain.document import Document 7 | from gr_nlp_toolkit.processors.abstract_processor import AbstractProcessor 8 | from gr_nlp_toolkit.configs.pos_labels import pos_labels, pos_properties 9 | 10 | 11 | from gr_nlp_toolkit.models.pos_model import POSModel 12 | 13 | 14 | class POS(AbstractProcessor): 15 | """ 16 | Part-Of-Speech (POS) processor class. 17 | 18 | This class performs POS tagging using a pre-trained BERT model. It initializes the model, 19 | loads the necessary components, and provides functionality to process documents 20 | and assign POS tags and features to tokens. 21 | 22 | Attributes: 23 | properties_POS (dict): Dictionary containing properties for POS tags. 24 | feat_to_I2L (dict): Dictionary mapping feature names to label lists. 25 | feat_to_size (dict): Dictionary mapping feature names to the size of their label lists. 26 | _model (POSModel): The POS model based on BERT. 27 | softmax (nn.Softmax): Softmax function for output normalization. 28 | device (torch.device): Device on which the model is loaded. 29 | """ 30 | 31 | def __init__(self, model_path=None, device='cpu'): 32 | """ 33 | Initializes the POS class with the specified parameters. 34 | 35 | Args: 36 | model_path (str, optional): Path to the pre-trained model. Defaults to None. 37 | device (str, optional): Device to load the model on ('cpu' or 'cuda'). Defaults to 'cpu'. 38 | """ 39 | 40 | self.properties_POS = pos_properties 41 | self.feat_to_I2L = pos_labels 42 | self.feat_to_size = {k: len(v) for k, v in self.feat_to_I2L.items()} 43 | 44 | # model init 45 | bert_model = AutoModel.from_pretrained('nlpaueb/bert-base-greek-uncased-v1') 46 | self._model = POSModel(bert_model, self.feat_to_size, 0) 47 | self.softmax = nn.Softmax(dim=-1) 48 | self.device = torch.device(device) 49 | self._model.to(self.device) 50 | self._model.eval() 51 | 52 | # load the pretrained model 53 | if model_path is not None: 54 | self._model.load_state_dict(torch.load(model_path, map_location=self.device, weights_only=True), strict=False) 55 | 56 | def __call__(self, doc: Document) -> Document: 57 | """ 58 | Processes a document to perform Part-Of-Speech tagging and assign features. 59 | 60 | Args: 61 | doc (Document): The document to process. 62 | 63 | Returns: 64 | Document: The document with POS tags and features assigned to the tokens. 65 | """ 66 | 67 | predictions = {} 68 | 69 | input_ids, text_len = next(iter(doc.dataloader))['input'] 70 | 71 | for feat in self.feat_to_I2L.keys(): 72 | output = self._model(input_ids.to(self.device), text_len.to(self.device)) 73 | output = self.softmax(output[feat]) 74 | 75 | 76 | predictions[feat] = torch.argmax(output[0], axis=-1).detach().cpu().numpy() 77 | 78 | # set upos 79 | upos_predictions = predictions['upos'] 80 | i = 0 81 | for mask, pred in zip(doc.token_mask, upos_predictions[1: len(upos_predictions) - 1]): 82 | if mask: 83 | token = doc.tokens[i] 84 | token.upos = self.feat_to_I2L['upos'][pred] 85 | # Advance to the next word (not subtoken) 86 | i+=1 87 | 88 | # set features 89 | for feat in self.feat_to_I2L.keys(): 90 | if feat != 'upos': 91 | current_predictions = predictions[feat] 92 | i = 0 93 | for mask, pred in zip(doc.token_mask, current_predictions[1: len(current_predictions) - 1]): 94 | if mask: 95 | token = doc.tokens[i] 96 | if feat in self.properties_POS[token.upos]: 97 | token.feats[feat] = self.feat_to_I2L[feat][pred] 98 | # Advance to the next word (not subtoken) 99 | i += 1 100 | 101 | return doc 102 | -------------------------------------------------------------------------------- /tests/test_processors/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from gr_nlp_toolkit.processors.tokenizer import * 4 | 5 | 6 | class TestTokenizer(unittest.TestCase): 7 | def test_strip_accents_and_lowercase1(self): 8 | result = strip_accents_and_lowercase('ποιητής') 9 | self.assertEqual('ποιητης', result) 10 | 11 | def test_strip_accents_and_lowercase2(self): 12 | result = strip_accents_and_lowercase('ΠΟΙΗΤΗΣ') 13 | self.assertEqual('ποιητης', result) 14 | 15 | """" 16 | Tests with no sub-words: 17 | """ 18 | 19 | def test_create_ids_without_subwords(self): 20 | ids = create_ids('ο ποιητης') 21 | # 2 special tokens + 2 given words 22 | self.assertEqual(4, len(ids)) 23 | 24 | def test_create_tokens_without_subwords(self): 25 | ids = [101, 247, 6981, 102] 26 | tokens = convert_to_tokens(ids) 27 | # 2 words, special tokens are not included 28 | self.assertEqual(2, len(tokens)) 29 | self.assertEqual('ο', tokens[0]) 30 | self.assertEqual('ποιητης', tokens[1]) 31 | 32 | def test_create_mask_and_tokens_without_subwords(self): 33 | tokens = ['ο', 'ποιητης'] 34 | mask, tokens, subword2word = create_mask_and_tokens(tokens, [247, 6981]) 35 | 36 | self.assertEqual(2, len(mask)) 37 | self.assertEqual([True, True], mask) 38 | self.assertEqual(2, len(tokens)) 39 | self.assertEqual(1, len(tokens[0].subwords)) 40 | self.assertEqual(1, len(tokens[1].subwords)) 41 | self.assertEqual(247, tokens[0]._ids[0]) 42 | self.assertEqual(6981, tokens[1]._ids[0]) 43 | self.assertEqual('ο', tokens[0].subwords[0]) 44 | self.assertEqual('ποιητης', tokens[1].subwords[0]) 45 | self.assertEqual('ο', tokens[0].text) 46 | self.assertEqual('ποιητης', tokens[1].text) 47 | self.assertEqual(len(subword2word.keys()), 3) 48 | self.assertEqual(subword2word[1], 1) 49 | self.assertEqual(subword2word[2], 2) 50 | 51 | """" 52 | Tests with sub-words: 53 | """ 54 | 55 | def test_create_ids_with_subwords(self): 56 | ids = create_ids('ενα ποιηματακι') 57 | # 2 special tokens + 1 word without sub-words + 1 word with 1 sub-word 58 | self.assertEqual(5, len(ids)) 59 | 60 | def test_create_tokens_with_subwords(self): 61 | ids = [101, 370, 6623, 701, 102] 62 | tokens = convert_to_tokens(ids) 63 | # 2 words, special tokens are not included 64 | self.assertEqual(3, len(tokens)) 65 | self.assertEqual('ενα', tokens[0]) 66 | self.assertEqual('ποιηματα', tokens[1]) 67 | self.assertEqual('##κι', tokens[2]) 68 | 69 | def test_create_mask_and_tokens_with_subwords(self): 70 | tokens = ['ενα', 'ποιηματα', '##κι'] 71 | mask, tokens, subword2word = create_mask_and_tokens(tokens, [370, 6623, 701]) 72 | 73 | self.assertEqual(3, len(mask)) 74 | self.assertEqual([True, True, False], mask) 75 | self.assertEqual(2, len(tokens)) 76 | self.assertEqual(1, len(tokens[0].subwords)) 77 | self.assertEqual(2, len(tokens[1].subwords)) 78 | self.assertEqual(370, tokens[0]._ids[0]) 79 | self.assertEqual(6623, tokens[1]._ids[0]) 80 | self.assertEqual(701, tokens[1]._ids[1]) 81 | self.assertEqual('ενα', tokens[0].subwords[0]) 82 | self.assertEqual('ποιηματα', tokens[1].subwords[0]) 83 | self.assertEqual('##κι', tokens[1].subwords[1]) 84 | self.assertEqual('ενα', tokens[0].text) 85 | self.assertEqual('ποιηματακι', tokens[1].text) 86 | self.assertEqual(len(subword2word.keys()), 4) 87 | self.assertEqual(subword2word[1], 1) 88 | self.assertEqual(subword2word[2], 2) 89 | self.assertEqual(subword2word[3], 2) 90 | 91 | def test_tokenizer(self): 92 | tokenizer = Tokenizer() 93 | doc = tokenizer(Document('Ο ποιητής')) 94 | # document has all field set 95 | self.assertIsNotNone(doc.text) 96 | self.assertIsNotNone(doc.input_ids) 97 | self.assertIsNotNone(doc.token_mask) 98 | self.assertIsNotNone(doc.tokens) 99 | self.assertIsNotNone(doc.subword2word) 100 | 101 | def test_create_dataset_and_dataloader(self): 102 | input_ids = [101, 370, 6623, 701, 102] 103 | dataset, dataloader = create_dataset_and_dataloader(input_ids) 104 | self.assertIsNotNone(dataset.input_ids) 105 | self.assertIsNotNone(dataloader.dataset) 106 | self.assertEqual(dataset, dataloader.dataset) 107 | self.assertEqual(dataset.input_ids, [input_ids]) 108 | 109 | 110 | if __name__ == '__main__': 111 | unittest.main() 112 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/processors/g2g.py: -------------------------------------------------------------------------------- 1 | from gr_nlp_toolkit.processors.abstract_processor import AbstractProcessor 2 | from gr_nlp_toolkit.domain.document import Document 3 | from gr_nlp_toolkit.models import g2g_RBNLM_model 4 | from gr_nlp_toolkit.domain.textVectorizer import TextVectorizer 5 | from gr_nlp_toolkit.models.g2g_RBNLM_model import LanguageModel 6 | from gr_nlp_toolkit.models.g2g_transformer_model import ByT5Model 7 | import torch 8 | import pickle 9 | 10 | def detect_language(text): 11 | """ 12 | Checks whether the majority of the letters in the input text are in the greek or the latin script 13 | It is used to identify whether the text is in greek or greeklish (latin script), in order to skip unnecessary conversions. 14 | 15 | Args: 16 | text (str): The input text 17 | 18 | Returns: 19 | script (str): The dominant script 20 | """ 21 | # Filter out non-letter characters 22 | valid_characters = [char for char in text if char.isalpha()] 23 | 24 | # Count Greek and English letters 25 | greek_count = sum(1 for char in valid_characters if '\u0370' <= char <= '\u03FF' or '\u1F00' <= char <= '\u1FFF') 26 | english_count = sum(1 for char in valid_characters if '\u0041' <= char <= '\u005A' or '\u0061' <= char <= '\u007A') 27 | 28 | script = "greek" if greek_count >= english_count else "latin" 29 | return script 30 | 31 | 32 | class G2G(AbstractProcessor): 33 | """ 34 | Greeklsih to Greek (G2G) processor class. 35 | 36 | This class performs G2G conversion using either an LSTM-based model or a transformer-based model. 37 | It initializes the model, loads the necessary components, and provides functionality to process documents 38 | and convert text using the specified mode. 39 | """ 40 | 41 | def __init__(self, mode = 'LSTM', model_path = None, tokenizer_path = None, device = 'cpu'): 42 | 43 | """ 44 | Initializes the G2G class with the specified parameters. 45 | 46 | Args: 47 | mode (str, optional): The mode of the model, either 'LSTM' or 'transformer'. Defaults to 'LSTM'. 48 | model_path (str, optional): Path to the pre-trained model. Defaults to None. 49 | tokenizer_path (str, optional): Path to the tokenizer for LSTM mode. Defaults to None. 50 | device (str, optional): Device to load the model on ('cpu' or 'cuda'). Defaults to 'cpu'. 51 | """ 52 | 53 | 54 | self.mode = mode 55 | self.device = torch.device(device) 56 | 57 | if self.mode == 'LSTM': 58 | # Define the model parameters (more info: https://aclanthology.org/2024.lrec-main.1330/) 59 | input_size = 120 60 | embed_size = 32 61 | hidden_size = 512 62 | output_size = 120 63 | 64 | # Load and initialize the LSTM model 65 | self.beam_size = 5 66 | self.model = g2g_RBNLM_model.LSTM_LangModel(input_size, embed_size, hidden_size, output_size) 67 | 68 | 69 | # Load and initialize the tokenizer 70 | self.text_vectorizer = TextVectorizer("char") 71 | 72 | if(model_path is not None): 73 | self.model.load_state_dict(torch.load(model_path, map_location=self.device, weights_only=True)) 74 | 75 | 76 | if(tokenizer_path is not None): 77 | with open(tokenizer_path, "rb") as file: 78 | self.text_vectorizer = pickle.load(file) 79 | 80 | # Initialize the LanguageModel 81 | self.LM = LanguageModel(self.text_vectorizer, self.model, device=self.device) 82 | 83 | 84 | elif self.mode == 'transformer': 85 | self.model = ByT5Model(model_path, device=self.device) 86 | self.model.eval() 87 | 88 | 89 | def __call__(self, doc: Document) -> Document: 90 | """ 91 | Processes a document to perform Greeklish to Greek conversion. 92 | 93 | Args: 94 | doc (Document): The document to process. 95 | 96 | Returns: 97 | Document: The document with text converted using the specified model. 98 | """ 99 | 100 | # If the text is in already in greek, skip the g2g conversion 101 | if(detect_language(doc.text) == 'greek'): 102 | return doc 103 | 104 | 105 | 106 | # Perform G2G conversion based on the mode 107 | if(self.mode == 'LSTM'): 108 | doc.text = self.LM.translate([doc.text], self.beam_size)[0] 109 | elif(self.mode == 'transformer'): 110 | doc.text = self.model(doc.text) 111 | 112 | return doc -------------------------------------------------------------------------------- /tests/test_pipeline/test_pipeline.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from gr_nlp_toolkit.configs.dp_labels import dp_labels 4 | from gr_nlp_toolkit.configs.ner_labels import ner_labels 5 | from gr_nlp_toolkit.configs.pos_labels import pos_labels, pos_properties 6 | from gr_nlp_toolkit.pipeline.pipeline import Pipeline 7 | 8 | 9 | 10 | class TestPipeline(unittest.TestCase): 11 | # The unit tests test the transformer-based g2g processor. If you want to test the LSTM-based processor, you can change 'g2g' to 'g2g_lite' 12 | def test_using_all_processors(self): 13 | nlp = Pipeline('dp,pos,ner,g2g') 14 | 15 | sentences = ["Η Ιταλία κέρδισε την Αγγλία στον τελικό του Euro το 2021", 16 | "Το ποιηματάκι το έγραψε ο διάσημος ποιητής, Νίκος Νικολαϊδης", 17 | "Uparxoun autoi pou kerdizoun apo mia katastash kai autoi pou xanoun"] 18 | for sent in sentences: 19 | doc = nlp(sent) 20 | 21 | for token in doc.tokens: 22 | print(token.text, token.ner, token.upos, token.feats, token.head, token.deprel) 23 | self.assertIsNotNone(token.ner) 24 | self.assertTrue(token.ner in ner_labels) 25 | self.assertIsNotNone(token.head) 26 | self.assertIsNotNone(token.deprel) 27 | # We have to add plus one, because the cls token is removed 28 | self.assertTrue(token.head in range(0, len(doc.tokens) + 1)) 29 | self.assertTrue(token.deprel in dp_labels) 30 | self.assertIsNotNone(token.upos) 31 | self.assertTrue(token.upos in pos_labels['upos']) 32 | 33 | self.assertIsNotNone(token.feats) 34 | self.assertEqual(len(list(token.feats.keys())), len(pos_properties[token.upos])) 35 | 36 | for feat, value in token.feats.items(): 37 | self.assertTrue(feat in pos_properties[token.upos]) 38 | self.assertTrue(value in pos_labels[feat]) 39 | print(token.text, token.ner, token.upos, token.feats, token.head, token.deprel) 40 | self.assertIsNotNone(token.ner) 41 | self.assertTrue(token.ner in ner_labels) 42 | self.assertIsNotNone(token.head) 43 | self.assertIsNotNone(token.deprel) 44 | # We have to add plus one, because the cls token is removed 45 | self.assertTrue(token.head in range(0, len(doc.tokens) + 1)) 46 | self.assertTrue(token.deprel in dp_labels) 47 | self.assertIsNotNone(token.upos) 48 | self.assertTrue(token.upos in pos_labels['upos']) 49 | 50 | def test_annotations_are_same_with_multiple_configurations(self): 51 | nlp = Pipeline('dp,pos,ner,g2g') 52 | doc = nlp("Uparxoun autoi pou kerdizoun apo mia katastash kai autoi pou xanoun") 53 | 54 | deprels_preds = [] 55 | upos_preds = [] 56 | ner_preds = [] 57 | for token in doc.tokens: 58 | deprels_preds.append(token.deprel) 59 | upos_preds.append(token.upos) 60 | ner_preds.append(token.ner) 61 | 62 | nlp = Pipeline('dp,g2g') 63 | doc = nlp("Uparxoun autoi pou kerdizoun apo mia katastash kai autoi pou xanoun") 64 | new_deprels_preds = [] 65 | 66 | for token in doc.tokens: 67 | new_deprels_preds.append(token.deprel) 68 | 69 | nlp = Pipeline('pos,g2g') 70 | doc = nlp("Uparxoun autoi pou kerdizoun apo mia katastash kai autoi pou xanoun") 71 | new_upos_preds =[] 72 | 73 | for token in doc.tokens: 74 | new_upos_preds.append(token.upos) 75 | 76 | nlp = Pipeline('ner,g2g') 77 | doc = nlp("Uparxoun autoi pou kerdizoun apo mia katastash kai autoi pou xanoun") 78 | new_ner_preds =[] 79 | for token in doc.tokens: 80 | new_ner_preds.append(token.ner) 81 | 82 | self.assertEqual(new_deprels_preds, deprels_preds) 83 | self.assertEqual(new_upos_preds, upos_preds) 84 | self.assertEqual(new_ner_preds, ner_preds) 85 | 86 | 87 | 88 | def test_using_only_one_processor(self): 89 | nlp = Pipeline('ner') 90 | doc = nlp("Η Ιταλία κέρδισε την Αγγλία στον τελικό του Euro το 2021") 91 | 92 | for token in doc.tokens: 93 | self.assertIsNotNone(token.ner) 94 | self.assertTrue(token.ner in ner_labels) 95 | self.assertIsNone(token.head) 96 | self.assertIsNone(token.deprel) 97 | self.assertFalse(token.head in range(0, len(doc.tokens))) 98 | self.assertFalse(token.deprel in dp_labels) 99 | self.assertIsNone(token.upos) 100 | self.assertFalse(token.upos in pos_labels['upos']) 101 | 102 | for feat, value in token.feats.items(): 103 | self.assertFalse(feat in pos_properties[token.upos]) 104 | self.assertFalse(value in pos_labels[feat]) 105 | 106 | 107 | if __name__ == '__main__': 108 | unittest.main() 109 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/models/dp_model.py: -------------------------------------------------------------------------------- 1 | from torch.nn import LeakyReLU 2 | 3 | from gr_nlp_toolkit.models.util import create_mask_from_length 4 | 5 | from torch import nn 6 | import torch 7 | 8 | 9 | class DPModel(nn.Module): 10 | """ 11 | Dependency Parsing model. 12 | 13 | Attributes: 14 | numrels (int): Number of dependency relation labels. 15 | _bert_model (nn.Module): The BERT model. 16 | _dp (nn.Dropout): Dropout layer. 17 | arc_head (nn.Linear): Linear layer for arc head representation. 18 | arc_dep (nn.Linear): Linear layer for arc dependent representation. 19 | rel_head (nn.Linear): Linear layer for relation head representation. 20 | rel_dep (nn.Linear): Linear layer for relation dependent representation. 21 | arc_bias (nn.Parameter): Bias parameter for arc representation. 22 | rel_bias (nn.Parameter): Bias parameter for relation representation. 23 | u_rel (nn.Parameter): Parameter for relation representation. 24 | w_arc (nn.Parameter): Parameter for arc representation. 25 | w_rel_head (nn.Parameter): Parameter for relation head representation. 26 | w_rel_dep (nn.Parameter): Parameter for relation dependent representation. 27 | deprel_linear_2 (nn.Linear): Linear layer for dependency relation labels. 28 | relu (LeakyReLU): LeakyReLU activation function. 29 | """ 30 | 31 | def __init__(self, bert_model, deprel_i2l, dp): 32 | """ 33 | Initialize the DPModel. 34 | 35 | Args: 36 | bert_model (nn.Module): The BERT model. 37 | deprel_i2l (list): List of dependency relation labels. 38 | dp (float): The dropout probability. 39 | 40 | """ 41 | super(DPModel, self).__init__() 42 | self.numrels = len(deprel_i2l) 43 | self._bert_model = bert_model 44 | self._dp = nn.Dropout(dp) 45 | 46 | self.arc_head = nn.Linear(768, 768) 47 | self.arc_dep = nn.Linear(768, 768) 48 | 49 | self.rel_head = nn.Linear(768, 768) 50 | self.rel_dep = nn.Linear(768, 768) 51 | 52 | self.arc_bias = nn.Parameter(torch.zeros(1, 768, 1)) 53 | self.rel_bias = nn.Parameter(torch.zeros(1, 1, 1, self.numrels)) 54 | 55 | self.u_rel = nn.Parameter(torch.zeros(1, 768, self.numrels * 768)) 56 | 57 | self.w_arc = nn.Parameter(torch.zeros(1, 768, 768)) 58 | self.w_rel_head = nn.Parameter(torch.zeros(1, 1, 768, self.numrels)) 59 | self.w_rel_dep = nn.Parameter(torch.zeros(1, 1, 768, self.numrels)) 60 | 61 | self.deprel_linear_2 = nn.Linear(768, len(deprel_i2l) * 768) 62 | 63 | self.relu = LeakyReLU(1) 64 | 65 | 66 | def forward(self, text, text_len): 67 | """ 68 | Forward pass of the DPModel. 69 | 70 | Args: 71 | text (Tensor): Input text. 72 | text_len (Tensor): Length of the input text. 73 | 74 | Returns: 75 | output (dict): Dictionary containing the output of the model. 76 | 77 | """ 78 | output = {} 79 | 80 | attention_mask = create_mask_from_length(text_len, text.shape[1]) 81 | bert = self._bert_model(text, attention_mask=attention_mask) 82 | 83 | # output size bs , mseq , 768 84 | bert_output = self._dp(bert[0]) 85 | bs = bert_output.shape[0] 86 | mseq = bert_output.shape[1] 87 | 88 | # Specialized vector representations 89 | arc_head = self.relu(self.arc_head(bert_output)) # bs,mseq,768 90 | arc_dep = self.relu(self.arc_dep(bert_output)) # bs,mseq,768 91 | rel_head = self.relu(self.rel_head(bert_output)) # bs,mseq,768 92 | rel_dep = self.relu(self.rel_dep(bert_output)) # bs,mseq,768 93 | 94 | # bs,mseq,768 @ bs,768,mseq + bs,mseq,768 @ 1,768,1 95 | output_linear_head = arc_head @ (arc_dep @ self.w_arc).transpose(1, 2) + arc_head @ self.arc_bias 96 | # arcdep * self w.arc = (bs,mseq,768) * (1,768,768) = (bs, mseq , 768) 97 | 98 | # bs,mseq, 768 * 1,768,768 *numrel = bs,mseq,numrel,768,` 99 | label_biaffine = rel_dep @ self.u_rel # bs,mseq,768 * numrel 100 | label_biaffine = label_biaffine.reshape(bs,mseq,self.numrels,768) 101 | label_biaffine = label_biaffine @ rel_head.transpose(1,2).unsqueeze(1) # bs,mseq,numrel,mseq 102 | label_biaffine = label_biaffine.transpose(2,3) 103 | 104 | label_head_affine = (rel_head.unsqueeze(2) @ self.w_rel_head) 105 | label_dep_affine = (rel_dep.unsqueeze(2) @ self.w_rel_dep) 106 | label_bias = self.rel_bias 107 | 108 | output_linear_rel = label_biaffine + label_head_affine + label_dep_affine + label_bias 109 | # (bs,mseq,1,768) @ (1 , 1 , 768 ,numrels) + ( 1, 1 , 1, numrels) 110 | # (bs,mseq,1,numrels) 111 | 112 | output['heads'] = output_linear_head 113 | output['deprels'] = output_linear_rel.reshape(bs, mseq, mseq, self.numrels) 114 | 115 | selected_arcs = output_linear_head.argmax(-1) # bs,mseq (indexes in [0,mseq) ) 116 | selected_arcs = selected_arcs.unsqueeze(-1).repeat(1, 1, mseq) # bs,mseq,mseq 117 | selected_arcs = selected_arcs.unsqueeze(-1).repeat(1, 1, 1, self.numrels) # bs,mseq,mseq, numrels 118 | 119 | deprels_output = torch.gather(output_linear_rel, dim=2, index=selected_arcs) # bs,mseq,mseq,numrels 120 | # dim 2 is redundant so must be deleted ( there is only one head for every token) 121 | deprels_output = deprels_output.narrow(2, 0, 1) # bs,mseq,1,numrels 122 | deprels_output = deprels_output.squeeze(2) # bs , mseq,numrels 123 | output['gathered_deprels'] = deprels_output 124 | 125 | return output -------------------------------------------------------------------------------- /gr_nlp_toolkit/pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | from gr_nlp_toolkit.domain.document import Document 2 | from gr_nlp_toolkit.processors.dp import DP 3 | from gr_nlp_toolkit.processors.ner import NER 4 | from gr_nlp_toolkit.processors.pos import POS 5 | from gr_nlp_toolkit.processors.g2g import G2G 6 | 7 | from gr_nlp_toolkit.processors.tokenizer import Tokenizer 8 | from huggingface_hub import hf_hub_download 9 | 10 | from typing import Literal 11 | import torch 12 | 13 | from transformers import logging 14 | logging.set_verbosity_error() 15 | 16 | def get_device_name() -> Literal["mps", "cuda", "cpu"]: 17 | """ 18 | Returns the name of the device where this module is running. 19 | 20 | This is a simple implementation that doesn't cover cases when more powerful GPUs are available 21 | and not a primary device ('cuda:0') or MPS device is available but not configured properly: 22 | https://pytorch.org/docs/master/notes/mps.html 23 | 24 | Returns: 25 | Literal["mps", "cuda", "cpu"]: Device name, like 'cuda' or 'cpu'. 26 | 27 | Examples: 28 | >>> torch.cuda.is_available = lambda: True 29 | >>> torch.backends.mps.is_available = lambda: False 30 | >>> get_device_name() 31 | 'cuda' 32 | 33 | >>> torch.cuda.is_available = lambda: False 34 | >>> torch.backends.mps.is_available = lambda: True 35 | >>> get_device_name() 36 | 'mps' 37 | 38 | >>> torch.cuda.is_available = lambda: False 39 | >>> torch.backends.mps.is_available = lambda: False 40 | >>> get_device_name() 41 | 'cpu' 42 | """ 43 | if torch.cuda.is_available(): 44 | return "cuda" 45 | elif torch.backends.mps.is_available(): 46 | return "mps" 47 | else: 48 | return "cpu" 49 | 50 | 51 | class Pipeline: 52 | """ 53 | The central class of the toolkit. A pipeline is created after a list of processors are specified. The user can 54 | then annotate a document by using the __call__ method of the Pipeline 55 | 56 | Attributes: 57 | _processors: A list of the processors that will be used in the pipeline 58 | _processor_cache: A ProcessorCache object that is used to download the processors 59 | device: The device where the pipeline will run 60 | 61 | """ 62 | 63 | def __init__(self, processors: str, use_cpu: bool = False): 64 | """ 65 | Initializes the pipeline with the specified processors 66 | 67 | Args: 68 | processors: A string with the names of the processors you want to load, available values: 'ner', 'por', 'dp 69 | use_cpu: A boolean that specifies if the pipeline will run on the CPU 70 | """ 71 | 72 | # if the user wants to use the CPU, we set the device to 'cpu' 73 | if(use_cpu): 74 | self.device = "cpu" 75 | else: 76 | self.device = get_device_name() 77 | 78 | self._processors = [] 79 | 80 | processors = set(processors.split(",")) 81 | 82 | # ner: Named Entity Recognition Processor 83 | # pos: Part of Speech Recognition Processor 84 | # dp: Dependency Parsing 85 | # g2g: Greeklish to Greek Transliteration Processor (ByT5 model) 86 | # g2g_lite: Greeklish to Greek Transliteration Processor (LSTM model) 87 | available_processors = ['ner', 'pos', 'dp', 'g2g_lite', 'g2g'] 88 | 89 | 90 | # Adding the g2g processor, which must be the first in the pipeline 91 | if("g2g_lite" in processors): 92 | self._processors.append(G2G(mode="LSTM", model_path="gr_nlp_toolkit/RBNLM_weights/LSTM_LM_50000_char_120_32_512.pt", tokenizer_path="gr_nlp_toolkit/RBNLM_weights/RBNLMtextVectorizer.pkl", device=self.device)) 93 | processors.remove("g2g_lite") 94 | elif("g2g" in processors): 95 | self._processors.append(G2G(mode="transformer", model_path="AUEB-NLP/ByT5_g2g", device=self.device)) 96 | processors.remove("g2g") 97 | 98 | 99 | # Adding the tokenizer processor 100 | self._processors.append(Tokenizer()) 101 | for p in processors: 102 | if p == available_processors[0]: 103 | ner_path = hf_hub_download(repo_id="AUEB-NLP/gr-nlp-toolkit", filename="ner_processor") 104 | self._processors.append(NER(model_path=ner_path, device=self.device)) 105 | elif p == available_processors[1]: 106 | pos_path = hf_hub_download(repo_id="AUEB-NLP/gr-nlp-toolkit", filename="pos_processor") 107 | self._processors.append(POS(model_path=pos_path, device=self.device)) 108 | elif p == available_processors[2]: 109 | dp_path = hf_hub_download(repo_id="AUEB-NLP/gr-nlp-toolkit", filename="dp_processor") 110 | self._processors.append(DP(model_path=dp_path, device=self.device)) 111 | else: 112 | raise Exception(f"Invalid processor name, please choose one of {available_processors}") 113 | 114 | def __call__(self, text: str) -> Document: 115 | 116 | """ 117 | Annotate a text with the processors present in the pipeline 118 | 119 | Args: 120 | text: The text that will be annotated 121 | """ 122 | 123 | # Create a document from the text 124 | self._doc = Document(text) 125 | 126 | # Pass the document through every processor 127 | for processor in self._processors: 128 | # print(processor) 129 | processor(self._doc) 130 | 131 | return self._doc 132 | 133 | if __name__ == "__main__": 134 | 135 | 136 | nlp = Pipeline("g2g,ner,dp,pos") 137 | 138 | txts = ["Uparxoun autoi pou kerdizoun apo mia katastash kai autoi pou hanoun", 139 | "o volos kai h larisa einai poleis ths thessalias", 140 | "Η Αθήνα είναι η μεγαλύτερη πόλη της Ελλάδας"] 141 | 142 | for txt in txts: 143 | 144 | doc = nlp(txt) 145 | 146 | print(doc.text) 147 | for token in doc.tokens: 148 | print(f"{token.text}: {token.ner}, {token.upos}, {token.feats}, {token.head}, {token.deprel}") # the text of the token 149 | 150 | 151 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/configs/dictionary_tables.py: -------------------------------------------------------------------------------- 1 | # Without intonation 2 | greek_to_greeklish = {'α': ['a'], 'ε': ['e'], 'η': ['h', 'i'], 'ι': ['i'], 'ο': ['o'], 'υ': ['u', 'y', 'i'], 3 | 'ω': ['w', 'o'], 'β': ['b', 'v'], 'γ': ['g'], 'δ': ['d'], 'ζ': ['z'], 'θ': ['th', 'u'], 4 | 'κ': ['k'], 'λ': ['l'], 'μ': ['m'], 'ν': ['n'], 'ξ': ['ks', 'x'], 'π': ['p'], 5 | 'ρ': ['r'], 'σ': ['s'], 'ς': ['s'], 'τ': ['t'], 'χ': ['x', 'h'], 'φ': ['f'], 'ψ': ['ps'], 6 | 7 | 'Α': ['A'], 'Ε': ['E'], 'Η': ['H', 'I'], 'Ι': ['I'], 'Ο': ['O'], 'Υ': ['Y', 'I'], 8 | 'Ω': ['W', 'O'], 'Β': ['B', 'V'], 'Γ': ['G'], 'Δ': ['D'], 'Ζ': ['Z'], 9 | 'Θ': ['Th', 'U', 'Q'], 'Κ': ['K'], 'Λ': ['L'], 'Μ': ['M'], 'Ν': ['N'], 'Ξ': ['Ks', 'X'], 10 | 'Π': ['P'], 'Ρ': ['R'], 'Σ': ['S'], 'Τ': ['T'], 'Χ': ['X', 'H'], 'Φ': ['F'], 'Ψ': ['P'], 11 | 12 | 'ει': ['ei', 'i'], 'οι': ['oi', 'i'], 'ου': ['ou', 'u'], 'αι': ['ai', 'e'], 13 | 'υι': ['ui', 'i'], 'μπ': ['mp', 'b'], 'ντ': ['nt', 'd'], 14 | 15 | 'Ει': ['Ei', 'I'], 'Οι': ['Oi', 'I'], 'Ου': ['Ou', 'U'], 'Αι': ['Ai', 'E'], 16 | 'Υι': ['Yi', 'I'], 'Μπ': ['Mp', 'B'], 'Ντ': ['Nt', 'D']} 17 | 18 | greeklish_to_greek = {'A': ['Α'], 'Ai': ['Αι'], 'B': ['Β', 'Μπ'], 'D': ['Δ', 'Ντ'], 'E': ['Ε', 'Αι'], 'Ei': ['Ει'], 19 | 'F': ['Φ'], 'G': ['Γ'], 'H': ['Η', 'Χ'], 'I': ['Η', 'Ι', 'Υ', 'Ει', 'Οι', 'Υι'], 'K': ['Κ'], 20 | 'Ks': ['Ξ'], 'L': ['Λ'], 'M': ['Μ'], 'Mp': ['Μπ'], 'N': ['Ν'], 'Nt': ['Ντ'], 'O': ['Ο', 'Ω'], 21 | 'Oi': ['Οι'], 'Ou': ['Ου'], 'P': ['Π', 'Ψ'], 'Q': ['Θ'], 'R': ['Ρ'], 'S': ['Σ'], 'T': ['Τ'], 22 | 'Th': ['Θ'], 'U': ['Θ', 'Ου', 'Y'], 'V': ['Β'], 'W': ['Ω'], 'X': ['Ξ', 'Χ'], 'Y': ['Υ'], 'Yi': ['Υι'], 23 | 'Z': ['Ζ'], 'a': ['α'], 'ai': ['αι'], 'b': ['β', 'μπ'], 'd': ['δ', 'ντ'], 24 | 'e': ['ε', 'αι'], 'ei': ['ει'], 'f': ['φ'], 'g': ['γ'], 'h': ['η', 'χ'], 25 | 'i': ['η', 'ι', 'υ', 'ει', 'οι', 'υι'], 'k': ['κ'], 'ks': ['ξ'], 'l': ['λ'], 'm': ['μ'], 26 | 'mp': ['μπ'], 'n': ['ν'], 'nt': ['ντ'], 'o': ['ο', 'ω'], 'oi': ['οι'], 'ou': ['ου'], 'p': ['π'], 27 | 'ps': ['ψ'], 'r': ['ρ'], 's': ['σ', 'ς'], 't': ['τ'], 'th': ['θ'], 'u': ['υ', 'θ', 'ου'], 28 | 'ui': ['υι'], 'v': ['β'], 'w': ['ω'], 'x': ['ξ', 'χ'], 'y': ['υ'], 'z': ['ζ']} 29 | 30 | # With intonation 31 | greek_to_greeklish_intonated = {'α': ['a'], 'ε': ['e'], 'η': ['h', 'i'], 'ι': ['i'], 'ο': ['o'], 'υ': ['u', 'y', 'i'], 32 | 'ω': ['w', 'o'], 'β': ['b', 'v'], 'γ': ['g'], 'δ': ['d'], 'ζ': ['z'], 'θ': ['th', 'u'], 33 | 'κ': ['k'], 'λ': ['l'], 'μ': ['m'], 'ν': ['n'], 'ξ': ['ks', 'x'], 'π': ['p'], 34 | 'ρ': ['r'], 'σ': ['s'], 'ς': ['s'], 'τ': ['t'], 'φ': ['f'], 'χ': ['x', 'h'], 'ψ': ['ps'], 35 | 36 | 'Α': ['A'], 'Ε': ['E'], 'Η': ['H', 'I'], 'Ι': ['I'], 'Ο': ['O'], 'Υ': ['Y', 'U', 'I'], 37 | 'Ω': ['W', 'O'], 'Β': ['B', 'V'], 'Γ': ['G'], 'Δ': ['D'], 'Ζ': ['Z'], 38 | 'Θ': ['Th', 'U', 'Q'], 'Κ': ['K'], 'Λ': ['L'], 'Μ': ['M'], 'Ν': ['N'], 'Ξ': ['Ks', 'X'], 39 | 'Π': ['P'], 'Ρ': ['R'], 'Σ': ['S'], 'Τ': ['T'], 'Χ': ['X', 'H'], 'Φ': ['F'], 'Ψ': ['P'], 40 | 41 | 'ει': ['ei', 'i'], 'οι': ['oi', 'i'], 'ου': ['ou', 'u'], 'αι': ['ai', 'e'], 42 | 'υι': ['ui', 'i'], 'μπ': ['mp', 'b'], 'ντ': ['nt', 'd'], 43 | 44 | 'Ει': ['Ei', 'I'], 'Οι': ['Oi', 'I'], 'Ου': ['Ou', 'U'], 'Αι': ['Ai', 'E'], 45 | 'Υι': ['Yi', 'I'], 'Μπ': ['Mp', 'B'], 'Ντ': ['Nt', 'D'], 46 | 47 | 'ά': ['a'], 'έ': ['e'], 'ή': ['h', 'i'], 'ί': ['i'], 'ό': ['o'], 'ύ': ['u', 'y', 'i'], 48 | 'ώ': ['w', 'o'], 49 | 'Ά': ['A'], 'Έ': ['E'], 'Ή': ['H', 'I'], 'Ί': ['I'], 'Ό': ['O'], 'Ύ': ['Y', 'U', 'I'], 50 | 'Ώ': ['W', 'O'], 51 | 52 | 'εί': ['ei', 'i'], 'οί': ['oi', 'i'], 'ού': ['ou', 'u'], 'αί': ['ai', 'e'], 53 | 'υί': ['ui', 'i'], 54 | 'Εί': ['Ei', 'I'], 'Οί': ['Oi', 'I'], 'Ού': ['Ou', 'U'], 'Αί': ['Ai', 'E'], 55 | 'Υί': ['Yi', 'I'], 56 | } 57 | 58 | 59 | greeklish_to_greek_intonated = {'A': ['Α', 'Ά'], 'Ai': ['Αι', 'Αί'], 'B': ['Β', 'Μπ'], 'D': ['Δ', 'Ντ'], 60 | 'E': ['Ε', 'Αι', 'Έ', 'Αί'], 'Ei': ['Ει', 'Εί'], 'F': ['Φ'], 'G': ['Γ'], 61 | 'H': ['Η', 'Χ', 'Ή'], 62 | 'I': ['Η', 'Ι', 'Υ', 'Ει', 'Οι', 'Υι', 'Ή', 'Ί', 'Ύ', 'Εί', 'Οί', 'Υί'], 63 | 'K': ['Κ'], 'Ks': ['Ξ'], 'L': ['Λ'], 'M': ['Μ'], 'Mp': ['Μπ'], 'N': ['Ν'], 64 | 'Nt': ['Ντ'], 'O': ['Ο', 'Ω', 'Ό', 'Ώ'], 'Oi': ['Οι', 'Οί'], 'Ou': ['Ου', 'Ού'], 65 | 'P': ['Π', 'Ψ'], 'Q': ['Θ'], 'R': ['Ρ'], 'S': ['Σ'], 'T': ['Τ'], 'Th': ['Θ'], 66 | 'U': ['Θ', 'Ου', 'Ού', 'Υ', 'Ύ'], 'V': ['Β'], 'W': ['Ω', 'Ώ'], 'X': ['Ξ', 'Χ'], 'Y': ['Υ', 'Ύ'], 67 | 'Yi': ['Υι', 'Υί'], 'Z': ['Ζ'], 'a': ['α', 'ά'], 'ai': ['αι', 'αί'], 'b': ['β', 'μπ'], 68 | 'd': ['δ', 'ντ'], 'e': ['ε', 'αι', 'έ', 'αί'], 'ei': ['ει', 'εί'], 'f': ['φ'], 69 | 'g': ['γ'], 'h': ['η', 'χ', 'ή'], 70 | 'i': ['η', 'ι', 'υ', 'ει', 'οι', 'υι', 'ή', 'ί', 'ύ', 'εί', 'οί', 'υί'], 71 | 'k': ['κ'], 'ks': ['ξ'], 'l': ['λ'], 'm': ['μ'], 'mp': ['μπ'], 'n': ['ν'], 72 | 'nt': ['ντ'], 'o': ['ο', 'ω', 'ό', 'ώ'], 'oi': ['οι', 'οί'], 'ou': ['ου', 'ού'], 73 | 'p': ['π'], 'ps': ['ψ'], 'r': ['ρ'], 's': ['σ', 'ς'], 't': ['τ'], 'th': ['θ'], 74 | 'u': ['υ', 'θ', 'ου', 'ύ', 'ού'], 'ui': ['υι', 'υί'], 'v': ['β'], 'w': ['ω', 'ώ'], 75 | 'x': ['ξ', 'χ'], 'y': ['υ', 'ύ'], 'z': ['ζ']} 76 | 77 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/processors/tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Dict 2 | from torch.utils.data import DataLoader, Dataset 3 | 4 | from gr_nlp_toolkit.domain.dataset import DatasetImpl 5 | from gr_nlp_toolkit.domain.document import Document 6 | from gr_nlp_toolkit.processors.abstract_processor import AbstractProcessor 7 | 8 | import unicodedata 9 | 10 | from gr_nlp_toolkit.domain.token import Token 11 | 12 | from transformers import AutoTokenizer 13 | 14 | tokenizer_greek = AutoTokenizer.from_pretrained('nlpaueb/bert-base-greek-uncased-v1') 15 | 16 | 17 | def strip_accents_and_lowercase(s: str) -> str: 18 | """ 19 | Strips accents from a string and converts it to lowercase. 20 | 21 | Args: 22 | s: A string from which to strip accents. 23 | 24 | Returns: 25 | A new string with accents removed and converted to lowercase. 26 | """ 27 | return ''.join(c for c in unicodedata.normalize('NFD', s) 28 | if unicodedata.category(c) != 'Mn').lower() 29 | 30 | 31 | def create_ids(text: str) -> List[int]: 32 | """ 33 | Encodes a given text into a list of token IDs using a tokenizer. 34 | 35 | Args: 36 | text: A string to encode. 37 | 38 | Returns: 39 | A list of token IDs. 40 | """ 41 | return tokenizer_greek.encode(text) 42 | 43 | 44 | def create_text(ids: List[int]) -> List[int]: 45 | """ 46 | Decodes a list of token IDs back into a text string. 47 | 48 | Args: 49 | ids: A list of token IDs to decode. 50 | 51 | Returns: 52 | A decoded string with special tokens skipped. 53 | """ 54 | return tokenizer_greek.decode(ids, skip_special_tokens=True) 55 | 56 | 57 | def convert_to_tokens(input_ids: List[int]) -> List[str]: 58 | """ 59 | Converts a list of token IDs into their corresponding token strings. 60 | 61 | Args: 62 | input_ids: A list of token IDs. 63 | 64 | Returns: 65 | A list of token strings with special tokens skipped. 66 | """ 67 | return tokenizer_greek.convert_ids_to_tokens(input_ids, skip_special_tokens=True) 68 | 69 | 70 | def remove_special_tokens(input_ids: List[int]) -> List[int]: 71 | """ 72 | Removes special tokens from a list of token IDs. 73 | 74 | Args: 75 | input_ids: A list of token IDs. 76 | 77 | Returns: 78 | A new list of token IDs with special tokens removed. 79 | """ 80 | input_ids_without_special_tokens = [] 81 | for input_id in input_ids: 82 | if input_id not in tokenizer_greek.all_special_ids: 83 | input_ids_without_special_tokens.append(input_id) 84 | return input_ids_without_special_tokens 85 | 86 | 87 | 88 | def create_mask_and_tokens(input_tokens: List[str], input_ids: List[int]) -> Tuple[List[str], List[Token], Dict]: 89 | """ 90 | Creates a mask, tokens, and subword-to-word mapping from input tokens and IDs. 91 | 92 | Args: 93 | input_tokens: A list of input token strings. 94 | input_ids: A list of input token IDs. 95 | 96 | Returns: 97 | A tuple containing: 98 | - A list of booleans indicating whether each token is a subword. 99 | - A list of Token objects. 100 | - A dictionary mapping subword indices to word indices. 101 | """ 102 | mask = [] 103 | tokens = [] 104 | subword2word = {} 105 | 106 | word = 0 107 | # for each token 108 | for j, input in enumerate(zip(input_tokens, input_ids), 1): 109 | t = input[0] 110 | i = input[1] 111 | # it isn't a sub-word 112 | if not t.startswith("##"): 113 | # create a token object 114 | tokenObj = Token([t]) 115 | tokenObj.ids.append(i) 116 | tokens.append(tokenObj) 117 | mask.append(True) 118 | word = word + 1 119 | else: 120 | 121 | # add sub-words to token 122 | tokenObj.subwords.append(t) 123 | tokenObj.ids.append(i) 124 | mask.append(False) 125 | subword2word[j] = word 126 | 127 | # create text 128 | for token in tokens: 129 | token.text = create_text(token.ids) 130 | 131 | # Adding a 0-0 mapping to subword2word 132 | subword2word[0] = 0 133 | 134 | return mask, tokens, subword2word 135 | 136 | 137 | def create_dataset_and_dataloader(input_ids) -> Tuple[Dataset, DataLoader]: 138 | """ 139 | Creates a dataset and dataloader from input IDs. 140 | 141 | Args: 142 | input_ids: A list of input token IDs. 143 | 144 | Returns: 145 | A tuple containing: 146 | - A Dataset object. 147 | - A DataLoader object. 148 | """ 149 | dataset = DatasetImpl([input_ids]) 150 | dataloader = DataLoader(dataset) 151 | return dataset, dataloader 152 | 153 | 154 | 155 | class Tokenizer(AbstractProcessor): 156 | """ 157 | Tokenizer class that takes a document as an input with the text field set, tokenizes and returns a document with 158 | all fields set 159 | """ 160 | 161 | def __call__(self, doc: Document) -> Document: 162 | """ 163 | Processes a document by tokenizing its text and setting relevant fields. 164 | 165 | Args: 166 | doc: A Document object with the text field set. 167 | 168 | Returns: 169 | A Document object with the following fields set: 170 | - text: The original text stripped of accents and converted to lowercase. 171 | - input_ids: List of token IDs created from the text. 172 | - token_mask: List of booleans indicating whether each token is a subword. 173 | - tokens: List of Token objects. 174 | - subword2word: Dictionary mapping subword indices to word indices. 175 | - dataset: A Dataset object created from the input IDs. 176 | - dataloader: A DataLoader object created from the dataset. 177 | """ 178 | # get document's text and strip accent and lowercase 179 | doc.text = strip_accents_and_lowercase(doc.text) 180 | # create ids 181 | doc.input_ids = create_ids(doc.text) 182 | # create mask and tokens 183 | doc.token_mask, doc.tokens, doc.subword2word = create_mask_and_tokens(convert_to_tokens(doc.input_ids), 184 | remove_special_tokens(doc.input_ids)) 185 | 186 | # create dataloader 187 | doc.dataset, doc.dataloader = create_dataset_and_dataloader(doc.input_ids) 188 | return doc 189 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/data/util.py: -------------------------------------------------------------------------------- 1 | import string 2 | from collections import Counter 3 | from torch.utils.data import Dataset 4 | import torch 5 | import torch.nn as nn 6 | 7 | class TextVectorizer: 8 | """ 9 | Used to vectorize given text based on a learned vocabulary. 10 | After training the vocabulary on a corpus, the resulting encoding is: 11 | 1 : Padding 12 | [1 to max_vocab_size+1] : tokens learnt in the vocab. (This could be smaller than the actual max number provided) 13 | len(vocab)-1 : [SOS] symbol 14 | len(vocab) : OOV tokens 15 | """ 16 | def __init__(self, mode): 17 | """ 18 | vocab: dictionary (token --> index) 19 | idx2token: list (idx --> index) 20 | :param mode: 21 | """ 22 | assert mode in {"word", "char"} 23 | self.vocab = dict() 24 | self.idx2token = [] 25 | self.mode = mode 26 | 27 | def build_vocab(self, corpus, max_size=25000): 28 | """ 29 | Builds the vocabulary from a corpus of sentences. The words get encoded by 30 | count of appearances in the data. 31 | :param corpus: A list of sentences as strings. 32 | :param max_size: The max size of words that can be encoded, excluding codes for & OOV tokens. 33 | """ 34 | counts = Counter() 35 | self.vocab[""] = 0 36 | self.idx2token.append("") 37 | idx = 1 38 | if self.mode == "word": 39 | # In the case of words, we remove punctuation and split on whitespaces 40 | for line in corpus: 41 | # Remove punctuation 42 | line = line.translate(str.maketrans("", "", string.punctuation)) 43 | # Split the line in whitespaces to get the words 44 | tokens = line.split() 45 | # Update counts 46 | counts.update(tokens) 47 | # mode == "char" 48 | else: 49 | # Here we do not do any regularization, and split on every character. 50 | for line in corpus: 51 | tokens = [char for char in line] 52 | counts.update(tokens) 53 | # Add the most frequent tokens to the vocabulary. 54 | for (name, count) in counts.most_common(max_size): 55 | self.vocab[name] = idx 56 | self.idx2token.append(name) 57 | idx += 1 58 | # Add [SOS] token. 59 | self.vocab[""] = idx 60 | self.idx2token.append("") 61 | 62 | 63 | def encode_dataset(self, corpus): 64 | """ 65 | Takes as input a corpus of sentences, generates source/target training pairs 66 | and encodes them based on the vocabulary. Then it returns the pairs as tuples of tensors. 67 | :param corpus: Array of sentences in the form of strings. 68 | :return: list of pairs of torch.LongTensor objects 69 | """ 70 | # We start by tokenizing the corpus. 71 | tokenized_dataset = [] 72 | for line in corpus: 73 | if self.mode == "word": 74 | # Strip punctuation and split on whitespaces. 75 | tokens = line.translate(str.maketrans("", "", string.punctuation)).split() 76 | else: 77 | # No regularization applied for characters. 78 | tokens = [char for char in line] 79 | # Also find the length of the longest sequence, taking into account the addition of a [SOS]/[EOS] symbol. 80 | tokenized_dataset.append(tokens) 81 | # Make source & target sentences and encode them based on the dictionary. 82 | source_vecs, target_vecs = [], [] 83 | for sequence in tokenized_dataset: 84 | # Ignore strings that may be reduced to empty after stripping punctuation & whitespaces 85 | # (only happens if mode == "word") 86 | if not sequence: 87 | continue 88 | # Initialize source vectorized sentence with token. 89 | source_vector = [self.vocab[""]] 90 | target_vector = [] 91 | for idx in range(len(sequence)-1): 92 | source_vector.append(self.vocab.get(sequence[idx], len(self.vocab))) 93 | target_vector.append(self.vocab.get(sequence[idx], len(self.vocab))) 94 | target_vector.append(self.vocab.get(sequence[-1], len(self.vocab))) 95 | # Add to sources/targets. 96 | source_vecs.append(source_vector) 97 | target_vecs.append(target_vector) 98 | 99 | """# Get the length for each sequence in the data 100 | source_lengths = torch.LongTensor(list(map(len, source_vecs))) 101 | target_lengths = torch.LongTensor(list(map(len, target_vecs)))""" 102 | # Convert data to LongTensors. 103 | for i in range(len(source_vecs)): 104 | source_vecs[i] = torch.LongTensor(source_vecs[i]) 105 | target_vecs[i] = torch.LongTensor(target_vecs[i]) 106 | # Pad & Sort sequences 107 | source_tensors = nn.utils.rnn.pad_sequence(source_vecs, batch_first=True) 108 | target_tensors = nn.utils.rnn.pad_sequence(target_vecs, batch_first=True) 109 | # Create Dataset object 110 | dataset = GreekDataset(source_tensors, target_tensors) 111 | # Return the Dataset & the sequence lengths (to be used for packing) 112 | return dataset 113 | 114 | def split_sequence(self, sequence): 115 | """ 116 | Splits a sequence based on the tokenization mode configured, and returns it without indexing it. 117 | """ 118 | if self.mode == "word": 119 | tokens = sequence.translate(str.maketrans("", "", string.punctuation)).split() 120 | else: 121 | tokens = [char for char in sequence] 122 | 123 | return sequence 124 | 125 | def input_tensor(self, sequence): 126 | """ 127 | Takes a sentence and returns its encoding, based on the vocabulary, to be used for inference. 128 | :param sequence: (String) The sentence to be encoded. 129 | :return: Encoded sentence in form of a torch.Longtensor object. 130 | """ 131 | if self.mode == "word": 132 | tokens = sequence.translate(str.maketrans("", "", string.punctuation)).split() 133 | else: 134 | tokens = [char for char in sequence] 135 | vectorized_input = [] 136 | for token in tokens: 137 | vectorized_input.append(self.vocab.get(token, len(self.vocab))) 138 | 139 | # Convert to tensor 140 | vectorized_input = torch.LongTensor(vectorized_input) 141 | 142 | return vectorized_input 143 | 144 | 145 | class GreekDataset(Dataset): 146 | 147 | def __init__(self, source_vecs, target_vecs): 148 | """ 149 | Gets two arrays of source and target vectors and outputs a Dataset object of those arrays 150 | :param source_vecs: array of source vectors 151 | :param target_vecs: array of target vectors 152 | """ 153 | self.n_samples = source_vecs.size(0) 154 | self.x_data = source_vecs 155 | self.y_data = target_vecs 156 | 157 | 158 | def __getitem__(self, index): 159 | return self.x_data[index], self.y_data[index] 160 | 161 | def __len__(self): 162 | return self.n_samples -------------------------------------------------------------------------------- /gr_nlp_toolkit/domain/textVectorizer.py: -------------------------------------------------------------------------------- 1 | import string 2 | from collections import Counter 3 | from torch.utils.data import Dataset 4 | import torch 5 | import torch.nn as nn 6 | import pickle 7 | 8 | class TextVectorizer: 9 | """ 10 | Used to vectorize given text based on a learned vocabulary. 11 | It is used by the RBNLM model to encode the text into numbers that can be then fed into the LSTM model. 12 | After training the vocabulary on a corpus, the resulting encoding is: 13 | 1 : Padding 14 | [1 to max_vocab_size+1] : tokens learnt in the vocab. (This could be smaller than the actual max number provided) 15 | len(vocab)-1 : [SOS] symbol 16 | len(vocab) : OOV tokens 17 | 18 | Attributes: 19 | vocab (dict): A dictionary mapping tokens to their indices. (token --> index) 20 | idx2token (list): A list mapping indices to tokens. (idx --> index) 21 | mode (str): The tokenization mode, either 'word' or 'char'. 22 | """ 23 | def __init__(self, mode): 24 | """ 25 | Initializes the TextVectorizer with the specified mode. 26 | 27 | Args: 28 | mode (str): The tokenization mode, either 'word' or 'char'. 29 | """ 30 | assert mode in {"word", "char"} 31 | self.vocab = dict() 32 | self.idx2token = [] 33 | self.mode = mode 34 | 35 | def build_vocab(self, corpus, max_size=25000): 36 | """ 37 | Builds the vocabulary from a corpus of sentences. The words get encoded by 38 | count of appearances in the data. 39 | 40 | Args: 41 | corpus (str): A list of sentences as strings. 42 | max_size (int): The max size of words that can be encoded, excluding codes for & OOV tokens. 43 | """ 44 | counts = Counter() 45 | self.vocab[""] = 0 46 | self.idx2token.append("") 47 | idx = 1 48 | if self.mode == "word": 49 | # In the case of words, we remove punctuation and split on whitespaces 50 | for line in corpus: 51 | # Remove punctuation 52 | line = line.translate(str.maketrans("", "", string.punctuation)) 53 | # Split the line in whitespaces to get the words 54 | tokens = line.split() 55 | # Update counts 56 | counts.update(tokens) 57 | # mode == "char" 58 | else: 59 | # Here we do not do any regularization, and split on every character. 60 | for line in corpus: 61 | tokens = [char for char in line] 62 | counts.update(tokens) 63 | # Add the most frequent tokens to the vocabulary. 64 | for (name, count) in counts.most_common(max_size): 65 | self.vocab[name] = idx 66 | self.idx2token.append(name) 67 | idx += 1 68 | # Add [SOS] token. 69 | self.vocab[""] = idx 70 | self.idx2token.append("") 71 | 72 | 73 | 74 | def encode_dataset(self, corpus): 75 | """ 76 | Takes as input a corpus of sentences, generates source/target training pairs 77 | and encodes them based on the vocabulary. Then it returns the pairs as tuples of tensors. 78 | 79 | Args: 80 | corpus (list): Array of sentences in the form of strings. 81 | 82 | Returns: 83 | dataset (GreekDataset): List of pairs of torch.LongTensor objects 84 | """ 85 | # We start by tokenizing the corpus. 86 | tokenized_dataset = [] 87 | for line in corpus: 88 | if self.mode == "word": 89 | # Strip punctuation and split on whitespaces. 90 | tokens = line.translate(str.maketrans("", "", string.punctuation)).split() 91 | else: 92 | # No regularization applied for characters. 93 | tokens = [char for char in line] 94 | # Also find the length of the longest sequence, taking into account the addition of a [SOS]/[EOS] symbol. 95 | tokenized_dataset.append(tokens) 96 | # Make source & target sentences and encode them based on the dictionary. 97 | source_vecs, target_vecs = [], [] 98 | for sequence in tokenized_dataset: 99 | # Ignore strings that may be reduced to empty after stripping punctuation & whitespaces 100 | # (only happens if mode == "word") 101 | if not sequence: 102 | continue 103 | # Initialize source vectorized sentence with token. 104 | source_vector = [self.vocab[""]] 105 | target_vector = [] 106 | for idx in range(len(sequence)-1): 107 | source_vector.append(self.vocab.get(sequence[idx], len(self.vocab))) 108 | target_vector.append(self.vocab.get(sequence[idx], len(self.vocab))) 109 | target_vector.append(self.vocab.get(sequence[-1], len(self.vocab))) 110 | # Add to sources/targets. 111 | source_vecs.append(source_vector) 112 | target_vecs.append(target_vector) 113 | 114 | """# Get the length for each sequence in the data 115 | source_lengths = torch.LongTensor(list(map(len, source_vecs))) 116 | target_lengths = torch.LongTensor(list(map(len, target_vecs)))""" 117 | # Convert data to LongTensors. 118 | for i in range(len(source_vecs)): 119 | source_vecs[i] = torch.LongTensor(source_vecs[i]) 120 | target_vecs[i] = torch.LongTensor(target_vecs[i]) 121 | # Pad & Sort sequences 122 | source_tensors = nn.utils.rnn.pad_sequence(source_vecs, batch_first=True) 123 | target_tensors = nn.utils.rnn.pad_sequence(target_vecs, batch_first=True) 124 | # Create Dataset object 125 | dataset = GreekDataset(source_tensors, target_tensors) 126 | # Return the Dataset & the sequence lengths (to be used for packing) 127 | return dataset 128 | 129 | def split_sequence(self, sequence): 130 | """ 131 | Splits a sequence based on the tokenization mode configured, and returns it without indexing it. 132 | 133 | Args: 134 | sequence (str): The sentence to be split. 135 | 136 | Returns: 137 | sequence (str): The sentence split based on the tokenization mode. 138 | """ 139 | if self.mode == "word": 140 | tokens = sequence.translate(str.maketrans("", "", string.punctuation)).split() 141 | else: 142 | tokens = [char for char in sequence] 143 | 144 | return sequence 145 | 146 | def input_tensor(self, sequence): 147 | """ 148 | Takes a sentence and returns its encoding, based on the vocabulary, to be used for inference. 149 | 150 | Args: 151 | sequence (String): The sentence to be encoded. 152 | 153 | Returns: 154 | vectorized_input (torch.LongTensor): Encoded sentence in form of a torch.Longtensor object. 155 | """ 156 | if self.mode == "word": 157 | tokens = sequence.translate(str.maketrans("", "", string.punctuation)).split() 158 | else: 159 | tokens = [char for char in sequence] 160 | vectorized_input = [] 161 | for token in tokens: 162 | vectorized_input.append(self.vocab.get(token, len(self.vocab))) 163 | 164 | # Convert to tensor 165 | vectorized_input = torch.LongTensor(vectorized_input) 166 | 167 | return vectorized_input -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `gr-nlp-toolkit` 2 | 3 |

4 | gr-nlp-toolkit Logo 5 |

6 | 7 | `gr-nlp-toolkit` is a Python toolkit with state-of-the-art performance in (modern) Greek, supporting the following functionalities: 8 | 1. Named Entity Recognition (NER) 9 | 2. Part-of-Speech Tagging (POS Tagging) 10 | 3. Morphological tagging 11 | 4. Dependency parsing 12 | 5. Greeklish to Greek transliteration ("kalimera" -> "καλημερα") 13 | 14 | ## Web Demo 🤗 15 | 16 | Apart from the python library (details below), you can also interact with `gr-nlp-toolkit` in a no-code fashion by visiting our web playground here: https://huggingface.co/spaces/AUEB-NLP/greek-nlp-toolkit-demo 17 | 18 | Thanks to HuggingFace 🤗 for the GPUs. 19 | 20 | ## Installation 21 | The toolkit is supported for Python 3.9+. 22 | 23 | You can install it from PyPI by executing the following in the command line: 24 | 25 | ```sh 26 | pip install gr-nlp-toolkit 27 | ``` 28 | 29 | ## Usage 30 | 31 | ### Available Processors/Features 32 | 33 | To use the toolkit, first initialize a `Pipeline` specifying which task processors you need. Each processor 34 | annotates the text with a specific task's annotations. 35 | 36 | For example: 37 | - To obtain Part-of-Speech and Morphological Tagging annotations, add the `pos` processor 38 | - To obtain Named Entity Recognition annotations, add the `ner` processor 39 | - To obtain Dependency Parsing annotations, add the `dp` processor 40 | - To enable the transliteration from Greeklish to Greek, add the `g2g` processor or the `g2g_lite` processor for a lighter but less accurate model 41 | (Greeklish to Greek transliteration example: "thessalonikh" -> "θεσσαλονίκη") 42 | 43 | ### Example Usage Scenarios 44 | 45 | - DP, POS, NER processors (input text in Greek) 46 | 47 | ```python 48 | from gr_nlp_toolkit import Pipeline 49 | 50 | nlp = Pipeline("pos,ner,dp") # Instantiate the Pipeline with the DP, POS and NER processors 51 | doc = nlp("Η Ιταλία κέρδισε την Αγγλία στον τελικό του Euro 2020.") # Apply the pipeline to a sentence in Greek 52 | 53 | ``` 54 | 55 | A `Document` object is created and is annotated. The original text is tokenized 56 | and split to tokens 57 | 58 | ```python 59 | # Iterate over the generated tokens 60 | for token in doc.tokens: 61 | print(token.text) # the text of the token 62 | 63 | print(token.ner) # the named entity label in IOBES encoding : str 64 | 65 | print(token.upos) # the UPOS tag of the token 66 | print(token.feats) # the morphological features for the token 67 | 68 | print(token.head) # the head of the token 69 | print(token.deprel) # the dependency relation between the current token and its head 70 | ``` 71 | 72 | `token.ner` is set by the `ner` processor, `token.upos` and `token.feats` are set by the `pos` processor 73 | and `token.head` and `token.deprel` are set by the `dp` processor. 74 | 75 | A small detail is that to get the `Token` object that is the head of another token you need to access 76 | `doc.tokens[head-1]`. The reason for this is that the enumeration of the tokens starts from 1 and when the 77 | field `token.head` is set to 0, that means the token is the root of the word. 78 | 79 | - Greeklish to Greek Conversion (input text in Greeklish) 80 | 81 | ```python 82 | from gr_nlp_toolkit import Pipeline 83 | nlp = Pipeline("g2g") # Instantiate the pipeline with the g2g processor 84 | 85 | doc = nlp("O Volos kai h Larisa einai sth Thessalia") # Apply the pipeline to a sentence in Greeklish 86 | print(doc.text) # Access the transliterated text, which is "ο Βόλος και η Λάρισα είναι στη Θεσσαλία" 87 | ``` 88 | - Use all the processors together (input text in Greeklish) 89 | 90 | ```python 91 | from gr_nlp_toolkit import Pipeline 92 | nlp = Pipeline("pos,ner,dp,g2g") # Instantiate the Pipeline with the G2G, DP, POS and NER processors 93 | 94 | doc = nlp("O Volos kai h Larisa einai sthn Thessalia") # Apply the pipeline to a sentence in Greeklish 95 | 96 | print(doc.text) # Print the transliterated text 97 | 98 | # Iterate over the generated tokens 99 | for token in doc.tokens: 100 | print(token.text) # the text of the token 101 | 102 | print(token.ner) # the named entity label in IOBES encoding : str 103 | 104 | print(token.upos) # the UPOS tag of the token 105 | print(token.feats) # the morphological features for the token 106 | 107 | print(token.head) # the head of the token 108 | print(token.deprel) # the dependency relation between the current token and its head 109 | ``` 110 | 111 | ## Paper 112 | The software was presented as a paper at COLING 2025. 113 | Read the full technical report/paper here: [https://aclanthology.org/2025.coling-demos.17/](https://aclanthology.org/2025.coling-demos.17/) 114 | 115 | If you use our toolkit, please cite it: 116 | ```bibtex 117 | @inproceedings{loukas-etal-coling2025-greek-nlp-toolkit, 118 | title = "{GR}-{NLP}-{TOOLKIT}: An Open-Source {NLP} Toolkit for {M}odern {G}reek", 119 | author = "Loukas, Lefteris and 120 | Smyrnioudis, Nikolaos and 121 | Dikonomaki, Chrysa and 122 | Barbakos, Spiros and 123 | Toumazatos, Anastasios and 124 | Koutsikakis, John and 125 | Kyriakakis, Manolis and 126 | Georgiou, Mary and 127 | Vassos, Stavros and 128 | Pavlopoulos, John and 129 | Androutsopoulos, Ion", 130 | editor = "Rambow, Owen and 131 | Wanner, Leo and 132 | Apidianaki, Marianna and 133 | Al-Khalifa, Hend and 134 | Eugenio, Barbara Di and 135 | Schockaert, Steven and 136 | Mather, Brodie and 137 | Dras, Mark", 138 | booktitle = "Proceedings of the 31st International Conference on Computational Linguistics: System Demonstrations", 139 | month = jan, 140 | year = "2025", 141 | address = "Abu Dhabi, UAE", 142 | publisher = "Association for Computational Linguistics", 143 | url = "https://aclanthology.org/2025.coling-demos.17/", 144 | pages = "174--182", 145 | } 146 | ``` 147 | 148 | ---- 149 | ### Technical Notes: 150 | 151 | - The *first* time you use a processor, the models are downloaded from Hugging Face and stored into the .cache folder. The NER, DP and POS processors are each about 500 MB, while the G2G processor is about 1.2 GB in size. 152 | - If the input text is already in Greek, the G2G (Greeklish-to-Greek) processor is skipped. 153 | - If your machine has an accelerator but you want to run the process on the CPU, you can pass the flag `use_cpu=True` to the Pipeline object. By default, `use_cpu` is set to *False*. 154 | - The Greeklish-to-Greek transliteration processor (ByT5) weights can be found in HuggingFace: [https://huggingface.co/AUEB-NLP/ByT5_g2g](https://huggingface.co/AUEB-NLP/ByT5_g2g) 155 | - The NER/POS/DP processors/weights can be found in HuggingFace: [https://huggingface.co/AUEB-NLP/gr-nlp-toolkit](https://huggingface.co/AUEB-NLP/gr-nlp-toolkit) 156 | 157 | ## References 158 | While many methodology details are shared in the [GR-NLP-TOOLKIT paper publication @ COLING 2025 (see above)](https://arxiv.org/abs/2412.08520), additional research details can be found here: 159 | 1. C. Dikonimaki, "A Transformer-based natural language processing toolkit for Greek -- Part of speech tagging and dependency parsing", BSc thesis, Department of Informatics, Athens University of Economics and Business, 2021. http://nlp.cs.aueb.gr/theses/dikonimaki_bsc_thesis.pdf *(POS/DP/Morphological tagging processor)* 160 | 161 | 2. N. Smyrnioudis, "A Transformer-based natural language processing toolkit for Greek -- Named entity recognition and multi-task learning", BSc thesis, Department of Informatics, Athens University of Economics and Business, 2021. http://nlp.cs.aueb.gr/theses/smyrnioudis_bsc_thesis.pdf *(NER processor)* 162 | 163 | 3. A. Toumazatos, J. Pavlopoulos, I. Androutsopoulos, & S. Vassos, "Still All Greeklish to Me: Greeklish to Greek Transliteration." In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024) (pp. 15309–15319). https://aclanthology.org/2024.lrec-main.1330/ *(Greeklish-to-Greek processor)* 164 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 AUEB NLP Group 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /gr_nlp_toolkit/models/g2g_RBNLM_model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | from torch.utils.data import DataLoader 5 | from gr_nlp_toolkit.configs.dictionary_tables import greeklish_to_greek_intonated 6 | 7 | class LSTM_LangModel(nn.Module): 8 | """ 9 | LSTM-based language model 10 | 11 | Attributes: 12 | hidden_size (int): The size of the hidden layer 13 | embed (nn.Embedding): The embedding layer 14 | lstm (nn.LSTM): The LSTM layer 15 | dense (nn.Linear): The dense layer 16 | dropout (nn.Dropout): The dropout layer 17 | """ 18 | def __init__(self, input_size, embed_size, hidden_size, output_size): 19 | """ 20 | Initializes the LSTM_LangModel with the specified parameters. 21 | 22 | Args: 23 | input_size (int): The size of the input layer 24 | embed_size (int): The size of the embedding layer 25 | hidden_size (int): The size of the hidden layer 26 | output_size (int): The size of the output layer 27 | """ 28 | super(LSTM_LangModel, self).__init__() 29 | self.hidden_size = hidden_size 30 | 31 | self.embed = nn.Embedding(input_size, embed_size, padding_idx=0) 32 | self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True) 33 | self.dense = nn.Linear(hidden_size, output_size) 34 | self.dropout = nn.Dropout(0.5) 35 | 36 | def forward(self, x, h0=None, c0=None): 37 | """ 38 | Forward pass of the LSTM_LangModel 39 | 40 | Args: 41 | x (Tensor): The input tensor 42 | h0 (Tensor): The initial hidden state 43 | c0 (Tensor): The initial cell state 44 | 45 | Returns: 46 | output (Tensor): The output tensor 47 | h (Tensor): The hidden state 48 | c (Tensor): The cell state 49 | 50 | """ 51 | input_embedded = self.embed(x) 52 | if h0 is None and c0 is None: 53 | output_lstm, (h, c) = self.lstm(input_embedded) 54 | else: 55 | output_lstm, (h, c) = self.lstm(input_embedded, (h0, c0)) 56 | output = self.dropout(output_lstm) 57 | output = self.dense(output) 58 | return output, h, c 59 | 60 | 61 | class State(): 62 | """ 63 | Container class for the attributes of each candidate replacement. 64 | 65 | Attributes: 66 | translated (list): List of tokens already translated to the desired language. 67 | remaining (str): The remaining sentence to be translated. 68 | out (tensor): The last output of the LSTM for that particular state. Contains 69 | logits that will become probabilities with the application of a softmax. 70 | hidden (tuple): Contains the hidden states of the candidate. 71 | score (float): The score given to the translation based on the language model. 72 | """ 73 | 74 | def __init__(self, translated, remaining, out, hidden, score): 75 | 76 | """ 77 | Initializes the State object with the specified parameters. 78 | 79 | Args: 80 | translated (list): List of tokens already translated to the desired language. 81 | remaining (str): The remaining sentence to be translated. 82 | out (tensor): The last output of the LSTM for that particular state. Contains 83 | logits that will become probabilities with the application of a softmax. 84 | hidden (tuple): Contains the hidden states of the candidate. 85 | score (float): The score given to the translation based on the language model. 86 | """ 87 | self.translated = translated 88 | self.remaining = remaining 89 | self.hidden = hidden 90 | self.output = out 91 | self.score = score 92 | 93 | def __eq__(self, other): 94 | """ 95 | Equality operator, needed for eliminating duplicate states. 96 | """ 97 | if isinstance(other, State): 98 | if (self.translated == other.translated and 99 | self.remaining == other.remaining and 100 | self.score == other.score): 101 | return True 102 | 103 | return False 104 | 105 | class LanguageModel: 106 | """ 107 | Language model for Greeklish to Greek conversion. 108 | 109 | Attributes: 110 | vectorizer (TextVectorizer): The vectorizer used to convert tokens to indices. 111 | model (LSTM_LangModel): The language model used for translation. 112 | device (str): The device to run the model on. 113 | softmax (nn.LogSoftmax): The log version of the softmax function. 114 | """ 115 | 116 | def __init__(self, vectorizer, model, device='cpu'): 117 | """ 118 | Initializes the LanguageModel with the specified parameters. 119 | 120 | Args: 121 | vectorizer: (TextVectorizer) The vectorizer used to convert tokens to indices. 122 | model: (LSTM_LangModel) The language model used for translation. 123 | device: (str) The device to run the model on. 124 | """ 125 | self.vectorizer = vectorizer 126 | self.model = model 127 | self.mode = vectorizer.mode 128 | self.device = torch.device(device) 129 | self.model.to(self.device) 130 | 131 | # Use the log version of Softmax to sum scores instead of multiplying them and avoid decay. 132 | self.softmax = nn.LogSoftmax(dim=1) 133 | 134 | def load_model(self, path): 135 | """ 136 | Load a pre-trained model as a state dictionary. 137 | 138 | Args: 139 | path (str): The path to the pre-trained model. 140 | """ 141 | self.model.load_state_dict(torch.load(path, weights_only=True)) 142 | 143 | def translate(self, sentences, beams): 144 | """ 145 | Takes a list of sentences and translates them. 146 | 147 | Args: 148 | sentences: (list) Sentences you want to translate 149 | beams: (int) The number of parameters 150 | 151 | Returns: 152 | translated_sentences: (list) Translated sentences 153 | 154 | """ 155 | # Don't forget to put the model in eval mode. 156 | self.model.eval() 157 | with torch.no_grad(): 158 | translated_sentences = [] 159 | for sentence in sentences: 160 | translated = [] 161 | remaining = sentence 162 | # We start with the first state, with a score of 1 163 | # The format of a state is: (translated_sent, remaining_sent, (h0, c0), score) 164 | # -------------------------------------------------------------------------------- 165 | # First, we need to "prep" the char model. This is done by feeding the network with the 166 | # token and saving the output hidden states for the initial State() object. 167 | start_input = self.vectorizer.input_tensor("") 168 | out, h_n, c_n = self.model(start_input.to(self.device), None, None) 169 | # The score of the initial state in 0, because we use LogSoftmax instead of regular Softmax. 170 | initial_state = State(translated, remaining, out, (h_n, c_n), 0) 171 | states = [initial_state] 172 | for i in range(len(sentence)): 173 | candidates = [] 174 | # Look through the current states. 175 | for state in states: 176 | # Produce the next-char-candidates from each state, along with their probabilities. 177 | new_states = self.get_candidates(state) 178 | candidates += new_states 179 | 180 | # Remove any duplicate candidates 181 | #print([candidate.translated for candidate in candidates]) 182 | candidates_set = [] 183 | [candidates_set.append(cand) for cand in candidates if cand not in candidates_set] 184 | candidates = candidates_set 185 | # Get the best states, according to the number of beams in the search. 186 | best_candidates = [] 187 | for j in range(beams): 188 | if candidates: 189 | # Probabilities of each candidate new state. 190 | probabilities = [cand.score for cand in candidates] 191 | # Get the best candidate and remove from the list 192 | best_cand = candidates.pop(probabilities.index(max(probabilities))) 193 | # Add that candidate to the list of best candidates 194 | best_candidates.append(best_cand) 195 | # Make the list of the best candidates the new states. 196 | states = best_candidates 197 | 198 | # Once the sentence is over, the state with the highest probability is the best translation. 199 | probs = [state.score for state in states] 200 | # Extract the sentence from the state 201 | sent = states[probs.index(max(probs))] 202 | # Convert the list of translated tokens to a sentence. 203 | translation = "" 204 | for i in sent.translated: 205 | translation += i 206 | 207 | translated_sentences.append(translation) 208 | return translated_sentences 209 | 210 | def get_candidates(self, state): 211 | """ 212 | Get the next candidates for the translation. 213 | 214 | Args: 215 | state (State): The current state of the translation. 216 | 217 | Returns: 218 | candidates (list): A list of the next candidates for the translation. 219 | 220 | """ 221 | 222 | 223 | # If the state is already a final state (no remaining text to translate), 224 | # it returns only itself as a candidate. 225 | if not state.remaining: 226 | return [state] 227 | 228 | # If it is not a final state, generate the next candidate states 229 | candidates = [] 230 | 231 | # Look at both of the first two characters in the translated sentence, as some greek 232 | # characters may be represented with 2 characters in Greeklish. 233 | for length in [1, 2]: 234 | if len(state.remaining) >= length: 235 | # Fetch the valid replacements from the dictionary. 236 | if length == 2: 237 | token = state.remaining[0] + state.remaining[1] 238 | replacements = greeklish_to_greek_intonated.get(token, []) 239 | else: 240 | # If the look-up is a miss (e.g. the token is a space, a number or punctuation), 241 | # return the token itself. 242 | token = state.remaining[0] 243 | replacements = greeklish_to_greek_intonated.get(token, [token]) 244 | 245 | # For each candidate replacement, get the probability from the LM 246 | for item in replacements: 247 | h_n, c_n = state.hidden[0], state.hidden[1] 248 | out = state.output 249 | score = state.score 250 | for token in item: 251 | # Apply softmax to the model's output and get the prob based on the index from vocab 252 | probs = self.softmax(out) 253 | idx = self.vectorizer.vocab.get(token, len(self.vectorizer.vocab)) 254 | # Update score. 255 | score = score + probs[0][idx].item() 256 | # Feed the token to the model to get the next output and hidden states 257 | input = self.vectorizer.input_tensor(token) 258 | out, h_n, c_n = self.model(input.to(self.device), h_n, c_n) 259 | 260 | translated_tokens = [token for token in item] 261 | 262 | new_candidate = State(state.translated+translated_tokens, 263 | state.remaining[length:], 264 | out, (h_n, c_n), score) 265 | candidates.append(new_candidate) 266 | 267 | return candidates --------------------------------------------------------------------------------