├── tests
├── __init__.py
├── domain
│ ├── __init__.py
│ └── test_token.py
├── test_data
│ ├── __init__.py
│ └── test_processor_cache.py
├── test_pipeline
│ ├── __init__.py
│ └── test_pipeline.py
├── test_processors
│ ├── __init__.py
│ ├── test_g2g.py
│ ├── test_ner.py
│ ├── test_dp.py
│ ├── test_pos.py
│ └── test_tokenizer.py
└── guide_for_testing.md
├── gr_nlp_toolkit
├── data
│ ├── __init__.py
│ ├── downloader.py
│ ├── downloader_stub.py
│ ├── downloader_gdrive.py
│ ├── processor_cache.py
│ └── util.py
├── domain
│ ├── __init__.py
│ ├── dataset.py
│ ├── document.py
│ ├── token.py
│ └── textVectorizer.py
├── models
│ ├── __init__.py
│ ├── util.py
│ ├── g2g_transformer_model.py
│ ├── ner_model.py
│ ├── pos_model.py
│ ├── dp_model.py
│ └── g2g_RBNLM_model.py
├── configs
│ ├── __init__.py
│ ├── dp_labels.py
│ ├── ner_labels.py
│ ├── pos_labels.py
│ └── dictionary_tables.py
├── pipeline
│ ├── __init__.py
│ └── pipeline.py
├── processors
│ ├── __init__.py
│ ├── abstract_processor.py
│ ├── ner.py
│ ├── dp.py
│ ├── pos.py
│ ├── g2g.py
│ └── tokenizer.py
├── __init__.py
└── RBNLM_weights
│ ├── RBNLMtextVectorizer.pkl
│ └── LSTM_LM_50000_char_120_32_512.pt
├── logo.png
├── requirements.txt
├── pyproject.toml
├── setup.py
├── .gitignore
├── README.md
└── LICENSE
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/domain/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/domain/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_pipeline/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_processors/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/configs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/processors/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlpaueb/gr-nlp-toolkit/HEAD/logo.png
--------------------------------------------------------------------------------
/gr_nlp_toolkit/__init__.py:
--------------------------------------------------------------------------------
1 | from gr_nlp_toolkit.pipeline.pipeline import Pipeline
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | huggingface_hub==0.23.5
2 | torch==2.4.0
3 | transformers==4.44.0
4 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=42",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/RBNLM_weights/RBNLMtextVectorizer.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlpaueb/gr-nlp-toolkit/HEAD/gr_nlp_toolkit/RBNLM_weights/RBNLMtextVectorizer.pkl
--------------------------------------------------------------------------------
/gr_nlp_toolkit/RBNLM_weights/LSTM_LM_50000_char_120_32_512.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlpaueb/gr-nlp-toolkit/HEAD/gr_nlp_toolkit/RBNLM_weights/LSTM_LM_50000_char_120_32_512.pt
--------------------------------------------------------------------------------
/gr_nlp_toolkit/data/downloader.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 |
4 | class Downloader(ABC):
5 |
6 | @abstractmethod
7 | def download_processor(self, processor_name: str, target_path: str):
8 | pass
9 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/processors/abstract_processor.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | from gr_nlp_toolkit.domain.document import Document
4 |
5 |
6 | class AbstractProcessor(ABC):
7 | @abstractmethod
8 | def __call__(self, doc : Document):
9 | pass
--------------------------------------------------------------------------------
/gr_nlp_toolkit/data/downloader_stub.py:
--------------------------------------------------------------------------------
1 | from gr_nlp_toolkit.data.downloader import Downloader
2 | import os
3 |
4 |
5 | class DownloaderStub(Downloader):
6 | def download_processor(self, processor_name: str, target_path: str):
7 | with open(target_path , 'wb') as f:
8 | pass
--------------------------------------------------------------------------------
/tests/domain/test_token.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from gr_nlp_toolkit.domain.token import Token
4 |
5 |
6 | class MyTestCase(unittest.TestCase):
7 | def test_new_token_object(self):
8 | token = Token(['α'])
9 | self.assertEqual(['α'], token.subwords)
10 |
11 |
12 | if __name__ == '__main__':
13 | unittest.main()
14 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/configs/dp_labels.py:
--------------------------------------------------------------------------------
1 | # The labels for the output of the dp model.
2 | # A string label can be obtained by an output index
3 | dp_labels = \
4 | ['obl',
5 | 'obj',
6 | 'dep',
7 | 'mark',
8 | 'case',
9 | 'flat',
10 | 'nummod',
11 | 'obl:arg',
12 | 'punct',
13 | 'cop',
14 | 'acl:relcl',
15 | 'expl',
16 | 'nsubj',
17 | 'csubj:pass',
18 | 'root',
19 | 'advmod',
20 | 'nsubj:pass',
21 | 'ccomp',
22 | 'conj',
23 | 'amod',
24 | 'xcomp',
25 | 'aux',
26 | 'appos',
27 | 'csubj',
28 | 'fixed',
29 | 'nmod',
30 | 'iobj',
31 | 'parataxis',
32 | 'orphan',
33 | 'det',
34 | 'advcl',
35 | 'vocative',
36 | 'compound',
37 | 'cc',
38 | 'discourse',
39 | 'acl',
40 | 'obl:agent']
--------------------------------------------------------------------------------
/gr_nlp_toolkit/data/downloader_gdrive.py:
--------------------------------------------------------------------------------
1 | from gr_nlp_toolkit.data.downloader import Downloader
2 | import gdown
3 |
4 |
5 | class GDriveDownloader(Downloader):
6 | def __init__(self):
7 | self.urls = {
8 | 'pos': 'https://drive.google.com/uc?id=1Or5HDk1kVnxI3_w0fwgR8-dzO0jvcc_L', # pos link
9 | 'ner': 'https://drive.google.com/uc?id=1fx0pHtcN7F2Vj9L8y5TUpbjSqKTUaT3i', # ner link
10 | 'dp': 'https://drive.google.com/uc?id=1NhEqmLBf67Ydw-LdI7eB-f0afMPgNSmG' # dp link
11 | }
12 |
13 | def download_processor(self, processor_name: str, target_path: str):
14 | gdown.download(self.urls[processor_name], output=target_path, quiet=False)
15 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/domain/dataset.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data import Dataset
3 |
4 |
5 | class DatasetImpl(Dataset):
6 | def __init__(self, input_ids):
7 | self._input_ids = input_ids
8 |
9 | def __getitem__(self, index):
10 | return {
11 | "input": [
12 | torch.tensor(self._input_ids[index], dtype=torch.long),
13 | torch.tensor(len(self._input_ids[index])),
14 | ]
15 | }
16 |
17 | def __len__(self):
18 | return 1
19 |
20 | @property
21 | def input_ids(self):
22 | return self._input_ids
23 |
24 | @input_ids.setter
25 | def input_ids(self, value):
26 | self._input_ids = value
27 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/models/util.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def create_mask_from_length(length_tensor, mask_size):
5 |
6 | """
7 | Creates a binary mask based on length.
8 |
9 | Args:
10 | length_tensor (torch.Tensor): ND Tensor containing the lengths.
11 | mask_size (int): Integer specifying the mask size. Usually the largest length in the batch
12 |
13 | Return:
14 | torch.Tensor (N+1)D Int Tensor (..., mask_size) containing the binary mask.
15 | """
16 |
17 | mask = torch.arange(0, mask_size, dtype=torch.int, device=length_tensor.device)
18 |
19 | mask = mask.int().view([1] * (len(length_tensor.shape)) + [-1])
20 |
21 | return mask < length_tensor.int().unsqueeze(-1)
22 |
23 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | with open("README.md", "r", encoding="utf-8") as fh:
4 | long_description = fh.read()
5 |
6 | setuptools.setup(
7 | name="gr-nlp-toolkit",
8 | version="0.2.0",
9 | author="nlpaueb",
10 | author_email="p3170148@aueb.gr, p3170039@aueb.gr, spirosbarbakos7@gmail.com, eleftheriosloukas@aueb.gr, ipavlopoulos@aueb.gr",
11 | description="The state-of-the-art NLP toolkit for (modern) Greek",
12 | long_description=long_description,
13 | long_description_content_type="text/markdown",
14 | url="https://github.com/nlpaueb/gr-nlp-toolkit",
15 | project_urls={
16 | # "Bug Tracker": "https://github.com/pypa/sampleproject/issues",
17 | },
18 | classifiers=[
19 | "Programming Language :: Python :: 3",
20 | "License :: OSI Approved :: Apache Software License",
21 | "Operating System :: OS Independent",
22 | "Topic :: Text Processing :: Linguistic",
23 | "Natural Language :: Greek",
24 | ],
25 | packages=setuptools.find_packages(where=".", exclude="./tests"),
26 | python_requires=">=3.9",
27 | install_requires=[
28 | "torch>=2.1.2",
29 | "transformers>=4.11.1",
30 | "huggingface_hub>=0.23.5",
31 | ],
32 | )
33 |
--------------------------------------------------------------------------------
/tests/test_processors/test_g2g.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from gr_nlp_toolkit.processors.g2g import G2G
4 | from gr_nlp_toolkit.processors.g2g import detect_language
5 | from gr_nlp_toolkit.domain.document import Document
6 |
7 | class MyTestCase(unittest.TestCase):
8 |
9 | def test_g2g_lstm(self):
10 |
11 | g2g = G2G(mode="LSTM",model_path="gr_nlp_toolkit/RBNLM_weights/LSTM_LM_50000_char_120_32_512.pt", tokenizer_path="gr_nlp_toolkit/RBNLM_weights/RBNLMtextVectorizer.pkl")
12 | self.assertIsNotNone(g2g.model)
13 | self.assertIsNotNone(g2g.text_vectorizer)
14 | self.assertIsNotNone(g2g.LM)
15 |
16 |
17 | doc = Document("o volos kai h larisa einai poleis ths thessalias")
18 | doc = g2g(doc)
19 | self.assertEqual(detect_language(doc.text), 'greek')
20 | self.assertEqual(doc.text.split()[5], "είναι")
21 |
22 |
23 | def test_g2g_transformer(self):
24 |
25 | g2g = G2G(mode="transformer", model_path="AUEB-NLP/ByT5_g2g")
26 | self.assertIsNotNone(g2g.model)
27 |
28 | doc = Document('"o volos kai h larisa einai poleis ths thessalias"')
29 | doc = g2g(doc)
30 | self.assertEqual(detect_language(doc.text), 'greek')
31 | self.assertEqual(doc.text.split()[5], "είναι")
32 |
33 |
34 | if __name__ == '__main__':
35 | unittest.main()
36 |
--------------------------------------------------------------------------------
/tests/guide_for_testing.md:
--------------------------------------------------------------------------------
1 | In Visual Studio Code, make sure:
2 | - you have created a virtual environment (venv) with the corresponding python version supported
3 | - you have installed the venv dependencies `pip install -r requirements.txt`
4 | - Configure this interpreter (`ctrl + shift + p` -> `Python: Select Interpreter`)
5 | - **Important**: Install the toolkit as a `package` in *editable* mode via `pip install -e .` (run this command from the root directory of the project!)
6 | - Configure the tests accordingly (`ctrl + shift + p`). Select `unittest` as your test framework, `tests` as the directory containing the tests and `test_*` as the file patterns to be matched as test files. This shall create a `.vscode/settings.json` file like this for you:
7 | ```json
8 | {
9 | "python.testing.unittestArgs": [
10 | "-v",
11 | "-s",
12 | "tests",
13 | "-p",
14 | "test_*.py"
15 | ],
16 | "python.testing.unittestEnabled": true,
17 | }
18 | ```
19 | - Now, go to the `Testing` tab in the left column in Visual Studio Code and click the `Refresh Tests` button in order to discover them.
20 |
21 |
22 |
Bonus (for Windows):
23 |
24 | If you use Windows, it would be helpful if you enable the developer mode. This will speed things up in the caching mechanism under-the-hood of huggingface hub. (https://learn.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development)
--------------------------------------------------------------------------------
/tests/test_processors/test_ner.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from transformers import AutoModel
4 |
5 | from gr_nlp_toolkit.domain.document import Document
6 | from gr_nlp_toolkit.processors.ner import NER
7 | from gr_nlp_toolkit.processors.tokenizer import Tokenizer
8 |
9 | from gr_nlp_toolkit.configs.ner_labels import ner_labels
10 |
11 |
12 | class MyTestCase(unittest.TestCase):
13 |
14 | def test_ner_with_one_example(self):
15 | tokenizer = Tokenizer()
16 | doc = tokenizer(Document('Ο ποιητής'))
17 |
18 | ner = NER(entities=18)
19 |
20 | self.assertEqual(69, ner.output_size)
21 | self.assertIsNotNone(ner._model)
22 | doc = ner(doc)
23 |
24 | tokens = doc.tokens
25 | for token in tokens:
26 | self.assertIsNotNone(token.ner)
27 | self.assertTrue(token.ner in ner_labels)
28 |
29 | def test_ner_with_one_example_with_subwords(self):
30 | tokenizer = Tokenizer()
31 | doc = tokenizer(Document('ενα ποιηματακι'))
32 |
33 | ner = NER()
34 | self.assertIsNotNone(ner._model)
35 | doc = ner(doc)
36 |
37 | tokens = doc.tokens
38 | for token in tokens:
39 | self.assertIsNotNone(token.ner)
40 | self.assertTrue(token.ner in ner_labels)
41 |
42 | def test_ner_with_value_exception(self):
43 | with self.assertRaises(ValueError):
44 | NER(entities=2)
45 |
46 |
47 | if __name__ == '__main__':
48 | unittest.main()
49 |
--------------------------------------------------------------------------------
/tests/test_processors/test_dp.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from transformers import AutoModel
4 |
5 | from gr_nlp_toolkit.domain.document import Document
6 | from gr_nlp_toolkit.processors.dp import DP
7 | from gr_nlp_toolkit.processors.tokenizer import Tokenizer
8 |
9 | from gr_nlp_toolkit.configs.dp_labels import dp_labels
10 |
11 |
12 | class MyTestCase(unittest.TestCase):
13 |
14 | def test_dp_with_one_example(self):
15 | tokenizer = Tokenizer()
16 | doc = tokenizer(Document('Ο ποιητής'))
17 |
18 | dp = DP()
19 | self.assertIsNotNone(dp._model)
20 |
21 | doc = dp(doc)
22 |
23 | tokens = doc.tokens
24 | for token in tokens:
25 | self.assertIsNotNone(token.head)
26 | self.assertIsNotNone(token.deprel)
27 | self.assertTrue(token.head in range(0, len(tokens)))
28 | self.assertTrue(token.deprel in dp_labels)
29 |
30 | def test_dp_with_one_example_with_subwords(self):
31 | tokenizer = Tokenizer()
32 | doc = tokenizer(Document('ενα ποιηματακι'))
33 |
34 | # bert model init
35 | dp = DP()
36 |
37 | self.assertIsNotNone(dp._model)
38 | doc = dp(doc)
39 |
40 | tokens = doc.tokens
41 | for token in tokens:
42 | self.assertIsNotNone(token.head)
43 | self.assertIsNotNone(token.deprel)
44 | self.assertTrue(token.head in range(0, len(tokens)))
45 | self.assertTrue(token.deprel in dp_labels)
46 |
47 |
48 | if __name__ == '__main__':
49 | unittest.main()
--------------------------------------------------------------------------------
/gr_nlp_toolkit/data/processor_cache.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | from os.path import expanduser
4 |
5 | from gr_nlp_toolkit.data.downloader import Downloader
6 |
7 |
8 | class ProcessorCache:
9 | def __init__(self, downloader : Downloader, cache_path : str):
10 | """
11 | Initializes the cache of processors creating necessary directories
12 | :param downloader: an object with the Downloader interface
13 | """
14 | # Get home directory
15 | self.home = expanduser("~")
16 | self.sep = os.sep
17 | self.cache_path = cache_path
18 | self.downloader = downloader
19 | # Initialize the filenames for each processor
20 | self.processor_names_to_filenames = {
21 | 'ner': 'ner_processor',
22 | 'pos': 'pos_processor',
23 | 'dp': 'dp_processor'
24 | }
25 | self.update_cache_path()
26 |
27 | def update_cache_path(self):
28 | Path(self.cache_path).mkdir(parents=True, exist_ok=True)
29 |
30 | def get_processor_path(self, processor_name: str) -> str:
31 | # Update cache path in case any changes occured
32 | self.update_cache_path()
33 | target_filename = self.processor_names_to_filenames[processor_name]
34 | if not os.path.exists(self.cache_path + self.sep + target_filename):
35 | self.downloader.download_processor(processor_name, self.cache_path + self.sep + target_filename)
36 | # Return the path
37 | return self.cache_path + self.sep + target_filename
38 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/models/g2g_transformer_model.py:
--------------------------------------------------------------------------------
1 | from transformers import T5ForConditionalGeneration, AutoTokenizer
2 | from torch import nn
3 | import torch
4 |
5 |
6 |
7 | class ByT5Model(nn.Module):
8 | """
9 | A wrapper class for the T5 model for conditional generation
10 |
11 | Attributes:
12 | model (T5ForConditionalGeneration): The pre-trained ByT5 model
13 | tokenizer (AutoTokenizer): The tokenizer for the ByT5 model
14 | """
15 |
16 | def __init__(self, model_path = None, device = 'cpu'):
17 | """
18 | Initializes the ByT5Model with a pretrained T5 model and tokenizer.
19 |
20 | Args:
21 | model_path: The path to the pretrained model and tokenizer.
22 | """
23 | super(ByT5Model, self).__init__()
24 |
25 | self.model = T5ForConditionalGeneration.from_pretrained(model_path)
26 | self.tokenizer = AutoTokenizer.from_pretrained(model_path)
27 | self.device = torch.device(device)
28 | self.model.to(self.device)
29 |
30 | def forward(self, text):
31 | """
32 | Performs inference to the ByT5 to generate the transliterated text
33 |
34 | Args:
35 | text: the input text in greeklish
36 |
37 | Returns:
38 | The output text in greek
39 | """
40 | self.model.eval()
41 | tokenized_text = self.tokenizer(text, return_tensors="pt").input_ids
42 |
43 | output = self.model.generate(tokenized_text.to(self.device), max_length=10000)
44 |
45 | return self.tokenizer.decode(output[0], skip_special_tokens=True)
46 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/models/ner_model.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from gr_nlp_toolkit.models.util import create_mask_from_length
4 |
5 |
6 | class NERBERTModel(nn.Module):
7 | """
8 | Named Entity Recognition (NER) model class based on BERT.
9 |
10 | This class defines a NER model using a pre-trained BERT model
11 | with a dropout and a linear layer on top.
12 |
13 | Attributes:
14 | _bert_model (AutoModel): The pre-trained BERT model.
15 | _dp (nn.Dropout): Dropout layer for regularization.
16 | _output_linear (nn.Linear): Linear layer to produce model outputs.
17 | """
18 |
19 | def __init__(self, bert_model, model_output_size, dp):
20 | """
21 | Initializes the NERBERTModel with the specified parameters.
22 |
23 | Args:
24 | bert_model (AutoModel): The pre-trained BERT model.
25 | model_output_size (int): The size of the output layer.
26 | dp (float): Dropout probability.
27 | """
28 |
29 | super(NERBERTModel, self).__init__()
30 | self._bert_model = bert_model
31 | self._dp = nn.Dropout(dp)
32 | self._output_linear = nn.Linear(768, model_output_size)
33 |
34 | def forward(self, text, text_len):
35 | """
36 | Performs a forward pass of the model.
37 |
38 | Args:
39 | text (torch.Tensor): Input tensor containing token IDs.
40 | text_len (torch.Tensor): Tensor containing the lengths of each sequence in the batch.
41 |
42 | Returns:
43 | torch.Tensor: The output of the linear layer after applying dropout and BERT.
44 | """
45 |
46 | # Create attention mask
47 | attention_mask = create_mask_from_length(text_len, text.shape[1])
48 |
49 | return self._output_linear(
50 | self._dp(self._bert_model(text, attention_mask=attention_mask)[0])
51 | )
52 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/configs/ner_labels.py:
--------------------------------------------------------------------------------
1 | # The labels for the output of the ner model.
2 | # A string label can be obtained by an output index
3 | ner_labels = ['O',
4 | 'S-GPE',
5 | 'S-ORG',
6 | 'S-CARDINAL',
7 | 'B-ORG',
8 | 'E-ORG',
9 | 'B-DATE',
10 | 'E-DATE',
11 | 'S-NORP',
12 | 'B-GPE',
13 | 'E-GPE',
14 | 'S-EVENT',
15 | 'S-DATE',
16 | 'S-PRODUCT',
17 | 'S-LOC',
18 | 'I-ORG',
19 | 'S-PERSON',
20 | 'S-ORDINAL',
21 | 'B-PERSON',
22 | 'I-PERSON',
23 | 'E-PERSON',
24 | 'B-LAW',
25 | 'I-LAW',
26 | 'E-LAW',
27 | 'B-MONEY',
28 | 'I-MONEY',
29 | 'E-MONEY',
30 | 'B-EVENT',
31 | 'I-EVENT',
32 | 'E-EVENT',
33 | 'B-FAC',
34 | 'E-FAC',
35 | 'I-DATE',
36 | 'S-PERCENT',
37 | 'B-QUANTITY',
38 | 'E-QUANTITY',
39 | 'B-WORK_OF_ART',
40 | 'I-WORK_OF_ART',
41 | 'E-WORK_OF_ART',
42 | 'I-FAC',
43 | 'S-LAW',
44 | 'S-TIME',
45 | 'B-LOC',
46 | 'E-LOC',
47 | 'I-LOC',
48 | 'S-FAC',
49 | 'B-TIME',
50 | 'E-TIME',
51 | 'S-WORK_OF_ART',
52 | 'B-PRODUCT',
53 | 'E-PRODUCT',
54 | 'B-CARDINAL',
55 | 'E-CARDINAL',
56 | 'S-MONEY',
57 | 'S-LANGUAGE',
58 | 'I-TIME',
59 | 'I-PRODUCT',
60 | 'I-GPE',
61 | 'I-QUANTITY',
62 | 'B-NORP',
63 | 'E-NORP',
64 | 'S-QUANTITY',
65 | 'B-PERCENT',
66 | 'I-PERCENT',
67 | 'E-PERCENT',
68 | 'I-CARDINAL',
69 | 'B-ORDINAL',
70 | 'I-ORDINAL',
71 | 'E-ORDINAL']
72 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/models/pos_model.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | from gr_nlp_toolkit.models.util import create_mask_from_length
3 |
4 | class POSModel(nn.Module):
5 | """
6 | Part-Of-Speech (POS) tagging model class based on BERT.
7 |
8 | This class defines a POS model using a pre-trained BERT model
9 | with a dropout and multiple linear layers on top.
10 |
11 | Attributes:
12 | _bert_model (AutoModel): The pre-trained BERT model.
13 | _dp (nn.Dropout): Dropout layer for regularization.
14 | _linear_dict (nn.ModuleDict): Dictionary of linear layers for different features.
15 | """
16 |
17 | def __init__(self, bert_model, feat_to_size, dp):
18 | """
19 | Initializes the POSModel with the specified parameters.
20 |
21 | Args:
22 | bert_model (AutoModel): The pre-trained BERT model.
23 | feat_to_size (dict): A dictionary mapping feature names to the size of their output layers.
24 | dp (float): Dropout probability.
25 | """
26 |
27 | super(POSModel, self).__init__()
28 | self._bert_model = bert_model
29 | self._dp = nn.Dropout(dp)
30 |
31 | self._linear_dict = nn.ModuleDict({feat: nn.Linear(768, feat_to_size[feat]) for feat in feat_to_size})
32 |
33 | def forward(self, text, text_len):
34 | """
35 | Performs a forward pass of the model.
36 |
37 | Args:
38 | text (torch.Tensor): Input tensor containing token IDs.
39 | text_len (torch.Tensor): Tensor containing the lengths of each sequence in the batch.
40 |
41 | Returns:
42 | dict: A dictionary containing the output tensors for each feature.
43 | """
44 |
45 | attention_mask = create_mask_from_length(text_len, text.shape[1])
46 | bert_output = self._dp(self._bert_model(text, attention_mask=attention_mask)[0])
47 |
48 | output_dict = {feat: self._linear_dict[feat](bert_output) for feat in self._linear_dict}
49 | return output_dict
50 |
51 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/configs/pos_labels.py:
--------------------------------------------------------------------------------
1 | # This is a dict where each entry contains an label for a morphological feature,
2 | # or the label for the UPOS tag if the key is 'upos'
3 | pos_properties = {'ADJ': ['Degree', 'Number', 'Gender', 'Case'],
4 | 'ADP': ['Number', 'Gender', 'Case'],
5 | 'ADV': ['Degree', 'Abbr'],
6 | 'AUX': ['Mood',
7 | 'Aspect',
8 | 'Tense',
9 | 'Number',
10 | 'Person',
11 | 'VerbForm',
12 | 'Voice'],
13 | 'CCONJ': [],
14 | 'DET': ['Number', 'Gender', 'PronType', 'Definite', 'Case'],
15 | 'NOUN': ['Number', 'Gender', 'Abbr', 'Case'],
16 | 'NUM': ['NumType', 'Number', 'Gender', 'Case'],
17 | 'PART': [],
18 | 'PRON': ['Number', 'Gender', 'Person', 'Poss', 'PronType', 'Case'],
19 | 'PROPN': ['Number', 'Gender', 'Case'],
20 | 'PUNCT': [],
21 | 'SCONJ': [],
22 | 'SYM': [],
23 | 'VERB': ['Mood',
24 | 'Aspect',
25 | 'Tense',
26 | 'Number',
27 | 'Gender',
28 | 'Person',
29 | 'VerbForm',
30 | 'Voice',
31 | 'Case'],
32 | 'X': ['Foreign'],
33 | '_': []}
34 |
35 | # The labels for the named entity output of the ner model.
36 | # A string label can be obtained by an output index
37 | pos_labels = {'Abbr': ['_', 'Yes'],
38 | 'Aspect': ['Perf', '_', 'Imp'],
39 | 'Case': ['Dat', '_', 'Acc', 'Gen', 'Nom', 'Voc'],
40 | 'Definite': ['Ind', 'Def', '_'],
41 | 'Degree': ['Cmp', 'Sup', '_'],
42 | 'Foreign': ['_', 'Yes'],
43 | 'Gender': ['Fem', 'Masc', '_', 'Neut'],
44 | 'Mood': ['Ind', '_', 'Imp'],
45 | 'NumType': ['Mult', 'Card', '_', 'Ord', 'Sets'],
46 | 'Number': ['Plur', '_', 'Sing'],
47 | 'Person': ['3', '1', '_', '2'],
48 | 'Poss': ['_', 'Yes'],
49 | 'PronType': ['Ind', 'Art', '_', 'Rel', 'Dem', 'Prs', 'Ind,Rel', 'Int'],
50 | 'Tense': ['Pres', 'Past', '_'],
51 | 'VerbForm': ['Part', 'Conv', '_', 'Inf', 'Fin'],
52 | 'Voice': ['Pass', 'Act', '_'],
53 | 'upos': ['X',
54 | 'PROPN',
55 | 'PRON',
56 | 'ADJ',
57 | 'AUX',
58 | 'PART',
59 | 'ADV',
60 | '_',
61 | 'DET',
62 | 'SYM',
63 | 'NUM',
64 | 'CCONJ',
65 | 'PUNCT',
66 | 'NOUN',
67 | 'SCONJ',
68 | 'ADP',
69 | 'VERB']}
70 |
--------------------------------------------------------------------------------
/tests/test_data/test_processor_cache.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import shutil
4 |
5 | from gr_nlp_toolkit.data.downloader_stub import DownloaderStub
6 | from gr_nlp_toolkit.data.processor_cache import ProcessorCache
7 |
8 |
9 | class TestProcessorCache(unittest.TestCase):
10 | def test_download_processors_sequentially(self):
11 |
12 | sep = os.sep
13 | cache_path = "./test"
14 | stub = DownloaderStub()
15 | processor_cache = ProcessorCache(stub , cache_path)
16 | processor_cache.get_processor_path('ner')
17 | self.assertTrue(os.path.exists(cache_path + sep + "ner_processor"))
18 | processor_cache.get_processor_path('pos')
19 | self.assertTrue(os.path.exists(cache_path + sep + "pos_processor"))
20 | dp_path = processor_cache.get_processor_path('dp')
21 | self.assertTrue(type(dp_path) == str)
22 | self.assertTrue(os.path.exists(cache_path + sep + "dp_processor"))
23 | self.assertTrue(dp_path == (cache_path + sep + "dp_processor"))
24 | # Remove any files created
25 | shutil.rmtree(cache_path)
26 |
27 | def test_download_processor_removing_file_and_folder(self):
28 |
29 | home = os.path.expanduser("~")
30 | sep = os.sep
31 | cache_path = "./test"
32 | stub = DownloaderStub()
33 | processor_cache = ProcessorCache(stub, cache_path)
34 | processor_cache.get_processor_path('ner')
35 | self.assertTrue(os.path.exists(cache_path + sep + "ner_processor"))
36 | os.remove(cache_path + sep + "ner_processor")
37 | # Assert that the file is removed
38 | self.assertTrue(not os.path.exists(cache_path + sep + "ner_processor"))
39 | processor_cache.get_processor_path('ner')
40 | # Assert that the file has appeared again
41 | self.assertTrue(os.path.exists(cache_path + sep + "ner_processor"))
42 | processor_cache.get_processor_path('pos')
43 | # Remove entire directory
44 | shutil.rmtree(cache_path)
45 | processor_cache.get_processor_path('pos')
46 | # Assert that the certain processor has appeared again
47 | self.assertTrue(os.path.exists(cache_path + sep + "pos_processor"))
48 | # Remove any files created
49 | shutil.rmtree(cache_path)
50 |
51 | if __name__ == '__main__':
52 | unittest.main()
53 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/domain/document.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | class Document:
4 | """
5 | Document class that represents an annotated text
6 | """
7 |
8 | def __init__(self, text: str):
9 | """
10 | Create a Document object setting possible parameters other than the text as None
11 |
12 | Keyword arguments:
13 | param text: The text of the document
14 | """
15 | self._text = text
16 |
17 | self._input_ids = None
18 | self._token_mask = None
19 |
20 | self._tokens = None
21 |
22 | self._dataloader = None
23 |
24 | self._subword2word = None
25 |
26 |
27 | @property
28 | def text(self):
29 | """
30 | Return the original text of the document
31 | """
32 | return self._text
33 |
34 | @text.setter
35 | def text(self, value):
36 | self._text = value
37 |
38 |
39 | @property
40 | def tokens(self):
41 | """
42 | A list of Tokens containing the tokens of the text as well as token level annotations
43 | """
44 | return self._tokens
45 |
46 | @tokens.setter
47 | def tokens(self, value):
48 | self._tokens = value
49 |
50 |
51 | @property
52 | def input_ids(self):
53 | """
54 | A tensor of shape [1,mseq] containing the input ids created with the BERT tokenizer
55 | """
56 | return self._input_ids
57 |
58 | @input_ids.setter
59 | def input_ids(self, value):
60 | self._input_ids = value
61 |
62 |
63 | @property
64 | def token_mask(self):
65 | """
66 | A tensor of shape [1,mseq] containign zeros at the positions of the input_ids tensor that map to subword tokens that are non first subword tokens
67 | """
68 | return self._token_mask
69 |
70 | @token_mask.setter
71 | def token_mask(self, value):
72 | self._token_mask = value
73 |
74 |
75 | @property
76 | def dataloader(self):
77 | return self._dataloader
78 |
79 |
80 | @dataloader.setter
81 | def dataloader(self, value):
82 | self._dataloader = value
83 |
84 | @property
85 | def subword2word(self):
86 | """
87 | A mapping for each subword to the word
88 | """
89 | return self._subword2word
90 |
91 | @subword2word.setter
92 | def subword2word(self, value):
93 | self._subword2word = value
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # Pycharm
132 | .idea
133 |
134 | # Temporary files
135 | gr_nlp_toolkit/tmp/
136 | .vscode/
137 | dist/
138 | .pytest_cache/
139 | test_toolkit_venv/
140 |
141 | #pipenv
142 | Pipfile
143 | Pipfile.lock
144 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/domain/token.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 |
4 | class Token:
5 | """
6 | Token class which represents a word/token
7 | """
8 |
9 | def __init__(self, subwords: List[str]):
10 | """
11 | Create a Token object setting possible parameters other than the text as None
12 |
13 | Keyword arguments:
14 | subwords: A list with the token's subwords
15 | """
16 |
17 | # the text
18 | self._text = ""
19 |
20 | # the subwords
21 | self._subwords = subwords
22 |
23 | # the ids
24 | self._ids = []
25 |
26 | # Named Entity Recognition parameters
27 | # the named entity
28 | self._ner = None
29 |
30 | # Part of Speech Tagging parameters
31 | # the universal pos tag
32 | self._upos = None
33 | # the universal morphological features
34 | self._feats = {}
35 |
36 | # Dependency Parsing parameters
37 | # the dependant word index in the sentence
38 | self._head = None
39 | # the label of the relation between the specific word and the dependant one
40 | self._deprel = None
41 |
42 | @property
43 | def text(self):
44 | """
45 | The text
46 | """
47 | return self._text
48 |
49 | @text.setter
50 | def text(self, value):
51 | self._text = value
52 |
53 | @property
54 | def subwords(self):
55 | """
56 | A list with Token's subwords
57 | """
58 | return self._subwords
59 |
60 | @subwords.setter
61 | def subwords(self, value):
62 | self._subwords = value
63 |
64 | @property
65 | def ids(self):
66 | return self._ids
67 |
68 | @ids.setter
69 | def ids(self, value):
70 | self._ids = value
71 |
72 | @property
73 | def ner(self):
74 | """
75 | The Named Entity
76 | """
77 | return self._ner
78 |
79 | @ner.setter
80 | def ner(self, value):
81 | self._ner = value
82 |
83 | @property
84 | def upos(self):
85 | """
86 | The universal pos tag
87 | """
88 | return self._upos
89 |
90 | @upos.setter
91 | def upos(self, value):
92 | self._upos = value
93 |
94 | @property
95 | def feats(self):
96 | """
97 | The universal morphological features
98 | """
99 | return self._feats
100 |
101 | @feats.setter
102 | def feats(self, value):
103 | self._feats = value
104 |
105 | @property
106 | def head(self):
107 | """
108 | The dependant word index in the sentence
109 | """
110 | return self._head
111 |
112 | @head.setter
113 | def head(self, value):
114 | self._head = value
115 |
116 | @property
117 | def deprel(self):
118 | """
119 | The label of the relation between the specific word and the dependant one
120 | """
121 | return self._deprel
122 |
123 | @deprel.setter
124 | def deprel(self, value):
125 | self._deprel = value
126 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/processors/ner.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from transformers import AutoModel
5 |
6 | from gr_nlp_toolkit.configs.ner_labels import ner_labels
7 | from gr_nlp_toolkit.domain.document import Document
8 | from gr_nlp_toolkit.processors.abstract_processor import AbstractProcessor
9 |
10 | from gr_nlp_toolkit.models.ner_model import NERBERTModel
11 |
12 |
13 | model_params = {'dp': 0}
14 |
15 |
16 | class NER(AbstractProcessor):
17 | """
18 | Named Entity Recognition (NER) processor class.
19 |
20 | This class performs NER using a pre-trained BERT model. It initializes the model,
21 | loads the necessary components, and provides functionality to process documents
22 | and perform NER on them.
23 |
24 | Attributes:
25 | I2L (list): A list of label names for the NER task.
26 | output_size (int): The number of output labels.
27 | _model (NERBERTModel): The NER model based on BERT.
28 | softmax (nn.Softmax): Softmax function for output normalization.
29 | device (torch.device): Device on which the model is loaded.
30 | """
31 |
32 | def __init__(self, model_path=None, device='cpu', entities=18,):
33 | """
34 | Initializes the NER class with the specified parameters.
35 |
36 | Args:
37 | model_path (str, optional): Path to the pre-trained model. Defaults to None.
38 | device (str, optional): Device to load the model on ('cpu' or 'cuda'). Defaults to 'cpu'.
39 | entities (int, optional): Number of entity labels. Should be set to 18. Defaults to 18.
40 |
41 | Raises:
42 | ValueError: If the number of entities is not 18.
43 | """
44 |
45 | # Entities are the semantic catgories of the NER task (more info: http://nlp.cs.aueb.gr/theses/smyrnioudis_bsc_thesis.pdf)
46 | if entities == 18:
47 | self.I2L = ner_labels
48 | self.output_size = len(self.I2L)
49 | else:
50 | raise ValueError('Entities should be set to 18')
51 |
52 | # Initialize the BERT model
53 | bert_model = AutoModel.from_pretrained('nlpaueb/bert-base-greek-uncased-v1')
54 | self._model = NERBERTModel(bert_model, self.output_size, **model_params)
55 | self.softmax = nn.Softmax(dim=-1)
56 | self.device = torch.device(device)
57 | self._model.to(self.device)
58 | self._model.eval()
59 |
60 | # load the pretrained model if provided
61 | if model_path is not None:
62 | self._model.load_state_dict(torch.load(model_path, map_location=self.device, weights_only=True), strict=False)
63 |
64 |
65 |
66 | def __call__(self, doc: Document) -> Document:
67 | """
68 | Processes a document to perform Named Entity Recognition.
69 |
70 | Args:
71 | doc (Document): The document to process.
72 |
73 | Returns:
74 | Document: The document with NER tags assigned to the tokens.
75 | """
76 |
77 | # Get the input ids and text length of the document
78 | input_ids, text_len = next(iter(doc.dataloader))['input']
79 |
80 | # Perform NER with the model
81 | output = self._model(input_ids.to(self.device), text_len.to(self.device))
82 | predictions = self.softmax(output)
83 | predictions = torch.argmax(predictions[0], axis=-1).detach().cpu().numpy()
84 |
85 | # map predictions -> tokens, special tokens are not included
86 | i = 0
87 | for mask, pred in zip(doc.token_mask, predictions[1: len(predictions) - 1]):
88 | if mask:
89 | token = doc.tokens[i]
90 | token.ner = self.I2L[pred]
91 | i+=1
92 |
93 | return doc
--------------------------------------------------------------------------------
/gr_nlp_toolkit/processors/dp.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from transformers import AutoModel
5 |
6 | from gr_nlp_toolkit.configs.dp_labels import dp_labels
7 | from gr_nlp_toolkit.domain.document import Document
8 | from gr_nlp_toolkit.processors.abstract_processor import AbstractProcessor
9 |
10 | from gr_nlp_toolkit.models.dp_model import DPModel
11 |
12 |
13 | class DP(AbstractProcessor):
14 | """
15 | Dependency Parsing (DP) processor class.
16 |
17 | This class performs dependency parsing using a pre-trained BERT model. It initializes the model,
18 | loads the necessary components, and provides functionality to process documents
19 | and assign head and dependency relation (deprel) tags to tokens.
20 |
21 | Attributes:
22 | I2L (list): A list of dependency relation labels.
23 | output_size (int): The number of output labels.
24 | _model (DPModel): The dependency parsing model based on BERT.
25 | softmax (nn.Softmax): Softmax function for output normalization.
26 | device (torch.device): Device on which the model is loaded.
27 | """
28 |
29 | def __init__(self, model_path=None, device='cpu'):
30 | """
31 | Initializes the DP class with the specified parameters.
32 |
33 | Args:
34 | model_path (str, optional): Path to the pre-trained model. Defaults to None.
35 | device (str, optional): Device to load the model on ('cpu' or 'cuda'). Defaults to 'cpu'.
36 | """
37 |
38 | self.I2L = dp_labels
39 | self.output_size = len(self.I2L)
40 |
41 | # Initialize the BERT model
42 | bert_model = AutoModel.from_pretrained('nlpaueb/bert-base-greek-uncased-v1')
43 | self._model = DPModel(bert_model, self.I2L, 0)
44 |
45 | self.softmax = nn.Softmax(dim=-1)
46 | self.device = torch.device(device)
47 | self._model.to(self.device)
48 | self._model.eval()
49 |
50 | # Load the pretrained model if provided
51 | if model_path is not None:
52 | self._model.load_state_dict(torch.load(model_path, map_location=self.device, weights_only=True), strict=False)
53 |
54 | def __call__(self, doc: Document) -> Document:
55 | """
56 | Processes a document to perform dependency parsing.
57 |
58 | Args:
59 | doc (Document): The document to process.
60 |
61 | Returns:
62 | Document: The document with head and deprel tags assigned to the tokens.
63 | """
64 |
65 | # Predict heads
66 | input_ids, text_len = next(iter(doc.dataloader))['input']
67 |
68 | output_heads = 'heads'
69 |
70 | predictions_heads = self._model(input_ids.to(self.device), text_len.to(self.device))
71 | predictions_heads = self.softmax(predictions_heads[output_heads])
72 | predictions_heads = torch.argmax(predictions_heads[0], axis=-1).detach().cpu().numpy()
73 |
74 | # Predict dependency relations (deprels)
75 | output_deprels = 'gathered_deprels'
76 |
77 | predictions_deprels = self._model(input_ids.to(self.device), text_len.to(self.device))
78 | predictions_deprels = self.softmax(predictions_deprels[output_deprels])
79 | predictions_deprels = torch.argmax(predictions_deprels[0], axis=-1).detach().cpu().numpy()
80 |
81 | # map predictions -> tokens, special tokens are not included
82 | i = 0
83 | for mask, pred_head, pred_deprel in zip(doc.token_mask, predictions_heads[1: len(predictions_heads) - 1],
84 | predictions_deprels[1: len(predictions_deprels) - 1]):
85 | if mask:
86 | token = doc.tokens[i]
87 | token.head = doc.subword2word[pred_head]
88 | token.deprel = self.I2L[pred_deprel]
89 | i +=1
90 |
91 | return doc
92 |
--------------------------------------------------------------------------------
/tests/test_processors/test_pos.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from transformers import AutoModel
4 |
5 | from gr_nlp_toolkit.domain.document import Document
6 | from gr_nlp_toolkit.processors.pos import POS
7 | from gr_nlp_toolkit.processors.tokenizer import Tokenizer
8 | from gr_nlp_toolkit.configs.pos_labels import pos_labels, pos_properties
9 |
10 |
11 | class MyTestCase(unittest.TestCase):
12 |
13 | @classmethod
14 | def setUpClass(cls) -> None:
15 | cls.bert_model = AutoModel.from_pretrained('nlpaueb/bert-base-greek-uncased-v1')
16 |
17 |
18 | def test_pos_with_one_example(self):
19 | tokenizer = Tokenizer()
20 | doc = tokenizer(Document('Ο ποιητής'))
21 |
22 | pos = POS()
23 | self.assertIsNotNone(pos._model)
24 | # self.assertIsNotNone(pos.system)
25 | doc = pos(doc)
26 |
27 | tokens = doc.tokens
28 | for token in tokens:
29 | self.assertIsNotNone(token.upos)
30 | self.assertTrue(token.upos in pos_labels['upos'])
31 |
32 | self.assertIsNotNone(token.feats)
33 | self.assertEqual(len(list(token.feats.keys())), len(pos_properties[token.upos]))
34 |
35 | for feat, value in token.feats.items():
36 | self.assertTrue(feat in pos_properties[token.upos])
37 | self.assertTrue(value in pos_labels[feat])
38 |
39 | def test_pos_with_one_example_with_subwords(self):
40 | tokenizer = Tokenizer()
41 | doc = tokenizer(Document('ενα ποιηματακι'))
42 |
43 | pos = POS(MyTestCase.bert_model)
44 | self.assertIsNotNone(pos._model)
45 | self.assertIsNotNone(pos.system)
46 | doc = pos(doc)
47 |
48 | tokens = doc.tokens
49 | for token in tokens:
50 | self.assertIsNotNone(token.upos)
51 | self.assertTrue(token.upos in pos_labels['upos'])
52 |
53 | self.assertIsNotNone(token.feats)
54 | self.assertEqual(len(list(token.feats.keys())), len(pos_properties[token.upos]))
55 |
56 | for feat, value in token.feats.items():
57 | self.assertTrue(feat in pos_properties[token.upos])
58 | self.assertTrue(value in pos_labels[feat])
59 |
60 | def test_pos_with_one_example_with_subwords(self):
61 | tokenizer = Tokenizer()
62 | doc = tokenizer(Document('ενα ποιηματακι'))
63 |
64 | pos = POS(MyTestCase.bert_model)
65 | self.assertIsNotNone(pos._model)
66 | self.assertIsNotNone(pos.system)
67 | doc = pos(doc)
68 |
69 | tokens = doc.tokens
70 | for token in tokens:
71 | self.assertIsNotNone(token.upos)
72 | self.assertTrue(token.upos in pos_labels['upos'])
73 |
74 | self.assertIsNotNone(token.feats)
75 | self.assertEqual(len(list(token.feats.keys())), len(pos_properties[token.upos]))
76 |
77 | for feat, value in token.feats.items():
78 | self.assertTrue(feat in pos_properties[token.upos])
79 | self.assertTrue(value in pos_labels[feat])
80 |
81 | def test_pos_with_one_example_with_subwords(self):
82 | tokenizer = Tokenizer()
83 | doc = tokenizer(Document('ενα ποιηματακι'))
84 |
85 | pos = POS()
86 | self.assertIsNotNone(pos._model)
87 | # self.assertIsNotNone(pos.system)
88 | doc = pos(doc)
89 |
90 | tokens = doc.tokens
91 | for token in tokens:
92 | self.assertIsNotNone(token.upos)
93 | self.assertTrue(token.upos in pos_labels['upos'])
94 |
95 | self.assertIsNotNone(token.feats)
96 | self.assertEqual(len(list(token.feats.keys())), len(pos_properties[token.upos]))
97 |
98 | for feat, value in token.feats.items():
99 | self.assertTrue(feat in pos_properties[token.upos])
100 | self.assertTrue(value in pos_labels[feat])
101 |
102 |
103 | if __name__ == '__main__':
104 | unittest.main()
105 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/processors/pos.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from transformers import AutoModel
5 |
6 | from gr_nlp_toolkit.domain.document import Document
7 | from gr_nlp_toolkit.processors.abstract_processor import AbstractProcessor
8 | from gr_nlp_toolkit.configs.pos_labels import pos_labels, pos_properties
9 |
10 |
11 | from gr_nlp_toolkit.models.pos_model import POSModel
12 |
13 |
14 | class POS(AbstractProcessor):
15 | """
16 | Part-Of-Speech (POS) processor class.
17 |
18 | This class performs POS tagging using a pre-trained BERT model. It initializes the model,
19 | loads the necessary components, and provides functionality to process documents
20 | and assign POS tags and features to tokens.
21 |
22 | Attributes:
23 | properties_POS (dict): Dictionary containing properties for POS tags.
24 | feat_to_I2L (dict): Dictionary mapping feature names to label lists.
25 | feat_to_size (dict): Dictionary mapping feature names to the size of their label lists.
26 | _model (POSModel): The POS model based on BERT.
27 | softmax (nn.Softmax): Softmax function for output normalization.
28 | device (torch.device): Device on which the model is loaded.
29 | """
30 |
31 | def __init__(self, model_path=None, device='cpu'):
32 | """
33 | Initializes the POS class with the specified parameters.
34 |
35 | Args:
36 | model_path (str, optional): Path to the pre-trained model. Defaults to None.
37 | device (str, optional): Device to load the model on ('cpu' or 'cuda'). Defaults to 'cpu'.
38 | """
39 |
40 | self.properties_POS = pos_properties
41 | self.feat_to_I2L = pos_labels
42 | self.feat_to_size = {k: len(v) for k, v in self.feat_to_I2L.items()}
43 |
44 | # model init
45 | bert_model = AutoModel.from_pretrained('nlpaueb/bert-base-greek-uncased-v1')
46 | self._model = POSModel(bert_model, self.feat_to_size, 0)
47 | self.softmax = nn.Softmax(dim=-1)
48 | self.device = torch.device(device)
49 | self._model.to(self.device)
50 | self._model.eval()
51 |
52 | # load the pretrained model
53 | if model_path is not None:
54 | self._model.load_state_dict(torch.load(model_path, map_location=self.device, weights_only=True), strict=False)
55 |
56 | def __call__(self, doc: Document) -> Document:
57 | """
58 | Processes a document to perform Part-Of-Speech tagging and assign features.
59 |
60 | Args:
61 | doc (Document): The document to process.
62 |
63 | Returns:
64 | Document: The document with POS tags and features assigned to the tokens.
65 | """
66 |
67 | predictions = {}
68 |
69 | input_ids, text_len = next(iter(doc.dataloader))['input']
70 |
71 | for feat in self.feat_to_I2L.keys():
72 | output = self._model(input_ids.to(self.device), text_len.to(self.device))
73 | output = self.softmax(output[feat])
74 |
75 |
76 | predictions[feat] = torch.argmax(output[0], axis=-1).detach().cpu().numpy()
77 |
78 | # set upos
79 | upos_predictions = predictions['upos']
80 | i = 0
81 | for mask, pred in zip(doc.token_mask, upos_predictions[1: len(upos_predictions) - 1]):
82 | if mask:
83 | token = doc.tokens[i]
84 | token.upos = self.feat_to_I2L['upos'][pred]
85 | # Advance to the next word (not subtoken)
86 | i+=1
87 |
88 | # set features
89 | for feat in self.feat_to_I2L.keys():
90 | if feat != 'upos':
91 | current_predictions = predictions[feat]
92 | i = 0
93 | for mask, pred in zip(doc.token_mask, current_predictions[1: len(current_predictions) - 1]):
94 | if mask:
95 | token = doc.tokens[i]
96 | if feat in self.properties_POS[token.upos]:
97 | token.feats[feat] = self.feat_to_I2L[feat][pred]
98 | # Advance to the next word (not subtoken)
99 | i += 1
100 |
101 | return doc
102 |
--------------------------------------------------------------------------------
/tests/test_processors/test_tokenizer.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from gr_nlp_toolkit.processors.tokenizer import *
4 |
5 |
6 | class TestTokenizer(unittest.TestCase):
7 | def test_strip_accents_and_lowercase1(self):
8 | result = strip_accents_and_lowercase('ποιητής')
9 | self.assertEqual('ποιητης', result)
10 |
11 | def test_strip_accents_and_lowercase2(self):
12 | result = strip_accents_and_lowercase('ΠΟΙΗΤΗΣ')
13 | self.assertEqual('ποιητης', result)
14 |
15 | """"
16 | Tests with no sub-words:
17 | """
18 |
19 | def test_create_ids_without_subwords(self):
20 | ids = create_ids('ο ποιητης')
21 | # 2 special tokens + 2 given words
22 | self.assertEqual(4, len(ids))
23 |
24 | def test_create_tokens_without_subwords(self):
25 | ids = [101, 247, 6981, 102]
26 | tokens = convert_to_tokens(ids)
27 | # 2 words, special tokens are not included
28 | self.assertEqual(2, len(tokens))
29 | self.assertEqual('ο', tokens[0])
30 | self.assertEqual('ποιητης', tokens[1])
31 |
32 | def test_create_mask_and_tokens_without_subwords(self):
33 | tokens = ['ο', 'ποιητης']
34 | mask, tokens, subword2word = create_mask_and_tokens(tokens, [247, 6981])
35 |
36 | self.assertEqual(2, len(mask))
37 | self.assertEqual([True, True], mask)
38 | self.assertEqual(2, len(tokens))
39 | self.assertEqual(1, len(tokens[0].subwords))
40 | self.assertEqual(1, len(tokens[1].subwords))
41 | self.assertEqual(247, tokens[0]._ids[0])
42 | self.assertEqual(6981, tokens[1]._ids[0])
43 | self.assertEqual('ο', tokens[0].subwords[0])
44 | self.assertEqual('ποιητης', tokens[1].subwords[0])
45 | self.assertEqual('ο', tokens[0].text)
46 | self.assertEqual('ποιητης', tokens[1].text)
47 | self.assertEqual(len(subword2word.keys()), 3)
48 | self.assertEqual(subword2word[1], 1)
49 | self.assertEqual(subword2word[2], 2)
50 |
51 | """"
52 | Tests with sub-words:
53 | """
54 |
55 | def test_create_ids_with_subwords(self):
56 | ids = create_ids('ενα ποιηματακι')
57 | # 2 special tokens + 1 word without sub-words + 1 word with 1 sub-word
58 | self.assertEqual(5, len(ids))
59 |
60 | def test_create_tokens_with_subwords(self):
61 | ids = [101, 370, 6623, 701, 102]
62 | tokens = convert_to_tokens(ids)
63 | # 2 words, special tokens are not included
64 | self.assertEqual(3, len(tokens))
65 | self.assertEqual('ενα', tokens[0])
66 | self.assertEqual('ποιηματα', tokens[1])
67 | self.assertEqual('##κι', tokens[2])
68 |
69 | def test_create_mask_and_tokens_with_subwords(self):
70 | tokens = ['ενα', 'ποιηματα', '##κι']
71 | mask, tokens, subword2word = create_mask_and_tokens(tokens, [370, 6623, 701])
72 |
73 | self.assertEqual(3, len(mask))
74 | self.assertEqual([True, True, False], mask)
75 | self.assertEqual(2, len(tokens))
76 | self.assertEqual(1, len(tokens[0].subwords))
77 | self.assertEqual(2, len(tokens[1].subwords))
78 | self.assertEqual(370, tokens[0]._ids[0])
79 | self.assertEqual(6623, tokens[1]._ids[0])
80 | self.assertEqual(701, tokens[1]._ids[1])
81 | self.assertEqual('ενα', tokens[0].subwords[0])
82 | self.assertEqual('ποιηματα', tokens[1].subwords[0])
83 | self.assertEqual('##κι', tokens[1].subwords[1])
84 | self.assertEqual('ενα', tokens[0].text)
85 | self.assertEqual('ποιηματακι', tokens[1].text)
86 | self.assertEqual(len(subword2word.keys()), 4)
87 | self.assertEqual(subword2word[1], 1)
88 | self.assertEqual(subword2word[2], 2)
89 | self.assertEqual(subword2word[3], 2)
90 |
91 | def test_tokenizer(self):
92 | tokenizer = Tokenizer()
93 | doc = tokenizer(Document('Ο ποιητής'))
94 | # document has all field set
95 | self.assertIsNotNone(doc.text)
96 | self.assertIsNotNone(doc.input_ids)
97 | self.assertIsNotNone(doc.token_mask)
98 | self.assertIsNotNone(doc.tokens)
99 | self.assertIsNotNone(doc.subword2word)
100 |
101 | def test_create_dataset_and_dataloader(self):
102 | input_ids = [101, 370, 6623, 701, 102]
103 | dataset, dataloader = create_dataset_and_dataloader(input_ids)
104 | self.assertIsNotNone(dataset.input_ids)
105 | self.assertIsNotNone(dataloader.dataset)
106 | self.assertEqual(dataset, dataloader.dataset)
107 | self.assertEqual(dataset.input_ids, [input_ids])
108 |
109 |
110 | if __name__ == '__main__':
111 | unittest.main()
112 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/processors/g2g.py:
--------------------------------------------------------------------------------
1 | from gr_nlp_toolkit.processors.abstract_processor import AbstractProcessor
2 | from gr_nlp_toolkit.domain.document import Document
3 | from gr_nlp_toolkit.models import g2g_RBNLM_model
4 | from gr_nlp_toolkit.domain.textVectorizer import TextVectorizer
5 | from gr_nlp_toolkit.models.g2g_RBNLM_model import LanguageModel
6 | from gr_nlp_toolkit.models.g2g_transformer_model import ByT5Model
7 | import torch
8 | import pickle
9 |
10 | def detect_language(text):
11 | """
12 | Checks whether the majority of the letters in the input text are in the greek or the latin script
13 | It is used to identify whether the text is in greek or greeklish (latin script), in order to skip unnecessary conversions.
14 |
15 | Args:
16 | text (str): The input text
17 |
18 | Returns:
19 | script (str): The dominant script
20 | """
21 | # Filter out non-letter characters
22 | valid_characters = [char for char in text if char.isalpha()]
23 |
24 | # Count Greek and English letters
25 | greek_count = sum(1 for char in valid_characters if '\u0370' <= char <= '\u03FF' or '\u1F00' <= char <= '\u1FFF')
26 | english_count = sum(1 for char in valid_characters if '\u0041' <= char <= '\u005A' or '\u0061' <= char <= '\u007A')
27 |
28 | script = "greek" if greek_count >= english_count else "latin"
29 | return script
30 |
31 |
32 | class G2G(AbstractProcessor):
33 | """
34 | Greeklsih to Greek (G2G) processor class.
35 |
36 | This class performs G2G conversion using either an LSTM-based model or a transformer-based model.
37 | It initializes the model, loads the necessary components, and provides functionality to process documents
38 | and convert text using the specified mode.
39 | """
40 |
41 | def __init__(self, mode = 'LSTM', model_path = None, tokenizer_path = None, device = 'cpu'):
42 |
43 | """
44 | Initializes the G2G class with the specified parameters.
45 |
46 | Args:
47 | mode (str, optional): The mode of the model, either 'LSTM' or 'transformer'. Defaults to 'LSTM'.
48 | model_path (str, optional): Path to the pre-trained model. Defaults to None.
49 | tokenizer_path (str, optional): Path to the tokenizer for LSTM mode. Defaults to None.
50 | device (str, optional): Device to load the model on ('cpu' or 'cuda'). Defaults to 'cpu'.
51 | """
52 |
53 |
54 | self.mode = mode
55 | self.device = torch.device(device)
56 |
57 | if self.mode == 'LSTM':
58 | # Define the model parameters (more info: https://aclanthology.org/2024.lrec-main.1330/)
59 | input_size = 120
60 | embed_size = 32
61 | hidden_size = 512
62 | output_size = 120
63 |
64 | # Load and initialize the LSTM model
65 | self.beam_size = 5
66 | self.model = g2g_RBNLM_model.LSTM_LangModel(input_size, embed_size, hidden_size, output_size)
67 |
68 |
69 | # Load and initialize the tokenizer
70 | self.text_vectorizer = TextVectorizer("char")
71 |
72 | if(model_path is not None):
73 | self.model.load_state_dict(torch.load(model_path, map_location=self.device, weights_only=True))
74 |
75 |
76 | if(tokenizer_path is not None):
77 | with open(tokenizer_path, "rb") as file:
78 | self.text_vectorizer = pickle.load(file)
79 |
80 | # Initialize the LanguageModel
81 | self.LM = LanguageModel(self.text_vectorizer, self.model, device=self.device)
82 |
83 |
84 | elif self.mode == 'transformer':
85 | self.model = ByT5Model(model_path, device=self.device)
86 | self.model.eval()
87 |
88 |
89 | def __call__(self, doc: Document) -> Document:
90 | """
91 | Processes a document to perform Greeklish to Greek conversion.
92 |
93 | Args:
94 | doc (Document): The document to process.
95 |
96 | Returns:
97 | Document: The document with text converted using the specified model.
98 | """
99 |
100 | # If the text is in already in greek, skip the g2g conversion
101 | if(detect_language(doc.text) == 'greek'):
102 | return doc
103 |
104 |
105 |
106 | # Perform G2G conversion based on the mode
107 | if(self.mode == 'LSTM'):
108 | doc.text = self.LM.translate([doc.text], self.beam_size)[0]
109 | elif(self.mode == 'transformer'):
110 | doc.text = self.model(doc.text)
111 |
112 | return doc
--------------------------------------------------------------------------------
/tests/test_pipeline/test_pipeline.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from gr_nlp_toolkit.configs.dp_labels import dp_labels
4 | from gr_nlp_toolkit.configs.ner_labels import ner_labels
5 | from gr_nlp_toolkit.configs.pos_labels import pos_labels, pos_properties
6 | from gr_nlp_toolkit.pipeline.pipeline import Pipeline
7 |
8 |
9 |
10 | class TestPipeline(unittest.TestCase):
11 | # The unit tests test the transformer-based g2g processor. If you want to test the LSTM-based processor, you can change 'g2g' to 'g2g_lite'
12 | def test_using_all_processors(self):
13 | nlp = Pipeline('dp,pos,ner,g2g')
14 |
15 | sentences = ["Η Ιταλία κέρδισε την Αγγλία στον τελικό του Euro το 2021",
16 | "Το ποιηματάκι το έγραψε ο διάσημος ποιητής, Νίκος Νικολαϊδης",
17 | "Uparxoun autoi pou kerdizoun apo mia katastash kai autoi pou xanoun"]
18 | for sent in sentences:
19 | doc = nlp(sent)
20 |
21 | for token in doc.tokens:
22 | print(token.text, token.ner, token.upos, token.feats, token.head, token.deprel)
23 | self.assertIsNotNone(token.ner)
24 | self.assertTrue(token.ner in ner_labels)
25 | self.assertIsNotNone(token.head)
26 | self.assertIsNotNone(token.deprel)
27 | # We have to add plus one, because the cls token is removed
28 | self.assertTrue(token.head in range(0, len(doc.tokens) + 1))
29 | self.assertTrue(token.deprel in dp_labels)
30 | self.assertIsNotNone(token.upos)
31 | self.assertTrue(token.upos in pos_labels['upos'])
32 |
33 | self.assertIsNotNone(token.feats)
34 | self.assertEqual(len(list(token.feats.keys())), len(pos_properties[token.upos]))
35 |
36 | for feat, value in token.feats.items():
37 | self.assertTrue(feat in pos_properties[token.upos])
38 | self.assertTrue(value in pos_labels[feat])
39 | print(token.text, token.ner, token.upos, token.feats, token.head, token.deprel)
40 | self.assertIsNotNone(token.ner)
41 | self.assertTrue(token.ner in ner_labels)
42 | self.assertIsNotNone(token.head)
43 | self.assertIsNotNone(token.deprel)
44 | # We have to add plus one, because the cls token is removed
45 | self.assertTrue(token.head in range(0, len(doc.tokens) + 1))
46 | self.assertTrue(token.deprel in dp_labels)
47 | self.assertIsNotNone(token.upos)
48 | self.assertTrue(token.upos in pos_labels['upos'])
49 |
50 | def test_annotations_are_same_with_multiple_configurations(self):
51 | nlp = Pipeline('dp,pos,ner,g2g')
52 | doc = nlp("Uparxoun autoi pou kerdizoun apo mia katastash kai autoi pou xanoun")
53 |
54 | deprels_preds = []
55 | upos_preds = []
56 | ner_preds = []
57 | for token in doc.tokens:
58 | deprels_preds.append(token.deprel)
59 | upos_preds.append(token.upos)
60 | ner_preds.append(token.ner)
61 |
62 | nlp = Pipeline('dp,g2g')
63 | doc = nlp("Uparxoun autoi pou kerdizoun apo mia katastash kai autoi pou xanoun")
64 | new_deprels_preds = []
65 |
66 | for token in doc.tokens:
67 | new_deprels_preds.append(token.deprel)
68 |
69 | nlp = Pipeline('pos,g2g')
70 | doc = nlp("Uparxoun autoi pou kerdizoun apo mia katastash kai autoi pou xanoun")
71 | new_upos_preds =[]
72 |
73 | for token in doc.tokens:
74 | new_upos_preds.append(token.upos)
75 |
76 | nlp = Pipeline('ner,g2g')
77 | doc = nlp("Uparxoun autoi pou kerdizoun apo mia katastash kai autoi pou xanoun")
78 | new_ner_preds =[]
79 | for token in doc.tokens:
80 | new_ner_preds.append(token.ner)
81 |
82 | self.assertEqual(new_deprels_preds, deprels_preds)
83 | self.assertEqual(new_upos_preds, upos_preds)
84 | self.assertEqual(new_ner_preds, ner_preds)
85 |
86 |
87 |
88 | def test_using_only_one_processor(self):
89 | nlp = Pipeline('ner')
90 | doc = nlp("Η Ιταλία κέρδισε την Αγγλία στον τελικό του Euro το 2021")
91 |
92 | for token in doc.tokens:
93 | self.assertIsNotNone(token.ner)
94 | self.assertTrue(token.ner in ner_labels)
95 | self.assertIsNone(token.head)
96 | self.assertIsNone(token.deprel)
97 | self.assertFalse(token.head in range(0, len(doc.tokens)))
98 | self.assertFalse(token.deprel in dp_labels)
99 | self.assertIsNone(token.upos)
100 | self.assertFalse(token.upos in pos_labels['upos'])
101 |
102 | for feat, value in token.feats.items():
103 | self.assertFalse(feat in pos_properties[token.upos])
104 | self.assertFalse(value in pos_labels[feat])
105 |
106 |
107 | if __name__ == '__main__':
108 | unittest.main()
109 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/models/dp_model.py:
--------------------------------------------------------------------------------
1 | from torch.nn import LeakyReLU
2 |
3 | from gr_nlp_toolkit.models.util import create_mask_from_length
4 |
5 | from torch import nn
6 | import torch
7 |
8 |
9 | class DPModel(nn.Module):
10 | """
11 | Dependency Parsing model.
12 |
13 | Attributes:
14 | numrels (int): Number of dependency relation labels.
15 | _bert_model (nn.Module): The BERT model.
16 | _dp (nn.Dropout): Dropout layer.
17 | arc_head (nn.Linear): Linear layer for arc head representation.
18 | arc_dep (nn.Linear): Linear layer for arc dependent representation.
19 | rel_head (nn.Linear): Linear layer for relation head representation.
20 | rel_dep (nn.Linear): Linear layer for relation dependent representation.
21 | arc_bias (nn.Parameter): Bias parameter for arc representation.
22 | rel_bias (nn.Parameter): Bias parameter for relation representation.
23 | u_rel (nn.Parameter): Parameter for relation representation.
24 | w_arc (nn.Parameter): Parameter for arc representation.
25 | w_rel_head (nn.Parameter): Parameter for relation head representation.
26 | w_rel_dep (nn.Parameter): Parameter for relation dependent representation.
27 | deprel_linear_2 (nn.Linear): Linear layer for dependency relation labels.
28 | relu (LeakyReLU): LeakyReLU activation function.
29 | """
30 |
31 | def __init__(self, bert_model, deprel_i2l, dp):
32 | """
33 | Initialize the DPModel.
34 |
35 | Args:
36 | bert_model (nn.Module): The BERT model.
37 | deprel_i2l (list): List of dependency relation labels.
38 | dp (float): The dropout probability.
39 |
40 | """
41 | super(DPModel, self).__init__()
42 | self.numrels = len(deprel_i2l)
43 | self._bert_model = bert_model
44 | self._dp = nn.Dropout(dp)
45 |
46 | self.arc_head = nn.Linear(768, 768)
47 | self.arc_dep = nn.Linear(768, 768)
48 |
49 | self.rel_head = nn.Linear(768, 768)
50 | self.rel_dep = nn.Linear(768, 768)
51 |
52 | self.arc_bias = nn.Parameter(torch.zeros(1, 768, 1))
53 | self.rel_bias = nn.Parameter(torch.zeros(1, 1, 1, self.numrels))
54 |
55 | self.u_rel = nn.Parameter(torch.zeros(1, 768, self.numrels * 768))
56 |
57 | self.w_arc = nn.Parameter(torch.zeros(1, 768, 768))
58 | self.w_rel_head = nn.Parameter(torch.zeros(1, 1, 768, self.numrels))
59 | self.w_rel_dep = nn.Parameter(torch.zeros(1, 1, 768, self.numrels))
60 |
61 | self.deprel_linear_2 = nn.Linear(768, len(deprel_i2l) * 768)
62 |
63 | self.relu = LeakyReLU(1)
64 |
65 |
66 | def forward(self, text, text_len):
67 | """
68 | Forward pass of the DPModel.
69 |
70 | Args:
71 | text (Tensor): Input text.
72 | text_len (Tensor): Length of the input text.
73 |
74 | Returns:
75 | output (dict): Dictionary containing the output of the model.
76 |
77 | """
78 | output = {}
79 |
80 | attention_mask = create_mask_from_length(text_len, text.shape[1])
81 | bert = self._bert_model(text, attention_mask=attention_mask)
82 |
83 | # output size bs , mseq , 768
84 | bert_output = self._dp(bert[0])
85 | bs = bert_output.shape[0]
86 | mseq = bert_output.shape[1]
87 |
88 | # Specialized vector representations
89 | arc_head = self.relu(self.arc_head(bert_output)) # bs,mseq,768
90 | arc_dep = self.relu(self.arc_dep(bert_output)) # bs,mseq,768
91 | rel_head = self.relu(self.rel_head(bert_output)) # bs,mseq,768
92 | rel_dep = self.relu(self.rel_dep(bert_output)) # bs,mseq,768
93 |
94 | # bs,mseq,768 @ bs,768,mseq + bs,mseq,768 @ 1,768,1
95 | output_linear_head = arc_head @ (arc_dep @ self.w_arc).transpose(1, 2) + arc_head @ self.arc_bias
96 | # arcdep * self w.arc = (bs,mseq,768) * (1,768,768) = (bs, mseq , 768)
97 |
98 | # bs,mseq, 768 * 1,768,768 *numrel = bs,mseq,numrel,768,`
99 | label_biaffine = rel_dep @ self.u_rel # bs,mseq,768 * numrel
100 | label_biaffine = label_biaffine.reshape(bs,mseq,self.numrels,768)
101 | label_biaffine = label_biaffine @ rel_head.transpose(1,2).unsqueeze(1) # bs,mseq,numrel,mseq
102 | label_biaffine = label_biaffine.transpose(2,3)
103 |
104 | label_head_affine = (rel_head.unsqueeze(2) @ self.w_rel_head)
105 | label_dep_affine = (rel_dep.unsqueeze(2) @ self.w_rel_dep)
106 | label_bias = self.rel_bias
107 |
108 | output_linear_rel = label_biaffine + label_head_affine + label_dep_affine + label_bias
109 | # (bs,mseq,1,768) @ (1 , 1 , 768 ,numrels) + ( 1, 1 , 1, numrels)
110 | # (bs,mseq,1,numrels)
111 |
112 | output['heads'] = output_linear_head
113 | output['deprels'] = output_linear_rel.reshape(bs, mseq, mseq, self.numrels)
114 |
115 | selected_arcs = output_linear_head.argmax(-1) # bs,mseq (indexes in [0,mseq) )
116 | selected_arcs = selected_arcs.unsqueeze(-1).repeat(1, 1, mseq) # bs,mseq,mseq
117 | selected_arcs = selected_arcs.unsqueeze(-1).repeat(1, 1, 1, self.numrels) # bs,mseq,mseq, numrels
118 |
119 | deprels_output = torch.gather(output_linear_rel, dim=2, index=selected_arcs) # bs,mseq,mseq,numrels
120 | # dim 2 is redundant so must be deleted ( there is only one head for every token)
121 | deprels_output = deprels_output.narrow(2, 0, 1) # bs,mseq,1,numrels
122 | deprels_output = deprels_output.squeeze(2) # bs , mseq,numrels
123 | output['gathered_deprels'] = deprels_output
124 |
125 | return output
--------------------------------------------------------------------------------
/gr_nlp_toolkit/pipeline/pipeline.py:
--------------------------------------------------------------------------------
1 | from gr_nlp_toolkit.domain.document import Document
2 | from gr_nlp_toolkit.processors.dp import DP
3 | from gr_nlp_toolkit.processors.ner import NER
4 | from gr_nlp_toolkit.processors.pos import POS
5 | from gr_nlp_toolkit.processors.g2g import G2G
6 |
7 | from gr_nlp_toolkit.processors.tokenizer import Tokenizer
8 | from huggingface_hub import hf_hub_download
9 |
10 | from typing import Literal
11 | import torch
12 |
13 | from transformers import logging
14 | logging.set_verbosity_error()
15 |
16 | def get_device_name() -> Literal["mps", "cuda", "cpu"]:
17 | """
18 | Returns the name of the device where this module is running.
19 |
20 | This is a simple implementation that doesn't cover cases when more powerful GPUs are available
21 | and not a primary device ('cuda:0') or MPS device is available but not configured properly:
22 | https://pytorch.org/docs/master/notes/mps.html
23 |
24 | Returns:
25 | Literal["mps", "cuda", "cpu"]: Device name, like 'cuda' or 'cpu'.
26 |
27 | Examples:
28 | >>> torch.cuda.is_available = lambda: True
29 | >>> torch.backends.mps.is_available = lambda: False
30 | >>> get_device_name()
31 | 'cuda'
32 |
33 | >>> torch.cuda.is_available = lambda: False
34 | >>> torch.backends.mps.is_available = lambda: True
35 | >>> get_device_name()
36 | 'mps'
37 |
38 | >>> torch.cuda.is_available = lambda: False
39 | >>> torch.backends.mps.is_available = lambda: False
40 | >>> get_device_name()
41 | 'cpu'
42 | """
43 | if torch.cuda.is_available():
44 | return "cuda"
45 | elif torch.backends.mps.is_available():
46 | return "mps"
47 | else:
48 | return "cpu"
49 |
50 |
51 | class Pipeline:
52 | """
53 | The central class of the toolkit. A pipeline is created after a list of processors are specified. The user can
54 | then annotate a document by using the __call__ method of the Pipeline
55 |
56 | Attributes:
57 | _processors: A list of the processors that will be used in the pipeline
58 | _processor_cache: A ProcessorCache object that is used to download the processors
59 | device: The device where the pipeline will run
60 |
61 | """
62 |
63 | def __init__(self, processors: str, use_cpu: bool = False):
64 | """
65 | Initializes the pipeline with the specified processors
66 |
67 | Args:
68 | processors: A string with the names of the processors you want to load, available values: 'ner', 'por', 'dp
69 | use_cpu: A boolean that specifies if the pipeline will run on the CPU
70 | """
71 |
72 | # if the user wants to use the CPU, we set the device to 'cpu'
73 | if(use_cpu):
74 | self.device = "cpu"
75 | else:
76 | self.device = get_device_name()
77 |
78 | self._processors = []
79 |
80 | processors = set(processors.split(","))
81 |
82 | # ner: Named Entity Recognition Processor
83 | # pos: Part of Speech Recognition Processor
84 | # dp: Dependency Parsing
85 | # g2g: Greeklish to Greek Transliteration Processor (ByT5 model)
86 | # g2g_lite: Greeklish to Greek Transliteration Processor (LSTM model)
87 | available_processors = ['ner', 'pos', 'dp', 'g2g_lite', 'g2g']
88 |
89 |
90 | # Adding the g2g processor, which must be the first in the pipeline
91 | if("g2g_lite" in processors):
92 | self._processors.append(G2G(mode="LSTM", model_path="gr_nlp_toolkit/RBNLM_weights/LSTM_LM_50000_char_120_32_512.pt", tokenizer_path="gr_nlp_toolkit/RBNLM_weights/RBNLMtextVectorizer.pkl", device=self.device))
93 | processors.remove("g2g_lite")
94 | elif("g2g" in processors):
95 | self._processors.append(G2G(mode="transformer", model_path="AUEB-NLP/ByT5_g2g", device=self.device))
96 | processors.remove("g2g")
97 |
98 |
99 | # Adding the tokenizer processor
100 | self._processors.append(Tokenizer())
101 | for p in processors:
102 | if p == available_processors[0]:
103 | ner_path = hf_hub_download(repo_id="AUEB-NLP/gr-nlp-toolkit", filename="ner_processor")
104 | self._processors.append(NER(model_path=ner_path, device=self.device))
105 | elif p == available_processors[1]:
106 | pos_path = hf_hub_download(repo_id="AUEB-NLP/gr-nlp-toolkit", filename="pos_processor")
107 | self._processors.append(POS(model_path=pos_path, device=self.device))
108 | elif p == available_processors[2]:
109 | dp_path = hf_hub_download(repo_id="AUEB-NLP/gr-nlp-toolkit", filename="dp_processor")
110 | self._processors.append(DP(model_path=dp_path, device=self.device))
111 | else:
112 | raise Exception(f"Invalid processor name, please choose one of {available_processors}")
113 |
114 | def __call__(self, text: str) -> Document:
115 |
116 | """
117 | Annotate a text with the processors present in the pipeline
118 |
119 | Args:
120 | text: The text that will be annotated
121 | """
122 |
123 | # Create a document from the text
124 | self._doc = Document(text)
125 |
126 | # Pass the document through every processor
127 | for processor in self._processors:
128 | # print(processor)
129 | processor(self._doc)
130 |
131 | return self._doc
132 |
133 | if __name__ == "__main__":
134 |
135 |
136 | nlp = Pipeline("g2g,ner,dp,pos")
137 |
138 | txts = ["Uparxoun autoi pou kerdizoun apo mia katastash kai autoi pou hanoun",
139 | "o volos kai h larisa einai poleis ths thessalias",
140 | "Η Αθήνα είναι η μεγαλύτερη πόλη της Ελλάδας"]
141 |
142 | for txt in txts:
143 |
144 | doc = nlp(txt)
145 |
146 | print(doc.text)
147 | for token in doc.tokens:
148 | print(f"{token.text}: {token.ner}, {token.upos}, {token.feats}, {token.head}, {token.deprel}") # the text of the token
149 |
150 |
151 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/configs/dictionary_tables.py:
--------------------------------------------------------------------------------
1 | # Without intonation
2 | greek_to_greeklish = {'α': ['a'], 'ε': ['e'], 'η': ['h', 'i'], 'ι': ['i'], 'ο': ['o'], 'υ': ['u', 'y', 'i'],
3 | 'ω': ['w', 'o'], 'β': ['b', 'v'], 'γ': ['g'], 'δ': ['d'], 'ζ': ['z'], 'θ': ['th', 'u'],
4 | 'κ': ['k'], 'λ': ['l'], 'μ': ['m'], 'ν': ['n'], 'ξ': ['ks', 'x'], 'π': ['p'],
5 | 'ρ': ['r'], 'σ': ['s'], 'ς': ['s'], 'τ': ['t'], 'χ': ['x', 'h'], 'φ': ['f'], 'ψ': ['ps'],
6 |
7 | 'Α': ['A'], 'Ε': ['E'], 'Η': ['H', 'I'], 'Ι': ['I'], 'Ο': ['O'], 'Υ': ['Y', 'I'],
8 | 'Ω': ['W', 'O'], 'Β': ['B', 'V'], 'Γ': ['G'], 'Δ': ['D'], 'Ζ': ['Z'],
9 | 'Θ': ['Th', 'U', 'Q'], 'Κ': ['K'], 'Λ': ['L'], 'Μ': ['M'], 'Ν': ['N'], 'Ξ': ['Ks', 'X'],
10 | 'Π': ['P'], 'Ρ': ['R'], 'Σ': ['S'], 'Τ': ['T'], 'Χ': ['X', 'H'], 'Φ': ['F'], 'Ψ': ['P'],
11 |
12 | 'ει': ['ei', 'i'], 'οι': ['oi', 'i'], 'ου': ['ou', 'u'], 'αι': ['ai', 'e'],
13 | 'υι': ['ui', 'i'], 'μπ': ['mp', 'b'], 'ντ': ['nt', 'd'],
14 |
15 | 'Ει': ['Ei', 'I'], 'Οι': ['Oi', 'I'], 'Ου': ['Ou', 'U'], 'Αι': ['Ai', 'E'],
16 | 'Υι': ['Yi', 'I'], 'Μπ': ['Mp', 'B'], 'Ντ': ['Nt', 'D']}
17 |
18 | greeklish_to_greek = {'A': ['Α'], 'Ai': ['Αι'], 'B': ['Β', 'Μπ'], 'D': ['Δ', 'Ντ'], 'E': ['Ε', 'Αι'], 'Ei': ['Ει'],
19 | 'F': ['Φ'], 'G': ['Γ'], 'H': ['Η', 'Χ'], 'I': ['Η', 'Ι', 'Υ', 'Ει', 'Οι', 'Υι'], 'K': ['Κ'],
20 | 'Ks': ['Ξ'], 'L': ['Λ'], 'M': ['Μ'], 'Mp': ['Μπ'], 'N': ['Ν'], 'Nt': ['Ντ'], 'O': ['Ο', 'Ω'],
21 | 'Oi': ['Οι'], 'Ou': ['Ου'], 'P': ['Π', 'Ψ'], 'Q': ['Θ'], 'R': ['Ρ'], 'S': ['Σ'], 'T': ['Τ'],
22 | 'Th': ['Θ'], 'U': ['Θ', 'Ου', 'Y'], 'V': ['Β'], 'W': ['Ω'], 'X': ['Ξ', 'Χ'], 'Y': ['Υ'], 'Yi': ['Υι'],
23 | 'Z': ['Ζ'], 'a': ['α'], 'ai': ['αι'], 'b': ['β', 'μπ'], 'd': ['δ', 'ντ'],
24 | 'e': ['ε', 'αι'], 'ei': ['ει'], 'f': ['φ'], 'g': ['γ'], 'h': ['η', 'χ'],
25 | 'i': ['η', 'ι', 'υ', 'ει', 'οι', 'υι'], 'k': ['κ'], 'ks': ['ξ'], 'l': ['λ'], 'm': ['μ'],
26 | 'mp': ['μπ'], 'n': ['ν'], 'nt': ['ντ'], 'o': ['ο', 'ω'], 'oi': ['οι'], 'ou': ['ου'], 'p': ['π'],
27 | 'ps': ['ψ'], 'r': ['ρ'], 's': ['σ', 'ς'], 't': ['τ'], 'th': ['θ'], 'u': ['υ', 'θ', 'ου'],
28 | 'ui': ['υι'], 'v': ['β'], 'w': ['ω'], 'x': ['ξ', 'χ'], 'y': ['υ'], 'z': ['ζ']}
29 |
30 | # With intonation
31 | greek_to_greeklish_intonated = {'α': ['a'], 'ε': ['e'], 'η': ['h', 'i'], 'ι': ['i'], 'ο': ['o'], 'υ': ['u', 'y', 'i'],
32 | 'ω': ['w', 'o'], 'β': ['b', 'v'], 'γ': ['g'], 'δ': ['d'], 'ζ': ['z'], 'θ': ['th', 'u'],
33 | 'κ': ['k'], 'λ': ['l'], 'μ': ['m'], 'ν': ['n'], 'ξ': ['ks', 'x'], 'π': ['p'],
34 | 'ρ': ['r'], 'σ': ['s'], 'ς': ['s'], 'τ': ['t'], 'φ': ['f'], 'χ': ['x', 'h'], 'ψ': ['ps'],
35 |
36 | 'Α': ['A'], 'Ε': ['E'], 'Η': ['H', 'I'], 'Ι': ['I'], 'Ο': ['O'], 'Υ': ['Y', 'U', 'I'],
37 | 'Ω': ['W', 'O'], 'Β': ['B', 'V'], 'Γ': ['G'], 'Δ': ['D'], 'Ζ': ['Z'],
38 | 'Θ': ['Th', 'U', 'Q'], 'Κ': ['K'], 'Λ': ['L'], 'Μ': ['M'], 'Ν': ['N'], 'Ξ': ['Ks', 'X'],
39 | 'Π': ['P'], 'Ρ': ['R'], 'Σ': ['S'], 'Τ': ['T'], 'Χ': ['X', 'H'], 'Φ': ['F'], 'Ψ': ['P'],
40 |
41 | 'ει': ['ei', 'i'], 'οι': ['oi', 'i'], 'ου': ['ou', 'u'], 'αι': ['ai', 'e'],
42 | 'υι': ['ui', 'i'], 'μπ': ['mp', 'b'], 'ντ': ['nt', 'd'],
43 |
44 | 'Ει': ['Ei', 'I'], 'Οι': ['Oi', 'I'], 'Ου': ['Ou', 'U'], 'Αι': ['Ai', 'E'],
45 | 'Υι': ['Yi', 'I'], 'Μπ': ['Mp', 'B'], 'Ντ': ['Nt', 'D'],
46 |
47 | 'ά': ['a'], 'έ': ['e'], 'ή': ['h', 'i'], 'ί': ['i'], 'ό': ['o'], 'ύ': ['u', 'y', 'i'],
48 | 'ώ': ['w', 'o'],
49 | 'Ά': ['A'], 'Έ': ['E'], 'Ή': ['H', 'I'], 'Ί': ['I'], 'Ό': ['O'], 'Ύ': ['Y', 'U', 'I'],
50 | 'Ώ': ['W', 'O'],
51 |
52 | 'εί': ['ei', 'i'], 'οί': ['oi', 'i'], 'ού': ['ou', 'u'], 'αί': ['ai', 'e'],
53 | 'υί': ['ui', 'i'],
54 | 'Εί': ['Ei', 'I'], 'Οί': ['Oi', 'I'], 'Ού': ['Ou', 'U'], 'Αί': ['Ai', 'E'],
55 | 'Υί': ['Yi', 'I'],
56 | }
57 |
58 |
59 | greeklish_to_greek_intonated = {'A': ['Α', 'Ά'], 'Ai': ['Αι', 'Αί'], 'B': ['Β', 'Μπ'], 'D': ['Δ', 'Ντ'],
60 | 'E': ['Ε', 'Αι', 'Έ', 'Αί'], 'Ei': ['Ει', 'Εί'], 'F': ['Φ'], 'G': ['Γ'],
61 | 'H': ['Η', 'Χ', 'Ή'],
62 | 'I': ['Η', 'Ι', 'Υ', 'Ει', 'Οι', 'Υι', 'Ή', 'Ί', 'Ύ', 'Εί', 'Οί', 'Υί'],
63 | 'K': ['Κ'], 'Ks': ['Ξ'], 'L': ['Λ'], 'M': ['Μ'], 'Mp': ['Μπ'], 'N': ['Ν'],
64 | 'Nt': ['Ντ'], 'O': ['Ο', 'Ω', 'Ό', 'Ώ'], 'Oi': ['Οι', 'Οί'], 'Ou': ['Ου', 'Ού'],
65 | 'P': ['Π', 'Ψ'], 'Q': ['Θ'], 'R': ['Ρ'], 'S': ['Σ'], 'T': ['Τ'], 'Th': ['Θ'],
66 | 'U': ['Θ', 'Ου', 'Ού', 'Υ', 'Ύ'], 'V': ['Β'], 'W': ['Ω', 'Ώ'], 'X': ['Ξ', 'Χ'], 'Y': ['Υ', 'Ύ'],
67 | 'Yi': ['Υι', 'Υί'], 'Z': ['Ζ'], 'a': ['α', 'ά'], 'ai': ['αι', 'αί'], 'b': ['β', 'μπ'],
68 | 'd': ['δ', 'ντ'], 'e': ['ε', 'αι', 'έ', 'αί'], 'ei': ['ει', 'εί'], 'f': ['φ'],
69 | 'g': ['γ'], 'h': ['η', 'χ', 'ή'],
70 | 'i': ['η', 'ι', 'υ', 'ει', 'οι', 'υι', 'ή', 'ί', 'ύ', 'εί', 'οί', 'υί'],
71 | 'k': ['κ'], 'ks': ['ξ'], 'l': ['λ'], 'm': ['μ'], 'mp': ['μπ'], 'n': ['ν'],
72 | 'nt': ['ντ'], 'o': ['ο', 'ω', 'ό', 'ώ'], 'oi': ['οι', 'οί'], 'ou': ['ου', 'ού'],
73 | 'p': ['π'], 'ps': ['ψ'], 'r': ['ρ'], 's': ['σ', 'ς'], 't': ['τ'], 'th': ['θ'],
74 | 'u': ['υ', 'θ', 'ου', 'ύ', 'ού'], 'ui': ['υι', 'υί'], 'v': ['β'], 'w': ['ω', 'ώ'],
75 | 'x': ['ξ', 'χ'], 'y': ['υ', 'ύ'], 'z': ['ζ']}
76 |
77 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/processors/tokenizer.py:
--------------------------------------------------------------------------------
1 | from typing import List, Tuple, Dict
2 | from torch.utils.data import DataLoader, Dataset
3 |
4 | from gr_nlp_toolkit.domain.dataset import DatasetImpl
5 | from gr_nlp_toolkit.domain.document import Document
6 | from gr_nlp_toolkit.processors.abstract_processor import AbstractProcessor
7 |
8 | import unicodedata
9 |
10 | from gr_nlp_toolkit.domain.token import Token
11 |
12 | from transformers import AutoTokenizer
13 |
14 | tokenizer_greek = AutoTokenizer.from_pretrained('nlpaueb/bert-base-greek-uncased-v1')
15 |
16 |
17 | def strip_accents_and_lowercase(s: str) -> str:
18 | """
19 | Strips accents from a string and converts it to lowercase.
20 |
21 | Args:
22 | s: A string from which to strip accents.
23 |
24 | Returns:
25 | A new string with accents removed and converted to lowercase.
26 | """
27 | return ''.join(c for c in unicodedata.normalize('NFD', s)
28 | if unicodedata.category(c) != 'Mn').lower()
29 |
30 |
31 | def create_ids(text: str) -> List[int]:
32 | """
33 | Encodes a given text into a list of token IDs using a tokenizer.
34 |
35 | Args:
36 | text: A string to encode.
37 |
38 | Returns:
39 | A list of token IDs.
40 | """
41 | return tokenizer_greek.encode(text)
42 |
43 |
44 | def create_text(ids: List[int]) -> List[int]:
45 | """
46 | Decodes a list of token IDs back into a text string.
47 |
48 | Args:
49 | ids: A list of token IDs to decode.
50 |
51 | Returns:
52 | A decoded string with special tokens skipped.
53 | """
54 | return tokenizer_greek.decode(ids, skip_special_tokens=True)
55 |
56 |
57 | def convert_to_tokens(input_ids: List[int]) -> List[str]:
58 | """
59 | Converts a list of token IDs into their corresponding token strings.
60 |
61 | Args:
62 | input_ids: A list of token IDs.
63 |
64 | Returns:
65 | A list of token strings with special tokens skipped.
66 | """
67 | return tokenizer_greek.convert_ids_to_tokens(input_ids, skip_special_tokens=True)
68 |
69 |
70 | def remove_special_tokens(input_ids: List[int]) -> List[int]:
71 | """
72 | Removes special tokens from a list of token IDs.
73 |
74 | Args:
75 | input_ids: A list of token IDs.
76 |
77 | Returns:
78 | A new list of token IDs with special tokens removed.
79 | """
80 | input_ids_without_special_tokens = []
81 | for input_id in input_ids:
82 | if input_id not in tokenizer_greek.all_special_ids:
83 | input_ids_without_special_tokens.append(input_id)
84 | return input_ids_without_special_tokens
85 |
86 |
87 |
88 | def create_mask_and_tokens(input_tokens: List[str], input_ids: List[int]) -> Tuple[List[str], List[Token], Dict]:
89 | """
90 | Creates a mask, tokens, and subword-to-word mapping from input tokens and IDs.
91 |
92 | Args:
93 | input_tokens: A list of input token strings.
94 | input_ids: A list of input token IDs.
95 |
96 | Returns:
97 | A tuple containing:
98 | - A list of booleans indicating whether each token is a subword.
99 | - A list of Token objects.
100 | - A dictionary mapping subword indices to word indices.
101 | """
102 | mask = []
103 | tokens = []
104 | subword2word = {}
105 |
106 | word = 0
107 | # for each token
108 | for j, input in enumerate(zip(input_tokens, input_ids), 1):
109 | t = input[0]
110 | i = input[1]
111 | # it isn't a sub-word
112 | if not t.startswith("##"):
113 | # create a token object
114 | tokenObj = Token([t])
115 | tokenObj.ids.append(i)
116 | tokens.append(tokenObj)
117 | mask.append(True)
118 | word = word + 1
119 | else:
120 |
121 | # add sub-words to token
122 | tokenObj.subwords.append(t)
123 | tokenObj.ids.append(i)
124 | mask.append(False)
125 | subword2word[j] = word
126 |
127 | # create text
128 | for token in tokens:
129 | token.text = create_text(token.ids)
130 |
131 | # Adding a 0-0 mapping to subword2word
132 | subword2word[0] = 0
133 |
134 | return mask, tokens, subword2word
135 |
136 |
137 | def create_dataset_and_dataloader(input_ids) -> Tuple[Dataset, DataLoader]:
138 | """
139 | Creates a dataset and dataloader from input IDs.
140 |
141 | Args:
142 | input_ids: A list of input token IDs.
143 |
144 | Returns:
145 | A tuple containing:
146 | - A Dataset object.
147 | - A DataLoader object.
148 | """
149 | dataset = DatasetImpl([input_ids])
150 | dataloader = DataLoader(dataset)
151 | return dataset, dataloader
152 |
153 |
154 |
155 | class Tokenizer(AbstractProcessor):
156 | """
157 | Tokenizer class that takes a document as an input with the text field set, tokenizes and returns a document with
158 | all fields set
159 | """
160 |
161 | def __call__(self, doc: Document) -> Document:
162 | """
163 | Processes a document by tokenizing its text and setting relevant fields.
164 |
165 | Args:
166 | doc: A Document object with the text field set.
167 |
168 | Returns:
169 | A Document object with the following fields set:
170 | - text: The original text stripped of accents and converted to lowercase.
171 | - input_ids: List of token IDs created from the text.
172 | - token_mask: List of booleans indicating whether each token is a subword.
173 | - tokens: List of Token objects.
174 | - subword2word: Dictionary mapping subword indices to word indices.
175 | - dataset: A Dataset object created from the input IDs.
176 | - dataloader: A DataLoader object created from the dataset.
177 | """
178 | # get document's text and strip accent and lowercase
179 | doc.text = strip_accents_and_lowercase(doc.text)
180 | # create ids
181 | doc.input_ids = create_ids(doc.text)
182 | # create mask and tokens
183 | doc.token_mask, doc.tokens, doc.subword2word = create_mask_and_tokens(convert_to_tokens(doc.input_ids),
184 | remove_special_tokens(doc.input_ids))
185 |
186 | # create dataloader
187 | doc.dataset, doc.dataloader = create_dataset_and_dataloader(doc.input_ids)
188 | return doc
189 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/data/util.py:
--------------------------------------------------------------------------------
1 | import string
2 | from collections import Counter
3 | from torch.utils.data import Dataset
4 | import torch
5 | import torch.nn as nn
6 |
7 | class TextVectorizer:
8 | """
9 | Used to vectorize given text based on a learned vocabulary.
10 | After training the vocabulary on a corpus, the resulting encoding is:
11 | 1 : Padding
12 | [1 to max_vocab_size+1] : tokens learnt in the vocab. (This could be smaller than the actual max number provided)
13 | len(vocab)-1 : [SOS] symbol
14 | len(vocab) : OOV tokens
15 | """
16 | def __init__(self, mode):
17 | """
18 | vocab: dictionary (token --> index)
19 | idx2token: list (idx --> index)
20 | :param mode:
21 | """
22 | assert mode in {"word", "char"}
23 | self.vocab = dict()
24 | self.idx2token = []
25 | self.mode = mode
26 |
27 | def build_vocab(self, corpus, max_size=25000):
28 | """
29 | Builds the vocabulary from a corpus of sentences. The words get encoded by
30 | count of appearances in the data.
31 | :param corpus: A list of sentences as strings.
32 | :param max_size: The max size of words that can be encoded, excluding codes for & OOV tokens.
33 | """
34 | counts = Counter()
35 | self.vocab[""] = 0
36 | self.idx2token.append("")
37 | idx = 1
38 | if self.mode == "word":
39 | # In the case of words, we remove punctuation and split on whitespaces
40 | for line in corpus:
41 | # Remove punctuation
42 | line = line.translate(str.maketrans("", "", string.punctuation))
43 | # Split the line in whitespaces to get the words
44 | tokens = line.split()
45 | # Update counts
46 | counts.update(tokens)
47 | # mode == "char"
48 | else:
49 | # Here we do not do any regularization, and split on every character.
50 | for line in corpus:
51 | tokens = [char for char in line]
52 | counts.update(tokens)
53 | # Add the most frequent tokens to the vocabulary.
54 | for (name, count) in counts.most_common(max_size):
55 | self.vocab[name] = idx
56 | self.idx2token.append(name)
57 | idx += 1
58 | # Add [SOS] token.
59 | self.vocab[""] = idx
60 | self.idx2token.append("")
61 |
62 |
63 | def encode_dataset(self, corpus):
64 | """
65 | Takes as input a corpus of sentences, generates source/target training pairs
66 | and encodes them based on the vocabulary. Then it returns the pairs as tuples of tensors.
67 | :param corpus: Array of sentences in the form of strings.
68 | :return: list of pairs of torch.LongTensor objects
69 | """
70 | # We start by tokenizing the corpus.
71 | tokenized_dataset = []
72 | for line in corpus:
73 | if self.mode == "word":
74 | # Strip punctuation and split on whitespaces.
75 | tokens = line.translate(str.maketrans("", "", string.punctuation)).split()
76 | else:
77 | # No regularization applied for characters.
78 | tokens = [char for char in line]
79 | # Also find the length of the longest sequence, taking into account the addition of a [SOS]/[EOS] symbol.
80 | tokenized_dataset.append(tokens)
81 | # Make source & target sentences and encode them based on the dictionary.
82 | source_vecs, target_vecs = [], []
83 | for sequence in tokenized_dataset:
84 | # Ignore strings that may be reduced to empty after stripping punctuation & whitespaces
85 | # (only happens if mode == "word")
86 | if not sequence:
87 | continue
88 | # Initialize source vectorized sentence with token.
89 | source_vector = [self.vocab[""]]
90 | target_vector = []
91 | for idx in range(len(sequence)-1):
92 | source_vector.append(self.vocab.get(sequence[idx], len(self.vocab)))
93 | target_vector.append(self.vocab.get(sequence[idx], len(self.vocab)))
94 | target_vector.append(self.vocab.get(sequence[-1], len(self.vocab)))
95 | # Add to sources/targets.
96 | source_vecs.append(source_vector)
97 | target_vecs.append(target_vector)
98 |
99 | """# Get the length for each sequence in the data
100 | source_lengths = torch.LongTensor(list(map(len, source_vecs)))
101 | target_lengths = torch.LongTensor(list(map(len, target_vecs)))"""
102 | # Convert data to LongTensors.
103 | for i in range(len(source_vecs)):
104 | source_vecs[i] = torch.LongTensor(source_vecs[i])
105 | target_vecs[i] = torch.LongTensor(target_vecs[i])
106 | # Pad & Sort sequences
107 | source_tensors = nn.utils.rnn.pad_sequence(source_vecs, batch_first=True)
108 | target_tensors = nn.utils.rnn.pad_sequence(target_vecs, batch_first=True)
109 | # Create Dataset object
110 | dataset = GreekDataset(source_tensors, target_tensors)
111 | # Return the Dataset & the sequence lengths (to be used for packing)
112 | return dataset
113 |
114 | def split_sequence(self, sequence):
115 | """
116 | Splits a sequence based on the tokenization mode configured, and returns it without indexing it.
117 | """
118 | if self.mode == "word":
119 | tokens = sequence.translate(str.maketrans("", "", string.punctuation)).split()
120 | else:
121 | tokens = [char for char in sequence]
122 |
123 | return sequence
124 |
125 | def input_tensor(self, sequence):
126 | """
127 | Takes a sentence and returns its encoding, based on the vocabulary, to be used for inference.
128 | :param sequence: (String) The sentence to be encoded.
129 | :return: Encoded sentence in form of a torch.Longtensor object.
130 | """
131 | if self.mode == "word":
132 | tokens = sequence.translate(str.maketrans("", "", string.punctuation)).split()
133 | else:
134 | tokens = [char for char in sequence]
135 | vectorized_input = []
136 | for token in tokens:
137 | vectorized_input.append(self.vocab.get(token, len(self.vocab)))
138 |
139 | # Convert to tensor
140 | vectorized_input = torch.LongTensor(vectorized_input)
141 |
142 | return vectorized_input
143 |
144 |
145 | class GreekDataset(Dataset):
146 |
147 | def __init__(self, source_vecs, target_vecs):
148 | """
149 | Gets two arrays of source and target vectors and outputs a Dataset object of those arrays
150 | :param source_vecs: array of source vectors
151 | :param target_vecs: array of target vectors
152 | """
153 | self.n_samples = source_vecs.size(0)
154 | self.x_data = source_vecs
155 | self.y_data = target_vecs
156 |
157 |
158 | def __getitem__(self, index):
159 | return self.x_data[index], self.y_data[index]
160 |
161 | def __len__(self):
162 | return self.n_samples
--------------------------------------------------------------------------------
/gr_nlp_toolkit/domain/textVectorizer.py:
--------------------------------------------------------------------------------
1 | import string
2 | from collections import Counter
3 | from torch.utils.data import Dataset
4 | import torch
5 | import torch.nn as nn
6 | import pickle
7 |
8 | class TextVectorizer:
9 | """
10 | Used to vectorize given text based on a learned vocabulary.
11 | It is used by the RBNLM model to encode the text into numbers that can be then fed into the LSTM model.
12 | After training the vocabulary on a corpus, the resulting encoding is:
13 | 1 : Padding
14 | [1 to max_vocab_size+1] : tokens learnt in the vocab. (This could be smaller than the actual max number provided)
15 | len(vocab)-1 : [SOS] symbol
16 | len(vocab) : OOV tokens
17 |
18 | Attributes:
19 | vocab (dict): A dictionary mapping tokens to their indices. (token --> index)
20 | idx2token (list): A list mapping indices to tokens. (idx --> index)
21 | mode (str): The tokenization mode, either 'word' or 'char'.
22 | """
23 | def __init__(self, mode):
24 | """
25 | Initializes the TextVectorizer with the specified mode.
26 |
27 | Args:
28 | mode (str): The tokenization mode, either 'word' or 'char'.
29 | """
30 | assert mode in {"word", "char"}
31 | self.vocab = dict()
32 | self.idx2token = []
33 | self.mode = mode
34 |
35 | def build_vocab(self, corpus, max_size=25000):
36 | """
37 | Builds the vocabulary from a corpus of sentences. The words get encoded by
38 | count of appearances in the data.
39 |
40 | Args:
41 | corpus (str): A list of sentences as strings.
42 | max_size (int): The max size of words that can be encoded, excluding codes for & OOV tokens.
43 | """
44 | counts = Counter()
45 | self.vocab[""] = 0
46 | self.idx2token.append("")
47 | idx = 1
48 | if self.mode == "word":
49 | # In the case of words, we remove punctuation and split on whitespaces
50 | for line in corpus:
51 | # Remove punctuation
52 | line = line.translate(str.maketrans("", "", string.punctuation))
53 | # Split the line in whitespaces to get the words
54 | tokens = line.split()
55 | # Update counts
56 | counts.update(tokens)
57 | # mode == "char"
58 | else:
59 | # Here we do not do any regularization, and split on every character.
60 | for line in corpus:
61 | tokens = [char for char in line]
62 | counts.update(tokens)
63 | # Add the most frequent tokens to the vocabulary.
64 | for (name, count) in counts.most_common(max_size):
65 | self.vocab[name] = idx
66 | self.idx2token.append(name)
67 | idx += 1
68 | # Add [SOS] token.
69 | self.vocab[""] = idx
70 | self.idx2token.append("")
71 |
72 |
73 |
74 | def encode_dataset(self, corpus):
75 | """
76 | Takes as input a corpus of sentences, generates source/target training pairs
77 | and encodes them based on the vocabulary. Then it returns the pairs as tuples of tensors.
78 |
79 | Args:
80 | corpus (list): Array of sentences in the form of strings.
81 |
82 | Returns:
83 | dataset (GreekDataset): List of pairs of torch.LongTensor objects
84 | """
85 | # We start by tokenizing the corpus.
86 | tokenized_dataset = []
87 | for line in corpus:
88 | if self.mode == "word":
89 | # Strip punctuation and split on whitespaces.
90 | tokens = line.translate(str.maketrans("", "", string.punctuation)).split()
91 | else:
92 | # No regularization applied for characters.
93 | tokens = [char for char in line]
94 | # Also find the length of the longest sequence, taking into account the addition of a [SOS]/[EOS] symbol.
95 | tokenized_dataset.append(tokens)
96 | # Make source & target sentences and encode them based on the dictionary.
97 | source_vecs, target_vecs = [], []
98 | for sequence in tokenized_dataset:
99 | # Ignore strings that may be reduced to empty after stripping punctuation & whitespaces
100 | # (only happens if mode == "word")
101 | if not sequence:
102 | continue
103 | # Initialize source vectorized sentence with token.
104 | source_vector = [self.vocab[""]]
105 | target_vector = []
106 | for idx in range(len(sequence)-1):
107 | source_vector.append(self.vocab.get(sequence[idx], len(self.vocab)))
108 | target_vector.append(self.vocab.get(sequence[idx], len(self.vocab)))
109 | target_vector.append(self.vocab.get(sequence[-1], len(self.vocab)))
110 | # Add to sources/targets.
111 | source_vecs.append(source_vector)
112 | target_vecs.append(target_vector)
113 |
114 | """# Get the length for each sequence in the data
115 | source_lengths = torch.LongTensor(list(map(len, source_vecs)))
116 | target_lengths = torch.LongTensor(list(map(len, target_vecs)))"""
117 | # Convert data to LongTensors.
118 | for i in range(len(source_vecs)):
119 | source_vecs[i] = torch.LongTensor(source_vecs[i])
120 | target_vecs[i] = torch.LongTensor(target_vecs[i])
121 | # Pad & Sort sequences
122 | source_tensors = nn.utils.rnn.pad_sequence(source_vecs, batch_first=True)
123 | target_tensors = nn.utils.rnn.pad_sequence(target_vecs, batch_first=True)
124 | # Create Dataset object
125 | dataset = GreekDataset(source_tensors, target_tensors)
126 | # Return the Dataset & the sequence lengths (to be used for packing)
127 | return dataset
128 |
129 | def split_sequence(self, sequence):
130 | """
131 | Splits a sequence based on the tokenization mode configured, and returns it without indexing it.
132 |
133 | Args:
134 | sequence (str): The sentence to be split.
135 |
136 | Returns:
137 | sequence (str): The sentence split based on the tokenization mode.
138 | """
139 | if self.mode == "word":
140 | tokens = sequence.translate(str.maketrans("", "", string.punctuation)).split()
141 | else:
142 | tokens = [char for char in sequence]
143 |
144 | return sequence
145 |
146 | def input_tensor(self, sequence):
147 | """
148 | Takes a sentence and returns its encoding, based on the vocabulary, to be used for inference.
149 |
150 | Args:
151 | sequence (String): The sentence to be encoded.
152 |
153 | Returns:
154 | vectorized_input (torch.LongTensor): Encoded sentence in form of a torch.Longtensor object.
155 | """
156 | if self.mode == "word":
157 | tokens = sequence.translate(str.maketrans("", "", string.punctuation)).split()
158 | else:
159 | tokens = [char for char in sequence]
160 | vectorized_input = []
161 | for token in tokens:
162 | vectorized_input.append(self.vocab.get(token, len(self.vocab)))
163 |
164 | # Convert to tensor
165 | vectorized_input = torch.LongTensor(vectorized_input)
166 |
167 | return vectorized_input
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # `gr-nlp-toolkit`
2 |
3 |
4 |
5 |
6 |
7 | `gr-nlp-toolkit` is a Python toolkit with state-of-the-art performance in (modern) Greek, supporting the following functionalities:
8 | 1. Named Entity Recognition (NER)
9 | 2. Part-of-Speech Tagging (POS Tagging)
10 | 3. Morphological tagging
11 | 4. Dependency parsing
12 | 5. Greeklish to Greek transliteration ("kalimera" -> "καλημερα")
13 |
14 | ## Web Demo 🤗
15 |
16 | Apart from the python library (details below), you can also interact with `gr-nlp-toolkit` in a no-code fashion by visiting our web playground here: https://huggingface.co/spaces/AUEB-NLP/greek-nlp-toolkit-demo
17 |
18 | Thanks to HuggingFace 🤗 for the GPUs.
19 |
20 | ## Installation
21 | The toolkit is supported for Python 3.9+.
22 |
23 | You can install it from PyPI by executing the following in the command line:
24 |
25 | ```sh
26 | pip install gr-nlp-toolkit
27 | ```
28 |
29 | ## Usage
30 |
31 | ### Available Processors/Features
32 |
33 | To use the toolkit, first initialize a `Pipeline` specifying which task processors you need. Each processor
34 | annotates the text with a specific task's annotations.
35 |
36 | For example:
37 | - To obtain Part-of-Speech and Morphological Tagging annotations, add the `pos` processor
38 | - To obtain Named Entity Recognition annotations, add the `ner` processor
39 | - To obtain Dependency Parsing annotations, add the `dp` processor
40 | - To enable the transliteration from Greeklish to Greek, add the `g2g` processor or the `g2g_lite` processor for a lighter but less accurate model
41 | (Greeklish to Greek transliteration example: "thessalonikh" -> "θεσσαλονίκη")
42 |
43 | ### Example Usage Scenarios
44 |
45 | - DP, POS, NER processors (input text in Greek)
46 |
47 | ```python
48 | from gr_nlp_toolkit import Pipeline
49 |
50 | nlp = Pipeline("pos,ner,dp") # Instantiate the Pipeline with the DP, POS and NER processors
51 | doc = nlp("Η Ιταλία κέρδισε την Αγγλία στον τελικό του Euro 2020.") # Apply the pipeline to a sentence in Greek
52 |
53 | ```
54 |
55 | A `Document` object is created and is annotated. The original text is tokenized
56 | and split to tokens
57 |
58 | ```python
59 | # Iterate over the generated tokens
60 | for token in doc.tokens:
61 | print(token.text) # the text of the token
62 |
63 | print(token.ner) # the named entity label in IOBES encoding : str
64 |
65 | print(token.upos) # the UPOS tag of the token
66 | print(token.feats) # the morphological features for the token
67 |
68 | print(token.head) # the head of the token
69 | print(token.deprel) # the dependency relation between the current token and its head
70 | ```
71 |
72 | `token.ner` is set by the `ner` processor, `token.upos` and `token.feats` are set by the `pos` processor
73 | and `token.head` and `token.deprel` are set by the `dp` processor.
74 |
75 | A small detail is that to get the `Token` object that is the head of another token you need to access
76 | `doc.tokens[head-1]`. The reason for this is that the enumeration of the tokens starts from 1 and when the
77 | field `token.head` is set to 0, that means the token is the root of the word.
78 |
79 | - Greeklish to Greek Conversion (input text in Greeklish)
80 |
81 | ```python
82 | from gr_nlp_toolkit import Pipeline
83 | nlp = Pipeline("g2g") # Instantiate the pipeline with the g2g processor
84 |
85 | doc = nlp("O Volos kai h Larisa einai sth Thessalia") # Apply the pipeline to a sentence in Greeklish
86 | print(doc.text) # Access the transliterated text, which is "ο Βόλος και η Λάρισα είναι στη Θεσσαλία"
87 | ```
88 | - Use all the processors together (input text in Greeklish)
89 |
90 | ```python
91 | from gr_nlp_toolkit import Pipeline
92 | nlp = Pipeline("pos,ner,dp,g2g") # Instantiate the Pipeline with the G2G, DP, POS and NER processors
93 |
94 | doc = nlp("O Volos kai h Larisa einai sthn Thessalia") # Apply the pipeline to a sentence in Greeklish
95 |
96 | print(doc.text) # Print the transliterated text
97 |
98 | # Iterate over the generated tokens
99 | for token in doc.tokens:
100 | print(token.text) # the text of the token
101 |
102 | print(token.ner) # the named entity label in IOBES encoding : str
103 |
104 | print(token.upos) # the UPOS tag of the token
105 | print(token.feats) # the morphological features for the token
106 |
107 | print(token.head) # the head of the token
108 | print(token.deprel) # the dependency relation between the current token and its head
109 | ```
110 |
111 | ## Paper
112 | The software was presented as a paper at COLING 2025.
113 | Read the full technical report/paper here: [https://aclanthology.org/2025.coling-demos.17/](https://aclanthology.org/2025.coling-demos.17/)
114 |
115 | If you use our toolkit, please cite it:
116 | ```bibtex
117 | @inproceedings{loukas-etal-coling2025-greek-nlp-toolkit,
118 | title = "{GR}-{NLP}-{TOOLKIT}: An Open-Source {NLP} Toolkit for {M}odern {G}reek",
119 | author = "Loukas, Lefteris and
120 | Smyrnioudis, Nikolaos and
121 | Dikonomaki, Chrysa and
122 | Barbakos, Spiros and
123 | Toumazatos, Anastasios and
124 | Koutsikakis, John and
125 | Kyriakakis, Manolis and
126 | Georgiou, Mary and
127 | Vassos, Stavros and
128 | Pavlopoulos, John and
129 | Androutsopoulos, Ion",
130 | editor = "Rambow, Owen and
131 | Wanner, Leo and
132 | Apidianaki, Marianna and
133 | Al-Khalifa, Hend and
134 | Eugenio, Barbara Di and
135 | Schockaert, Steven and
136 | Mather, Brodie and
137 | Dras, Mark",
138 | booktitle = "Proceedings of the 31st International Conference on Computational Linguistics: System Demonstrations",
139 | month = jan,
140 | year = "2025",
141 | address = "Abu Dhabi, UAE",
142 | publisher = "Association for Computational Linguistics",
143 | url = "https://aclanthology.org/2025.coling-demos.17/",
144 | pages = "174--182",
145 | }
146 | ```
147 |
148 | ----
149 | ### Technical Notes:
150 |
151 | - The *first* time you use a processor, the models are downloaded from Hugging Face and stored into the .cache folder. The NER, DP and POS processors are each about 500 MB, while the G2G processor is about 1.2 GB in size.
152 | - If the input text is already in Greek, the G2G (Greeklish-to-Greek) processor is skipped.
153 | - If your machine has an accelerator but you want to run the process on the CPU, you can pass the flag `use_cpu=True` to the Pipeline object. By default, `use_cpu` is set to *False*.
154 | - The Greeklish-to-Greek transliteration processor (ByT5) weights can be found in HuggingFace: [https://huggingface.co/AUEB-NLP/ByT5_g2g](https://huggingface.co/AUEB-NLP/ByT5_g2g)
155 | - The NER/POS/DP processors/weights can be found in HuggingFace: [https://huggingface.co/AUEB-NLP/gr-nlp-toolkit](https://huggingface.co/AUEB-NLP/gr-nlp-toolkit)
156 |
157 | ## References
158 | While many methodology details are shared in the [GR-NLP-TOOLKIT paper publication @ COLING 2025 (see above)](https://arxiv.org/abs/2412.08520), additional research details can be found here:
159 | 1. C. Dikonimaki, "A Transformer-based natural language processing toolkit for Greek -- Part of speech tagging and dependency parsing", BSc thesis, Department of Informatics, Athens University of Economics and Business, 2021. http://nlp.cs.aueb.gr/theses/dikonimaki_bsc_thesis.pdf *(POS/DP/Morphological tagging processor)*
160 |
161 | 2. N. Smyrnioudis, "A Transformer-based natural language processing toolkit for Greek -- Named entity recognition and multi-task learning", BSc thesis, Department of Informatics, Athens University of Economics and Business, 2021. http://nlp.cs.aueb.gr/theses/smyrnioudis_bsc_thesis.pdf *(NER processor)*
162 |
163 | 3. A. Toumazatos, J. Pavlopoulos, I. Androutsopoulos, & S. Vassos, "Still All Greeklish to Me: Greeklish to Greek Transliteration." In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024) (pp. 15309–15319). https://aclanthology.org/2024.lrec-main.1330/ *(Greeklish-to-Greek processor)*
164 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2020 AUEB NLP Group
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/gr_nlp_toolkit/models/g2g_RBNLM_model.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import torch.nn as nn
4 | from torch.utils.data import DataLoader
5 | from gr_nlp_toolkit.configs.dictionary_tables import greeklish_to_greek_intonated
6 |
7 | class LSTM_LangModel(nn.Module):
8 | """
9 | LSTM-based language model
10 |
11 | Attributes:
12 | hidden_size (int): The size of the hidden layer
13 | embed (nn.Embedding): The embedding layer
14 | lstm (nn.LSTM): The LSTM layer
15 | dense (nn.Linear): The dense layer
16 | dropout (nn.Dropout): The dropout layer
17 | """
18 | def __init__(self, input_size, embed_size, hidden_size, output_size):
19 | """
20 | Initializes the LSTM_LangModel with the specified parameters.
21 |
22 | Args:
23 | input_size (int): The size of the input layer
24 | embed_size (int): The size of the embedding layer
25 | hidden_size (int): The size of the hidden layer
26 | output_size (int): The size of the output layer
27 | """
28 | super(LSTM_LangModel, self).__init__()
29 | self.hidden_size = hidden_size
30 |
31 | self.embed = nn.Embedding(input_size, embed_size, padding_idx=0)
32 | self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
33 | self.dense = nn.Linear(hidden_size, output_size)
34 | self.dropout = nn.Dropout(0.5)
35 |
36 | def forward(self, x, h0=None, c0=None):
37 | """
38 | Forward pass of the LSTM_LangModel
39 |
40 | Args:
41 | x (Tensor): The input tensor
42 | h0 (Tensor): The initial hidden state
43 | c0 (Tensor): The initial cell state
44 |
45 | Returns:
46 | output (Tensor): The output tensor
47 | h (Tensor): The hidden state
48 | c (Tensor): The cell state
49 |
50 | """
51 | input_embedded = self.embed(x)
52 | if h0 is None and c0 is None:
53 | output_lstm, (h, c) = self.lstm(input_embedded)
54 | else:
55 | output_lstm, (h, c) = self.lstm(input_embedded, (h0, c0))
56 | output = self.dropout(output_lstm)
57 | output = self.dense(output)
58 | return output, h, c
59 |
60 |
61 | class State():
62 | """
63 | Container class for the attributes of each candidate replacement.
64 |
65 | Attributes:
66 | translated (list): List of tokens already translated to the desired language.
67 | remaining (str): The remaining sentence to be translated.
68 | out (tensor): The last output of the LSTM for that particular state. Contains
69 | logits that will become probabilities with the application of a softmax.
70 | hidden (tuple): Contains the hidden states of the candidate.
71 | score (float): The score given to the translation based on the language model.
72 | """
73 |
74 | def __init__(self, translated, remaining, out, hidden, score):
75 |
76 | """
77 | Initializes the State object with the specified parameters.
78 |
79 | Args:
80 | translated (list): List of tokens already translated to the desired language.
81 | remaining (str): The remaining sentence to be translated.
82 | out (tensor): The last output of the LSTM for that particular state. Contains
83 | logits that will become probabilities with the application of a softmax.
84 | hidden (tuple): Contains the hidden states of the candidate.
85 | score (float): The score given to the translation based on the language model.
86 | """
87 | self.translated = translated
88 | self.remaining = remaining
89 | self.hidden = hidden
90 | self.output = out
91 | self.score = score
92 |
93 | def __eq__(self, other):
94 | """
95 | Equality operator, needed for eliminating duplicate states.
96 | """
97 | if isinstance(other, State):
98 | if (self.translated == other.translated and
99 | self.remaining == other.remaining and
100 | self.score == other.score):
101 | return True
102 |
103 | return False
104 |
105 | class LanguageModel:
106 | """
107 | Language model for Greeklish to Greek conversion.
108 |
109 | Attributes:
110 | vectorizer (TextVectorizer): The vectorizer used to convert tokens to indices.
111 | model (LSTM_LangModel): The language model used for translation.
112 | device (str): The device to run the model on.
113 | softmax (nn.LogSoftmax): The log version of the softmax function.
114 | """
115 |
116 | def __init__(self, vectorizer, model, device='cpu'):
117 | """
118 | Initializes the LanguageModel with the specified parameters.
119 |
120 | Args:
121 | vectorizer: (TextVectorizer) The vectorizer used to convert tokens to indices.
122 | model: (LSTM_LangModel) The language model used for translation.
123 | device: (str) The device to run the model on.
124 | """
125 | self.vectorizer = vectorizer
126 | self.model = model
127 | self.mode = vectorizer.mode
128 | self.device = torch.device(device)
129 | self.model.to(self.device)
130 |
131 | # Use the log version of Softmax to sum scores instead of multiplying them and avoid decay.
132 | self.softmax = nn.LogSoftmax(dim=1)
133 |
134 | def load_model(self, path):
135 | """
136 | Load a pre-trained model as a state dictionary.
137 |
138 | Args:
139 | path (str): The path to the pre-trained model.
140 | """
141 | self.model.load_state_dict(torch.load(path, weights_only=True))
142 |
143 | def translate(self, sentences, beams):
144 | """
145 | Takes a list of sentences and translates them.
146 |
147 | Args:
148 | sentences: (list) Sentences you want to translate
149 | beams: (int) The number of parameters
150 |
151 | Returns:
152 | translated_sentences: (list) Translated sentences
153 |
154 | """
155 | # Don't forget to put the model in eval mode.
156 | self.model.eval()
157 | with torch.no_grad():
158 | translated_sentences = []
159 | for sentence in sentences:
160 | translated = []
161 | remaining = sentence
162 | # We start with the first state, with a score of 1
163 | # The format of a state is: (translated_sent, remaining_sent, (h0, c0), score)
164 | # --------------------------------------------------------------------------------
165 | # First, we need to "prep" the char model. This is done by feeding the network with the
166 | # token and saving the output hidden states for the initial State() object.
167 | start_input = self.vectorizer.input_tensor("")
168 | out, h_n, c_n = self.model(start_input.to(self.device), None, None)
169 | # The score of the initial state in 0, because we use LogSoftmax instead of regular Softmax.
170 | initial_state = State(translated, remaining, out, (h_n, c_n), 0)
171 | states = [initial_state]
172 | for i in range(len(sentence)):
173 | candidates = []
174 | # Look through the current states.
175 | for state in states:
176 | # Produce the next-char-candidates from each state, along with their probabilities.
177 | new_states = self.get_candidates(state)
178 | candidates += new_states
179 |
180 | # Remove any duplicate candidates
181 | #print([candidate.translated for candidate in candidates])
182 | candidates_set = []
183 | [candidates_set.append(cand) for cand in candidates if cand not in candidates_set]
184 | candidates = candidates_set
185 | # Get the best states, according to the number of beams in the search.
186 | best_candidates = []
187 | for j in range(beams):
188 | if candidates:
189 | # Probabilities of each candidate new state.
190 | probabilities = [cand.score for cand in candidates]
191 | # Get the best candidate and remove from the list
192 | best_cand = candidates.pop(probabilities.index(max(probabilities)))
193 | # Add that candidate to the list of best candidates
194 | best_candidates.append(best_cand)
195 | # Make the list of the best candidates the new states.
196 | states = best_candidates
197 |
198 | # Once the sentence is over, the state with the highest probability is the best translation.
199 | probs = [state.score for state in states]
200 | # Extract the sentence from the state
201 | sent = states[probs.index(max(probs))]
202 | # Convert the list of translated tokens to a sentence.
203 | translation = ""
204 | for i in sent.translated:
205 | translation += i
206 |
207 | translated_sentences.append(translation)
208 | return translated_sentences
209 |
210 | def get_candidates(self, state):
211 | """
212 | Get the next candidates for the translation.
213 |
214 | Args:
215 | state (State): The current state of the translation.
216 |
217 | Returns:
218 | candidates (list): A list of the next candidates for the translation.
219 |
220 | """
221 |
222 |
223 | # If the state is already a final state (no remaining text to translate),
224 | # it returns only itself as a candidate.
225 | if not state.remaining:
226 | return [state]
227 |
228 | # If it is not a final state, generate the next candidate states
229 | candidates = []
230 |
231 | # Look at both of the first two characters in the translated sentence, as some greek
232 | # characters may be represented with 2 characters in Greeklish.
233 | for length in [1, 2]:
234 | if len(state.remaining) >= length:
235 | # Fetch the valid replacements from the dictionary.
236 | if length == 2:
237 | token = state.remaining[0] + state.remaining[1]
238 | replacements = greeklish_to_greek_intonated.get(token, [])
239 | else:
240 | # If the look-up is a miss (e.g. the token is a space, a number or punctuation),
241 | # return the token itself.
242 | token = state.remaining[0]
243 | replacements = greeklish_to_greek_intonated.get(token, [token])
244 |
245 | # For each candidate replacement, get the probability from the LM
246 | for item in replacements:
247 | h_n, c_n = state.hidden[0], state.hidden[1]
248 | out = state.output
249 | score = state.score
250 | for token in item:
251 | # Apply softmax to the model's output and get the prob based on the index from vocab
252 | probs = self.softmax(out)
253 | idx = self.vectorizer.vocab.get(token, len(self.vectorizer.vocab))
254 | # Update score.
255 | score = score + probs[0][idx].item()
256 | # Feed the token to the model to get the next output and hidden states
257 | input = self.vectorizer.input_tensor(token)
258 | out, h_n, c_n = self.model(input.to(self.device), h_n, c_n)
259 |
260 | translated_tokens = [token for token in item]
261 |
262 | new_candidate = State(state.translated+translated_tokens,
263 | state.remaining[length:],
264 | out, (h_n, c_n), score)
265 | candidates.append(new_candidate)
266 |
267 | return candidates
--------------------------------------------------------------------------------