├── training_data.zip
├── Human evaluation.pdf
├── cleansing_operations.pdf
├── test_and_validation_data.zip
├── Run.py
├── inference.py
├── .gitignore
├── train.py
├── README.md
├── LICENCE
└── utils.py
/training_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattia-decao/hiero-transformer/HEAD/training_data.zip
--------------------------------------------------------------------------------
/Human evaluation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattia-decao/hiero-transformer/HEAD/Human evaluation.pdf
--------------------------------------------------------------------------------
/cleansing_operations.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattia-decao/hiero-transformer/HEAD/cleansing_operations.pdf
--------------------------------------------------------------------------------
/test_and_validation_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattia-decao/hiero-transformer/HEAD/test_and_validation_data.zip
--------------------------------------------------------------------------------
/Run.py:
--------------------------------------------------------------------------------
1 | # Load environment and model
2 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3 | import torch
4 |
5 | tokenizer = AutoTokenizer.from_pretrained("mattiadc/hiero-transformer")
6 | model = AutoModelForSeq2SeqLM.from_pretrained("mattiadc/hiero-transformer").to('cuda:0').eval()
7 |
8 | # Traduction
9 | #@title Traduction
10 |
11 | language_input = 'tnt' #@param ["ea", "tnt"]
12 | language_output = 'de' #@param ["de", "en", "tnt", "lKey", "wordClass"]
13 | sentence_input = '*ra m p,t' #@param {type:"string"}
14 | # resulted_input_tnt = '' #@param {type:"string"}
15 | all_outputs = True #@param {type:"boolean"}
16 |
17 | # If you desire to add capital letters (e.g. in proper names) you need to add the asterisk * before the letter you want to capitalize in the transliteration
18 |
19 | if language_input == 'tnt':
20 | sentence_input = (sentence_input
21 |
22 | .replace('*X', 'H̱')
23 | .replace('*S', 'Š')
24 | .replace('*T', 'Ṯ')
25 | .replace('*D', 'Ḏ')
26 | .replace('*A', 'Ꜣ')
27 | .replace('*H', 'Ḥ')
28 |
29 | .replace('X', 'ẖ')
30 | .replace('S', 'š')
31 | .replace('T', 'ṯ')
32 | .replace('D', 'ḏ')
33 | .replace('A', 'ꜣ')
34 | .replace('H', 'ḥ')
35 |
36 | .replace('*j', 'J')
37 | .replace('*i', 'I')
38 | .replace('*y', 'Y')
39 | .replace('*a', 'Ꜥ')
40 | .replace('*w', 'W')
41 | .replace('*b', 'B')
42 | .replace('*p', 'P')
43 | .replace('*f', 'F')
44 | .replace('*m', 'M')
45 | .replace('*n', 'N')
46 | .replace('*r', 'R')
47 | .replace('*h', 'H')
48 | .replace('*x', 'Ḫ')
49 | .replace('*s', 'S')
50 | .replace('*z', 'Z')
51 | .replace('*q', 'Q')
52 | .replace('*k', 'K')
53 | .replace('*g', 'G')
54 | .replace('*t', 'T')
55 | .replace('*d', 'D')
56 | .replace('a', 'ꜥ')
57 | .replace('x', 'ḫ')
58 | .replace ('i', 'i̯')
59 |
60 | )
61 | print(sentence_input)
62 |
63 | lang_to_m2m_lang_id = {
64 | 'ea': 'ar',
65 | 'tnt': 'ar',
66 | 'en': 'en',
67 | 'de': 'de',
68 | 'lKey': 'my',
69 | 'tnt': 'lo',
70 | 'wordClass': 'th',
71 | }
72 |
73 | langs = [
74 | ('ea', 'de'),
75 | ('ea', 'en'),
76 | ('ea', 'tnt'),
77 | ('ea', 'lKey'),
78 | ('ea', 'wordClass'),
79 | ('tnt', 'de'),
80 | ('tnt', 'en'),
81 | ('tnt', 'lKey'),
82 | ('tnt', 'wordClass'),
83 | ]
84 |
85 | def get_translation(language_input, language_output, sentence_input):
86 | with torch.no_grad():
87 | with torch.cuda.amp.autocast():
88 | tokenizer.src_lang = lang_to_m2m_lang_id[language_input]
89 | tokenizer.tgt_lang = lang_to_m2m_lang_id[language_output]
90 |
91 | model_inputs = tokenizer([sentence_input], return_tensors="pt").to(model.device)
92 | generated_tokens = model.generate(
93 | **model_inputs,
94 | num_beams=10,
95 | max_length=32,
96 | forced_bos_token_id=tokenizer.get_lang_id(lang_to_m2m_lang_id[language_output]))
97 | return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
98 |
99 | if not all_outputs:
100 | assert (language_input, language_output) in langs, 'Coppia lingue non valida'
101 | result = get_translation(language_input, language_output, sentence_input)
102 | else:
103 | result = {
104 | language_output: get_translation(language_input, language_output, sentence_input)
105 | for language_input_tmp, language_output in langs if language_input == language_input_tmp
106 | }
107 | result
108 |
--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
1 | import string
2 |
3 | import datasets
4 | import pandas as pd
5 | import torch
6 | from tqdm.auto import tqdm
7 | from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
8 |
9 | from utils import lang_to_m2m_lang_id, load_data_from_folder, processed_data
10 |
11 | # Load data
12 | test_data = load_data_from_folder("test_data")
13 |
14 | # Filter and extract data
15 | # Dict[str, Dict[str, List[Dict[str, str]]]]
16 | # {src_lang: {tgt_lang: [{'source': ..., 'target': ...}]}}
17 | test_data = processed_data(test_data)
18 |
19 |
20 | # Load model to generate predictions
21 | model = M2M100ForConditionalGeneration.from_pretrained("ea9all").to("cuda:0").eval()
22 | tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
23 |
24 | # Produce predictions
25 | for src_lang, values in test_data.items():
26 | for tgt_lang, data in values.items():
27 | for element in tqdm(data):
28 | with torch.no_grad():
29 | with torch.cuda.amp.autocast():
30 | tokenizer.src_lang = lang_to_m2m_lang_id[src_lang]
31 | tokenizer.tgt_lang = lang_to_m2m_lang_id[tgt_lang]
32 |
33 | model_inputs = tokenizer(
34 | [element["source"]], return_tensors="pt"
35 | ).to(model.device)
36 | generated_tokens = model.generate(
37 | **model_inputs,
38 | num_beams=10,
39 | forced_bos_token_id=tokenizer.get_lang_id(
40 | lang_to_m2m_lang_id[tgt_lang]
41 | )
42 | )
43 | element["prediction"] = tokenizer.batch_decode(
44 | generated_tokens, skip_special_tokens=True
45 | )[0]
46 |
47 | # Calculate metrics
48 | metrics = {
49 | src_lang: {
50 | tgt_lang: {m: datasets.load_metric(m) for m in ("sacrebleu", "rouge")}
51 | for tgt_lang, _ in values.items()
52 | }
53 | for src_lang, values in test_data.items()
54 | }
55 | for src_lang, values in test_data.items():
56 | for tgt_lang, data in values.items():
57 | for element in data:
58 | for metric in metrics[src_lang][tgt_lang].values():
59 | metric.add_batch(
60 | predictions=[
61 | element["prediction"].strip(string.punctuation).lower().split()
62 | ],
63 | references=[
64 | [element["target"].strip(string.punctuation).lower().split()]
65 | ],
66 | )
67 |
68 | metrics = {
69 | src_lang: {
70 | tgt_lang: {name: metric.compute() for name, metric in metrics.items()}
71 | for tgt_lang, metrics in values.items()
72 | }
73 | for src_lang, values in metrics.items()
74 | }
75 |
76 | # Compute tables
77 | tables = {
78 | "sacrebleu": {
79 | src_lang: {
80 | tgt_lang: metric["sacrebleu"]["score"]
81 | for tgt_lang, metric in values.items()
82 | }
83 | for src_lang, values in metrics.items()
84 | },
85 | "rougeL": {
86 | src_lang: {
87 | tgt_lang: 100 * metric["rouge"]["rougeL"].mid.fmeasure
88 | for tgt_lang, metric in values.items()
89 | }
90 | for src_lang, values in metrics.items()
91 | },
92 | }
93 |
94 | print("sacrebleu")
95 | print(pd.DataFrame(tables["sacrebleu"]).T)
96 | print("rougeL")
97 | print(pd.DataFrame(tables["rougeL"]).T)
98 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import json
3 | import shutil
4 |
5 | import numpy as np
6 | import torch
7 | from tqdm.auto import tqdm
8 | from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
9 |
10 | from utils import (
11 | batch_it,
12 | clean_data,
13 | lang_to_m2m_lang_id,
14 | load_data_from_folder,
15 | processed_data,
16 | training_step,
17 | validation_step,
18 | )
19 |
20 | # Epochs, batch, periods variables
21 | epochs = 20
22 | batch_size = 16
23 | eval_period = 1000
24 | total_steps = 0
25 | best_eval_loss = float("inf")
26 | max_models = 1
27 | topk_models = []
28 |
29 | # Choose the pairs of languages to train and validate
30 | langs = [
31 | ("ea", "de"),
32 | ("ea", "en"),
33 | # ('ea', 'tnt'),
34 | # ('ea', 'lKey'),
35 | # ('ea', 'wordClass'),
36 | # ('tnt', 'de'),
37 | # ('tnt', 'en'),
38 | # ('tnt', 'lKey'),
39 | # ('tnt', 'wordClass'),
40 | ]
41 |
42 |
43 | # Load data
44 | training_data = load_data_from_folder("training_data")
45 | validation_data = load_data_from_folder("validation_data")
46 |
47 | # Clean data
48 | training_data = clean_data(training_data)
49 |
50 | # Filter and extract data
51 | # Dict[str, Dict[str, List[Dict[str, str]]]]
52 | # {src_lang: {tgt_lang: [{'source': ..., 'target': ...}]}}
53 | training_data = processed_data(training_data)
54 | validation_data = processed_data(validation_data)
55 |
56 |
57 | # Adding traduction of corpus and vocabulary
58 |
59 | with open("translations_de2en.json", encoding="utf-8") as f:
60 | translations = json.load(f)
61 |
62 | for lang in ("ea", "tnt"):
63 | ids_sentence = {
64 | element["metadata"]["id_sentence"]
65 | for element in training_data[lang]["en"]
66 | if "id_sentence" in element["metadata"]
67 | }
68 |
69 | for element in training_data[lang]["de"]:
70 | if (
71 | "id_sentence" in element["metadata"]
72 | and element["metadata"]["id_sentence"] not in ids_sentence
73 | ):
74 | new_element = copy.deepcopy(element)
75 | new_element["target"] = translations[element["target"]]
76 | new_element["metadata"]["target_lang"] = "en"
77 | training_data[lang]["en"].append(new_element)
78 |
79 | print(
80 | f'{lang} -> en: Dopo la traduzione abbiamo {len(training_data[lang]["en"])} datapoints.'
81 | )
82 |
83 | # loading model
84 | model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M").to(
85 | "cuda:0"
86 | )
87 | tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
88 | optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
89 |
90 |
91 | # Training
92 | validation_losses = {}
93 | validation_data_batched = [
94 | (src_lang, trg_lang, batch)
95 | for src_lang, values in validation_data.items()
96 | for trg_lang, data in values.items()
97 | for batch in batch_it(data, batch_size)
98 | if (src_lang, trg_lang) in langs
99 | ]
100 |
101 | for epoch in range(epochs):
102 | print(f"Starting epoch {epoch + 1}")
103 |
104 | for src_lang, values in training_data.items():
105 | for data in values.values():
106 | np.random.shuffle(data)
107 |
108 | training_data_batched = [
109 | (src_lang, trg_lang, batch)
110 | for src_lang, values in training_data.items()
111 | for trg_lang, data in values.items()
112 | for batch in batch_it(data, batch_size)
113 | if (src_lang, trg_lang) in langs
114 | ]
115 |
116 | np.random.shuffle(training_data_batched)
117 |
118 | iterator = tqdm(training_data_batched)
119 | for src_lang, tgt_lang, batch in iterator:
120 | loss = training_step(
121 | batch,
122 | model,
123 | tokenizer,
124 | optimizer,
125 | lang_to_m2m_lang_id[src_lang],
126 | lang_to_m2m_lang_id[tgt_lang],
127 | )
128 | total_steps += 1
129 | iterator.set_postfix(
130 | total_steps=total_steps, loss=loss, src_lang=src_lang, tgt_lang=tgt_lang
131 | )
132 |
133 | if total_steps % eval_period == 0 and total_steps != 0:
134 | total_eval_loss = 0
135 | total_eval_tokens = 0
136 |
137 | for src_lang, tgt_lang, batch in validation_data_batched:
138 | loss, tokens = validation_step(
139 | batch,
140 | model,
141 | tokenizer,
142 | lang_to_m2m_lang_id[src_lang],
143 | lang_to_m2m_lang_id[tgt_lang],
144 | )
145 | total_eval_loss += loss * tokens
146 | total_eval_tokens += tokens
147 |
148 | validation_losses[total_steps] = total_eval_loss
149 | with open("validation_losses.json", "w") as f:
150 | json.dump(validation_losses, f)
151 |
152 | if total_eval_loss < best_eval_loss:
153 | print(
154 | f"The model improved! Old loss={best_eval_loss}, new loss={total_eval_loss}"
155 | )
156 | fname = f"checkpoint_total_steps={total_steps}_loss={total_eval_loss / total_eval_tokens:.2f}"
157 | model.save_pretrained(fname)
158 | topk_models.append(fname)
159 | best_eval_loss = total_eval_loss
160 |
161 | if len(topk_models) > max_models:
162 | fname = topk_models.pop(0)
163 | shutil.rmtree(fname)
164 | print(f"Removing {fname}")
165 |
166 | # Last check before the end
167 | total_eval_loss = 0
168 | total_eval_tokens = 0
169 |
170 | for src_lang, tgt_lang, batch in validation_data_batched:
171 | loss, tokens = validation_step(
172 | batch,
173 | model,
174 | tokenizer,
175 | lang_to_m2m_lang_id[src_lang],
176 | lang_to_m2m_lang_id[tgt_lang],
177 | )
178 | total_eval_loss += loss * tokens
179 | total_eval_tokens += tokens
180 |
181 | validation_losses[total_steps] = total_eval_loss
182 | with open("validation_losses.json", "w") as f:
183 | json.dump(validation_losses, f)
184 |
185 | if total_eval_loss < best_eval_loss:
186 | print(
187 | f"The model improved! Old loss={best_eval_loss}, new loss={total_eval_loss}"
188 | )
189 | fname = f"checkpoint_total_steps={total_steps}_loss={total_eval_loss / total_eval_tokens:.2f}"
190 | model.save_pretrained(fname)
191 | topk_models.append(fname)
192 | best_eval_loss = total_eval_loss
193 |
194 | if len(topk_models) > max_models:
195 | fname = topk_models.pop(0)
196 | shutil.rmtree(fname)
197 | print(f"Removing {fname}")
198 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # hiero-transformer
2 | This repository collects additional information for our article _Deep Learning Meets Egyptology: a Hieroglyphic Transformer for Translating Ancient Egyptian_ (De Cao et al. 2024). In particular, here you can find:
3 | - the complete list of the cleansing operations used to clean the data before the model training;
4 | - advice on how to enter the input, both from hieroglyphs and transliteration;
5 | - the examples and the analysis of the human evaluation;
6 | - the code we used to clean, train, evaluate and run the data and the model.
7 |
8 | The model is also on Huggingface: https://huggingface.co/mattiadc/hiero-transformer
9 |
10 | ## Explanation of ".py" files
11 |
12 | Four files collect our code: Run.py, inference.py, train.py, utils.py.
13 |
14 | - **Run.py:** Collects the code to load the environment and the model, as well as an input form we created to facilitate the input entry to the model. To use Run.py beware to divide the environment loading from the input form.
15 | - **inference.py:** Collects the code we used to load the test.data, generate the predictions and calculate the metrics.
16 | - **train.py:** Collects the code we used to load the model, the variables, the data and to train the model.
17 | - **utils.py:** Collects various code of the training functions, and the code we used to process, filter and clean the data.
18 |
19 | ## Cleansing operations
20 |
21 | Every cleansing operation was meticulously documented along with a concise description highlighting its purpose, implementation, and the rationale behind its choice. These operations were compiled into tables, incorporating the regular symbol expression ".*?" to depict an undefined sequence of words, numbers, and/or graphic symbols.
22 |
23 | Furthermore, any text found in the _Subject_ section was retained entirely, including spaces. At the same time, all of our annotations were enclosed within brackets not present in the TLA dataset, specifically "(£" "£)".
24 |
25 | The meanings of the cleaning procedures were derived from the _Manuel de Codage_ (Buurman et al. 1988; Hans Van Den Berg) conventions, the _Berlin Text System 3.1 (V 3.0) user manual_ (Kupreyev and Sperveslage 2011), or realized by us.
26 |
27 | The "cleansing_operations.pdf" file contains the management of translations, transliterations, Gardiner code and part-of-speech tags.
28 |
29 |
30 | ## Model functioning tips
31 | Hiero-transformer is a useful tool, but it could generate inaccurate results, especially if the input provided isn't correct. Users need to be aware of this and able to distinguish any potential machine-generated mistakes. To help you get better output using Hiero-transformer, here are some tips.
32 |
33 | ### Hieroglyphic input
34 | You will need to use the Gardiner code to provide hieroglyphs to Hiero-transformer. This code requires some preparation.
35 | - **Cleaning:** Remove any brackets, graphic signs, or letters (which are not part of the hieroglyph) attached to them, like you might see working with Jsesh (Rosmorduc).
36 | - **Separation:** Use spaces to separate individual hieroglyphs and erase any other character.
37 |
38 | Remember that the model is trained on Old and Middle Egyptian hieroglyphs. It might struggle with later stages of the language or grammatical forms developed after the Second Intermediate Period.
39 | For best results, we recommend using a sign list like Gardiner's (Gardiner 1957) or, even better, Allen's (Allen 2014).
40 |
41 |
42 | ### Transliteration input
43 | To provide Hiero-transformer with transliteration, you will need to use the same conventions used by the TLA.
44 | - **Capitalization:** Proper nouns need to be capitalized.
45 | - **Hyphens:** You need to use hyphens (-) to separate individual words within proper nouns (e.g., _sḥtp-jb-rꜥ_) or concepts (e.g., _wꜣḏ-wr_). Otherwise, the model will translate them as separate words.
46 | - **Suffix pronouns:** When using the _=_ sign to indicate a suffix pronoun, always add a space before the sign directly followed by the suffix pronoun letters (e.g., _zꜣ =f m pr_).
47 | - **Yod:** The consonant _j_ is used for the strong radical yod, while _i̯_ represents the weak radical yod.
48 | - **Dots:** Use a dot to separate the verb root and the suffixes (other than pronouns). For example, in the form _sḏm.n =f_, the dot separates _sḏm_ (root) from _n_ (suffix other than pronoun). Dots may also be used for plural/dual forms.
49 | - **Commas:** Commas are used for the feminine suffix and may also be used for plural/dual forms.
50 |
51 | You can provide characters in transliteration either in Unicode (the standard encoding) or according to the computer transcription of the Manuel de Codage (a hieroglyphs-specific encoding system that does not make use of special characters). Furthermore, we ensured the insertion of other characters.
52 | - **Capital letters**: Add an asterisk (*) directly before the letter you want to capitalize. For example, using the MdC system, to get a capitalized _ḏ_, type _*D_ (instead of _D_); similarly, to get a capitalized _d_, type _*d_.
53 | - **Weak radical yod ( _i̯_ )**: Type _i_ to insert this character.
54 |
55 |
56 | ## Human evaluation examples and analysis
57 | As soon as possible, we will add a PDF file in which we have analyzed all the examples we worked on.
58 |
59 |
60 | ## References
61 |
62 | Mattia De Cao, Nicola De Cao, Angelo Colonna, and Alessandro Lenci. 2024. Deep Learning Meets Egyptology: a Hieroglyphic Transformer for Translating Ancient Egyptian. In _Proceedings of the 1st Workshop on Machine Learning for Ancient Languages (ML4AL 2024)_, pages 71–86, Hybrid in Bangkok, Thailand and online. Association for Computational Linguistics.
63 |
64 | Jan Buurman, Nicolas-Christophe Grimal, Michale Hainsworth, Jochen Hallof, and Dirk Van der Plas. 1988. _Inventaire des signes hieroglyphiques en vue de leur saisie informatique: Manuel de codage des textes ieroglyphiques en vue de leur saisie sur ordinateur_, volume 2 of _Informatique et egyptologie_. Imprimerie Lienharte et Cie.; Difussion Boccard, Paris.
65 |
66 | Hans Van Den Berg, _“Manuel de Codage” A standard system for the computer encoding of Egyptian transliteration and hieroglyphic texts_, (last access: 28 July 2023).
67 |
68 | Maxim Kupreyev and Gunnar Sperveslage. 2011. _Berlin Text System 3.1 User Manual: Editorial Software of the Thesaurus Linguae Aegyptiae Project_.
69 |
70 | Alan H. Gardiner. 1957. _Egyptian Grammar, Being an Introduction to the Study of Hieroglyphs_, third edition. Griffith Institute, Oxford.
71 |
72 | James P. Allen. 2014. _Middle Egyptian: An Introduction to the Language and Culture of Hieroglyphs_, 3 edition. Cambridge University Press.
73 |
74 | Serge Rosmorduc, _JSesh Documentation_, (last access 09 September 2023).
75 |
76 | ## Data source reference
77 |
78 | Database snapshot of project "Strukturen und Transformationen des Wortschatzes der ägyptischen Sprache" (excerpt from January 2018), 2018,
79 | ed. by Tonio Sebastian Richter & Ingelore Hafemann on behalf of the Berlin-Brandenburgische Akademie der Wissenschaften and Hans-Werner Fischer-Elfert & Peter Dils on behalf of the Sächsische Akademie der Wissenschaften zu Leipzig,
80 | urn:nbn:de:kobv:b4-opus4-29190, https://nbn-resolving.org/urn:nbn:de:kobv:b4-opus4-29190 (CC BY-SA 4.0 Int.)
81 |
--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
1 | Attribution-ShareAlike 4.0 International
2 |
3 | =======================================================================
4 |
5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
6 | does not provide legal services or legal advice. Distribution of
7 | Creative Commons public licenses does not create a lawyer-client or
8 | other relationship. Creative Commons makes its licenses and related
9 | information available on an "as-is" basis. Creative Commons gives no
10 | warranties regarding its licenses, any material licensed under their
11 | terms and conditions, or any related information. Creative Commons
12 | disclaims all liability for damages resulting from their use to the
13 | fullest extent possible.
14 |
15 | Using Creative Commons Public Licenses
16 |
17 | Creative Commons public licenses provide a standard set of terms and
18 | conditions that creators and other rights holders may use to share
19 | original works of authorship and other material subject to copyright
20 | and certain other rights specified in the public license below. The
21 | following considerations are for informational purposes only, are not
22 | exhaustive, and do not form part of our licenses.
23 |
24 | Considerations for licensors: Our public licenses are
25 | intended for use by those authorized to give the public
26 | permission to use material in ways otherwise restricted by
27 | copyright and certain other rights. Our licenses are
28 | irrevocable. Licensors should read and understand the terms
29 | and conditions of the license they choose before applying it.
30 | Licensors should also secure all rights necessary before
31 | applying our licenses so that the public can reuse the
32 | material as expected. Licensors should clearly mark any
33 | material not subject to the license. This includes other CC-
34 | licensed material, or material used under an exception or
35 | limitation to copyright. More considerations for licensors:
36 | wiki.creativecommons.org/Considerations_for_licensors
37 |
38 | Considerations for the public: By using one of our public
39 | licenses, a licensor grants the public permission to use the
40 | licensed material under specified terms and conditions. If
41 | the licensor's permission is not necessary for any reason--for
42 | example, because of any applicable exception or limitation to
43 | copyright--then that use is not regulated by the license. Our
44 | licenses grant only permissions under copyright and certain
45 | other rights that a licensor has authority to grant. Use of
46 | the licensed material may still be restricted for other
47 | reasons, including because others have copyright or other
48 | rights in the material. A licensor may make special requests,
49 | such as asking that all changes be marked or described.
50 | Although not required by our licenses, you are encouraged to
51 | respect those requests where reasonable. More considerations
52 | for the public:
53 | wiki.creativecommons.org/Considerations_for_licensees
54 |
55 | =======================================================================
56 |
57 | Creative Commons Attribution-ShareAlike 4.0 International Public
58 | License
59 |
60 | By exercising the Licensed Rights (defined below), You accept and agree
61 | to be bound by the terms and conditions of this Creative Commons
62 | Attribution-ShareAlike 4.0 International Public License ("Public
63 | License"). To the extent this Public License may be interpreted as a
64 | contract, You are granted the Licensed Rights in consideration of Your
65 | acceptance of these terms and conditions, and the Licensor grants You
66 | such rights in consideration of benefits the Licensor receives from
67 | making the Licensed Material available under these terms and
68 | conditions.
69 |
70 |
71 | Section 1 -- Definitions.
72 |
73 | a. Adapted Material means material subject to Copyright and Similar
74 | Rights that is derived from or based upon the Licensed Material
75 | and in which the Licensed Material is translated, altered,
76 | arranged, transformed, or otherwise modified in a manner requiring
77 | permission under the Copyright and Similar Rights held by the
78 | Licensor. For purposes of this Public License, where the Licensed
79 | Material is a musical work, performance, or sound recording,
80 | Adapted Material is always produced where the Licensed Material is
81 | synched in timed relation with a moving image.
82 |
83 | b. Adapter's License means the license You apply to Your Copyright
84 | and Similar Rights in Your contributions to Adapted Material in
85 | accordance with the terms and conditions of this Public License.
86 |
87 | c. BY-SA Compatible License means a license listed at
88 | creativecommons.org/compatiblelicenses, approved by Creative
89 | Commons as essentially the equivalent of this Public License.
90 |
91 | d. Copyright and Similar Rights means copyright and/or similar rights
92 | closely related to copyright including, without limitation,
93 | performance, broadcast, sound recording, and Sui Generis Database
94 | Rights, without regard to how the rights are labeled or
95 | categorized. For purposes of this Public License, the rights
96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar
97 | Rights.
98 |
99 | e. Effective Technological Measures means those measures that, in the
100 | absence of proper authority, may not be circumvented under laws
101 | fulfilling obligations under Article 11 of the WIPO Copyright
102 | Treaty adopted on December 20, 1996, and/or similar international
103 | agreements.
104 |
105 | f. Exceptions and Limitations means fair use, fair dealing, and/or
106 | any other exception or limitation to Copyright and Similar Rights
107 | that applies to Your use of the Licensed Material.
108 |
109 | g. License Elements means the license attributes listed in the name
110 | of a Creative Commons Public License. The License Elements of this
111 | Public License are Attribution and ShareAlike.
112 |
113 | h. Licensed Material means the artistic or literary work, database,
114 | or other material to which the Licensor applied this Public
115 | License.
116 |
117 | i. Licensed Rights means the rights granted to You subject to the
118 | terms and conditions of this Public License, which are limited to
119 | all Copyright and Similar Rights that apply to Your use of the
120 | Licensed Material and that the Licensor has authority to license.
121 |
122 | j. Licensor means the individual(s) or entity(ies) granting rights
123 | under this Public License.
124 |
125 | k. Share means to provide material to the public by any means or
126 | process that requires permission under the Licensed Rights, such
127 | as reproduction, public display, public performance, distribution,
128 | dissemination, communication, or importation, and to make material
129 | available to the public including in ways that members of the
130 | public may access the material from a place and at a time
131 | individually chosen by them.
132 |
133 | l. Sui Generis Database Rights means rights other than copyright
134 | resulting from Directive 96/9/EC of the European Parliament and of
135 | the Council of 11 March 1996 on the legal protection of databases,
136 | as amended and/or succeeded, as well as other essentially
137 | equivalent rights anywhere in the world.
138 |
139 | m. You means the individual or entity exercising the Licensed Rights
140 | under this Public License. Your has a corresponding meaning.
141 |
142 |
143 | Section 2 -- Scope.
144 |
145 | a. License grant.
146 |
147 | 1. Subject to the terms and conditions of this Public License,
148 | the Licensor hereby grants You a worldwide, royalty-free,
149 | non-sublicensable, non-exclusive, irrevocable license to
150 | exercise the Licensed Rights in the Licensed Material to:
151 |
152 | a. reproduce and Share the Licensed Material, in whole or
153 | in part; and
154 |
155 | b. produce, reproduce, and Share Adapted Material.
156 |
157 | 2. Exceptions and Limitations. For the avoidance of doubt, where
158 | Exceptions and Limitations apply to Your use, this Public
159 | License does not apply, and You do not need to comply with
160 | its terms and conditions.
161 |
162 | 3. Term. The term of this Public License is specified in Section
163 | 6(a).
164 |
165 | 4. Media and formats; technical modifications allowed. The
166 | Licensor authorizes You to exercise the Licensed Rights in
167 | all media and formats whether now known or hereafter created,
168 | and to make technical modifications necessary to do so. The
169 | Licensor waives and/or agrees not to assert any right or
170 | authority to forbid You from making technical modifications
171 | necessary to exercise the Licensed Rights, including
172 | technical modifications necessary to circumvent Effective
173 | Technological Measures. For purposes of this Public License,
174 | simply making modifications authorized by this Section 2(a)
175 | (4) never produces Adapted Material.
176 |
177 | 5. Downstream recipients.
178 |
179 | a. Offer from the Licensor -- Licensed Material. Every
180 | recipient of the Licensed Material automatically
181 | receives an offer from the Licensor to exercise the
182 | Licensed Rights under the terms and conditions of this
183 | Public License.
184 |
185 | b. Additional offer from the Licensor -- Adapted Material.
186 | Every recipient of Adapted Material from You
187 | automatically receives an offer from the Licensor to
188 | exercise the Licensed Rights in the Adapted Material
189 | under the conditions of the Adapter's License You apply.
190 |
191 | c. No downstream restrictions. You may not offer or impose
192 | any additional or different terms or conditions on, or
193 | apply any Effective Technological Measures to, the
194 | Licensed Material if doing so restricts exercise of the
195 | Licensed Rights by any recipient of the Licensed
196 | Material.
197 |
198 | 6. No endorsement. Nothing in this Public License constitutes or
199 | may be construed as permission to assert or imply that You
200 | are, or that Your use of the Licensed Material is, connected
201 | with, or sponsored, endorsed, or granted official status by,
202 | the Licensor or others designated to receive attribution as
203 | provided in Section 3(a)(1)(A)(i).
204 |
205 | b. Other rights.
206 |
207 | 1. Moral rights, such as the right of integrity, are not
208 | licensed under this Public License, nor are publicity,
209 | privacy, and/or other similar personality rights; however, to
210 | the extent possible, the Licensor waives and/or agrees not to
211 | assert any such rights held by the Licensor to the limited
212 | extent necessary to allow You to exercise the Licensed
213 | Rights, but not otherwise.
214 |
215 | 2. Patent and trademark rights are not licensed under this
216 | Public License.
217 |
218 | 3. To the extent possible, the Licensor waives any right to
219 | collect royalties from You for the exercise of the Licensed
220 | Rights, whether directly or through a collecting society
221 | under any voluntary or waivable statutory or compulsory
222 | licensing scheme. In all other cases the Licensor expressly
223 | reserves any right to collect such royalties.
224 |
225 |
226 | Section 3 -- License Conditions.
227 |
228 | Your exercise of the Licensed Rights is expressly made subject to the
229 | following conditions.
230 |
231 | a. Attribution.
232 |
233 | 1. If You Share the Licensed Material (including in modified
234 | form), You must:
235 |
236 | a. retain the following if it is supplied by the Licensor
237 | with the Licensed Material:
238 |
239 | i. identification of the creator(s) of the Licensed
240 | Material and any others designated to receive
241 | attribution, in any reasonable manner requested by
242 | the Licensor (including by pseudonym if
243 | designated);
244 |
245 | ii. a copyright notice;
246 |
247 | iii. a notice that refers to this Public License;
248 |
249 | iv. a notice that refers to the disclaimer of
250 | warranties;
251 |
252 | v. a URI or hyperlink to the Licensed Material to the
253 | extent reasonably practicable;
254 |
255 | b. indicate if You modified the Licensed Material and
256 | retain an indication of any previous modifications; and
257 |
258 | c. indicate the Licensed Material is licensed under this
259 | Public License, and include the text of, or the URI or
260 | hyperlink to, this Public License.
261 |
262 | 2. You may satisfy the conditions in Section 3(a)(1) in any
263 | reasonable manner based on the medium, means, and context in
264 | which You Share the Licensed Material. For example, it may be
265 | reasonable to satisfy the conditions by providing a URI or
266 | hyperlink to a resource that includes the required
267 | information.
268 |
269 | 3. If requested by the Licensor, You must remove any of the
270 | information required by Section 3(a)(1)(A) to the extent
271 | reasonably practicable.
272 |
273 | b. ShareAlike.
274 |
275 | In addition to the conditions in Section 3(a), if You Share
276 | Adapted Material You produce, the following conditions also apply.
277 |
278 | 1. The Adapter's License You apply must be a Creative Commons
279 | license with the same License Elements, this version or
280 | later, or a BY-SA Compatible License.
281 |
282 | 2. You must include the text of, or the URI or hyperlink to, the
283 | Adapter's License You apply. You may satisfy this condition
284 | in any reasonable manner based on the medium, means, and
285 | context in which You Share Adapted Material.
286 |
287 | 3. You may not offer or impose any additional or different terms
288 | or conditions on, or apply any Effective Technological
289 | Measures to, Adapted Material that restrict exercise of the
290 | rights granted under the Adapter's License You apply.
291 |
292 |
293 | Section 4 -- Sui Generis Database Rights.
294 |
295 | Where the Licensed Rights include Sui Generis Database Rights that
296 | apply to Your use of the Licensed Material:
297 |
298 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right
299 | to extract, reuse, reproduce, and Share all or a substantial
300 | portion of the contents of the database;
301 |
302 | b. if You include all or a substantial portion of the database
303 | contents in a database in which You have Sui Generis Database
304 | Rights, then the database in which You have Sui Generis Database
305 | Rights (but not its individual contents) is Adapted Material,
306 | including for purposes of Section 3(b); and
307 |
308 | c. You must comply with the conditions in Section 3(a) if You Share
309 | all or a substantial portion of the contents of the database.
310 |
311 | For the avoidance of doubt, this Section 4 supplements and does not
312 | replace Your obligations under this Public License where the Licensed
313 | Rights include other Copyright and Similar Rights.
314 |
315 |
316 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
317 |
318 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
319 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
320 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
321 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
322 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
323 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
324 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
325 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
326 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
327 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
328 |
329 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
330 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
331 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
332 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
333 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
334 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
335 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
336 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
337 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
338 |
339 | c. The disclaimer of warranties and limitation of liability provided
340 | above shall be interpreted in a manner that, to the extent
341 | possible, most closely approximates an absolute disclaimer and
342 | waiver of all liability.
343 |
344 |
345 | Section 6 -- Term and Termination.
346 |
347 | a. This Public License applies for the term of the Copyright and
348 | Similar Rights licensed here. However, if You fail to comply with
349 | this Public License, then Your rights under this Public License
350 | terminate automatically.
351 |
352 | b. Where Your right to use the Licensed Material has terminated under
353 | Section 6(a), it reinstates:
354 |
355 | 1. automatically as of the date the violation is cured, provided
356 | it is cured within 30 days of Your discovery of the
357 | violation; or
358 |
359 | 2. upon express reinstatement by the Licensor.
360 |
361 | For the avoidance of doubt, this Section 6(b) does not affect any
362 | right the Licensor may have to seek remedies for Your violations
363 | of this Public License.
364 |
365 | c. For the avoidance of doubt, the Licensor may also offer the
366 | Licensed Material under separate terms or conditions or stop
367 | distributing the Licensed Material at any time; however, doing so
368 | will not terminate this Public License.
369 |
370 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
371 | License.
372 |
373 |
374 | Section 7 -- Other Terms and Conditions.
375 |
376 | a. The Licensor shall not be bound by any additional or different
377 | terms or conditions communicated by You unless expressly agreed.
378 |
379 | b. Any arrangements, understandings, or agreements regarding the
380 | Licensed Material not stated herein are separate from and
381 | independent of the terms and conditions of this Public License.
382 |
383 |
384 | Section 8 -- Interpretation.
385 |
386 | a. For the avoidance of doubt, this Public License does not, and
387 | shall not be interpreted to, reduce, limit, restrict, or impose
388 | conditions on any use of the Licensed Material that could lawfully
389 | be made without permission under this Public License.
390 |
391 | b. To the extent possible, if any provision of this Public License is
392 | deemed unenforceable, it shall be automatically reformed to the
393 | minimum extent necessary to make it enforceable. If the provision
394 | cannot be reformed, it shall be severed from this Public License
395 | without affecting the enforceability of the remaining terms and
396 | conditions.
397 |
398 | c. No term or condition of this Public License will be waived and no
399 | failure to comply consented to unless expressly agreed to by the
400 | Licensor.
401 |
402 | d. Nothing in this Public License constitutes or may be interpreted
403 | as a limitation upon, or waiver of, any privileges and immunities
404 | that apply to the Licensor or You, including from the legal
405 | processes of any jurisdiction or authority.
406 |
407 |
408 | =======================================================================
409 |
410 | Creative Commons is not a party to its public
411 | licenses. Notwithstanding, Creative Commons may elect to apply one of
412 | its public licenses to material it publishes and in those instances
413 | will be considered the “Licensor.” The text of the Creative Commons
414 | public licenses is dedicated to the public domain under the CC0 Public
415 | Domain Dedication. Except for the limited purpose of indicating that
416 | material is shared under a Creative Commons public license or as
417 | otherwise permitted by the Creative Commons policies published at
418 | creativecommons.org/policies, Creative Commons does not authorize the
419 | use of the trademark "Creative Commons" or any other trademark or logo
420 | of Creative Commons without its prior written consent including,
421 | without limitation, in connection with any unauthorized modifications
422 | to any of its public licenses or any other arrangements,
423 | understandings, or agreements concerning use of licensed material. For
424 | the avoidance of doubt, this paragraph does not form part of the
425 | public licenses.
426 |
427 | Creative Commons may be contacted at creativecommons.org.
428 |
429 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import re
4 |
5 | import torch
6 |
7 | lang_to_m2m_lang_id = {
8 | "ea": "ar",
9 | "tnt": "lo",
10 | "en": "en",
11 | "de": "de",
12 | "lKey": "my",
13 | "wordClass": "th",
14 | }
15 |
16 | # Processing and filter functions defining
17 |
18 |
19 | # load all files from folder
20 | def load_data_from_folder(folder):
21 | data = []
22 | files = os.listdir(folder)
23 | print(f"Ci sono {len(files)} files.")
24 |
25 | for fname in files:
26 | if fname.endswith(".json"):
27 | with open(os.path.join(folder, fname), encoding="utf-8") as f:
28 | data += json.load(f)
29 |
30 | print(f"Caricati {len(data)} datapoints.")
31 |
32 | return data
33 |
34 |
35 | def extract_data_standard(data, src_lang, tgt_lang):
36 | # Filter data without ae -> target
37 | data = filter(
38 | lambda datapoint: (
39 | datapoint["metadata"]["source_lang"] == src_lang
40 | and datapoint["metadata"]["target_lang"] == tgt_lang
41 | and datapoint["source"] != ""
42 | and datapoint["target"] != ""
43 | ),
44 | data,
45 | )
46 |
47 | data = map(
48 | lambda datapoint: {
49 | "source": datapoint["source"],
50 | "target": datapoint["target"],
51 | "metadata": datapoint["metadata"],
52 | },
53 | data,
54 | )
55 |
56 | data = list(data)
57 |
58 | print(f"{src_lang} -> {tgt_lang}: Dopo i filtri abbiamo {len(data)} datapoints.")
59 | return data
60 |
61 |
62 | # Extract ea as source and transliteration as target
63 | def extract_data_transliteration_target(data, src_lang):
64 | # Filter data without ae -> transliteration
65 | data = filter(
66 | lambda datapoint: (
67 | datapoint["metadata"]["source_lang"] == src_lang
68 | and datapoint["source"] != ""
69 | and datapoint["transliteration"] != ""
70 | ),
71 | data,
72 | )
73 |
74 | data = map(
75 | lambda datapoint: {
76 | "source": datapoint["source"],
77 | "target": datapoint["transliteration"],
78 | "metadata": datapoint["metadata"],
79 | },
80 | data,
81 | )
82 |
83 | data = list(data)
84 |
85 | print(f"{src_lang} -> tnt: Dopo i filtri abbiamo {len(data)} datapoints.")
86 | return data
87 |
88 |
89 | # Extract transliteration as source and traduction as target
90 | def extract_data_transliteration_source(data, trg_lang):
91 | # Filter data without traduction -> transliteration
92 | data = filter(
93 | lambda datapoint: (
94 | datapoint["metadata"]["target_lang"] == trg_lang
95 | and datapoint["target"] != ""
96 | and datapoint["transliteration"] != ""
97 | ),
98 | data,
99 | )
100 |
101 | data = map(
102 | lambda datapoint: {
103 | "source": datapoint["transliteration"],
104 | "target": datapoint["target"],
105 | "metadata": datapoint["metadata"],
106 | },
107 | data,
108 | )
109 |
110 | data = list(data)
111 |
112 | print(f"tnt -> {trg_lang}: Dopo i filtri abbiamo {len(data)} datapoints.")
113 | return data
114 |
115 |
116 | # Extract ea as source and lKey/wordClass as target
117 | def extract_data_ea_lKey_or_wordClass(data, lKey_or_wordClass):
118 | assert lKey_or_wordClass in ("lKey", "wordClass")
119 |
120 | # Filter data without ae -> lKey_or_wordClass
121 | data = filter(
122 | lambda datapoint: (
123 | datapoint["metadata"]["source_lang"] == "ea"
124 | and datapoint["source"] != ""
125 | and datapoint[lKey_or_wordClass] != ""
126 | and "/" not in datapoint[lKey_or_wordClass]
127 | ),
128 | data,
129 | )
130 |
131 | data = map(
132 | lambda datapoint: {
133 | "source": datapoint["source"],
134 | "target": datapoint[lKey_or_wordClass],
135 | "metadata": datapoint["metadata"],
136 | },
137 | data,
138 | )
139 |
140 | data = list(data)
141 |
142 | print(f"ea -> {lKey_or_wordClass}: Dopo i filtri abbiamo {len(data)} datapoints.")
143 | return data
144 |
145 |
146 | # Extract transliteration as source and lKey/wordClass as target
147 | def extract_data_transliteration_lKey_or_wordClass(data, lKey_or_wordClass):
148 | assert lKey_or_wordClass in ("lKey", "wordClass")
149 |
150 | # Filter data without tnt -> lKey_or_wordClass
151 | data = filter(
152 | lambda datapoint: (
153 | datapoint["transliteration"] != ""
154 | and datapoint[lKey_or_wordClass] != ""
155 | and "/" not in datapoint[lKey_or_wordClass]
156 | ),
157 | data,
158 | )
159 |
160 | data = map(
161 | lambda datapoint: {
162 | "source": datapoint["transliteration"],
163 | "target": datapoint[lKey_or_wordClass],
164 | "metadata": datapoint["metadata"],
165 | },
166 | data,
167 | )
168 |
169 | data = list(data)
170 |
171 | print(f"tnt -> {lKey_or_wordClass}: Dopo i filtri abbiamo {len(data)} datapoints.")
172 | return data
173 |
174 |
175 | # Processing data
176 | def processed_data(data):
177 | return {
178 | "ea": {
179 | "de": extract_data_standard(data, "ea", "de"),
180 | "en": extract_data_standard(data, "ea", "en"),
181 | "tnt": extract_data_transliteration_target(data, "ea"),
182 | "lKey": extract_data_ea_lKey_or_wordClass(data, "lKey"),
183 | "wordClass": extract_data_ea_lKey_or_wordClass(data, "wordClass"),
184 | },
185 | "tnt": {
186 | "de": extract_data_transliteration_source(data, "de"),
187 | "en": extract_data_transliteration_source(data, "en"),
188 | "lKey": extract_data_transliteration_lKey_or_wordClass(data, "lKey"),
189 | "wordClass": extract_data_transliteration_lKey_or_wordClass(
190 | data, "wordClass"
191 | ),
192 | },
193 | }
194 |
195 |
196 | # Cleaning functions defining
197 |
198 |
199 | # Hieroglyphs cleaning
200 | def clean_graphics(text: str) -> str:
201 | # Start from double spaces and sentences to delete
202 | text = " ".join(text.split())
203 | if "{m1}〈S29〉" in text:
204 | text = ""
205 | if "geschrieben" in text:
206 | text = ""
207 | if "SandhiForm" in text:
208 | text = ""
209 | if "Det.-von" in text:
210 | text = ""
211 | if "erhalten" in text:
212 | text = ""
213 | if text == "//":
214 | text = ""
215 | # Comments
216 | text = text.replace('"sic"', "")
217 | text = text.replace('"var"', "")
218 | text = text.replace('"Var"', "")
219 | text = text.replace('"var."', "")
220 | text = text.replace("-var", "")
221 | text = text.replace("-vae", "")
222 | text = text.replace("-+lvar+s", "")
223 | text = text.replace("-+linverted+s", "")
224 | text = text.replace('"ein Vogel"', "/")
225 | text = text.replace('"unleserliches Zeichen"', "/")
226 | text = text.replace('"lb"', "")
227 | text = text.replace('" lb"', "")
228 | text = text.replace('"lb', "")
229 | text = text.replace('"b"', "")
230 | text = text.replace('"hierat"', "")
231 | text = text.replace('"monogr"', "")
232 | text = text.replace('"monogram"', "")
233 | text = text.replace('"Spuren"', "")
234 | text = text.replace('"large"', "")
235 | text = text.replace('"hiero"', "")
236 | text = text.replace('"mutil"', "")
237 | text = text.replace('"composite"', "")
238 | text = text.replace('"vacat"', "")
239 | text = text.replace('"traces"', "")
240 | text = text.replace('"senkrechte Zeichenspur"', "")
241 | text = text.replace('"senkrechtes Zeichen"', "")
242 | # Jsesh graphic elements
243 | text = text.replace("**", "-")
244 | text = text.replace("*", "-")
245 | text = text.replace("//", "/")
246 | text = text.replace("h/", "/")
247 | text = text.replace("v/", "/")
248 | text = text.replace("#b-/#e", "/")
249 | text = text.replace("-:", "-")
250 | text = text.replace(":", "-")
251 | text = text.replace("[?", "").replace("?]", "")
252 | text = text.replace('"⸮"', "").replace('"?"', "")
253 | text = text.replace("\"'⸮'\"", "").replace("\"'?'\"", "")
254 | text = text.replace("[[", "").replace("]]", "")
255 | text = text.replace("[{*", "").replace("*}]", "")
256 | text = text.replace("[{-", "").replace("-}]", "")
257 | text = text.replace("[[*", "").replace("*]]", "")
258 | text = text.replace("[[-", "").replace("-]]", "")
259 | text = text.replace("[(-", "").replace("-)]", "")
260 | text = text.replace("(", "").replace(")", "")
261 | text = text.replace("$", "")
262 | text = text.replace("<1-0>-", "").replace("-<0-2>", "")
263 | text = text.replace("<1-", "").replace("-2>", "")
264 | text = text.replace("-<1", "")
265 | text = text.replace("<2-", "").replace("-1>", "")
266 | text = text.replace("<0-", "").replace("-0>", "")
267 | text = text.replace("<-", "").replace("->", "")
268 | text = text.replace("<", "").replace(">", "")
269 | text = text.replace('⸮"', "")
270 | text = text.replace("##", "")
271 | text = text.replace("v", "")
272 | # Specific phrase elements
273 | text = text.replace("ss", "S29")
274 | text = text.replace("nn", "M22-M22")
275 | text = text.replace('"lc"', "")
276 | # text = text.replace('"tr"', '')
277 | text = text.replace("prwn", "O1")
278 | text = text.replace("rf", "D21-I9")
279 | text = text.replace("ZeA", "Z2A")
280 | text = text.replace("j", "M17")
281 | text = text.replace("y1", "Y1")
282 | text = text.replace("z2", "Z2")
283 | # text = text.replace('-?9', '')
284 | text = text.replace("b1", "B1")
285 | text = text.replace("pS", "F22")
286 | # text = text.replace('-?', '')
287 | text = text.replace("_", "")
288 | # text = text.replace('{{89,263,62}}', '')
289 | # text = text.replace('{{267,6,97}}', '')
290 | text = text.replace('"⸮h"', "")
291 | text = text.replace("!", "")
292 | # [& parenthesis and cleaning residues
293 | text = text.replace('"', "")
294 | text = text.replace("[&", "").replace("&]", "")
295 | text = text.replace("&", "-")
296 | text = re.sub(r"-+", "-", text)
297 | text = text.replace("- ", " ")
298 | text = text.replace(" -", " ")
299 | text = text.strip("-")
300 | # \\Rx, cartouche, \\, space at end and beginning
301 | text = re.sub(r"\\\\R.*?(-|\s|$)", r"\1", text)
302 | text = re.sub(r"\\\\.*?(-|\s|$)", r"\1", text)
303 | text = re.sub(r"\\.*?(-|\s|$)", r"\1", text)
304 | text = re.sub(r"\((.*?)\)\|", r"\1", text)
305 | text = text.replace("\\", "")
306 | text = text.strip()
307 | # Double spaces again and -
308 | text = text.replace("-", " ")
309 | text = " ".join(text.split())
310 | return text
311 |
312 |
313 | # Traduction cleaning
314 | def clean_traduction(text):
315 | # Start from double spaces and sentences to delete
316 | text = " ".join(text.split())
317 | if text.endswith("..."):
318 | text = text[:-3].strip()
319 | if text == "?":
320 | text = text.replace("?", "")
321 | if "-??-" in text:
322 | text = ""
323 | text = re.sub(r"--.*?--", "--zerstört--", text)
324 | if "--zerstört--" in text:
325 | text = ""
326 | if "..." in text:
327 | text = ""
328 | if "…" in text:
329 | text = ""
330 | if ". . ." in text:
331 | text = ""
332 | if "_" in text:
333 | text = ""
334 | if "⸮_?" in text:
335 | text = ""
336 | if "?_?" in text:
337 | text = ""
338 | if "---?---" in text:
339 | text = ""
340 | if "---" in text:
341 | text = ""
342 | if "--" in text:
343 | text = ""
344 | if "keine Übersetzung vorhanden" in text:
345 | text = ""
346 | if "Keine Übersetzung möglich" in text:
347 | text = ""
348 | if "--- LEER GEFUNDEN ---" in text:
349 | text = ""
350 | if "---LEER GEFUNDEN---" in text:
351 | text = ""
352 | text = re.sub(r"\(=.*?\)", "", text)
353 | # if text == 'The':
354 | # text = ''
355 | if "[---]" in text:
356 | text = ""
357 | # lhg acronym, other languages, special parenthesis and chapter numbers
358 | text = re.sub(r"\(\((.*?)\)\)", r"\1", text)
359 | text = re.sub(r"\[\[(.*?)\]\]", r"\1", text)
360 | text = text.replace('"arbustes à épines"', "dornige Sträucher")
361 | text = text.replace("rôdeurs", "plünderer")
362 | text = re.sub('\\"(.*?)"', r"\1", text)
363 | text = re.sub(r"(\/[\w+ÄäÖöẞßÜü]+)", " ", text)
364 | text = text.replace("- LHG -", " Leben, Heil, Gesundheit ")
365 | text = text.replace("- LHG", " Leben, Heil, Gesundheit ")
366 | text = text.replace("-LHG", " Leben, Heil, Gesundheit ")
367 | text = text.replace("- {LHG} LHG -", " Leben, Heil, Gesundheit ")
368 | text = text.replace("LHG", "Leben, Heil, Gesundheit")
369 | text = text.replace("l.h.g.", "Leben, Heil, Gesundheit")
370 | text = text.replace("l.h,.g.", "Leben, Heil, Gesundheit")
371 | text = text.replace("l.h-g", "Leben, Heil, Gesundheit")
372 | text = text.replace("l.h.g .", "Leben, Heil, Gesundheit")
373 | text = text.replace("l.h.g -", "Leben, Heil, Gesundheit")
374 | text = text.replace("l.h.g", "Leben, Heil, Gesundheit")
375 | text = text.replace("l.p.h.", "Life, Prosperity, Health")
376 | text = text.replace("LPH", "Life, Prosperity, Health")
377 | text = text.replace("„", "").replace("“", "").replace("”", "")
378 | text = text.replace("⸢", "").replace("⸣", "")
379 | text = re.sub(r"\$\[.*?\]\$", "", text)
380 | text = text.replace("[", "").replace("]", "")
381 | text = text.replace("<", "").replace(">", "")
382 | text = text.replace("𓉘", "").replace("𓊂", "")
383 | text = text.replace("𓍹", "").replace("𓍺", "")
384 | text = text.replace("‚", "").replace("‘", "")
385 | text = re.sub(r"⸮(.*?)\?", r"\1", text)
386 | # text = re.sub('\((.*?)\)[^\|]', ' ', text) !Attention! Problems with other parenthesis
387 | text = re.sub(r"\((.*?)\)\|", r"\1", text)
388 | text = text.replace("|", "")
389 | text = re.sub(r"\[§[0-9]+\]", "", text)
390 | text = re.sub(r"\[§[0-9]+\w+\]", "", text)
391 | text = re.sub(r"§[0-9]+(\s|\.|$|\,|\:|.*?)", r"\1", text)
392 | text = re.sub(r"§\s[0-9]+(\s|\.|$|\,|\:|.*?)", r"\1", text)
393 | text = re.sub(r"§\s[0-9]+-[0-9]+(\s|\.|$|\,|\:|.*?)", r"\1", text)
394 | text = re.sub(r"\-\s(Variante)(.*?)\-", "", text)
395 | text = re.sub(r"^(Variante)(.*?)$", r"\2", text)
396 | text = re.sub(r"(Variante)(.*?)$", "", text)
397 | # und, von, OA, UA acronyms and comments inside parenthesis
398 | text = text.replace("u.", "und")
399 | text = text.replace("v.", "von")
400 | text = text.replace(". ---", "")
401 | text = text.replace("--NN--", "").replace("|NN|", "").replace("NN", "")
402 | text = re.sub(r"\(wört.*?\)", "", text)
403 | text = re.sub(r"\(wört.*?$", "", text)
404 | text = re.sub(r"\[ältere Fassung.*?\]", "", text)
405 | text = re.sub(r"\(älterer Text.*?\)", "", text)
406 | text = re.sub(r"\(oder.*?\)", "", text)
407 | text = re.sub(r"^\[Beischrift.*?\]:", "", text)
408 | text = re.sub(r"\[Beischrift.*?\]", "", text)
409 | text = re.sub(r"\[.*?Beischrift.*?\]", "", text)
410 | text = re.sub(r"(O.?Äg?\.?)", "Oberägypten", text)
411 | text = re.sub(r"(U.?Äg?\.?)", "Unterägypten", text)
412 | text = text.strip("'").strip('"')
413 | text = text.strip()
414 | text = text.lstrip(".")
415 | # 〈〉 and {} parenthesis, and other elements
416 | text = re.sub(r"\{(.*?)\}\〈(.*?)\〉", r"\1", text)
417 | text = re.sub(r"\〈(.*?)\〉\{(.*?)\}", r"\2", text)
418 | text = text.replace("〈〈", "").replace("〉〉", "")
419 | text = text.replace("{{", "").replace("}}", "")
420 | text = re.sub(r"(\{.*?\}\s+[\wÄäÖöẞßÜü.,=:]+\s+)\〈(.*?)\〉", r"\1", text)
421 | text = re.sub(r"\〈(.*?)\〉(\s+[\wÄäÖöẞßÜü.,=:]+\s+\{.*?\})", r"\2", text)
422 | text = re.sub(
423 | r"(\{.*?\}[\wÄäÖöẞßÜü.,=:]+\s+[\wÄäÖöẞßÜü.,=:]+\s+)\〈(.*?)\〉", r"\1", text
424 | )
425 | text = re.sub(
426 | r"\〈(.*?)\〉([\wÄäÖöẞßÜü.,=:]+\s+[\wÄäÖöẞßÜü.,=:]+\s+\{.*?\})", r"\2", text
427 | )
428 | text = re.sub(
429 | r"(\{.*?\}\s+[\wÄäÖöẞßÜü.,=:]+\s+[\wÄäÖöẞßÜü.,=:]+\s+)\〈(.*?)\〉", r"\1", text
430 | )
431 | text = re.sub(
432 | r"\〈(.*?)\〉(\s+[\wÄäÖöẞßÜü.,=:]+\s+[\wÄäÖöẞßÜü.,=:]+\s+\{.*?\})", r"\2", text
433 | )
434 | text = re.sub(r"\〈(.*?)\〉(\s+[\wÄäÖöẞßÜü.,=:]+\{.*?\})", r"\2", text)
435 | text = re.sub(r"(\{.*?\}[\wÄäÖöẞßÜü.,=:]+\s+)\〈(.*?)\〉", r"\1", text)
436 | text = re.sub(r"\〈(.*?)\〉([\wÄäÖöẞßÜü.,=:]+\s+\{.*?\})", r"\2", text)
437 | text = re.sub(r"\{(.*?)\}\s\〈(.*?)\〉", r"\1", text)
438 | text = re.sub(r"\〈(.*?)\〉\s\{(.*?)\}", r"\2", text)
439 | text = re.sub(r'"(.*?)\/(.*?)"', r"\1", text)
440 | text = text.replace("〈", "").replace("〉", "")
441 | text = text.replace("{", "").replace("}", "")
442 | text = text.replace("Ꜥ", "ꜥ")
443 | text = text.replace("`", "'")
444 | text = text.replace("#", "")
445 | text = text.replace("≡", "=")
446 | text = text.replace("&", "und")
447 | text = text.replace("$", "")
448 | text = text.replace("(?)", "")
449 | text = re.sub(r"\.\s(oder[\s\wÄäÖöẞßÜü.,=:]+)", "", text)
450 | text = text.replace("*", "")
451 | text = text.replace('"', "")
452 | text = re.sub(r"\(.*?\)", "", text)
453 | text = re.sub(r"\(d\.h\.\s[\s\wÄäÖöẞßÜü.,=:]+", "", text)
454 | # Double spaces again
455 | text = " ".join(text.split())
456 | return text
457 |
458 |
459 | # Transliteration cleaning
460 | def clean_wChar(text):
461 | # Start from double spaces and sentences to delete
462 | text = " ".join(text.split())
463 | if "..." in text:
464 | text = ""
465 | if "_" in text:
466 | text = ""
467 | if "-??-" in text:
468 | text = ""
469 | # (()), [[]], ⸮? parenthesis, and two elements
470 | text = re.sub(r"\(\((.*?)\)\)", r"\1", text)
471 | text = re.sub(r"\[\[(.*?)\]\]", r"\1", text)
472 | text = text.replace("⸮", "").replace("?", "")
473 | text = text.replace("~", "")
474 | text = (
475 | text.replace(".pl.", "")
476 | .replace(".pl", "")
477 | .replace(".{pl}", "")
478 | .replace("{.pl}", "")
479 | .replace(",pl", "")
480 | .replace(".Pl", "")
481 | .replace("pl", "")
482 | )
483 | # text = text.replace('{(ꜥnḫ-wḏꜣ-snb)} ꜥnḫ', 'ꜥnḫ')
484 | text = text.replace("[", "").replace("]", "")
485 | text = text.replace("-(Zahl)-", "")
486 | text = text.replace("oder ḫr =s", "")
487 | text = text.replace("ON", "").replace("GN", "")
488 | text = text.replace("a", "")
489 | text = text.replace("Zahl", "")
490 | text = text.replace("(", "").replace(")", "")
491 | text = text.replace("⸢", "").replace("⸣", "")
492 | text = text.replace("..1Q..", "/")
493 | text = text.replace("..2Q..", "/ /")
494 | # Inside 〈〉 and {} parenthesis
495 | text = re.sub(
496 | r"(\〈[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\〉.*?\〈[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\〉)(.*?\{[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\}.*?\{[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\})",
497 | r"\2",
498 | text,
499 | )
500 | text = re.sub(r"(\{.*?\}\s\{.*?\})\s(\〈.*?\〉\s\〈.*?\〉)", r"\1", text)
501 | text = re.sub(r"(\〈.*?\〉\s\〈.*?\〉)\s(\{.*?\}\s\{.*?\})", r"\2", text)
502 | text = re.sub(r"(\{.*?\}\s\{.*?\})\s\〈(.*?)\〉", r"\1", text)
503 | text = re.sub(r"\〈(.*?)\〉\s(\{.*?\}\s\{.*?\})", r"\2", text)
504 | text = re.sub(r"(\{.*?\})\s(\〈.*?\〉\s\〈.*?\〉)", r"\2", text)
505 | text = re.sub(r"(\〈.*?\〉\s\〈.*?\〉)\s(\{.*?\})", r"\1", text)
506 | text = re.sub(r"(\{.*?\}\s.*?\s\{.*?\})\s(\〈.*?\〉)", r"\1", text)
507 | text = re.sub(r"(\〈.*?\〉\s.*?\s\〈.*?\〉)\s(\{.*?\})", r"\1", text)
508 | text = re.sub(r"\{(.*?)\}[^\s]\〈(.*?)\〉", r"\1", text)
509 | text = re.sub(r"\〈(.*?)\〉[^\s]\{(.*?)\}", r"\2", text)
510 | text = re.sub(r"\{(.*?)\}\s[^\s]\〈(.*?)\〉", r"\1", text)
511 | text = re.sub(r"\〈(.*?)\〉\s[^\s]\{(.*?)\}", r"\2", text)
512 | text = re.sub(r"(\{.*?\}[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+)\〈(.*?)\〉", r"\1", text)
513 | text = re.sub(r"(\〈.*?\〉)([a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\{.*?\})", r"\2", text)
514 | text = re.sub(r"(\{.*?\}\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+)\〈(.*?)\〉", r"\1", text)
515 | text = re.sub(r"(\〈.*?\〉)(\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\{.*?\})", r"\2", text)
516 | text = re.sub(r"(\{.*?\}\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s)\〈(.*?)\〉", r"\1", text)
517 | text = re.sub(r"(\〈.*?\〉)(\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s\{.*?\})", r"\2", text)
518 | text = re.sub(r"(\{.*?\}[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s)\〈(.*?)\〉", r"\1", text)
519 | text = re.sub(r"(\〈.*?\〉)([a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s\{.*?\})", r"\2", text)
520 | text = re.sub(
521 | r"(\{.*?\}\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+)\〈(.*?)\〉",
522 | r"\1",
523 | text,
524 | )
525 | text = re.sub(
526 | r"(\〈.*?\〉)(\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\{.*?\})",
527 | r"\2",
528 | text,
529 | )
530 | text = re.sub(
531 | r"(\{.*?\}[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+)\〈(.*?)\〉",
532 | r"\1",
533 | text,
534 | )
535 | text = re.sub(
536 | r"(\〈.*?\〉)([a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\{.*?\})",
537 | r"\2",
538 | text,
539 | )
540 | text = re.sub(
541 | r"(\{.*?\}[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s)\〈(.*?)\〉",
542 | r"\1",
543 | text,
544 | )
545 | text = re.sub(
546 | r"(\〈.*?\〉)([a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s\{.*?\})",
547 | r"\2",
548 | text,
549 | )
550 | text = re.sub(
551 | r"(\{.*?\}[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s)\〈(.*?)\〉",
552 | r"\1",
553 | text,
554 | )
555 | text = re.sub(
556 | r"\〈(.*?)\〉(\{.*?\}[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s)",
557 | r"\2",
558 | text,
559 | )
560 | text = re.sub(
561 | r"(\{.*?\}\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s)\〈(.*?)\〉",
562 | r"\1",
563 | text,
564 | )
565 | text = re.sub(
566 | r"(\〈.*?\〉)(\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s\{.*?\})",
567 | r"\2",
568 | text,
569 | )
570 | text = re.sub(
571 | r"(\{.*?\}\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+)\〈(.*?)\〉",
572 | r"\1",
573 | text,
574 | )
575 | text = re.sub(
576 | r"(\〈.*?\〉)(\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\{.*?\})",
577 | r"\2",
578 | text,
579 | )
580 | text = re.sub(
581 | r"(\{.*?\}\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s)\〈(.*?)\〉",
582 | r"\1",
583 | text,
584 | )
585 | text = re.sub(
586 | r"\〈(.*?)\〉(\{.*?\}\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s[a-zA-Z0-9ḤḥḪḫẖꜣꜥḏṯš.,:=i̯]+\s)",
587 | r"\2",
588 | text,
589 | )
590 | text = re.sub(r"\{(.*?)\}\〈(.*?)\〉", r"\1", text)
591 | text = re.sub(r"\〈(.*?)\〉\-\{(.*?)\}", r"\2", text)
592 | text = re.sub(r"\{(.*?)\}\\-〈(.*?)\〉", r"\1", text)
593 | text = re.sub(r"\〈(.*?)\〉\s\{(.*?)\}", r"\2", text)
594 | text = re.sub(r"\{(.*?)\}\s\〈(.*?)\〉", r"\1", text)
595 | text = re.sub(r"\〈(.*?)\〉\{(.*?)\}", r"\2", text)
596 | # # Fractions
597 | # text = re.sub('\〈\w+\/\w+\〉\s\〈\w+\/\w+\〉\s.*?(\{\w+\/\w+\}\s\{\w+\/\w+\})', r'\1', text)
598 | # 〈〉 and {} parenthesis and other elements
599 | text = text.replace("〈", "").replace("〉", "")
600 | text = text.replace("{", "").replace("}", "")
601 | text = text.replace(":", "")
602 | text = text.replace(".du", "").replace(",du", "")
603 | text = text.replace("≡", "=")
604 | text = text.replace("-Lücke-", "")
605 | text = text.replace("Lücke", "")
606 | text = text.replace("-", " ")
607 | text = text.replace("+", "")
608 | text = text.replace("!", "")
609 | text = text.replace("ø", "")
610 | text = text.replace("𓍹", "").replace("𓍺", "")
611 | text = text.replace("⁝", "")
612 | text = text.replace("Präp.", "")
613 | text = text.replace("𓊆", "").replace("𓊇", "")
614 | # text = text.replace('ð', '')
615 | # text = text.replace('ṯb;w,t', 'ṯbw,t')
616 | text = text.replace("t'", "tꜥ").replace("jmj-r'", "jmj-rꜥ")
617 | text = text.replace("ʾ", "ꜥ")
618 | text = text.strip()
619 | # Double spaces again
620 | text = " ".join(text.split())
621 | return text
622 |
623 |
624 | # # wordClass semplification
625 | # def clean_wordClass(text):
626 | # text = text.replace('title', 'title_epithet').replace('epith_god', 'title_epithet').replace('epith_king', 'title_epithet').replace('epitheton_title', 'title_epithet')
627 | # text = text.replace('prepositional_adverb', 'adverb')
628 | # text = text.replace('nisbe_adjective_preposition', 'adjective').replace('nisbe_adjective_substantive', 'adjective')
629 | # text = text.replace('substantive_fem', 'substantive').replace('substantive_masc', 'substantive').replace('animal_name', 'substantive').replace('artifact_name', 'substantive')
630 | # text = text.replace('entity_name', 'substantive').replace('gods_name', 'substantive').replace('kings_name', 'substantive').replace('org_name', 'substantive')
631 | # text = text.replace('person_name', 'substantive').replace('place_name', 'substantive').replace('root', 'substantive')
632 | # text = text.replace('cardinal', 'numeral').replace('ordinal', 'numeral')
633 | # text = text.replace('particle_enclitic', 'particle').replace('particle_nonenclitic', 'particle').replace('interjection', 'particle')
634 | # text = text.replace('personal_pronoun', 'pronoun').replace('demonstrative_pronoun', 'pronoun').replace('relative_pronoun', 'pronoun').replace('interrogative_pronoun', 'pronoun')
635 | # text = text.replace('verb_2-gem', 'verb').replace('verb_2-lit', 'verb').replace('verb_3-gem', 'verb').replace('verb_3-inf', 'verb').replace('verb_3-lit', 'verb')
636 | # text = text.replace('verb_4-inf', 'verb').replace('verb_4-lit', 'verb').replace('verb_5-inf', 'verb').replace('verb_5-lit', 'verb').replace('verb_6-lit', 'verb')
637 | # text = text.replace('verb_caus_2-gem', 'verb').replace('verb_caus_2-lit', 'verb').replace('verb_caus_3-gem', 'verb').replace('verb_caus_3-inf', 'verb')
638 | # text = text.replace('verb_caus_3-lit', 'verb').replace('verb_caus_4-inf', 'verb').replace('verb_caus_4-lit', 'verb').replace('verb_caus_5-lit', 'verb')
639 | # text = text.replace('verb_irr', 'verb')
640 | # return text
641 |
642 |
643 | # Clean all data function
644 | def clean_data(data):
645 | for datapoint in data:
646 | datapoint["source"] = clean_graphics(datapoint["source"])
647 | datapoint["transliteration"] = clean_wChar(datapoint["transliteration"])
648 | datapoint["target"] = clean_traduction(datapoint["target"])
649 | # datapoint['wordClass'] = clean_wordClass(datapoint['wordClass'])
650 | return data
651 |
652 |
653 | # Training functions defining: batch_it, tokenize_batch, training_stes, validations_step
654 | def batch_it(sequence, batch_size=1, return_last=True):
655 | if batch_size <= 0:
656 | raise ValueError(
657 | f"Batch size cannot be nonpositive. Passed `batch_size = {batch_size}`"
658 | )
659 |
660 | batch = []
661 | for item in sequence:
662 | if len(batch) == batch_size:
663 | yield batch
664 | batch = []
665 | batch.append(item)
666 |
667 | if batch and return_last:
668 | yield batch
669 |
670 |
671 | def tokenize_batch(model, batch, tokenizer, src_lang, tgt_lang):
672 | tokenizer.src_lang = src_lang
673 | tokenizer.tgt_lang = tgt_lang
674 |
675 | tokenized_batch = tokenizer(
676 | [element["source"] for element in batch],
677 | text_target=[element["target"] for element in batch],
678 | max_length=64,
679 | padding=True,
680 | truncation=True,
681 | return_tensors="pt",
682 | ).to(model.device)
683 |
684 | tokenized_batch["labels"] = torch.where(
685 | tokenized_batch["labels"] == tokenizer.pad_token_id,
686 | torch.full_like(tokenized_batch["labels"], -100),
687 | tokenized_batch["labels"],
688 | )
689 |
690 | return tokenized_batch
691 |
692 |
693 | def training_step(batch, model, tokenizer, optimizer, src_lang, tgt_lang):
694 | with torch.cuda.amp.autocast():
695 | tokenized_batch = tokenize_batch(model, batch, tokenizer, src_lang, tgt_lang)
696 | loss = model(**tokenized_batch).loss
697 |
698 | loss.backward()
699 | optimizer.step()
700 | optimizer.zero_grad()
701 |
702 | return loss.item()
703 |
704 |
705 | def validation_step(batch, model, tokenizer, src_lang, tgt_lang):
706 | with torch.no_grad():
707 | with torch.cuda.amp.autocast():
708 | tokenized_batch = tokenize_batch(
709 | model, batch, tokenizer, src_lang, tgt_lang
710 | )
711 | loss = model(**tokenized_batch).loss
712 |
713 | return loss.item(), (tokenized_batch["labels"] != -100).sum().item()
714 |
--------------------------------------------------------------------------------