├── tests
    ├── __init__.py
    ├── expected
    │   ├── luwian_unicode.yaml
    │   ├── luwian_tokenize_transliteration.yaml
    │   ├── linear_b_transliteration.yaml
    │   ├── hittite_unicode.yaml
    │   ├── linear_a_transliteration.yaml
    │   ├── hittite_tokenize_transliteration.yaml
    │   ├── linear_a_tokenize_transliteration.yaml
    │   ├── linear_a_unicode_regularized.yaml
    │   ├── linear_a_unicode.yaml
    │   ├── arabic_unicode.yaml
    │   ├── linear_b_tokenize_transliteration.yaml
    │   ├── errors.txt
    │   └── linear_b_unicode.yaml
    ├── test_data.py
    ├── test_script.py
    ├── test_arabic.py
    ├── test_luwian.py
    ├── data.py
    ├── test_hittite.py
    ├── test_linear_b.py
    ├── test_linear_a.py
    └── test_main.py
├── potnia
    ├── data
    │   ├── akkadian.yaml
    │   ├── arabic.yaml
    │   ├── potnia.bib
    │   ├── hittite.yaml
    │   ├── luwian.yaml
    │   ├── linear_a.yaml
    │   └── linear_b.yaml
    ├── scripts
    │   ├── __init__.py
    │   ├── luwian.py
    │   ├── akkadian.py
    │   ├── linear_b.py
    │   ├── hittite.py
    │   ├── arabic.py
    │   └── linear_a.py
    ├── __init__.py
    ├── enums.py
    ├── data.py
    ├── main.py
    └── script.py
├── docs
    ├── contributing.rst
    ├── _static
    │   └── img
    │   │   ├── Csign_har.png
    │   │   ├── Csign_me3.png
    │   │   ├── LBsign-qa.png
    │   │   ├── erc-logo.jpg
    │   │   ├── mappings.png
    │   │   ├── PotniaLogo.png
    │   │   ├── potnia-gui.png
    │   │   ├── potnia-banner.jpg
    │   │   ├── potnia-example.png
    │   │   ├── syllabograms.png
    │   │   ├── Csign_har_large.png
    │   │   ├── Csign_me3_large.png
    │   │   ├── LBsign_qa_large.png
    │   │   ├── LBsign_qa_large2.png
    │   │   └── downstream-example.png
    ├── credits.rst
    ├── quickstart.rst
    ├── Makefile
    ├── fonts.rst
    ├── index.rst
    ├── make.bat
    ├── api.rst
    ├── conf.py
    ├── additions.rst
    └── linear_b.md
├── .coveragerc
├── mkdocs.sh
├── example.py
├── paper.sh
├── .github
    └── workflows
    │   ├── testing.yml
    │   ├── joss-draft-pdf.yml
    │   ├── publish.yml
    │   └── docs.yml
├── pyproject.toml
├── CONTRIBUTING.rst
├── .gitignore
├── CODE_OF_CONDUCT.md
├── README.rst
├── LICENSE
├── paper.md
└── paper.bib


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/potnia/data/akkadian.yaml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/potnia/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | ../CONTRIBUTING.rst


--------------------------------------------------------------------------------
/tests/expected/luwian_unicode.yaml:
--------------------------------------------------------------------------------
1 | há : 𔓟


--------------------------------------------------------------------------------
/tests/expected/luwian_tokenize_transliteration.yaml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source = potnia
3 | 
4 | [report]
5 | precision = 2
6 | 
7 | 


--------------------------------------------------------------------------------
/mkdocs.sh:
--------------------------------------------------------------------------------
1 | sphinx-build -b html docs docshtml -E -a
2 | echo docshtml/index.html
3 | 


--------------------------------------------------------------------------------
/docs/_static/img/Csign_har.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/Csign_har.png


--------------------------------------------------------------------------------
/docs/_static/img/Csign_me3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/Csign_me3.png


--------------------------------------------------------------------------------
/docs/_static/img/LBsign-qa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/LBsign-qa.png


--------------------------------------------------------------------------------
/docs/_static/img/erc-logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/erc-logo.jpg


--------------------------------------------------------------------------------
/docs/_static/img/mappings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/mappings.png


--------------------------------------------------------------------------------
/docs/_static/img/PotniaLogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/PotniaLogo.png


--------------------------------------------------------------------------------
/docs/_static/img/potnia-gui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/potnia-gui.png


--------------------------------------------------------------------------------
/docs/_static/img/potnia-banner.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/potnia-banner.jpg


--------------------------------------------------------------------------------
/docs/_static/img/potnia-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/potnia-example.png


--------------------------------------------------------------------------------
/docs/_static/img/syllabograms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/syllabograms.png


--------------------------------------------------------------------------------
/tests/expected/linear_b_transliteration.yaml:
--------------------------------------------------------------------------------
1 | "𐀀𐀪𐀵𐀍" : "a-ri-to-jo"
2 | "𐀀𐀪𐀵𐀍 𐀀𐀪𐀵𐀍" : "a-ri-to-jo a-ri-to-jo"


--------------------------------------------------------------------------------
/docs/_static/img/Csign_har_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/Csign_har_large.png


--------------------------------------------------------------------------------
/docs/_static/img/Csign_me3_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/Csign_me3_large.png


--------------------------------------------------------------------------------
/docs/_static/img/LBsign_qa_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/LBsign_qa_large.png


--------------------------------------------------------------------------------
/docs/_static/img/LBsign_qa_large2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/LBsign_qa_large2.png


--------------------------------------------------------------------------------
/docs/_static/img/downstream-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/downstream-example.png


--------------------------------------------------------------------------------
/docs/credits.rst:
--------------------------------------------------------------------------------
1 | =======================
2 | Credits
3 | =======================
4 | 
5 | .. include:: ../README.rst
6 |    :start-after: start-credits
7 |    :end-before: end-credits
8 | 


--------------------------------------------------------------------------------
/docs/quickstart.rst:
--------------------------------------------------------------------------------
1 | =======================
2 | Quickstart
3 | =======================
4 | 
5 | .. include:: ../README.rst
6 |    :start-after: start-quickstart
7 |    :end-before: end-quickstart
8 | 


--------------------------------------------------------------------------------
/potnia/scripts/luwian.py:
--------------------------------------------------------------------------------
1 | # from dataclasses import dataclass
2 | # from ..hittite import Hittite
3 | 
4 | # @dataclass
5 | # class Luwian(Hittite):
6 | #     config:str = "luwian.yaml"
7 | 
8 | 
9 | # luwian = Luwian()


--------------------------------------------------------------------------------
/potnia/scripts/akkadian.py:
--------------------------------------------------------------------------------
 1 | # from dataclasses import dataclass
 2 | # from ..script import Script
 3 | 
 4 | 
 5 | # @dataclass
 6 | # class Akkadian(Script):
 7 | #     config:str = "akkadian"
 8 | 
 9 | 
10 | # akkadian = Akkadian()


--------------------------------------------------------------------------------
/tests/expected/hittite_unicode.yaml:
--------------------------------------------------------------------------------
1 | "ḫe]-en-ku-un šu-me-eš ma-ni-ia-aḫ-ḫi-eš-ke-et-tén" : "𒄭]𒂗𒆪𒌦 𒋗𒈨𒐁 𒈠𒉌𒅀𒄴𒄭𒐁𒆠𒀉𒁷"
2 | 
3 | "a-ši KÁ-aš ku-iš pa-it nu DINGIR (MEŠ)  2 ḪUR.SAG (MEŠ)" : "𒀀𒅆 𒆍𒀸 𒆪𒅖 𒉺𒀉 𒉡 𒀭 𒌍  2 𒄯𒊕 𒌍"


--------------------------------------------------------------------------------
/tests/test_data.py:
--------------------------------------------------------------------------------
1 | from potnia.data import read_data_yaml_cached
2 | 
3 | def test_read_data_yaml_cached():
4 |     result = read_data_yaml_cached("does-not-exists.yaml")
5 |     assert isinstance(result, dict)
6 |     assert len(result) == 0


--------------------------------------------------------------------------------
/tests/test_script.py:
--------------------------------------------------------------------------------
 1 | from potnia.script import Script
 2 | 
 3 | 
 4 | 
 5 | def test_tokenize_unicode():
 6 |     script = Script(config=dict(dummy="dummy"))
 7 | 
 8 |     result = script.tokenize_unicode("text")
 9 |     assert result == ["t", "e", "x", "t"]
10 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | from potnia import linear_b
 2 | text = "po-ti-ni-ja"
 3 | 
 4 | tokens = linear_b.tokenize_transliteration(text)
 5 | print(tokens) # Output: ['po', 'ti', 'ni', 'ja']
 6 | 
 7 | unicode_text = linear_b(text)
 8 | print(unicode_text) # Output: 𐀡𐀴𐀛𐀊
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/tests/expected/linear_a_transliteration.yaml:
--------------------------------------------------------------------------------
1 | "𐙒 𐙿 𐙇" : "*180 A339 *100"  # https://sigla.phis.me/document/PH%2012a/
2 | "𐀏𐀇 𐙒 𐙿 𐙇" : "ka-di *180 A339 *100"
3 | "𐀏𐀇 𐙍 3 𐀓𐀫 𐙍 78 𐛿 17" : "ka-di *131a 3 ku-ro *131a 78 A594 17"  # https://sigla.phis.me/document/ZA%2015b/. Uses *131a instead of AB131/VINa
4 | 


--------------------------------------------------------------------------------
/tests/expected/hittite_tokenize_transliteration.yaml:
--------------------------------------------------------------------------------
 1 | "ḫe]‑en‑ku‑un šu‑me‑eš ma‑ni‑ia‑aḫ‑ḫi‑eš‑ke‑et‑tén" :
 2 |   - "ḫe"
 3 |   - "]"
 4 |   - "en"
 5 |   - "ku"
 6 |   - "un"
 7 |   - " "
 8 |   - "šu"
 9 |   - "me"
10 |   - "eš"
11 |   - " "
12 |   - "ma"
13 |   - "ni"
14 |   - "ia"
15 |   - "aḫ"
16 |   - "ḫi"
17 |   - "eš"
18 |   - "ke"
19 |   - "et"
20 |   - "tén"
21 | 


--------------------------------------------------------------------------------
/tests/test_arabic.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from potnia import arabic
 3 | from .data import expected
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("test_input,expected", expected("arabic_unicode"))
 7 | def test_arabic_unicode(test_input, expected):
 8 |     result = arabic(test_input)
 9 |     assert result == expected, f"Expected: arabic('{test_input}') to produce '{expected}' but got '{result}'"
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/potnia/__init__.py:
--------------------------------------------------------------------------------
 1 | from .script import Script
 2 | from .scripts.linear_a import linear_a, LinearA
 3 | from .scripts.linear_b import linear_b, LinearB
 4 | from .scripts.hittite import hittite, Hittite
 5 | from .scripts.arabic import arabic, Arabic
 6 | 
 7 | 
 8 | # Luwian is currently a work in progress
 9 | # from .luwian import luwian 
10 | 
11 | # Akkadian is currently a work in progress
12 | # from .akkadian import akkadian
13 | 


--------------------------------------------------------------------------------
/potnia/enums.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class BibliographyFormat(str, Enum):
 5 |     plaintext = "plaintext"
 6 |     html = "html"
 7 |     latex = "latex"
 8 |     markdown = "markdown"
 9 | 
10 |     def __str__(self):
11 |         return self.value
12 | 
13 | 
14 | class BibliographyStyle(str, Enum):
15 |     plain = "plain"
16 |     unsrt = "unsrt"
17 |     alpha = "alpha"
18 |     unsrtalpha = "unsrtalpha"
19 | 
20 |     def __str__(self):
21 |         return self.value
22 | 


--------------------------------------------------------------------------------
/tests/test_luwian.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | # from potnia import luwian
 3 | from .data import expected
 4 | 
 5 | 
 6 | # @pytest.mark.parametrize("test_input,expected", expected("luwian_unicode"))
 7 | # def test_luwian_unicode(test_input, expected):
 8 | #     assert luwian(test_input) == expected
 9 | 
10 | 
11 | # @pytest.mark.parametrize("test_input,expected", expected("luwian_tokenize_transliteration"))
12 | # def test_tokenize_transliteration_luwian(test_input, expected):
13 | #     assert luwian.tokenize_transliteration(test_input) == expected
14 | 


--------------------------------------------------------------------------------
/tests/data.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | from pathlib import Path
 3 | 
 4 | EXPECTED_DIR = Path(__file__).parent / "expected"
 5 | 
 6 | def read_expected(filename: str) -> dict[str, str]:
 7 |     path = EXPECTED_DIR / filename
 8 |     if not path.suffix:
 9 |         path = path.with_suffix(".yaml")
10 | 
11 |     if not path.exists():
12 |         return dict()
13 |     
14 |     with open(path, encoding='utf8') as f:
15 |         result = yaml.safe_load(f)
16 |     return result or dict()
17 | 
18 | 
19 | def expected(filename: str) -> str:
20 |     return read_expected(filename).items()


--------------------------------------------------------------------------------
/potnia/data/arabic.yaml:
--------------------------------------------------------------------------------
 1 | mappings:
 2 |   b : ب
 3 |   t : ت
 4 |   ṯ : ث
 5 |   g : ج
 6 |   j : ج
 7 |   ǧ : ج
 8 |   ḥ : ح
 9 |   ḫ : خ
10 |   d : د
11 |   ḏ : ذ
12 |   r : ر
13 |   z : ز
14 |   s : س
15 |   š : ش
16 |   ṣ : ص
17 |   ḍ : ض
18 |   ṭ : ط
19 |   ẓ : ظ
20 |   ʿ : ع
21 |   ġ : غ
22 |   f : ف
23 |   q : ق
24 |   k : ك
25 |   l : ل
26 |   m : م
27 |   n : ن
28 |   h : ه
29 |   w : و
30 |   y : ي
31 |   ỳ : ى
32 |   ā : ا
33 |   ī : ي
34 |   ū : و
35 |   ʾ : ء
36 | 
37 |   a: َ  # Fatha (short 'a' sound)
38 |   i: ِ  # Kasra (short 'i' sound)
39 |   u: ُ  # Damma (short 'u' sound)
40 | 


--------------------------------------------------------------------------------
/tests/expected/linear_a_tokenize_transliteration.yaml:
--------------------------------------------------------------------------------
 1 | "]ta-pi ]ki[ ]a-ra[ ]a-su-mi-*118[ a-pa-[?][ ]mi-ki-sa-ne[":
 2 |   - "]"
 3 |   - "ta"
 4 |   - "pi"
 5 |   - " "
 6 |   - "]"
 7 |   - "ki"
 8 |   - "["
 9 |   - " "
10 |   - "]"
11 |   - "a"
12 |   - "ra"
13 |   - "["
14 |   - " "
15 |   - "]"
16 |   - "a"
17 |   - "su"
18 |   - "mi"
19 |   - "*118"
20 |   - "["
21 |   - " "
22 |   - "a"
23 |   - "pa"
24 |   - "[?]"
25 |   - "["
26 |   - " "
27 |   - "]"
28 |   - "mi"
29 |   - "ki"
30 |   - "sa"
31 |   - "ne"
32 |   - "["
33 | 
34 | "pi[?]":
35 |   - "pi"
36 |   - "[?]"
37 | 
38 | "[?]pi":
39 |   - "[?]"
40 |   - "pi"  
41 | 


--------------------------------------------------------------------------------
/paper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Instructions here: https://joss.readthedocs.io/en/latest/submitting.html#docker
 4 | docker run --rm -it \
 5 |     -v $PWD:/data \
 6 |     -u $(id -u):$(id -g) \
 7 |     --env JOURNAL=joss \
 8 |     openjournals/inara:latest \
 9 |     -o pdf \
10 |     paper.md
11 | 
12 | 
13 | echo Generating preprint
14 | docker run --rm -it \
15 |     -v $PWD:/data \
16 |     -u $(id -u):$(id -g) \
17 |     --env JOURNAL=joss \
18 |     openjournals/inara:latest \
19 |     -o preprint \
20 |     paper.md
21 | 
22 | # TODO replace "docs/_static/img/" in paths in paper.preprint.tex with root directory
23 | cat paper.preprint.tex | sed "s/docs\/_static\/img\///g" > tmp
24 | mv tmp paper.preprint.tex


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/fonts.rst:
--------------------------------------------------------------------------------
 1 | Fonts
 2 | ======================
 3 | 
 4 | If the glyphs for Linear A, Linear B, or other ancient scripts do not display correctly in your terminal, code editor, web browser, or Jupyter notebook, it is likely due to missing fonts. Install the following Noto Sans fonts from Google Fonts for proper rendering:
 5 | 
 6 | - `Noto Sans Linear A <https://fonts.google.com/noto/specimen/Noto+Sans+Linear+A>`_
 7 | - `Noto Sans Linear B <https://fonts.google.com/noto/specimen/Noto+Sans+Linear+B>`_
 8 | 
 9 | Ensure your editor, terminal, or application is set to use these fonts. If you are browsing the repository online, **Firefox** typically offers better glyph support compared to **Google Chrome** or **Microsoft Edge**.
10 | 


--------------------------------------------------------------------------------
/potnia/data.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | from pathlib import Path
 3 | from functools import cache
 4 | 
 5 | DATA_DIR = Path(__file__).parent / "data"
 6 | 
 7 | @cache
 8 | def read_data_yaml_cached(filename: str) -> dict[str, str]:
 9 |     path = DATA_DIR / filename
10 |     if not path.suffix:
11 |         path = path.with_suffix(".yaml")
12 | 
13 |     if not path.exists():
14 |         return dict()
15 |     
16 |     with open(path, encoding='utf8') as f:
17 |         result = yaml.safe_load(f)
18 |     return result or dict()
19 | 
20 | 
21 | def read_data(*filenames) -> dict[str, str]:
22 |     result = dict()
23 |     for filename in filenames:
24 |         result.update(read_data_yaml_cached(filename))
25 | 
26 |     return result
27 | 


--------------------------------------------------------------------------------
/tests/test_hittite.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from potnia import hittite
 3 | from .data import expected
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("test_input,expected", expected("hittite_unicode"))
 7 | def test_hittite_unicode(test_input, expected):
 8 |     result = hittite(test_input)
 9 |     assert result == expected, f"Expected: hittite('{test_input}') to produce '{expected}' but got '{result}'"
10 | 
11 | 
12 | @pytest.mark.parametrize("test_input,expected", expected("hittite_tokenize_transliteration"))
13 | def test_tokenize_transliteration_hittite(test_input, expected):
14 |     result = hittite.tokenize_transliteration(test_input)
15 |     assert result == expected, f"Expected: hittite.tokenize_transliteration('{test_input}') to produce '{expected}' but got '{result}'"
16 | 


--------------------------------------------------------------------------------
/tests/expected/linear_a_unicode_regularized.yaml:
--------------------------------------------------------------------------------
1 | # Test that syllabograms and logograms are correctly converted to Unicode, that spaces work as word separators, and that hyphens act to join syllables within a word
2 | "ka-di AB131/VINa 3 ku-ro AB131/VINa 78 A594 17" : "𐀏𐀇 𐙍 3 𐀓𐀫 𐙍 78 𐛿 17" # https://sigla.phis.me/document/ZA%2015b/
3 | 
4 | # Test that ']' and '[' are correctly converted to wildcard characters ('%')
5 | # Test that '[?]'' is correctly converted to a wildcard character ('%')
6 | "]pa-ri-de ]a-si-*118 ]ku-ka-[?][" : "%𐀞𐀪𐀆 %𐀀𐀯𐙈 %𐀓𐀏%%" # https://sigla.phis.me/document/KH%2099/index-7.html
7 | 
8 | # Test that '[unclassified]' is correctly converted to a wildcard character ('%')
9 | "[unclassified] *180 A339 *100" : "% 𐙒 𐙿 𐙇" # https://sigla.phis.me/document/PH%2012a/


--------------------------------------------------------------------------------
/.github/workflows/testing.yml:
--------------------------------------------------------------------------------
 1 | name: testing
 2 | 
 3 | on: [push]
 4 | jobs:
 5 |   build:
 6 |     runs-on: ubuntu-latest
 7 |     strategy:
 8 |       fail-fast: false
 9 |       matrix:
10 |         python-version: ["3.10", "3.11", "3.12"]
11 |     steps:
12 |     - uses: actions/checkout@v3
13 |     - name: Install poetry
14 |       run: pipx install poetry
15 |     - name: Initialise Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v3
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 |         cache: "poetry"
20 |     - name: Install dependencies for Python ${{ matrix.python-version }}
21 |       run: |
22 |         poetry env use "${{ matrix.python-version }}"
23 |         poetry install
24 |     - name: Testing
25 |       run: |
26 |         poetry env info
27 |         poetry run pytest -v
28 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. potnia documentation master file, created by
 2 |    sphinx-quickstart on Mon Jul 22 12:22:08 2024.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to potnia's documentation!
 7 | ==================================
 8 | 
 9 | .. image:: https://raw.githubusercontent.com/AncientNLP/potnia/main/docs/_static/img/PotniaLogo.png
10 | 
11 | .. include:: ../README.rst
12 |    :start-after: start-summary
13 |    :end-before: end-summary
14 | 
15 | .. toctree::
16 |    :maxdepth: 2
17 |    :caption: Contents:
18 | 
19 |    quickstart
20 |    fonts
21 |    linear_b
22 |    additions
23 |    api
24 |    contributing
25 |    credits
26 | 
27 | Indices and tables
28 | ==================
29 | 
30 | * :ref:`genindex`
31 | * :ref:`modindex`
32 | * :ref:`search`
33 | 


--------------------------------------------------------------------------------
/tests/expected/linear_a_unicode.yaml:
--------------------------------------------------------------------------------
 1 | # Test that syllabograms and logograms are correctly converted to Unicode, that spaces work as word separators, and that hyphens act to join syllables within a word
 2 | "ka-di AB131/VINa 3 ku-ro AB131/VINa 78 A594 17" : "𐀏𐀇 𐙍 3 𐀓𐀫 𐙍 78 𐛿 17" # https://sigla.phis.me/document/ZA%2015b/
 3 | 
 4 | # Test that ']' and '[' are correctly tokenized as individual tokens
 5 | # Test that '[?]'' is correctly converted to a wildcard character ('%')
 6 | "]pa-ri-de ]a-si-*118 ]ku-ka-[?][" : "]𐀞𐀪𐀆 ]𐀀𐀯𐙈 ]𐀓𐀏[?][" # https://sigla.phis.me/document/KH%2099/index-7.html
 7 | 
 8 | # Test that '[unclassified]' is correctly tokenized as a single token
 9 | "[unclassified] *180 A339 *100" : "[unclassified] 𐙒 𐙿 𐙇" # https://sigla.phis.me/document/PH%2012a/
10 | 
11 | "ka-di[unclassified] *180 A339 *100" : "𐀏𐀇[unclassified] 𐙒 𐙿 𐙇"


--------------------------------------------------------------------------------
/.github/workflows/joss-draft-pdf.yml:
--------------------------------------------------------------------------------
 1 | name: joss-draft-pdf
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - paper
 8 | 
 9 | jobs:
10 |   paper:
11 |     runs-on: ubuntu-latest
12 |     name: Paper Draft
13 |     steps:
14 |       - name: Checkout
15 |         uses: actions/checkout@v4
16 |       - name: Build draft PDF
17 |         uses: openjournals/openjournals-draft-action@master
18 |         with:
19 |           journal: joss
20 |           # This should be the path to the paper within your repo.
21 |           paper-path: paper.md
22 |       - name: Upload
23 |         uses: actions/upload-artifact@v4
24 |         with:
25 |           name: paper
26 |           # This is the output path where Pandoc will write the compiled
27 |           # PDF. Note, this should be the same directory as the input
28 |           # paper.md
29 |           path: paper.pdf
30 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/potnia/data/potnia.bib:
--------------------------------------------------------------------------------
 1 | @article{potnia, 
 2 |   author       = {Emily Tour and Kabir Manandhar Shrestha and Robert Turnbull},
 3 |   title        = {{Potnia: A Python library for the conversion of transliterated ancient texts to Unicode}},
 4 |   year         = {2025}, 
 5 |   journal      = {Journal of Open Source Software},
 6 |   publisher    = {The Open Journal}, 
 7 |   volume       = {10}, 
 8 |   number       = {108},
 9 |   pages        = {7725},
10 |   doi          = {10.21105/joss.07725}, 
11 |   url          = {https://doi.org/10.21105/joss.07725}
12 | }
13 | @misc{potnia_release,
14 |   author       = {Emily Tour and Kabir Manandhar Shrestha and Robert Turnbull},
15 |   title        = {{Potnia: A Python library for the conversion of transliterated ancient texts to Unicode}},
16 |   year         = {2025},
17 |   url          = {https://doi.org/10.26188/28721354.v1},
18 |   note         = {Version 0.4.0, Apache License 2.0},
19 |   doi          = {10.26188/28721354.v1}
20 | }
21 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | =======================
 2 | API Reference
 3 | =======================
 4 | 
 5 | 
 6 | Abstract Base Class: `Script`
 7 | =============================
 8 | 
 9 | .. autoclass:: potnia.script.Script
10 |     :members:
11 |     :undoc-members:
12 |     :show-inheritance:
13 | 
14 | 
15 | Scripts Available
16 | ==================
17 | 
18 | Linear A
19 | --------
20 | 
21 | .. autoclass:: potnia.scripts.linear_a.LinearA
22 |     :members:
23 |     :undoc-members:
24 |     :inherited-members:
25 | 
26 | 
27 | Linear B
28 | --------
29 | 
30 | .. autoclass:: potnia.scripts.linear_b.LinearB
31 |     :members:
32 |     :undoc-members:
33 |     :inherited-members:
34 | 
35 | 
36 | Arabic
37 | -------
38 | 
39 | .. autoclass:: potnia.scripts.arabic.Arabic
40 |     :members:
41 |     :undoc-members:
42 |     :inherited-members:
43 | 
44 | 
45 | Hittite
46 | -------
47 | 
48 | .. autoclass:: potnia.scripts.hittite.Hittite
49 |     :members:
50 |     :undoc-members:
51 |     :inherited-members:
52 | 
53 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: publish
 2 | on:
 3 |   push:
 4 |     tags:
 5 |       - 'v*.*.*'
 6 | jobs:
 7 |   build:
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         python-version: ["3.11"]
13 |     steps:
14 |       - uses: actions/checkout@v3
15 |       - name: Install poetry
16 |         run: pipx install poetry
17 |       - name: Initialise Python ${{ matrix.python-version }}
18 |         uses: actions/setup-python@v3
19 |         with:
20 |           python-version: ${{ matrix.python-version }}
21 |           cache: "poetry"
22 |       - name: Install dependencies for Python ${{ matrix.python-version }}
23 |         run: |
24 |           poetry env use "${{ matrix.python-version }}"
25 |           poetry install
26 |       - name: Build library
27 |         run: poetry build
28 |       - name: Publish library
29 |         env:
30 |           PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
31 |         run: |
32 |           poetry config pypi-token.pypi $PYPI_TOKEN
33 |           poetry publish
34 | 


--------------------------------------------------------------------------------
/tests/expected/arabic_unicode.yaml:
--------------------------------------------------------------------------------
 1 | "al-kitāb" : "الكِتاب"
 2 | "al-salām" : "السَلام"
 3 | naʿam: "نَعَم"           # Yes
 4 | lā: "لا"                 # No
 5 | ṣabāḥu al-ḫayr: "صَباحُ الخَير"  # Good morning
 6 | masāʾu al-ḫayr: "مَساءُ الخَير"  # Good evening
 7 | ṣadīq: "صَديق"                   # Friend (male)
 8 | kayfa ḥāluka: "كَيفَ حالُكَ"     # How are you (to a male)
 9 | kayfa ḥāluki: "كَيفَ حالُكِ"     # How are you (to a female)
10 | marḥaban: "مَرحَبًا"      # Hello
11 | šukran: "شُكرًا"         # Thank you
12 | ʿafwan: "عَفوًا"         # You're welcome
13 | ʿindī suʾālun: "عِندي سؤالٌ"     # I have a question
14 | mā ismuka: "ما اسمُكَ"           # What is your name (to a male)
15 | mā ismuki: "ما اسمُكِ"           # What is your name (to a female)
16 | ʾanā min: "أنا مِن"              # I am from
17 | ʾayna al-ḥammāmu: "أينَ الحَمّامُ"  # Where is the bathroom
18 | hal tatakallamu al-inglīziyya: "هَل تَتَكَلَّمُ الإنجليزِيَّ"  # Do you speak English (to a male)
19 | hal tatakallamīna al-inglīziyya: "هَل تَتَكَلَّمينَ الإنجليزِيَّ"  # Do you speak English (to a female)
20 | mā al-ʿamalu: "ما العَمَلُ"       # What is the work
21 | anā atakallamu al-ʿarabiyya: "أنا أتَكَلَّمُ العَرَبِيَّ"  # I speak Arabic
22 | ṣadīqatun: "صَديقَةٌ"                # Friend (female)
23 | 
24 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | 
 9 | project = 'potnia'
10 | copyright = '2024, Emily Tour, Kabir Manandhar Shrestha, Robert Turnbull'
11 | author = 'Emily Tour, Kabir Manandhar Shrestha, Robert Turnbull'
12 | release = '0.1.1'
13 | 
14 | # -- General configuration ---------------------------------------------------
15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
16 | 
17 | extensions = [
18 |     "sphinx_rtd_theme",
19 |     "myst_parser",
20 |     "sphinx.ext.mathjax",
21 |     "sphinx.ext.githubpages",
22 |     "sphinx.ext.autodoc",
23 |     "sphinx.ext.coverage",
24 |     "sphinx.ext.napoleon",
25 |     "sphinx_copybutton",
26 |     "sphinx.ext.graphviz",
27 | ]
28 | 
29 | templates_path = ['_templates']
30 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
31 | 
32 | 
33 | 
34 | # -- Options for HTML output -------------------------------------------------
35 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
36 | 
37 | html_theme = 'sphinx_rtd_theme'
38 | html_static_path = ['_static']
39 | 


--------------------------------------------------------------------------------
/tests/test_linear_b.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from potnia import linear_b
 3 | from .data import expected
 4 | 
 5 | @pytest.mark.parametrize("test_input,expected", expected("linear_b_unicode"))
 6 | def test_linear_b_unicode(test_input, expected):
 7 |     result = linear_b(test_input)
 8 |     assert result == expected, f"Expected: linear_b('{test_input}') to produce '{expected}' but got '{result}'"
 9 | 
10 | 
11 | @pytest.mark.parametrize("test_input,expected", expected("linear_b_unicode_regularized"))
12 | def test_linear_b_unicode_regularized(test_input, expected):
13 |     result = linear_b(test_input, regularize=True)
14 |     assert result == expected, f"Expected: linear_b('{test_input}', regularize=True) to produce '{expected}' but got '{result}'"
15 | 
16 | 
17 | @pytest.mark.parametrize("test_input,expected", expected("linear_b_transliteration"))
18 | def test_linear_b_transliteration(test_input, expected):
19 |     result = linear_b.to_transliteration(test_input)
20 |     assert result == expected, f"Expected: linear_b.to_transliteration('{test_input}') to produce '{expected}' but got '{result}'"
21 | 
22 | 
23 | @pytest.mark.parametrize("test_input,expected", expected("linear_b_tokenize_transliteration"))
24 | def test_tokenize_transliteration_linear_b(test_input, expected):
25 |     result = linear_b.tokenize_transliteration(test_input)
26 |     assert result == expected, f"Expected: linear_b.tokenize_transliteration('{test_input}') to produce '{expected}' but got '{result}'"
27 | 


--------------------------------------------------------------------------------
/tests/test_linear_a.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from potnia import linear_a
 3 | from .data import expected
 4 | 
 5 | @pytest.mark.parametrize("test_input,expected", expected("linear_a_unicode"))
 6 | def test_linear_a_unicode(test_input, expected):
 7 |     result = linear_a(test_input)
 8 |     assert result == expected, f"Expected: linear_a('{test_input}') to produce '{expected}' but got '{result}'"
 9 | 
10 | 
11 | @pytest.mark.parametrize("test_input,expected", expected("linear_a_transliteration"))
12 | def test_linear_a_transliteration(test_input, expected):
13 |     result = linear_a.to_transliteration(test_input)
14 |     assert result == expected, f"Expected: linear_a.to_transliteration('{test_input}') to produce '{expected}' but got '{result}'"
15 | 
16 | 
17 | @pytest.mark.parametrize("test_input,expected", expected("linear_a_unicode_regularized"))
18 | def test_linear_a_unicode_regularized(test_input, expected):
19 |     result = linear_a(test_input, regularize=True)
20 |     assert result == expected, f"Expected: linear_a('{test_input}', regularize=True) to produce '{expected}' but got '{result}'"
21 | 
22 | 
23 | @pytest.mark.parametrize("test_input,expected", expected("linear_a_tokenize_transliteration"))
24 | def test_tokenize_transliteration_linear_a(test_input, expected):
25 |     result = linear_a.tokenize_transliteration(test_input)
26 |     assert result == expected, f"Expected: linear_a.tokenize_transliteration('{test_input}') to produce '{expected}' but got '{result}'"
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "potnia"
 3 | version = "0.4.1"
 4 | description = "Potnia is an open-source Python library designed to convert Romanized transliterations of ancient texts into Unicode representations of ther respective native scripts.."
 5 | authors = ["Emily Tour <esmtour@gmail.com>", "Kabir Manandhar Shrestha", "Robert Turnbull"]
 6 | license = "Apache-2.0"
 7 | readme = "README.rst"
 8 | repository = "https://github.com/AncientNLP/potnia/"
 9 | documentation = "https://AncientNLP.github.io/potnia"
10 | homepage = "https://github.com/AncientNLP/potnia/"
11 | keywords = ["linear a", "linear b", "sumerian", "akkadian", "cuneiform", "unicode"]
12 | # For classifiers see https://pypi.org/classifiers/
13 | classifiers = [
14 |     "License :: OSI Approved :: Apache Software License",
15 |     "Intended Audience :: Science/Research",
16 |     "Topic :: Software Development :: Libraries :: Python Modules",
17 | ]
18 | 
19 | [tool.poetry.dependencies]
20 | python = ">=3.10,<3.13"
21 | pyyaml = "^6.0.1"
22 | typer = "^0.12.5"
23 | guigaga = ">=0.0.5"
24 | numpy = "<2"
25 | pybtex = ">=0.24.0"
26 | pybtexnbib = ">=0.1.1"
27 | setuptools = "^75.8.0"
28 | 
29 | 
30 | [tool.poetry.group.dev.dependencies]
31 | pytest = "^7.4.4"
32 | coverage = ">=7.4.3"
33 | Sphinx = ">=5.0.0"
34 | sphinx-rtd-theme = ">=1.0.0"
35 | sphinx-autobuild = ">=2021.3.14"
36 | sphinx-copybutton = ">=0.4.0"
37 | myst-parser = "^3.0.1"
38 | 
39 | 
40 | [build-system]
41 | requires = ["poetry-core"]
42 | build-backend = "poetry.core.masonry.api"
43 | 
44 | [tool.pytest.ini_options]
45 | filterwarnings = ["ignore::DeprecationWarning"]
46 | 
47 | [tool.poetry.scripts]
48 | potnia = "potnia.main:app"
49 | 


--------------------------------------------------------------------------------
/potnia/scripts/linear_b.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from dataclasses import dataclass
 3 | from ..script import Script
 4 | 
 5 | @dataclass
 6 | class LinearB(Script):
 7 |     """
 8 |     Class for handling text transliteration and unicode conversion for Linear B.
 9 | 
10 |     To use the singleton instance, import like so:
11 |     ``from potnia import linear_b``
12 | 
13 |     Designed especially for texts from DĀMOS (Database of Mycenaean at Oslo): https://damos.hf.uio.no/
14 |     and LiBER (Linear B Electronic Resources): https://liber.cnr.it/
15 | 
16 |     Attributes:
17 |         config (str): Path to the configuration file or configuration data in string format. 
18 |                       By default, it uses the 'linear_a.yaml file in the 'data' directory.
19 |     """
20 |     config:str = "linear_b"
21 | 
22 |     def regularize(self, text: str) -> str:
23 |         """
24 |         Applies regularization rules to a given string.
25 | 
26 |         Args:
27 |             string (str): Text string to be regularized.
28 | 
29 |         Returns:
30 |             str: Regularized text string.
31 |         """
32 |         text = super().regularize(text)
33 | 
34 |         # Ensure there are informative characters left in the text
35 |         informative_chars = set(list(re.sub(r'[%\s]', "", text)))
36 |         if len(informative_chars) == 0:
37 |             return ""
38 | 
39 |         return text
40 | 
41 |     def tokenize_unicode(self, text:str) -> list[str]:
42 |         """
43 |         Tokenizes a unicode string by splitting and joining words with dashes.
44 | 
45 |         Args:
46 |             text (str): Input text in unicode format.
47 | 
48 |         Returns:
49 |             list[str]: List of tokenized strings.
50 |         """
51 |         words = ['-'.join(word) for word in text.split()]
52 |         text = ' '.join(words)
53 |         return list(text)
54 | 
55 | 
56 | linear_b = LinearB()
57 | 


--------------------------------------------------------------------------------
/tests/expected/linear_b_tokenize_transliteration.yaml:
--------------------------------------------------------------------------------
 1 | "]ra-ma-na , / e-ne-ra MUL[": [']', 'ra', 'ma', 'na', ' ', ',', ' ', '/', ' ', 'e', 'ne', 'ra', ' ', 'MUL', '[']
 2 | "] ko-wo / m\u0323e\u0323[-zo] 1 ko-wo / me-wi-jo 2 [": [']', ' ', 'ko', 'wo', ' ', '/', ' ', 'me', '[', 'zo', ']', ' ', '1', ' ', 'ko', 'wo', ' ', '/', ' ', 'me', 'wi', 'jo', ' ', '2', ' ', '[']
 3 | "]wa VIR 1 MUL 2 'ko-wa 1' ko-wo 1": [']', 'wa', ' ', 'VIR', ' ', '1', ' ', 'MUL', ' ', '2', ' ', "'", 'ko', 'wa', ' ', '1', "'", ' ', 'ko', 'wo', ' ', '1']
 4 | "]qa-ra / re-me-to * 168 + SE 28": [']', 'qa', 'ra', ' ', '/', ' ', 're', 'me', 'to', ' ', '*168+SE', ' ', '28']
 5 | "da-pu₂-ri-to-jo , / po-ti-ni-ja 'me-ri' * 209 VAS 1": ['da', 'pu₂', 'ri', 'to', 'jo', ' ', ',', ' ', '/', ' ', 'po', 'ti', 'ni', 'ja', ' ', "'", 'me', 'ri', "'", ' ', '*209VAS', ' ', '1']
 6 | "po-*34-wi-do ⟦TUN⟧ BIG[": ['po', '*34', 'wi', 'do', ' ', '⟦', 'TUN', '⟧', ' ', 'BIG', '[']
 7 | "inf . mut .": ['inf.', ' ', 'mut.']
 8 | "] vacat [": [']', ' ', 'vacat', ' ', '[']
 9 | "] vest ., / su-ri-mo , u-ta-jo-jo , o OVIS m 85[\u00a0] vac .": [']', ' ', 'vest.', ',', ' ', '/', ' ', 'su', 'ri', 'mo', ' ', ',', ' ', 'u', 'ta', 'jo', 'jo', ' ', ',', ' ', 'o', ' ', 'OVISm', ' ', '85', '[', ' ', ']', ' ', 'vac.']
10 | "su-ma-no / ti-ri-to [ vestigia? ] vacat": ['su', 'ma', 'no', ' ', '/', ' ', 'ti', 'ri', 'to', ' ', '[', ' ', 'vestigia', '?', ' ', ']', ' ', 'vacat']
11 | "v.": ["v."]
12 | "l . s .": ['l.', ' ', 's.']
13 | "l\u0323a\u0323t\u0323 . i\u0323n\u0323f\u0323 .": ['lat.', ' ', 'inf.']
14 | "l . i .": ["l."," ","i."]
15 | "] Graffito [": [']', ' ', 'Graffito', ' ', '[']
16 | "CAP f 130 SUS 17 SUS f 41 BOS m 2 BOS f 4": ['CAPf', ' ', '130', ' ', 'SUS', ' ', '17', ' ', 'SUSf', ' ', '41', ' ', 'BOSm', ' ', '2', ' ', 'BOSf', ' ', '4']
17 | "i[-qi-ja?": ['i', '[', 'qi', 'ja', '?']
18 | "]2 TELA 2 + PU 90": ["]","2"," ","TELA2+PU"," ", "90"]
19 | "]TELA 1 1 LANA 3": ["]", "TELA1", " ", "1", " ", "LANA"," ","3"]


--------------------------------------------------------------------------------
/potnia/scripts/hittite.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from ..script import Script
 3 | 
 4 | 
 5 | @dataclass
 6 | class Hittite(Script):
 7 |     """
 8 |     Class for handling text transliteration and unicode conversion to Hittite.
 9 | 
10 |     To use the singleton instance, import like so:
11 |     ``from potnia import hittite``
12 | 
13 |     Designed especially for texts from the Catalog der Texte der Hethiter (CTH): https://www.hethport.uni-wuerzburg.de/CTH/index.php
14 | 
15 |     Attributes:
16 |         config (str): Path to the configuration file or configuration data in string format. 
17 |                       By default, it uses the 'hittite.yaml file in the 'data' directory.
18 |     """
19 |     config:str = "hittite"
20 | 
21 |     def tokenize_transliteration(self, input_string:str) -> list[str]:
22 |         """
23 |         Tokenizes transliterated text according to specific patterns.
24 | 
25 |         Args:
26 |             text (str): Input text in transliterated format.
27 | 
28 |         Returns:
29 |             list[str]: List of tokens
30 |         """
31 |         tokens = []
32 |         token = ""
33 |         i = 0
34 | 
35 |         while i < len(input_string):
36 |             char = input_string[i]
37 | 
38 |             # Handle characters ']', '[', and ' '
39 |             if char in '[] ':
40 |                 if token:
41 |                     tokens.append(token)
42 |                     token = ""
43 |                 tokens.append(char)
44 |             # Handle other characters
45 |             elif char in ['-','‑']:
46 |                 if token:
47 |                     tokens.append(token)
48 |                     token = ""
49 |             else:
50 |                 token += char
51 |             i += 1
52 | 
53 |         # Add the last token if it exists
54 |         if token:
55 |             tokens.append(token)
56 | 
57 |         return tokens
58 | 
59 | 
60 | 
61 | 
62 | hittite = Hittite()


--------------------------------------------------------------------------------
/tests/expected/errors.txt:
--------------------------------------------------------------------------------
 1 | # ISSUE 1
 2 | 
 3 | linear_b.tokenize_transliteration("]-o-pe-ro *209VAS 'ME<±RI>' 5 [")
 4 | # We want to see MERI as a single token
 5 | [']', 'o', 'pe', 'ro', ' ', '*209VAS', ' ', "'", 'ME', '<', '±RI', '>', "'", ' ', '5', ' ', '[']
 6 | 
 7 | linear_b("]-o-pe-ro *209VAS 'ME<±RI>' 5 [", regularize=True)
 8 | '%𐀃𐀟𐀫 𐃨 𐀕±RI 5 %'
 9 | 
10 | # ISSUE 2
11 | 
12 | linear_b("]-i-to , / da-nwa ME±RI *209VAS+A 16 *172 8", regularize=True)
13 | '%𐀂𐀵 𐀅𐁅 𐂙 𐃨+𐀀 16 𐂴 8' but should get "%𐀂𐀵 𐀅𐁅 𐂙 𐃨+𐀀 16 𐂹 8"
14 | 
15 | # ISSUE 3
16 | 
17 | linear_b.tokenize_transliteration("pa-si-te-o-i / me-ri *209VAS 1 da-pu2-ri-to-jo , / po-ti-ni-ja 'me-ri' *209VAS 1")
18 | ['pa', 'si', 'te', 'o', 'i', ' ', '/', ' ', 'me', 'ri', ' ', '*209VAS', ' ', '1', ' ', 'da', 'pu2', 'ri', 'to', 'jo', ' ', ',', ' ', '/', ' ', 'po', 'ti', 'ni', 'ja', ' ', "'", 'me', 'ri', "'", ' ', '*209VAS', ' ', '1']
19 | 
20 | linear_b("pa-si-te-o-i / me-ri *209VAS 1 da-pu2-ri-to-jo , / po-ti-ni-ja 'me-ri' *209VAS 1", regular
21 | ize=True)
22 | '𐀞𐀯𐀳𐀃𐀂 𐀕𐀪 𐃨 1 𐀅pu2𐀪𐀵𐀍 𐀡𐀴𐀛𐀊 𐀕𐀪 𐃨 1' pu2 is not mapped? 
23 | 
24 | # ISSUE 4
25 | 
26 | linear_b("a-ka--[ ]--jo-jo , me-no-[ da-pu2-ri-[-to-jo ]-po-ti-ni-ja ri *166+WE 22-[", regularize=Tr
27 | ue)
28 | '𐀀𐀏% %𐀍𐀍 𐀕𐀜% 𐀅pu2𐀪%𐀵𐀍 %𐀡𐀴𐀛𐀊 𐀪 𐂮+𐀸 22%' Again pu2 not mapped? 
29 | 
30 | # ISSUE 5 
31 | linear_b("a-ka--[ ]--jo-jo , me-no-[ da-pu2-ri-[-to-jo ]-po-ti-ni-ja ri *166+WE 22-[", regularize=Tr
32 | ue)
33 | '𐀀𐀏% %𐀍𐀍 𐀕𐀜% 𐀅pu2𐀪%𐀵𐀍 %𐀡𐀴𐀛𐀊 𐀪 𐂮+𐀸 22%'
34 | 
35 | # ISSUE 6
36 | linear_b("] e-ko-so OVIS:m 100 LANA [ ]-da-ro , / X LANA [ lat. inf. ]-a3 [", regularize=True)
37 | '% 𐀁𐀒𐀰 𐂇 100 𐂝 % %𐀅𐀫 𐂝 % %a3 %' is a3 mapping broken? 
38 | 
39 | # ISSUE 7
40 | linear_b("fragmentum A fragmentum B vacat [ sup. mut. e-me-si-jo-jo-[ ] 3-[ pa-na-so GRA 100-[ ]-vac.-[ ta-ra-qo GRA [ inf. mut. ta-u-pa-du-we GRA-[ a-ro-ja-[ pu-na-so-[ inf. mut.", regularize=True)
41 | '𐀀 % 𐀁𐀕𐀯𐀍𐀍% % 3% 𐀞𐀙𐀰 𐂎 100% %% 𐀲𐀨𐀦 𐂎 % 𐀲𐀄𐀞𐀉𐀸 𐂎% 𐀀𐀫𐀊% 𐀢𐀙𐀰%' but should be "% % 𐀁𐀕𐀯𐀍𐀍% % 3% 𐀞𐀙𐀡 𐂎 100% %% 𐀲𐀨𐀦 𐂎 % 𐀲𐀄𐀞𐀉𐀸 𐂎% 𐀀𐀫𐀊% 𐀢𐀙𐀰%"
42 | 
43 | # ISSUE 8
44 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: main
 6 | 
 7 |   # Allows you to run this workflow manually from the Actions tab
 8 |   workflow_dispatch:
 9 | 
10 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
11 | permissions:
12 |   contents: read
13 |   pages: write
14 |   id-token: write
15 | 
16 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
17 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
18 | concurrency:
19 |   group: "pages"
20 |   cancel-in-progress: false
21 | 
22 | jobs:
23 |   build:
24 |     environment:
25 |       name: github-pages
26 |       url: ${{ steps.deployment.outputs.page_url }}
27 |     runs-on: ubuntu-latest
28 |     strategy:
29 |       fail-fast: false
30 |       matrix:
31 |         python-version: ["3.10"]
32 |     steps:
33 |       - uses: actions/checkout@v3
34 |       - name: Install poetry
35 |         run: pipx install poetry
36 |       - name: Install dependencies for Python ${{ matrix.python-version }}
37 |         uses: actions/setup-python@v3
38 |         with:
39 |           python-version: ${{ matrix.python-version }}
40 |           cache: 'poetry'
41 |       - run: |
42 |           poetry env use "${{ matrix.python-version }}"
43 |           poetry install
44 |       - name: Docs
45 |         run: |
46 |           poetry run sphinx-build -b html docs gh-pages
47 |       - name: Coverage
48 |         run: |
49 |           poetry run coverage run -m pytest
50 |           echo "COVERAGE=$(poetry run coverage report --precision 2 | grep TOTAL | tr -s ' ' | cut -f 4 -d " ")" >> $GITHUB_ENV
51 |           poetry run coverage html --directory gh-pages/coverage
52 |       - name: Setup Pages
53 |         uses: actions/configure-pages@v3
54 |       - name: Upload artifact
55 |         uses: actions/upload-pages-artifact@v3
56 |         with:
57 |           path: 'gh-pages'
58 |       - name: Deploy to GitHub Pages
59 |         id: deployment
60 |         uses: actions/deploy-pages@v4  
61 |       - name: Create Coverage Badge
62 |         uses: schneegans/dynamic-badges-action@v1.1.0
63 |         with:
64 |           auth: ${{ secrets.GIST_SECRET }}
65 |           gistID: e640f26fb59e39e3051de8fbf020de62
66 |           filename: coverage-badge.json
67 |           label: coverage
68 |           message: ${{ env.COVERAGE }}
69 |           color: green
70 | 


--------------------------------------------------------------------------------
/potnia/scripts/arabic.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from dataclasses import dataclass
 3 | from ..script import Script
 4 | 
 5 | @dataclass
 6 | class Arabic(Script):
 7 |     """
 8 |     Class for handling text transliteration and unicode conversion to Arabic.
 9 | 
10 |     To use the singleton instance, import like so:
11 |     ``from potnia import arabic``
12 | 
13 |     Uses the DIN 31635 standard for Arabic transliteration.
14 | 
15 |     If you need the Tim Buckwalter transliteration system, then use the PyArabic library.
16 | 
17 |     Attributes:
18 |         config (str): Path to the configuration file or configuration data in string format. 
19 |                       By default, it uses the 'arabic.yaml file in the 'data' directory.
20 |     """
21 |     config:str = "arabic"
22 | 
23 |     def to_unicode(self, text:str, regularize:bool=False) -> str:
24 |         """
25 |         Converts transliterated text to unicode format.
26 | 
27 |         Args:
28 |             text (str): Input text in transliterated format.
29 |             regularize (bool, optional): Whether to apply regularization. Defaults to False.
30 | 
31 |         Returns:
32 |             str: Text converted to unicode format, optionally regularized.
33 |         """
34 |         # if word ends with 'atun' then make it damataan with taa marbuta
35 |         text = re.sub(r'(\w\w)atun\b', r'\1'+'َ\u0629\u064C', text)
36 |         # if word has uʾ then make it a hamza on top of waw
37 |         text = re.sub(r'uʾ', '\u0624', text)
38 |         # if word ends with 'un' then make it damataan
39 |         text = re.sub(r'(\w\w)un\b', r'\1'+'\u064C', text)
40 |         # if word ends with 'in' then make it kasrataan
41 |         text = re.sub(r'(\w\w)in\b', r'\1'+'\u064D', text)
42 |         # if word ends with 'an' then make it fatatan
43 |         text = re.sub(r'(\w\w)an\b', r'\1'+'\u064Bا', text)
44 |         # if word starts with 'i' or 'a' then make it an alif with hamza
45 |         text = re.sub(r'\b[i]', 'إ', text)
46 |         text = re.sub(r'-[i]', "-إ", text)
47 |         text = re.sub(r'\b[a]', 'أ', text)
48 |         text = re.sub(r'-[a]', "-أ", text)
49 | 
50 |         text = re.sub(r'\bʾa', 'أ', text)
51 | 
52 |         # definite article
53 |         text = re.sub(r'أl-', "ال", text)
54 | 
55 |         text = super().to_unicode(text, regularize)
56 | 
57 |         # fix the word 'اسم' if it is written as 'إسم'
58 |         text = re.sub(r"إسم", "اسم", text)
59 | 
60 |         arabic_consonants_with_shadda = [
61 |             'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 
62 |             'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 
63 |             'ه', 'و', 'ي'
64 |         ]
65 |         for consonant in arabic_consonants_with_shadda:
66 |             text = re.sub(f'{consonant}{consonant}', f'{consonant}\u0651', text)
67 | 
68 |         return text
69 | 
70 | 
71 | arabic = Arabic()


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from typer.testing import CliRunner
 4 | from potnia.main import app
 5 | from unittest.mock import patch
 6 | from .data import expected
 7 | 
 8 | runner = CliRunner()
 9 | 
10 | 
11 | @pytest.mark.parametrize("test_input,expected", expected("linear_a_unicode"))
12 | def test_linear_a_main(test_input, expected):
13 |     result = runner.invoke(app, ["linear-a", test_input])
14 |     assert expected in result.stdout
15 | 
16 | 
17 | @pytest.mark.parametrize("test_input,expected", expected("linear_b_unicode"))
18 | def test_linear_b_main(test_input, expected):
19 |     result = runner.invoke(app, ["linear-b", test_input])
20 |     assert expected in result.stdout
21 | 
22 | 
23 | @pytest.mark.parametrize("test_input,expected", expected("linear_b_unicode_regularized"))
24 | def test_linear_b_main_regularized(test_input, expected):
25 |     result = runner.invoke(app, ["linear-b", test_input, "--regularize"])
26 |     assert expected in result.stdout
27 | 
28 | 
29 | @pytest.mark.parametrize("test_input,expected", expected("hittite_unicode"))
30 | def test_hittite_main(test_input, expected):
31 |     result = runner.invoke(app, ["hittite", test_input])
32 |     assert expected in result.stdout
33 | 
34 | 
35 | # @pytest.mark.parametrize("test_input,expected", expected("luwian_unicode"))
36 | # def test_luwian_main(test_input, expected):
37 | #     result = runner.invoke(app, ["luwian", test_input])
38 | #     assert expected in result.stdout
39 | 
40 | 
41 | @pytest.mark.parametrize("test_input,expected", expected("arabic_unicode"))
42 | def test_arabic_main(test_input, expected):
43 |     result = runner.invoke(app, ["arabic", test_input])
44 |     assert expected in result.stdout
45 | 
46 | 
47 | def test_bibtex():
48 |     result = runner.invoke(app, ["bibtex"])
49 |     assert "Journal of Open Source Software" in result.stdout
50 |     assert "10.21105/joss.07725" in result.stdout
51 |     assert "2025" in result.stdout
52 | 
53 | 
54 | def test_bibliography():
55 |     result = runner.invoke(app, ["bibliography"])
56 |     assert "Emily Tour" in result.stdout
57 |     assert "Kabir" in result.stdout
58 |     assert "Turnbull" in result.stdout
59 |     assert "2025" in result.stdout
60 | 
61 | 
62 | def test_gui_launch():
63 |     # Mock the GUIGAGA class and its launch method to avoid actually launching the GUI during the test
64 |     with patch("guigaga.guigaga.GUIGAGA") as MockGUIGAGA:
65 |         # Create a mock instance for the GUIGAGA class
66 |         mock_gui = MockGUIGAGA.return_value
67 |         mock_gui.launch.return_value = None
68 | 
69 |         # Run the CLI command
70 |         result = runner.invoke(app, ["gui"])
71 | 
72 |         # Assert that the command ran successfully (exit code 0)
73 |         assert result.exit_code == 0
74 | 
75 |         # Assert that the GUIGAGA instance was created and launch was called
76 |         MockGUIGAGA.assert_called_once()
77 |         mock_gui.launch.assert_called_once()
78 | 


--------------------------------------------------------------------------------
/docs/additions.rst:
--------------------------------------------------------------------------------
 1 | ===========================
 2 | Adding New Scripts to Potnia
 3 | ============================
 4 | 
 5 | Potnia allows for the easy integration of new ancient scripts by using a single YAML file per script. This file will contain the mappings for syllabograms, logograms (if applicable), transliteration rules, and regularization patterns. Below are the steps for adding a new script, along with examples.
 6 | 
 7 | Steps to Add a New Script
 8 | ----------------------------
 9 | 
10 | 1. **Create a Single YAML Mapping and Rules File**: Define the mappings for syllabograms, logograms (if applicable), and the rules for transliteration and regularization. Here's an example for Linear B:
11 | 
12 |    .. code-block:: yaml
13 | 
14 |       mappings:
15 |         a: 𐀀
16 |         e: 𐀁
17 |         i: 𐀂
18 |         # logograms
19 |         VIR: 𐂀  # man
20 |         MUL: 𐂁  # woman
21 |       transliteration:
22 |         - ['ro2', '𐁊']
23 |       regularization:
24 |         - ['\\[•~\\]', '']  # Remove uncertain readings
25 |         - ['\\bqs\\b', '%']  # Handle missing elements
26 | 
27 | 2. **Add the New Script Class**: Create a `Script` class that points to the new YAML file (usually in the `scripts` directory). For example:
28 | 
29 |    .. code-block:: python
30 | 
31 |       from dataclasses import dataclass
32 |       from ..script import Script
33 | 
34 |       @dataclass
35 |       class NewScript(Script):
36 |           config: str = "new_script"  # Refers to the YAML file name
37 | 
38 |       new_script = NewScript()
39 | 
40 | 3. **Add to __init__.py**: Add the new script to the ``__init__.py`` file. For example:
41 | 
42 |    .. code-block:: python
43 | 
44 |       from .scripts.new_script import new_script, NewScript
45 | 
46 | 4. **Write Test Cases**: Add test cases to ensure that the new script's transliteration and Unicode mapping work as expected. Example:
47 | 
48 |    .. code-block:: yaml
49 | 
50 |       test_newscript_unicode.yaml:
51 |       "a-e-i": "𐀀𐀁𐀂"
52 |       "VIR MUL": "𐂀𐂁"
53 |     
54 |     Then, write a test function to check the output of the new script:
55 | 
56 |     .. code-block:: python
57 | 
58 |     @pytest.mark.parametrize("test_input,expected", expected("test_newscript_unicode"))
59 |     def test_test_newscript_unicode(test_input, expected):
60 |         result = new_script(test_input)
61 |         assert result == expected, f"Expected: new_script('{test_input}') to produce '{expected}' but got '{result}'"
62 | 
63 | 
64 | 5. **Usage Example**: Once the new script is added, it can be used as follows:
65 | 
66 |    .. code-block:: python
67 | 
68 |       from potnia import new_script
69 | 
70 |       # Convert transliterated text to Unicode
71 |       new_script("a-e-i")
72 | 
73 |       # Regularize text
74 |       new_script("a-[•~]", regularize=True)
75 | 
76 | This approach centralizes all configuration for a given script into a single YAML file, simplifying the process of adding new scripts while maintaining Potnia's flexible and modular design.
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
 1 | =======================
 2 | Contributing
 3 | =======================
 4 | 
 5 | These practices are subject to change based on the decisions of the team.
 6 | 
 7 | - Use clear and explicit variable names. The variable names are typically more verbose than those in fastai.
 8 | - Python code should be formatted using black with the settings in pyproject.toml. The maximum line length is 120 characters.
 9 | - Contributions should be commited to a new branch and will be merged with main only after tests and documentation are complete.
10 | 
11 | Installation
12 | ==================
13 | 
14 | To install Potnia for development, run the following command:
15 | 
16 | .. code-block:: bash
17 | 
18 |     git clone https://github.com/AncientNLP/potnia.git
19 |     cd potnia
20 | 
21 | Make sure that poetry is installed on your system. If not, see the `instructions <https://python-poetry.org/docs/#installation>`_.
22 | 
23 | Then install the dependencies using poetry:
24 | 
25 | .. code-block:: bash
26 | 
27 |     poetry install
28 | 
29 | 
30 | Testing
31 | ==================
32 | 
33 | - All tests must be passing before merging with the ``main`` branch.
34 | - Tests are automatically included in the CI/CD pipeline using Github actions.
35 | 
36 | Git Commits
37 | ===========
38 | 
39 | We use the `git3moji <https://robinpokorny.github.io/git3moji/>`_ standard for expressive git commit messages. 
40 | Use one of the following five short emojis at the start of your of your git commit messages:
41 | 
42 | - ``:zap:`` ⚡️ – Features and primary concerns
43 | - ``:bug:`` 🐛 – Bugs and fixes
44 | - ``:tv:``  📺 – CI, tooling, and configuration
45 | - ``:cop:`` 👮 – Tests and linting
46 | - ``:abc:`` 🔤 – Documentation
47 | 
48 | As far as possible, please keep your git commits granular and focussed on one thing at a time. 
49 | Please cite an the number of a Github issue if it relates to your commit.
50 | 
51 | Documentation
52 | ==================
53 | 
54 | - Docstrings for Python functions should use the Google docstring convention (https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings)
55 | - Documentation generated using sphinx and automatically deployed as part of the CI/CD pipeline.
56 | - Docs should be written in reStructuredText.
57 | 
58 | Files need to start with a heading for the section. The convention used here is to use the equals sign above and below the heading::
59 | 
60 |     ===============
61 |     Section Heading
62 |     ===============
63 | 
64 | Subsections also use an equals sign but just below the heading::
65 | 
66 |     Subsection Heading
67 |     ==================
68 | 
69 | Subsubsections have a single dash below the heading::
70 | 
71 |     Subsubsection Heading
72 |     ---------------------
73 | 
74 | Try not to have any other sections within this but if it is necessary, use tildas below the heading::
75 | 
76 |     Further Subsection Headings
77 |     ~~~~~~~~~~~~~~~~~~~~~~~~~~~
78 | 
79 | Other information for using reStructuredText in Sphinx can be found here: https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#rst-primer and https://thomas-cokelaer.info/tutorials/sphinx/rest_syntax.html.
80 | 
81 | Code of Conduct
82 | ==================
83 | 
84 | We follow the `Contributor Covenant Code of Conduct <https://github.com/AncientNLP/potnia/blob/main/CODE_OF_CONDUCT.md>`_ 
85 | for all community contributions.


--------------------------------------------------------------------------------
/potnia/scripts/linear_a.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from dataclasses import dataclass
  3 | from ..script import Script
  4 | 
  5 | @dataclass
  6 | class LinearA(Script):
  7 |     """
  8 |     Class for handling text transliteration and unicode conversion for Linear A.
  9 | 
 10 |     To use the singleton instance, import like so:
 11 |     ``from potnia import linear_a``
 12 | 
 13 |     Attributes:
 14 |         config (str): Path to the configuration file or configuration data in string format. 
 15 |                       By default, it uses the 'linear_a.yaml file in the 'data' directory.
 16 |     """
 17 |     config:str = "linear_a.yaml"
 18 | 
 19 |     def tokenize_transliteration(self, input_string: str) -> list[str]:
 20 |         """
 21 |         Tokenizes transliterated text according to specific patterns.
 22 | 
 23 |         Args:
 24 |             text (str): Input text in transliterated format.
 25 | 
 26 |         Returns:
 27 |             list[str]: List of tokens
 28 |         """
 29 |         tokens = []
 30 |         token = ""
 31 |         i = 0
 32 | 
 33 |         while i < len(input_string):
 34 |             char = input_string[i]
 35 | 
 36 |             # Check for special sequences like "[?]" and "[unclassified]"
 37 |             if char == '[':
 38 |                 if input_string[i:i + 3] == '[?]':
 39 |                     if token:
 40 |                         tokens.append(token)
 41 |                     tokens.append("[?]")
 42 |                     token = ""
 43 |                     i += 3  # Skip past "[?]"
 44 |                     continue
 45 |                 elif input_string[i:i + 14] == '[unclassified]':
 46 |                     if token:
 47 |                         tokens.append(token)
 48 |                     tokens.append("[unclassified]")
 49 |                     token = ""
 50 |                     i += 14  # Skip past "[unclassified]"
 51 |                     continue
 52 | 
 53 |             # Handle characters ']', '[', and ' '
 54 |             if char in '[] ':
 55 |                 if token:
 56 |                     tokens.append(token)
 57 |                     token = ""
 58 |                 tokens.append(char)
 59 |             # Handle other characters
 60 |             elif char == '-':
 61 |                 if token:
 62 |                     tokens.append(token)
 63 |                     token = ""
 64 |             else:
 65 |                 token += char
 66 |             i += 1
 67 | 
 68 |         # Add the last token if it exists
 69 |         if token:
 70 |             tokens.append(token)
 71 | 
 72 |         return tokens
 73 | 
 74 |     def tokenize_unicode(self, text:str) -> list[str]:
 75 |         """
 76 |         Tokenizes a unicode string by splitting and joining words with dashes.
 77 | 
 78 |         Args:
 79 |             text (str): Input text in unicode format.
 80 | 
 81 |         Returns:
 82 |             list[str]: List of tokenized strings.
 83 |         """
 84 |         def is_aegean(char):
 85 |             return "\U00010000" <= char <= "\U0001007F" or "\U00010600" <= char <= "\U0001077F"
 86 | 
 87 |         # Insert hyphens between consecutive Linear B characters
 88 |         modified_text = ""
 89 |         prev_was_aegean = False
 90 | 
 91 |         for char in text:
 92 |             if is_aegean(char):
 93 |                 if prev_was_aegean:
 94 |                     modified_text += "-"  # Add hyphen if previous character was also Linear B
 95 |                 modified_text += char
 96 |                 prev_was_aegean = True
 97 |             else:
 98 |                 modified_text += char
 99 |                 prev_was_aegean = False  # Reset flag on encountering a non-Linear B character
100 | 
101 |         return list(modified_text)
102 | 
103 | 
104 | linear_a = LinearA()


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | docshtml/
163 | paper.pdf
164 | jats/
165 | paper.preprint.tex
166 | paper.html


--------------------------------------------------------------------------------
/potnia/main.py:
--------------------------------------------------------------------------------
  1 | import typer
  2 | from pybtex import PybtexEngine
  3 | from potnia import linear_a as linear_a_script
  4 | from potnia import linear_b as linear_b_script
  5 | from potnia import hittite as hittite_script
  6 | from potnia import arabic as arabic_script
  7 | # from potnia import luwian as luwian_script
  8 | # from potnia import akkadian as akkadian_script
  9 | from rich.console import Console
 10 | 
 11 | from .enums import BibliographyStyle, BibliographyFormat
 12 | from .data import DATA_DIR
 13 | 
 14 | BIBTEX_PATH = DATA_DIR / "potnia.bib"
 15 | 
 16 | app  = typer.Typer(context_settings={"help_option_names": ["-h", "--help"]})
 17 | 
 18 | TEXT_ARGUMENT = typer.Argument(help="The transliterated text to be converted to Unicode.")
 19 | REGULARIZATION_DEFAULT = typer.Option(False, help="Whether or not to regularize the output.")
 20 | 
 21 | 
 22 | @app.command()
 23 | def linear_a(text: list[str]=TEXT_ARGUMENT, regularize:bool=REGULARIZATION_DEFAULT):
 24 |     """ Converts a Linear A text to Unicode. """
 25 |     if isinstance(text, list):
 26 |         text = " ".join(text)
 27 |     print(linear_a_script(text, regularize=regularize))
 28 | 
 29 | 
 30 | @app.command()
 31 | def linear_b(text: list[str]=TEXT_ARGUMENT, regularize:bool=REGULARIZATION_DEFAULT):
 32 |     """ Converts a Linear B text to Unicode. """
 33 |     if isinstance(text, list):
 34 |         text = " ".join(text)
 35 |     print(linear_b_script(text, regularize=regularize))
 36 | 
 37 | 
 38 | @app.command()
 39 | def hittite(text: list[str]=TEXT_ARGUMENT, regularize:bool=REGULARIZATION_DEFAULT):
 40 |     """ Converts a Hittite text to Unicode. """
 41 |     if isinstance(text, list):
 42 |         text = " ".join(text)
 43 |     print(hittite_script(text, regularize=regularize))
 44 | 
 45 | 
 46 | # @app.command()
 47 | # def luwian(text: list[str]=TEXT_ARGUMENT, regularize:bool=REGULARIZATION_DEFAULT):
 48 | #     """ Converts a Luwian text to Unicode. """
 49 | #     if isinstance(text, list):
 50 | #         text = " ".join(text)
 51 | #     print(luwian_script(text, regularize=regularize))
 52 | 
 53 | 
 54 | # @app.command()
 55 | # def akkadian(text: list[str]=TEXT_ARGUMENT, regularize:bool=REGULARIZATION_DEFAULT):
 56 | #     """ Converts a Akkadian text to Unicode. """
 57 | #     if isinstance(text, list):
 58 | #         text = " ".join(text)
 59 | #     print(akkadian_script(text, regularize=regularize))
 60 | 
 61 | 
 62 | @app.command()
 63 | def arabic(text: list[str]=TEXT_ARGUMENT, regularize:bool=REGULARIZATION_DEFAULT):
 64 |     """ Converts a Arabic text to Unicode. """
 65 |     if isinstance(text, list):
 66 |         text = " ".join(text)
 67 |     print(arabic_script(text, regularize=regularize))
 68 | 
 69 | 
 70 | @app.command()
 71 | def bibtex():
 72 |     """ Prints the BibTeX entry for this software package. """
 73 |     bibtex_str = BIBTEX_PATH.read_text()
 74 |     print(bibtex_str)
 75 | 
 76 | 
 77 | @app.command()
 78 | def bibliography(
 79 |     style:BibliographyStyle="plain", 
 80 |     output:BibliographyFormat="plaintext",
 81 | ):
 82 |     """ Displays the bibliography. """
 83 |     engine = PybtexEngine()
 84 |     bibliography_string = engine.format_from_files(
 85 |         bib_files_or_filenames=[BIBTEX_PATH], 
 86 |         style=str(style), 
 87 |         output_backend=str(output),
 88 |     )
 89 |     print(bibliography_string)
 90 | 
 91 | 
 92 | @app.callback()
 93 | def potnia():
 94 |     """
 95 |     <img src="https://raw.githubusercontent.com/AncientNLP/potnia/main/docs/_static/img/PotniaLogo.png" alt="Potnia Logo" width="500"/>
 96 | 
 97 |     Potnia is an open-source Python library designed to convert Romanized transliterations of ancient texts into Unicode representations of ther respective native scripts.
 98 |     """
 99 | 
100 | 
101 | @app.command()    
102 | def gui(ctx: typer.Context, share:bool=False):    
103 |     """ Launches the Potnia GUI. """
104 |     import gradio as gr
105 |     from guigaga.guigaga import GUIGAGA    
106 |     theme = gr.themes.Soft(
107 |         primary_hue="rose",
108 |         secondary_hue="pink",
109 |         text_size="lg",
110 |     )
111 |     gui = GUIGAGA(
112 |         typer.main.get_group(app), 
113 |         click_context=ctx,
114 |         theme=theme,
115 |         allow_file_download=False,
116 |     )
117 |     gui.launch(launch_kwargs={"share": share})    
118 | 
119 | 
120 | @app.command()
121 | def bibtex():
122 |     """ Prints the BibTeX entry for this software package. """
123 |     bibtex_str = BIBTEX_PATH.read_text()
124 |     print(bibtex_str)
125 | 


--------------------------------------------------------------------------------
/docs/linear_b.md:
--------------------------------------------------------------------------------
 1 | # Linear B Conversion Rules
 2 | 
 3 | This document outlines the rules used in the conversion process for Linear B texts. The process involves tokenization, regularization, and handling of special patterns to prepare the text for further analysis.
 4 | 
 5 | ## 1. Tokenization Rules
 6 | 
 7 | Tokenization is the process of breaking down the text into individual elements or tokens. For Linear B, this involves handling various special cases and patterns.
 8 | 
 9 | ### a) Space Normalization
10 | 
11 | Replaces non-breaking spaces and certain diacritic marks with regular spaces and empty strings respectively, cleaning up the text for uniform processing.
12 | 
13 | - Replace non-breaking spaces (`\u00a0`) with regular spaces.
14 | - Remove combining dot below (`\u0323`) to simplify character representation.
15 | 
16 | ### b) Special Pattern Handling
17 | 
18 | | Pattern | Regex | Description |
19 | |---------|-------|-------------|
20 | | Combine terms with 'm' or 'f' | `r'\b({})\s([mf])\b'.format('\|'.join(['BOS', 'SUS', 'OVIS', 'CAP', 'EQU']))` to `r'\1\2'`| Combines terms like 'BOS', 'SUS', 'OVIS', 'CAP', 'EQU' with following 'm' or 'f' to form a single token, facilitating cleaner tokenization. |
21 | | Add hyphen after ']' | `r'\](?=[^\s])'` to ` r']-'` | Adds a hyphen right after ']' when it is followed by a non-space character, maintaining syntax integrity in tokenization. |
22 | | Add hyphen before '[' | `r'(?<=[^\s])\['` to `r'-['` | Inserts a hyphen right before '[' when it is preceded by a non-space character, ensuring consistent formatting for special handling. |
23 | | TELA Number Combination | `r"TELA\s+(\d+)"` to `r'TELA\1'` | Combines the term "TELA" with following numbers without spaces. |
24 | | Combine '*' with numeral | `r'\* (\d+)'` to `r'*\1'` | Directly attaches '*' to the following numeral without a space, aiding in recognizing these combinations as distinct tokens. |
25 | | Combine '+' with ideograms | `r'\+ ([^\s]+)'` and `r'([^\s]) \+'` | Merges '+' with adjacent ideograms without space, preserving semantic units in tokenization. |
26 | | Attach 'VAS' | `r'([^\s]+) VAS'` to `r'\1VAS'` | Attaches 'VAS' directly to the preceding term without space, ensuring that it is processed as a single token. |
27 | | Handle abbreviations | `*[(rf'\b{term}\s?\.', term + '.') for term in ['vac', 'vest', 'l', 's', 'lat', 'inf', 'mut', 'sup', 'i']]` | Ensures common abbreviations (like 'vac', 'inf', etc.) are correctly punctuated with a period if missing, standardizing text format. |
28 | 
29 | Iterates over each pattern-replacement pair, applying them sequentially to the text to ensure all intended formatting and corrections are made.
30 | 
31 | ### c) Space Handling :
32 | 
33 | Uses a placeholder character to temporarily replace spaces, facilitating token splitting based on special characters and preserved spaces.
34 | 
35 | ```python
36 | space_placeholder = "\uE000"  # Placeholder for spaces
37 | text = re.sub(r' ', space_placeholder, text)
38 | ```
39 | 
40 | ### d) Tokenization with Space Placeholder
41 | 
42 | Splits the text based on special characters and the space placeholder, ensuring that meaningful elements like brackets, commas, and quotation marks are preserved as separate tokens.
43 | 
44 | ```python
45 | special_chars_pattern = r'(\[|\]|\,|\'|\u27e6|\u27e7|-|\?|\u2e24|\u2e25|' + re.escape(space_placeholder) + ')'
46 | tokens = re.split(special_chars_pattern, text)
47 | ```
48 | 
49 | ### e) Final Tokenization
50 | 
51 | Replace the placeholder with actual spaces and filter empty tokens:
52 | 
53 | ```python
54 | tokenized = [tok if tok != space_placeholder else " " for tok in tokens if tok and tok != "-"]
55 | ```
56 | 
57 | These rules form the core of the Linear B conversion process, handling various special cases in the transliteration, tokenization, and regularization of the text. The process aims to preserve important linguistic features while standardizing the format for further processing or analysis. This standardization is crucial for consistent treatment of texts across different sources and editions.
58 | 
59 | ## 2. Regularization Rules:
60 | 
61 | This list of regular expressions identifies various patterns in the text that should be tokenised as is in the previous step, but then either removed or handled as a special case during subsequent regularization.
62 | 
63 | - `lat., l., inf., i., sup., s., dex., mut, verso, v., v.→, v.↓, Graffito, vacat, vac., deest, α, β, γ, supra sigillum, reliqua pars sine regulis`: Various annotations related epigraphic features of the document, which should be removed at this step.
64 | - `fragmentum, qs, vestigia, vest.` and `][•`: Various annotations or specific punctuation denoting undetermined text parts, which should be handled as wildcards at this step (i.e. converted to `%`).
65 | - `/,'?⸤⸥<>`: Specific punctuation and bracket types, which should be removed at this step.
66 | - `⟦.*?⟧`: Matches text within these special double brackets, which indicate text erasures. Both punctuation and included text should be removed at this step.
67 | 
68 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement by email.
 63 | All complaints will be reviewed and investigated promptly and fairly.
 64 | 
 65 | All community leaders are obligated to respect the privacy and security of the
 66 | reporter of any incident.
 67 | 
 68 | ## Enforcement Guidelines
 69 | 
 70 | Community leaders will follow these Community Impact Guidelines in determining
 71 | the consequences for any action they deem in violation of this Code of Conduct:
 72 | 
 73 | ### 1. Correction
 74 | 
 75 | **Community Impact**: Use of inappropriate language or other behavior deemed
 76 | unprofessional or unwelcome in the community.
 77 | 
 78 | **Consequence**: A private, written warning from community leaders, providing
 79 | clarity around the nature of the violation and an explanation of why the
 80 | behavior was inappropriate. A public apology may be requested.
 81 | 
 82 | ### 2. Warning
 83 | 
 84 | **Community Impact**: A violation through a single incident or series
 85 | of actions.
 86 | 
 87 | **Consequence**: A warning with consequences for continued behavior. No
 88 | interaction with the people involved, including unsolicited interaction with
 89 | those enforcing the Code of Conduct, for a specified period of time. This
 90 | includes avoiding interactions in community spaces as well as external channels
 91 | like social media. Violating these terms may lead to a temporary or
 92 | permanent ban.
 93 | 
 94 | ### 3. Temporary Ban
 95 | 
 96 | **Community Impact**: A serious violation of community standards, including
 97 | sustained inappropriate behavior.
 98 | 
 99 | **Consequence**: A temporary ban from any sort of interaction or public
100 | communication with the community for a specified period of time. No public or
101 | private interaction with the people involved, including unsolicited interaction
102 | with those enforcing the Code of Conduct, is allowed during this period.
103 | Violating these terms may lead to a permanent ban.
104 | 
105 | ### 4. Permanent Ban
106 | 
107 | **Community Impact**: Demonstrating a pattern of violation of community
108 | standards, including sustained inappropriate behavior,  harassment of an
109 | individual, or aggression toward or disparagement of classes of individuals.
110 | 
111 | **Consequence**: A permanent ban from any sort of public interaction within
112 | the community.
113 | 
114 | ## Attribution
115 | 
116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
117 | version 2.0, available at
118 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
119 | 
120 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
121 | enforcement ladder](https://github.com/mozilla/diversity).
122 | 
123 | [homepage]: https://www.contributor-covenant.org
124 | 
125 | For answers to common questions about this code of conduct, see the FAQ at
126 | https://www.contributor-covenant.org/faq. Translations are available at
127 | https://www.contributor-covenant.org/translations.


--------------------------------------------------------------------------------
/potnia/script.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from functools import reduce
  3 | from pathlib import Path
  4 | from dataclasses import dataclass
  5 | 
  6 | from .data import read_data
  7 | 
  8 | @dataclass
  9 | class Script():
 10 |     """
 11 |     The abstract base class for handling text transliteration and unicode conversion.
 12 | 
 13 |     Attributes:
 14 |         config (str): Path to the configuration file or configuration data in YAML format.
 15 |     """
 16 |     config:str
 17 | 
 18 |     def __post_init__(self):
 19 |         """Initializes configuration and sets up mappings, patterns, and regularization rules."""
 20 | 
 21 |         if isinstance(self.config, (Path,str)):
 22 |             self.config = read_data(self.config)
 23 |         assert self.config, f"Configuration not found"
 24 | 
 25 |         self.transliteration_to_unicode_dict = self.config.get('mappings', {})
 26 |         self.unicode_to_transliteration_dict = {}
 27 |         for k, v in self.transliteration_to_unicode_dict.items():
 28 |             if v not in self.unicode_to_transliteration_dict:
 29 |                 self.unicode_to_transliteration_dict[v] = k
 30 | 
 31 |         # Load patterns to ignore                
 32 |         patterns_to_ignore = self.config.get('patterns_to_ignore', [])
 33 |         self.regex_to_ignore = [re.compile(pattern) for pattern in patterns_to_ignore]
 34 | 
 35 |         # Load regularization rules
 36 |         self.regularization_regex = [
 37 |             (re.compile(re.sub(r'\\\\', r'\\', pattern)), replacement) 
 38 |             for pattern, replacement in self.config.get('regularization', [])
 39 |         ]
 40 | 
 41 |         # Load transliteration rules
 42 |         self.transliteration_patterns = [ 
 43 |             (re.compile(pattern),replacement) 
 44 |             for pattern, replacement in self.config.get('tokenization', [])
 45 |         ]
 46 |         self.complex_symbols = self.config.get('complex_symbols', {})
 47 |         self.special_chars_pattern = re.compile(self.config.get('special_chars_pattern', ''))
 48 |         self.restore_patterns = [ 
 49 |             (re.compile(pattern),replacement) 
 50 |             for pattern, replacement in self.config.get('restore_patterns', [])
 51 |         ]
 52 | 
 53 |         # Reverse the complex_symbols dictionary
 54 |         self.reversed_symbols = {v: k for k, v in self.complex_symbols.items()}
 55 | 
 56 |     def tokenize_unicode(self, text:str) -> list[str]:
 57 |         """
 58 |         Tokenizes unicode text according to specific patterns.
 59 | 
 60 |         By default, it tokenizes each character as a separate token.
 61 |         This method can be overridden in subclasses to provide more complex tokenization.
 62 | 
 63 |         Args:
 64 |             text (str): Input text in unicode format.
 65 | 
 66 |         Returns:
 67 |             list[str]: List of tokens
 68 |         """
 69 |         return list(text)
 70 | 
 71 |     def tokenize_transliteration(self, text:str) -> list[str]:
 72 |         """
 73 |         Tokenizes transliterated text according to specific patterns.
 74 | 
 75 |         Args:
 76 |             text (str): Input text in transliterated format.
 77 | 
 78 |         Returns:
 79 |             list[str]: List of tokens
 80 |         """
 81 |         # Replace complex symbols with placeholders
 82 |         for symbol, placeholder in self.complex_symbols.items():
 83 |             text = text.replace(symbol, placeholder)
 84 | 
 85 |         # Apply each pattern replacement in order
 86 |         for pattern, replacement in self.transliteration_patterns:
 87 |             text = pattern.sub(replacement, text)
 88 | 
 89 |         # Handle space replacement with a placeholder
 90 |         space_placeholder = "\uE000"  # Placeholder for spaces
 91 |         text = text.replace(" ", space_placeholder)
 92 | 
 93 |         # Tokenize using the special characters pattern
 94 |         tokens = self.special_chars_pattern.split(text)
 95 | 
 96 |         # Apply processing to each token and filter out empty tokens
 97 |         tokenized = [
 98 |             " " if tok == space_placeholder else
 99 |             reduce(lambda t, p: p[0].sub(p[1], t), self.restore_patterns, tok)
100 |             for tok in tokens if tok and tok != "-"
101 |         ]
102 | 
103 |         # Restore complex symbols using the reversed dictionary
104 |         for placeholder, symbol in self.reversed_symbols.items():
105 |             tokenized = [tok.replace(placeholder, symbol) for tok in tokenized]
106 | 
107 |         return tokenized if tokenized else [""]
108 | 
109 |     def to_transliteration(self, text:str) -> str:
110 |         """
111 |         Converts unicode text to transliteration format.
112 | 
113 |         NB. This function may not work as expected for all scripts/languages
114 |         because there may not be a one-to-one mapping between unicode and transliteration.
115 | 
116 |         Args:
117 |             text (str): Input text in unicode format.
118 | 
119 |         Returns:
120 |             str: Transliterated text.
121 |         """
122 |         tokens = self.tokenize_unicode(text)
123 |         return "".join(
124 |             [
125 |                 self.unicode_to_transliteration_dict.get(token, token) 
126 |                 for token in tokens
127 |             ]
128 |         )
129 | 
130 |     def to_unicode(self, text:str, regularize:bool=False) -> str:
131 |         """
132 |         Converts transliterated text to unicode format.
133 | 
134 |         Args:
135 |             text (str): Input text in transliterated format.
136 |             regularize (bool, optional): Whether to apply regularization. Defaults to False.
137 | 
138 |         Returns:
139 |             str: Text converted to unicode format, optionally regularized.
140 |         """
141 |         tokens = self.tokenize_transliteration(text)
142 |         result = "".join([self.transliteration_to_unicode_dict.get(token, token) for token in tokens])
143 |         if regularize:
144 |             result = self.regularize(result)
145 |         return result
146 | 
147 |     def __call__(self, text:str, regularize:bool=False) -> str:
148 |         """
149 |         Allows the class instance to be called as a function for unicode conversion.
150 | 
151 |         Args:
152 |             text (str): Input text in transliterated format.
153 |             regularize (bool, optional): Whether to apply regularization. Defaults to False.
154 | 
155 |         Returns:
156 |             str: Text converted to unicode format, optionally regularized.
157 |         """
158 |         return self.to_unicode(text, regularize=regularize)
159 |     
160 |     def regularize(self, string: str) -> str:
161 |         """
162 |         Applies regularization rules to a given string.
163 | 
164 |         Args:
165 |             string (str): Text string to be regularized.
166 | 
167 |         Returns:
168 |             str: Regularized text string.
169 |         """
170 |         for pattern, replacement in self.regularization_regex:
171 |             string = pattern.sub(replacement, string)
172 | 
173 |         for regex in self.regex_to_ignore:
174 |             string = regex.sub("", string)
175 |         string = re.sub(r'\s+', ' ', string)
176 |         string = re.sub('mut','',string)
177 |         return string.strip()
178 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ================================================================
  2 | Potnia
  3 | ================================================================
  4 | 
  5 | .. image:: https://raw.githubusercontent.com/AncientNLP/potnia/main/docs/_static/img/PotniaLogo.png
  6 | 
  7 | .. start-summary
  8 | 
  9 | |pypi badge| |testing badge| |coverage badge| |docs badge| |git3moji badge| |black badge| |JOSS badge|
 10 | 
 11 | .. |pypi badge| image:: https://img.shields.io/pypi/v/potnia
 12 |     :target: https://pypi.org/project/potnia/
 13 | 
 14 | .. |testing badge| image:: https://github.com/AncientNLP/potnia/actions/workflows/testing.yml/badge.svg
 15 |     :target: https://github.com/AncientNLP/potnia/actions
 16 |     
 17 | .. |coverage badge| image:: https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/rbturnbull/e640f26fb59e39e3051de8fbf020de62/raw/coverage.json
 18 |     :target: https://ancientnlp.github.io/potnia/coverage/
 19 | 
 20 | .. |docs badge| image:: https://github.com/AncientNLP/potnia/actions/workflows/docs.yml/badge.svg
 21 |     :target: https://ancientnlp.github.io/potnia
 22 |     
 23 | .. |black badge| image:: https://img.shields.io/badge/code%20style-black-000000.svg
 24 |     :target: https://github.com/psf/black
 25 | 
 26 | .. |git3moji badge| image:: https://img.shields.io/badge/git3moji-%E2%9A%A1%EF%B8%8F%F0%9F%90%9B%F0%9F%93%BA%F0%9F%91%AE%F0%9F%94%A4-fffad8.svg
 27 |     :target: https://robinpokorny.github.io/git3moji/
 28 | 
 29 | .. |JOSS badge| image:: https://joss.theoj.org/papers/7641150c49e996a21fa0f4dc3aadb258/status.svg
 30 |     :target: https://joss.theoj.org/papers/7641150c49e996a21fa0f4dc3aadb258
 31 | 
 32 | 
 33 | 
 34 | 
 35 | 
 36 | Potnia is an open-source Python library designed to convert Romanized transliterations of ancient texts into Unicode representations of ther respective native scripts.
 37 | 
 38 | Currently, the scripts supported by Potnia are:
 39 | 
 40 | - Linear A
 41 | - Linear B
 42 | - Hittite cuneiform
 43 | - Arabic
 44 | 
 45 | Functionality for Luwian hieroglyphs, Sumero-Akkadian cuneiform, Lydian and Etruscan is in development.
 46 | 
 47 | Contributions are welcome! Please see the `CONTRIBUTING.rst <CONTRIBUTING.rst>`_ file for more information.
 48 | 
 49 | .. end-summary
 50 | 
 51 | 
 52 | .. start-quickstart
 53 | 
 54 | Installation
 55 | ====================
 56 | 
 57 | To install Potnia, run the following command:
 58 | 
 59 | .. code-block:: bash
 60 | 
 61 |     pip install potnia
 62 | 
 63 | To install the latest version from the repository, you can use this command:
 64 | 
 65 | .. code-block:: bash
 66 | 
 67 |     pip install git+https://github.com/AncientNLP/potnia.git
 68 | 
 69 | You can also install Potnia by cloning the repository and installing using poetry. 
 70 | This will install all the dependencies required for Potnia from the with the version numbers pinned in the ``poetry.lock`` file. 
 71 | Make sure that poetry is installed on your system. If not, see the `instructions <https://python-poetry.org/docs/#installation>`_.
 72 | Then follow these steps:
 73 | 
 74 | .. code-block:: bash
 75 | 
 76 |     git clone https://github.com/AncientNLP/potnia.git
 77 |     cd potnia
 78 |     poetry install
 79 | 
 80 | You can test that Potnia is working by running ``pytest``.
 81 | 
 82 | .. note::
 83 | 
 84 |     For proper display of ancient script glyphs, please refer to the `Fonts <https://ancientnlp.github.io/potnia/fonts.html>`_  section.
 85 | 
 86 | Usage
 87 | ====================
 88 | 
 89 | To convert transliterated Linear B to Linear B Unicode, use the following code:
 90 | 
 91 | .. code-block:: python
 92 | 
 93 |     >>> from potnia import linear_b
 94 |     >>> linear_b("a-ri-to-jo")
 95 |     '𐀀𐀪𐀵𐀍'
 96 | 
 97 | 
 98 | If you wish to regularize the text to remove additional annotations present in the `LiBER <https://liber.cnr.it/index>`_ 
 99 | and  `DĀMOS <https://damos.hf.uio.no/about/content/>`_ transliteration, use the following code:
100 | 
101 | .. code-block:: python
102 | 
103 |     >>> linear_b("e-ke-qe ]-o-na-to , ke-ke-me-na⌞ ⌟ko-to-na GRA qs ] vac.", regularize=True)
104 |     '𐀁𐀐𐀤 %𐀃𐀙𐀵 𐀐𐀐𐀕𐀙 𐀒𐀵𐀙 𐂎 %'
105 | 
106 | Note that uncertain/missing signs or sections of text are presently being replaced with a wildcard '%' character.
107 | 
108 | To tokenize transliterated Linear B texts without converting it to Unicode, use the following code:
109 | 
110 | .. code-block:: python
111 | 
112 |     >>> linear_b.tokenize_transliteration("]wa VIR 1 MUL 2 'ko-wa 1' ko-wo 1")
113 |     [']', 'wa', ' ', 'VIR', ' ', '1', ' ', 'MUL', ' ', '2', ' ', "'", 'ko', 'wa', ' ', '1', "'", ' ', 'ko', 'wo', ' ', '1']
114 | 
115 | Command Line Interface (CLI)
116 | ============================
117 | 
118 | Potnia also provides a command line interface (CLI).
119 | 
120 | To convert transliterated Linear B to Unicode, use the following command:
121 | 
122 | .. code-block:: bash
123 | 
124 |     potnia linear-b "a-ri-to-jo"
125 | 
126 | To regularize the text, use the following command:
127 | 
128 | .. code-block:: bash
129 | 
130 |     potnia linear-b "e-ke-qe ]-o-na-to , ke-ke-me-na⌞ ⌟ko-to-na GRA qs ] vac." --regularize
131 | 
132 | To see the full set of commands available in the CLI, use the following command:
133 | 
134 | .. code-block:: bash
135 | 
136 |     potnia --help
137 | 
138 | Graphical User Interface (GUI)
139 | ==============================
140 | 
141 | .. image:: https://raw.githubusercontent.com/AncientNLP/potnia/main/docs/_static/img/potnia-gui.png
142 | 
143 | Potnia also provides a graphical user interface (GUI). To start it, run:
144 | 
145 | .. code-block:: bash
146 | 
147 |     potnia gui
148 | 
149 | This will show a link in the terminal that you can click on to open the GUI in your browser.
150 | 
151 |     
152 | .. end-quickstart
153 | 
154 | Credits
155 | ====================
156 | 
157 | .. start-credits
158 | 
159 | Potnia is developed by:
160 | 
161 | - Emily Tour (University of Melbourne)
162 | - `Kabir Manandhar Shrestha <https://findanexpert.unimelb.edu.au/profile/892683-kabir-manandhar-shrestha>`_ (Melbourne Data Analytics Platform, University of Melbourne)
163 | - `Dr Robert Turnbull <https://robturnbull.com>`_ (Melbourne Data Analytics Platform, University of Melbourne)
164 | 
165 | To cite Potnia, use this reference:
166 | 
167 |     Tour, Emily, Kabir Manandhar Shrestha, and Robert Turnbull. 'Potnia: A Python Library for the Conversion of Transliterated Ancient Texts to Unicode.' *Journal of Open Source Software* 10, no. 108 (2025): 7725. `doi:10.21105/joss.07725 <https://doi.org/10.21105/joss.07725>`_
168 | 
169 | You can also use the following BibTeX entries:
170 | 
171 | .. code-block:: bibtex
172 | 
173 |     @article{potnia, 
174 |         author       = {Emily Tour and Kabir Manandhar Shrestha and Robert Turnbull},
175 |         title        = {{Potnia: A Python library for the conversion of transliterated ancient texts to Unicode}},
176 |         year         = {2025}, 
177 |         journal      = {Journal of Open Source Software},
178 |         publisher    = {The Open Journal}, 
179 |         volume       = {10}, 
180 |         number       = {108},
181 |         pages        = {7725},
182 |         doi          = {10.21105/joss.07725}, 
183 |         url          = {https://doi.org/10.21105/joss.07725}
184 |     }
185 | 
186 |     @misc{potnia_release,
187 |         author       = {Emily Tour and Kabir Manandhar Shrestha and Robert Turnbull},
188 |         title        = {{Potnia: A Python library for the conversion of transliterated ancient texts to Unicode}},
189 |         year         = {2025},
190 |         url          = {https://doi.org/10.26188/28721354.v1},
191 |         note         = {Version 0.4.0, Apache License 2.0},
192 |         doi          = {10.26188/28721354.v1}
193 |     }
194 | 
195 | We acknowledge support from Wytamma Wirth, Brent Davis, Kim Doyle, Man-Hua (Kate) Chu, Anhui (Ellie) Situ, Ekaterina Vylomova, Chris Guest and Stavroula (Stephie) Nikoloudis. This research was supported by The University of Melbourne’s Research Computing Services. Robert Turnbull completed part of this work through the BICROSS project, which has received funding from the European Research Council (ERC) under the European Union’s Horizon Europe research and innovation programme (grant agreement no. 101043730 – BICROSS – ERC-2021-COG).
196 | 
197 | .. image:: https://raw.githubusercontent.com/AncientNLP/potnia/main/docs/_static/img/erc-logo.jpg
198 |     :alt: ERC logo
199 |     :align: center
200 |     :width: 200px
201 | 
202 | .. end-credits
203 | 


--------------------------------------------------------------------------------
/potnia/data/hittite.yaml:
--------------------------------------------------------------------------------
  1 | mappings:
  2 |   # V syllabograms
  3 |   "a": 𒀀
  4 |   "e": 𒂊
  5 |   "i": 𒄿
  6 |   "u": 𒌋
  7 |   "ú": 𒌑
  8 |   "ia": 𒅀
  9 | 
 10 |   # CV syllabograms
 11 |   "ba": 𒁀
 12 |   "be": 𒁁
 13 |   "bi": 𒁉
 14 |   "bu": 𒁍
 15 | 
 16 |   "pa": 𒉺
 17 |   "pé": 𒁉
 18 |   "pí": 𒁉
 19 |   "pu": 𒁍
 20 | 
 21 |   "da": 𒁕
 22 |   "de": 𒁲
 23 |   "di": 𒁲
 24 |   "du": 𒁺
 25 | 
 26 |   "ta": 𒋫
 27 |   "te": 𒋼
 28 |   "ti": 𒋾
 29 |   "tu": 𒌅
 30 | 
 31 |   "ga": 𒂵
 32 |   "ge": 𒄀
 33 |   "gi": 𒄀
 34 |   "gu": 𒄖
 35 | 
 36 |   "ka": 𒅗
 37 |   "ke": 𒆠
 38 |   "ki": 𒆠
 39 |   "ku": 𒆪
 40 | 
 41 |   "ḫa": 𒄩
 42 |   "ḫe": 𒄭
 43 |   "ḫé": 𒃶
 44 |   "ḫi": 𒄭
 45 |   "ḫu": 𒄷
 46 | 
 47 |   "la": 𒆷
 48 |   "le": 𒇷
 49 |   "li": 𒇷
 50 |   "lu": 𒇻
 51 | 
 52 |   "ma": 𒈠
 53 |   "me": 𒈨
 54 |   "mé": 𒈪
 55 |   "mi": 𒈪
 56 |   "mu": 𒈬
 57 | 
 58 |   "na": 𒈾
 59 |   "ne": 𒉈
 60 |   "né": 𒉌
 61 |   "ni": 𒉌
 62 |   "nu": 𒉡
 63 | 
 64 |   "ra": 𒊏
 65 |   "re": 𒊑
 66 |   "ri": 𒊑
 67 |   "ru": 𒊒
 68 | 
 69 |   "ša": 𒊭
 70 |   "še": 𒊺
 71 |   "ši": 𒅆
 72 |   "šu": 𒋗
 73 |   "šú": 𒋙
 74 | 
 75 |   "wa": 𒉿
 76 |   "wi5": 𒃾
 77 | 
 78 |   "ya": 𒅀
 79 | 
 80 |   "za": 𒍝
 81 |   "ze": 𒍣
 82 |   "zé": 𒍢
 83 |   "zi": 𒍣
 84 |   "zu": 𒍪
 85 | 
 86 |   # VC syllabograms
 87 |   "ab": 𒀊
 88 |   "ap": 𒀊
 89 |   "eb": 𒅁
 90 |   "ep": 𒅁
 91 |   "ib": 𒅁
 92 |   "ip": 𒅁
 93 |   "ub": 𒌒 
 94 |   "up": 𒌒
 95 | 
 96 |   "ad": 𒀜
 97 |   "at": 𒀜
 98 |   "ed": 𒀉
 99 |   "et": 𒀉
100 |   "id": 𒀉
101 |   "it": 𒀉
102 |   "ud": 𒌓 
103 |   "ut": 𒌓
104 | 
105 |   "ag": 𒀝 
106 |   "ak": 𒀝
107 |   "eg": 𒅅
108 |   "ek": 𒅅
109 |   "ig": 𒅅
110 |   "ik": 𒅅
111 |   "ug": 𒊌
112 |   "uk": 𒊌
113 | 
114 |   "aḫ": 𒄴
115 |   "eḫ": 𒄴
116 |   "iḫ": 𒄴
117 |   "uḫ": 𒄴
118 | 
119 |   "al": 𒀠
120 |   "el": 𒂖
121 |   "il": 𒅋
122 |   "ul": 𒌌 # Check
123 | 
124 |   "am": 𒄠
125 |   "em": 𒅎
126 |   "im": 𒅎
127 |   "um": 𒌝
128 | 
129 |   "an": 𒀭
130 |   "en": 𒂗
131 |   "in": 𒅔
132 |   "un": 𒌦
133 | 
134 |   "ar": 𒅈
135 |   "er": 𒅕
136 |   "ir": 𒅕
137 |   "ur": 𒌨
138 |   "úr": 𒌫
139 | 
140 |   "aš": 𒀸
141 |   "eš": 𒌍
142 |   "eš": 𒐁
143 |   "iš": 𒅖
144 |   "uš": 𒍑
145 | 
146 |   "az": 𒊍
147 |   "ez": 𒄑
148 |   "iz": 𒄑
149 |   "uz": 𒊻
150 | 
151 |   # CVC syllabograms
152 |   "ḫal": 𒄬
153 |   "ḫab": 𒆸
154 |   "ḫap": 𒆸
155 |   "ḫaš": 𒋻
156 |   "ḫad": 𒉺
157 |   "ḫat": 𒉺
158 |   "PA": 𒉺 # sceptre
159 |   "ḫul": "??"
160 |   "ḪUL": "??" # evil # Find
161 |   "ḫub": 𒄽
162 |   "ḫup": 𒄽
163 |   "ḫar": 𒄯
164 |   "ḪAR": 𒄯 # ring
165 |   "ḫur": 𒄯
166 |   "ḪUR": 𒄯 # thick
167 |   "MUR": 𒄯 # lung
168 |   "gal": 𒃲
169 |   "GAL": 𒃲 # great
170 |   "kal": 𒆗
171 |   "gal9": 𒆗
172 |   "kam": 𒄰
173 |   "gám": 𒄰
174 |   "TU7": 𒄰 # soup
175 |   "kán": 𒃷
176 |   "gán": 𒃷
177 |   "GÁN": 𒃷 # field
178 |   "kab": 𒆏
179 |   "kap": 𒆏
180 |   "gáb": 𒆏
181 |   "gáp": 𒆏
182 |   "KAB": 𒆏 # left
183 |   "kar": "??" # find
184 |   "KAR": "??" # find # But actually find
185 |   "kàr": 𒃼
186 |   "gàr": 𒃼
187 |   "kaš": 𒁉
188 |   "gáš": 𒁉
189 |   "KAŠ": 𒁉 # beer
190 |   "kad": 𒃰
191 |   "gad": 𒃰
192 |   "gat": 𒃰
193 |   "GAD": 𒃰 # linen
194 |   "gaz": 𒄤
195 |   "GAZ": 𒄤 # kill
196 |   "kib": "??" # Find
197 |   "kir": 𒄫
198 |   "gir": 𒄫
199 |   "kiš": 𒆧
200 |   "KIŠ": 𒆧 # world
201 |   "kid": 𒃰
202 |   "t9": 𒃰
203 |   "kal": 𒆗
204 |   "KAL": 𒆗 # strong
205 |   "kul": 𒆰
206 |   "KUL": 𒆰 # offspring
207 |   "kúl": 𒄢
208 |   "gul": 𒄢
209 |   "GUL": 𒄢 # break
210 |   "kum": 𒄣
211 |   "gum": 𒄣
212 |   "kur": 𒆳
213 |   "KUR": 𒆳 # land
214 |   "kùr": 𒄥
215 |   "gur": 𒄥
216 |   "lal": 𒇲
217 |   "LAL": 𒇲 # bind
218 |   "lam": 𒇴
219 |   "lig": 𒌨
220 |   "lik": 𒌨
221 |   "liš": 𒇺
222 |   "LIŠ": 𒇺 # spoon
223 |   "luḫ": 𒈛
224 |   "LUḪ": 𒈛 # minister
225 |   "lum": 𒈝
226 |   "maḫ": 𒈤
227 |   "MAḪ": 𒈤 # great
228 |   "man": "??" # Find
229 |   "mar": 𒈥
230 |   "maš": 𒈦
231 |   "MAŠ": 𒈦 # half
232 |   "meš": "𒌍" # check
233 |   "mil": 𒅖
234 |   "mel": 𒅖
235 |   "miš": 𒈩
236 |   "mur": 𒄯
237 |   "mut": 𒅜
238 |   "MUD": 𒅜 # blood
239 |   "nam": 𒉆
240 |   "NAM": 𒉆 # district
241 |   "nab": 𒀮
242 |   "nap": 𒀮
243 |   "nir": 𒉪
244 |   "niš": "??" # Find
245 |   "pal": 𒁄
246 |   "bal": 𒁄
247 |   "pár": 𒈦
248 |   "bar": 𒈦
249 |   "paš": "??" # Find
250 |   "pád": 𒁁
251 |   "pát": 𒁁
252 |   "píd": 𒁁
253 |   "pít": 𒁁
254 |   "píl": 𒉋
255 |   "bíl": 𒉋
256 |   "GIBIL": 𒉋 # new
257 |   "pir": "??" # Find
258 |   "piš": 𒄫
259 |   "biš": 𒄫
260 |   "pùš": 𒄫
261 |   "pur": "??" # Find
262 |   "bur": "??" # Find
263 |   "rad": 𒋥
264 |   "rat": 𒋥
265 |   "riš": 𒊕
266 |   "šaḫ": 𒋚
267 |   "ŠUBUR": 𒋚 # pig
268 |   "šag": 𒊕
269 |   "šak": 𒊕
270 |   "SAG": 𒊕 # head
271 |   "šal": 𒊩
272 |   "MUNUS": 𒊩 # woman
273 |   "šam": 𒌑
274 |   "šàm": "??" # Find
275 |   "šab": "??" # Find
276 |   "šap": "??" # Find
277 |   "šar": 𒊬
278 |   "SAR": 𒊬 # plant
279 |   "šìp": "??" # Find
280 |   "šir": 𒋓
281 |   "ŠIR": 𒋓 # testicles
282 |   "šum": 𒋳
283 |   "šur": 𒋩
284 |   "taḫ": 𒈭
285 |   "daḫ": 𒈭
286 |   "túḫ": 𒈭
287 |   "tág": 𒁖
288 |   "ták": 𒁖
289 |   "dag": 𒁖
290 |   "dak": 𒁖
291 |   "tal": 𒊑
292 |   "dal": 𒊑
293 |   "tám": 𒁮
294 |   "dam": 𒁮
295 |   "DAM": 𒁮 # wife
296 |   "tan": 𒆗
297 |   "dan": 𒆗
298 |   "tab": 𒋰
299 |   "tap": 𒋰
300 |   "dáb": 𒋰
301 |   "dáp": 𒋰
302 |   "TAB": 𒋰 # 2
303 |   "tar": 𒋻
304 |   "táš": 𒁹
305 |   "dáš": 𒁹
306 |   "tiš": 𒁹
307 |   "diš": 𒁹
308 |   "tàš": 𒀾
309 |   "tin": 𒁷
310 |   "tén": 𒁷
311 |   "tim": 𒁴
312 |   "dim": 𒁴
313 |   "dir": 𒊑
314 |   "DIR": 𒊑 # red
315 |   "tir": 𒌁
316 |   "ter": 𒌁
317 |   "TIR": 𒌁 # forest
318 |   "tíš": "??" # Find
319 |   "túl": 𒇥
320 |   "tum": 𒌈
321 |   "dum": 𒌈
322 |   "tub": 𒁾
323 |   "dub": 𒁾
324 |   "dup": 𒁾
325 |   "DUB": 𒁾 # clay tablet
326 |   "túr": 𒄙
327 |   "dur": 𒄙
328 |   "DUR": 𒄙 # strip
329 |   "zul": 𒂄
330 |   "zum": 𒍮
331 | 
332 | # Logograms
333 |   "DIŠ": 𒁹 # (ᵐ) # male personal names
334 |   "DIDLI": 𒀸 # (suffixed) # plural or collective
335 |   "DIDLI ḪI.A": 𒀸𒄭𒀀 # (suffixed) # plural
336 |   "DINGIR": 𒀭 # (ᴰ) # "deity"
337 |   "DUG": 𒂁 # "vessel"
338 |   "É": 𒂍 # "house"
339 |   "GAD": 𒃰 # "linen, cloth"
340 |   "GI": 𒄀 # "tube; reed"
341 |   "GIŠ": 𒄑 # "wood"
342 |   "GUD": 𒄞 # "bovid"
343 |   "ḪUR.SAG": 𒄯𒊕 # "mountain"
344 |   "ÍD": 𒀀𒇉 # "river"
345 |   "IM": 𒅎 # "clay"
346 |   "ITU": 𒌚 # "month"
347 |   "KÁ": 𒆍
348 |   "KU6": 𒄩 # "fish"
349 |   "KUR": 𒆳 # "land"
350 |   "KUŠ": 𒋢 # "hide, fur"
351 |   "LÚ": 𒇽 # "man"
352 |   "MEŠ": 𒌍 # (suffixed) # plural
353 |   "MEŠ ḪI.A": 𒌍𒄭𒀀 # (suffixed) # plural
354 |   "MUL": 𒀯 # "star"
355 |   "MUNUS": 𒊩 # (ᶠ) # "woman" # female personal name
356 |   "MUŠ": 𒈲 # "serpent"
357 |   "MUŠEN": 𒄷 # (suffixed) # "bird"
358 |   "NA₄": 𒉌𒌓 # "stone"
359 |   "NINDA": 𒃻 # "bread"
360 |   "PÚ": 𒇥 # "source"
361 |   "SAR": 𒊬 # (suffixed) # "plant"
362 |   "SI": 𒋛 # "horn"
363 |   "SÍG": 𒋠 # "wool"
364 |   "TU7": 𒄰 # "soup"
365 |   "TÚG": 𒌆 # "garment"
366 |   "Ú": 𒌑 # "plant"
367 |   "URU": 𒌷 # "city"
368 |   "URUDU": 𒍐 # "copper"
369 |   "UZU": 𒍜 # "meat"
370 | 
371 |   # Determinatives
372 | 
373 |   "(DIŠ)": 𒁹 # (ᵐ) # male personal names
374 |   "(DIDLI)": 𒀸 # (suffixed) # plural or collective
375 |   "(DIDLI ḪI.A)": 𒀸𒄭𒀀 # (suffixed) # plural
376 |   "(DINGIR)": 𒀭 # (ᴰ) # "deity"
377 |   "(DUG)": 𒂁 # "vessel"
378 |   "(É)": 𒂍 # "house"
379 |   "(GAD)": 𒃰 # "linen, cloth"
380 |   "(GI)": 𒄀 # "tube; reed"
381 |   "(GIŠ)": 𒄑 # "wood"
382 |   "(GUD)": 𒄞 # "bovid"
383 |   "(ḪI.A)": 𒄭𒀀 # (suffixed) # plural
384 |   "(ḪUR.SAG)": 𒄯𒊕 # "mountain"
385 |   "(ÍD)": 𒀀𒇉 # "river"
386 |   "(IM)": 𒅎 # "clay"
387 |   "(ITU)": 𒌚 # "month"
388 |   "(KAM)": 𒄰 # (suffixed) # numerals
389 |   "(KI)": 𒆠 # (suffixed) # in 0.6% of toponyms[5]
390 |   "(KU6)": 𒄩 # "fish"
391 |   "(KUR)": 𒆳 # "land"
392 |   "(KUŠ)": 𒋢 # "hide, fur"
393 |   "(LÚ)": 𒇽 # "man"
394 |   "(MEŠ)": 𒌍 # (suffixed) # plural
395 |   "(MEŠ ḪI.A)": 𒌍𒄭𒀀 # (suffixed) # plural
396 |   "(MUL)": 𒀯 # "star"
397 |   "(MUNUS)": 𒊩 # (ᶠ) # "woman" # female personal name
398 |   "(MUŠ)": 𒈲 # "serpent"
399 |   "(MUŠEN)": 𒄷 # (suffixed) # "bird"
400 |   "(NA₄)": 𒉌𒌓 # "stone"
401 |   "(NINDA)": 𒃻 # "bread"
402 |   "(PÚ)": 𒇥 # "source"
403 |   "(SAR)": 𒊬 # (suffixed) # "plant"
404 |   "(SI)": 𒋛 # "horn"
405 |   "(SÍG)": 𒋠 # "wool"
406 |   "(TU7)": 𒄰 # "soup"
407 |   "(TÚG)": 𒌆 # "garment"
408 |   "(Ú)": 𒌑 # "plant"
409 |   "(URU)": 𒌷 # "city"
410 |   "(URUDU)": 𒍐 # "copper"
411 |   "(UZU)": 𒍜 # "meat"


--------------------------------------------------------------------------------
/potnia/data/luwian.yaml:
--------------------------------------------------------------------------------
  1 | # WORK IN PROGRESS
  2 | 
  3 | mappings:
  4 |   a: 𔗷
  5 |   á: 𔐓
  6 |   aₓ : 𔗨 # uncertain sound value
  7 |   i:  𔓯
  8 |   í: 𔕐
  9 |   u: 𔑻
 10 | 
 11 |   ha: 𔓷
 12 |   ha: 𔔁 # uncertain sound value
 13 |   há: 𔓟
 14 |   haₓ: 𔕡
 15 |   hi: 𔗒
 16 |   hí: 𔕘
 17 |   hu: 𔕙
 18 |   hú: 𔖈
 19 | 
 20 |   hwa: 𔘰
 21 |   hwi: 𔘰
 22 |   hwiₓ: 𔓎
 23 | 
 24 |   ka: 𔗧
 25 |   ká: 𔐾
 26 |   ki: 𔗳
 27 |   ki₄: 𔔓
 28 |   kiₓ: 𔔓
 29 |   ku: 𔗜
 30 |   kwa: 𔕰
 31 |   kwi: 𔕰
 32 | 
 33 |   la: 𔓊
 34 |   la: 𔗲 # issue with duplicate; to verify
 35 |   laₓ: 𔗽
 36 |   li: 𔔹
 37 |   li: 𔗲 # issue with duplicate; to verify
 38 |   lí: 𔒖
 39 |   lì: 𔕇
 40 |   lu: 𔗲
 41 | 
 42 |   ma: 𔒅
 43 |   má: 𔖘
 44 |   mà: 𔕖
 45 |   maₓ: 𔕖, 𔘅 # split into two
 46 |   mi: 𔖻
 47 |   mí: 𔗘
 48 |   mì: 𔖷
 49 |   mu: 𔑿, 𔖛, 𔑾, 𔒀 # split into multiple? unclear
 50 | 
 51 |   na: 𔐤
 52 |   ná: 𔕵
 53 |   ni: 𔗐
 54 |   ní: 𔓵
 55 |   nì: 𔐽
 56 |   niₓ: 𔗴
 57 |   nu: 𔒴
 58 |   nú: 𔖿
 59 | 
 60 |   pa: 𔕸, 𔔁 ?
 61 |   pá: 𔘅
 62 |   paₓ: 𔓐
 63 |   pi: 𔑉
 64 |   pu: 𔕯
 65 |   pú: 𔗣
 66 |   ra: 𔖱
 67 |   ri: 𔖱
 68 |   ru: 𔗑
 69 |   rú: 𔑳, 𔑵
 70 |   ur: 𔖙
 71 | 
 72 |   sa: 𔗔
 73 |   sá: 𔗦
 74 |   sà: 𔑷
 75 |   sa₄: 𔗆
 76 |   sa₅: 𔕮
 77 |   sa₆: 𔔀
 78 |   sa₇: 𔕣
 79 |   sa₈: 𔖭
 80 |   si: 𔓉
 81 |   sí ?: 𔗾
 82 |   su: 𔖢
 83 |   sú: 𔒂
 84 |   sù: 𔗵
 85 |   us: 𔗚
 86 | 
 87 |   ta: 𔑰
 88 |   tá: 𔐞
 89 |   tà: 𔐬
 90 |   ta₄: 𔕦
 91 |   ta₅: 𔓇
 92 |   ta₆: 𔑛
 93 |   taₓ: 𔐭
 94 |   ti: 𔑣
 95 |   tí: 𔘟
 96 |   tì ?: 𔕦
 97 |   ti₄ ?: 𔓇
 98 |   tu: 𔑡, 𔑢
 99 |   tú: 𔕬
100 |   tù: 𔕭
101 |   tu₄: 𔔈
102 | 
103 |   wa: 𔗬
104 |   wá: 𔓁
105 |   wà: 𔓀
106 |   wa₄: 𔓬
107 |   wa₅: 𔓩
108 |   wa₆: 𔓤
109 |   wa₇: 𔕁
110 |   wa₉: 𔔻
111 |   wi: 𔒻
112 |   wi: 𔗬
113 |   wí: 𔓁
114 |   wì: 𔓀
115 |   wi₄: 𔓬
116 |   wi₅: 𔓩
117 |   wi₆: 𔓤
118 |   wi₇: 𔕁
119 |   wi₉: 𔔻
120 | 
121 |   ia: 𔓱
122 |   iá: 𔕑
123 |   ià: 𔖬
124 | 
125 |   za: 𔖪, 𔖩
126 |   zá: 𔕹
127 |   zà: 𔕼
128 |   za₄: 𔒈
129 |   zaₓ: 𔕽
130 |   zi: 𔖩
131 |   zí: 𔕠
132 |   zì: 𔕻
133 |   zi₄: 𔒚
134 |   zu ?: 𔗥, 𔕀
135 |   zú: 𔗵
136 | 
137 |   a+ra: 𔗸
138 |   a+ri: 𔗸
139 |   a+tá: 𔐷
140 |   ara: 𔒟
141 |   ara: 𔒠
142 |   ari: 𔒟
143 |   # ari: 𔒠
144 |   hara: 𔕆
145 |   hari: 𔕆
146 |   hur: 𔗹
147 | 
148 |   i+ra: 𔓰
149 |   # i+ri: 𔓰
150 |   # kar: 𔕢
151 |   "la+ra+a": 𔓍
152 |   pari: 𔐎
153 |   ra+a: 𔗸
154 |   ri+i: 𔓰
155 |   sara: 𔕕
156 |   sari: 𔕕
157 |   tal: 𔖞
158 |   tana: 𔗢
159 |   tapa: 𔒋
160 |   tár: 𔖤
161 |   taraₓ: 𔖤
162 |   tariₓ: 𔖤
163 |   tara: 𔖹
164 |   tari: 𔖹
165 |   zuwa: 𔕀
166 | 
167 |   IUDEX+ra: 𔖤
168 |   IUDEX+ri: 𔖤
169 | 
170 | 
171 |   ADORARE: 𔐅
172 |   AEDIFICARE: 𔔘, 𔒐
173 |   AEDIFICIUM : 𔔖
174 |   AEDIFICIUM.PONERE : 𔔘, 𔒐
175 |   #AEDIFICIUM+MINUS : VASTUS) : 𔔗
176 |   ALA : 𔑗
177 |   AMPLECTI : 𔐈, 𔗱
178 |   ANIMAL : 𔗈
179 |   ANNUS : 𔕺
180 |   ANNUS+ANNUS : 𔖁
181 |   AQUA : 𔓳, 𔓴
182 |   AQUILA : 𔒟
183 |   ARGENTUM : 𔔣, 𔔤, 𔔦
184 |   ARHA : 𔓸, 𔓹
185 |   ASCIA : 𔔼
186 |   ASINUS : 𔑯, 𔒍
187 |   ASINUS₂ : 𔑱
188 |   AUDIRE : 𔑒, 𔓅
189 |   AURIGA : 𔕄
190 |   AURIS+TU+MI : 𔑒, 𔓅
191 |   AVIS : 𔒚
192 |   AVIS₂ : 𔒞
193 |   AVIS₃ : 𔒜
194 |   AVIS₄ : 𔒟
195 |   AVIS₅ : 𔒝
196 |   AVISₓ : 𔒡
197 |   AVUS : 𔕳
198 |   BESTIA : 𔑫
199 |   BIBERE : 𔐇
200 |   BONUS : 𔕧 (2nd mil.), 𔓀
201 |   BONUS₂ : 𔖢
202 |   BOS : 𔑺
203 |   BOS₂ : 𔑼
204 |   BOS+MI : 𔑾
205 |   BOS.MI : 𔒀
206 |   BOS₂.MI : 𔒁
207 |   BRACCHIUM : 𔐡
208 |   CAELUM : 𔓑
209 |   CANIS : 𔑬
210 |   CANIS₂ : 𔑭
211 |   CAPERE : 𔐫
212 |   CAPERE+SCALPRUM : 𔕲
213 |   CAPERE₂ : 𔐮, 𔒣
214 |   CAPERE₂.CAPERE₂ : 𔐭
215 |   CAPRA : 𔑶
216 |   CAPRA₂ : 𔑸
217 |   CAPRA2A : 𔑹
218 |   CAPUT : 𔐉
219 |   CAPUT+SCALPRUM : 𔐊
220 |   CASTRUM : 𔔉, 𔔊, 𔔋
221 |   CENTUM : 𔗃, 𔕂, 𔕔
222 |   CERVUS : 𔑳
223 |   CERVUS₂ : 𔑴
224 |   CERVUS₃ : 𔑵
225 |   CONTRACTUS : 𔖅
226 |   COR : 𔖂
227 |   CORNU : 𔒂
228 |   CORNU+CAPUT : 𔙀
229 |   CRUS : 𔑛
230 |   CRUS₂ : 𔑝
231 |   CRUS.CRUS : 𔑟, 𔑠
232 |   CRUS+FLUMEN : 𔑜
233 |   CRUX : 𔕛
234 |   CUBITUM : 𔔕
235 |   CULTER : 𔕿
236 |   CUM : 𔑀
237 |   CURRUS : 𔕃
238 |   DARE : 𔑈
239 |   DARE.DARE : 𔑊
240 |   DECEM : 𔗁
241 |   DELERE : 𔔚
242 |   DEUS : 𔖖
243 |   DEUS.DOMUS : 𔔛
244 |   (DEUS)VIA+TERRA : 𔓧
245 |   DIES : 𔖓, 𔖔, 𔖕
246 |   DOMINA : 𔐏
247 |   DOMINUS : 𔖺
248 |   DOMUS : 𔔙
249 |   DOMUS+MINUS : 𔔚
250 |   DOMUS+SCALA : 𔔞, 𔔟
251 |   DOMUS+X : 𔔝
252 |   EDERE : 𔐆
253 |   EGO : 𔐀, 𔘞 ?
254 |   EGO₂ : 𔐁
255 |   ENSIS : 𔐻
256 |   EQUUS : 𔑮
257 |   EUNUCHUS : 𔘑, 𔘐
258 |   EUNUCHUS₂ : ??
259 |   EXERCITUS : 𔔰
260 |   FALX ? : 𔘝
261 |   FEMINA : 𔑘, 𔗌
262 |   FILIA : 𔐱
263 |   FILIUS : 𔐰
264 |   FILIUS.NEPOS : 𔕒
265 |   FINES : 𔓸
266 |   FINES+ha : 𔓹
267 |   FLAMMAE ? : 𔘔, 𔗅, 𔘖
268 |   FLUMEN : 𔓳, 𔓴
269 |   FONS : 𔓶
270 |   FORTIS : 𔐝
271 |   FRATER : 𔐰
272 |   FRATER₂ : 𔔷
273 |   FRONS : 𔐚, 𔒉
274 |   FULGUR : 𔓣
275 |   FUSUS : 𔕗
276 |   GAZELLA : 𔑶
277 |   GENUFLECTERE : 𔑞
278 |   GRYLLUS : 𔒑
279 |   HÁ+LI : 𔓠
280 |   HALA : 𔕈
281 |   HALI : 𔕈
282 |   HALPA : 𔑞
283 |   HANA : 𔘮
284 |   HASTARIUS : 𔓈
285 |   HATTI : 𔓟
286 |   HATTI+li : 𔓠
287 |   HEROS : 𔐕
288 |   HORDEUM : 𔓎, 𔗻, 𔗼
289 |   HORREUM ? : 𔔡, 𔔢
290 |   HUR : 𔗹
291 |   HWI : 𔘰
292 |   IANUS : 𔒯
293 |   INFANS : 𔐰
294 |   INFRA : 𔐾, 𔐿
295 |   IRA : 𔐘
296 |   IŠUWA(URBS) : 𔔃
297 |   IUDEX : 𔖣
298 |   IUDEX.LA : 𔔸
299 |   IUSTITIA : 𔖣
300 |   IUSTITIA.LA : 𔔸
301 |   LA+LA : 𔓋
302 |   LAPIS : 𔔮
303 |   LAPIS+SCALPRUM : 𔔭
304 |   LECTUS : 𔕓
305 |   LEO : 𔑪
306 |   LEO₂ : 𔑫
307 |   LEO+MONS.TU+LEO : 𔓭
308 |   LEPUS : 𔒋
309 |   LEPUS₂ : 𔒌
310 |   LIₓ : 𔒗
311 |   LIBARE : 𔐜
312 |   LIBATIO : 𔒤
313 |   LIGARE : 𔐠
314 |   LIGNUM : 𔖰, 𔓄
315 |   LINGERE : 𔒈
316 |   LINGUA : 𔓊
317 |   LINGUA+CLAVUS : 𔓌
318 |   LIS : 𔐘
319 |   LITUUS : 𔖫
320 |   LITUUS+Á/LITUUS+á : 𔐔
321 |   LITUUS+na : 𔐥
322 |   LITUUS+u : 𔒊
323 |   LOCUS : 𔓤, 𔕝
324 |   LOQUI : 𔐖
325 |   LUNA : 𔓜
326 |   MAₓ : 𔒃
327 |   MAGNUS : 𔖙
328 |   MAGNUS.DOMINA : 𔐐
329 |   MAGNUS.DOMUS : 𔔜
330 |   MAGNUS.FILIA : 𔐴
331 |   MAGNUS.REX : 𔐒
332 |   MALLEUS : 𔔻
333 |   MALUS : 𔖟
334 |   MALUS₂ : 𔖠
335 |   MANDARE : 𔑊
336 |   MANUS : 𔑁, 𔑂, 𔑂
337 |   MANUS.CULTER : 𔐺
338 |   MANUS+CULTER : 𔐻
339 |   MANUS+MINUS ? (LONGUS) : 𔑄, 𔑍
340 |   MATER : 𔑘, 𔗌
341 |   MENSA : 𔕊
342 |   MENSA₂ : 𔕋
343 |   MÍ.REGIO : 𔔇
344 |   MILLE : 𔗄
345 |   MINISTRARE ? : 𔓐
346 |   MINUS : 𔖮
347 |   MONS : 𔓬
348 |   MONS₂ : 𔐃
349 |   MONS.SARPA : 𔕍, 𔕎
350 |   MORI : 𔖯
351 |   MURUS ? : 𔔎
352 |   NEG : 𔕴
353 |   NEG₂ : 𔕵
354 |   NEG₃ : 𔕶
355 |   NEPOS : 𔕒
356 |   OCCIDENS : 𔖬
357 |   OCULUS : 𔐙
358 |   OMNIS(+MI) : 𔖝
359 |   OMNIS₂ : 𔗣
360 |   ORIENS : 𔓛
361 |   OVIS : 𔒇
362 |   OVIS₂ : 𔘺
363 |   PANIS : 𔓐
364 |   PANIS.SCUTELLA : 𔗛
365 |   PASTOR : 𔗫
366 |   PES : 𔑣
367 |   PES₂ : 𔑦
368 |   PES₂.PES : 𔑩
369 |   PES₂.PES₂ : 𔑨
370 |   PES.SCALA.ROTAE : 𔑤, 𔑥, 𔑧
371 |   PINCERNA : 𔖆, 𔖍, 𔖎, 𔖏, 𔘻
372 |   PISCIS : 𔒥
373 |   PITHOS : 𔕾
374 |   PITHOS.SCUTELLA/PITHOS : 𔕺
375 |   POCULUM : 𔖇
376 |   PODIUM : 𔔪
377 |   PONERE : 𔑇
378 |   PORTA : 𔔏, 𔔐
379 |   PORTA₂ : 𔔑
380 |   POST : 𔐣
381 |   PRAE : 𔐍, 𔐎
382 |   PROPHETA ? : 𔙀
383 |   PUGNUS : 𔐨, 𔐪, 𔐯
384 |   PUGNUS+PUGNUS : 𔐠
385 |   PUGNUS+URBS : 𔐹
386 |   PUGNUS+X : 𔐩
387 |   PURUS : 𔕩, 𔕪
388 |   REGIO : 𔔆
389 |   REL : 𔕰
390 |   REX : 𔐑
391 |   REX.FILIA : 𔐳
392 |   REX.FILIUS : 𔐲
393 |   REX.INFANS : 𔐲
394 |   ROTA : 𔕈
395 |   SACERDOS : 𔖐
396 |   SACERDOS₂ : 𔖥
397 |   SARA : 𔕕
398 |   SARI : 𔕕
399 |   SARMA : 𔑙, 𔑚
400 |   SARMA₂ : 𔑙, 𔑚
401 |   SARPA : 𔕋
402 |   SCALPRUM : 𔔯
403 |   SCRIBA : 𔕭
404 |   SCUTELLA : 𔗆
405 |   SCUTUM : 𔔳
406 |   SERVUS : 𔖷
407 |   SIGILLUM : 𔕮
408 |   SOL : 𔓚, 𔘈, 𔘊
409 |   SOL₂ : 𔓙
410 |   SOL₂.MENSA : 𔕌
411 |   SOL₂.THRONUS : 𔕌
412 |   SOLIUM : 𔕐
413 |   SPHINX : 𔒒
414 |   STATUA : 𔐌
415 |   STELE : 𔔭
416 |   SUB : 𔐾, 𔐿
417 |   SUPER : 𔔱 (earlier variant), 𔑏
418 |   TÁ (?) : 𔐞
419 |   TAL (?) : 𔖞
420 |   TALA (?) : 𔖞
421 |   TANA (?) : 𔗢
422 |   TELIPINU : 𔒲
423 |   TERRA : 𔓤, 𔕝
424 |   TEŠUB : 𔕥
425 |   THRONUS : 𔕊
426 |   THRONUS : 𔕋
427 |   THRONUS₂ : 𔕏
428 |   TONITRUS : 𔓢
429 |   TURRIS ? : 𔔍
430 |   UNGULA : 𔒗
431 |   UNUS : 𔖭
432 |   UR : 𔖙
433 |   URBS : 𔔂
434 |   URBS+li : 𔔅
435 |   URBS-li : 𔔅
436 |   URBS-RA+li : 𔔄
437 |   URBS-RI?+li : 𔔄
438 |   URBS+RA-li : 𔔄
439 |   URBS+RI?-li : 𔔄
440 |   URCEUS : 𔖆, 𔖍, 𔖎, 𔖏, 𔘻
441 |   US : 𔗚
442 |   # VACUUS : : 𔔗
443 | 
444 |   VAS : 𔖂
445 |   VASTUS : 𔔗
446 |   VIA : 𔓾, 𔑕, 𔓿
447 |   VIA+TERRA.SCALPRUM : 𔓥
448 |   VIA+TERRA+SCALPRUM : 𔓦
449 |   VINUM : 𔒻
450 |   VIR : 𔕟 (earlier variant), 𔕠
451 |   VIR₂ : 𔖶 (word separator)
452 |   VIR₂.MINUS : 𔖯
453 |   VITA : 𔖡
454 |   VITELLUS : 𔒃
455 |   VITIS : 𔒻
456 | 
457 |   2 : 𔖳
458 |   3 : 𔖸
459 |   4 : 𔖻
460 |   5 : 𔖼
461 |   8 : 𔖽
462 |   9 : 𔖿
463 |   12 : 𔘍


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/potnia/data/linear_a.yaml:
--------------------------------------------------------------------------------
  1 | mappings:
  2 |   ############################################################
  3 |   ###### Syllabograms Common to Linear A and Linear B ########
  4 |   ############################################################
  5 |   a: 𐀀
  6 |   e: 𐀁
  7 |   i: 𐀂
  8 |   o: 𐀃
  9 |   u: 𐀄
 10 |   da: 𐀅
 11 |   de: 𐀆
 12 |   di: 𐀇
 13 |   do: 𐀈
 14 |   du: 𐀉
 15 |   ja: 𐀊
 16 |   je: 𐀋
 17 |   jo: 𐀍
 18 |   ju: 𐀎
 19 |   ka: 𐀏
 20 |   ke: 𐀐
 21 |   ki: 𐀑
 22 |   ko: 𐀒
 23 |   ku: 𐀓
 24 |   ma: 𐀔
 25 |   me: 𐀕
 26 |   mi: 𐀖
 27 |   mo: 𐀗
 28 |   mu: 𐀘
 29 |   na: 𐀙
 30 |   ne: 𐀚
 31 |   ni: 𐀛
 32 |   "no": 𐀜
 33 |   nu: 𐀝
 34 |   nwa: 𐁅
 35 |   pa: 𐀞
 36 |   pe: 𐀟
 37 |   pi: 𐀠
 38 |   po: 𐀡
 39 |   pu: 𐀢
 40 |   pu₂: 𐁆
 41 |   pu2: 𐁆
 42 |   qa: 𐀣
 43 |   qe: 𐀤
 44 |   qi: 𐀥 #𐘏?
 45 |   qif: 𐘐
 46 |   qo: 𐀦
 47 |   ra: 𐀨
 48 |   ra₂: 𐁈
 49 |   ra2: 𐁈
 50 |   re: 𐀩
 51 |   ri: 𐀪
 52 |   ro: 𐀫
 53 |   ru: 𐀬
 54 |   sa: 𐀭
 55 |   se: 𐀮
 56 |   si: 𐀯
 57 |   so: 𐀰
 58 |   su: 𐀱
 59 |   ta: 𐀲
 60 |   ta₂: 𐁋
 61 |   ta2: 𐁋
 62 |   te: 𐀳
 63 |   ti: 𐀴
 64 |   to: 𐀵
 65 |   tu: 𐀶
 66 |   wa: 𐀷
 67 |   we: 𐀸
 68 |   wi: 𐀹
 69 |   wo: 𐀺
 70 |   za: 𐀼
 71 |   ze: 𐀽
 72 |   zo: 𐀿
 73 | 
 74 |   "*22": 𐁒
 75 |   "*22f": 𐘓
 76 |   "*34": 𐁓
 77 |   "*47": 𐁔
 78 |   "*49": 𐁕
 79 |   "*56": 𐘰
 80 |   "*72": "?" #FIND # 𐘽 ?
 81 |   "*79": 𐙀
 82 |   "*82": 𐙃
 83 |   "*86": 𐁜
 84 |   "*118": 𐙈
 85 | 
 86 |   ############################################################
 87 |   ######      Syllabograms Unique to Linear A         ########
 88 |   ############################################################
 89 |   "*100": 𐙇
 90 |   "*123": 𐙌
 91 |   "*131a": 𐙍
 92 |   "*131b": 𐙎
 93 |   "*164a": 𐙐
 94 |   "*180": 𐙒
 95 |   "*188": 𐙓
 96 |   A301: 𐙕
 97 |   A305: 𐙙
 98 |   A309a: 𐙝
 99 |   A309b: 𐙞
100 |   A309c: 𐙟
101 |   A310: 𐙠
102 |   A312: 𐙢
103 |   A314: 𐙦
104 |   A320: 𐙬
105 |   A321: 𐙭
106 |   A322: 𐙮
107 |   A323: 𐙯
108 |   A324: 𐙰
109 |   A325: 𐙱
110 |   A327: 𐙳
111 |   A328: 𐙴
112 |   A329: 𐙵
113 |   A331: 𐙷
114 |   A333: 𐙹
115 |   A340: 𐚀
116 |   A342: 𐚂
117 |   A345: 𐚅
118 |   A349: 𐚉
119 |   A350: 𐚊
120 |   A352: 𐚌
121 |   A353: 𐚍
122 |   A354: 𐚎
123 |   A355: 𐚏
124 |   A356: 𐚐
125 |   A357: 𐚑
126 |   A358: 𐚒
127 |   A359: 𐚓
128 |   A360: 𐚔
129 |   A361: 𐚕
130 |   A362: 𐚖
131 |   A363: 𐚗
132 |   A364: 𐚘
133 | 
134 |   ############################################################
135 |   ##### Logograms Common Between Linear A and Linear B #######
136 |   ############################################################
137 | 
138 |   # LOGOGRAMS attested in Linear A and Linear B (SigLA codes)
139 |   AB01: 𐘀
140 |   AB02: 𐘁
141 |   AB03: 𐘂
142 |   AB04: 𐘃
143 |   AB05: 𐘄
144 |   AB06: 𐘅
145 |   AB07: 𐘆
146 |   AB08: 𐘇
147 |   AB09: 𐘈
148 |   AB10: 𐘉
149 |   AB11: 𐘊
150 |   AB13: 𐘋
151 |   AB16: 𐘌
152 |   AB17: 𐘍
153 |   AB20: 𐘎
154 |   "AB21/OVIS": 𐘏 #OVIS
155 |   "AB21/OVISf": 𐘐 #OVIS
156 |   "AB21/OVISm": 𐘑 #OVIS
157 |   "AB22/CAP": 𐘒 #CAP
158 |   "AB22/CAPf": 𐘓 #CAP
159 |   "AB22/CAPm": 𐘔 #CAP
160 |   "AB23/BOS": 𐘕 #BOS
161 |   "AB23/BOSm": 𐘖 #BOS
162 |   AB24: 𐘗
163 |   AB26: 𐘘
164 |   AB27: 𐘙
165 |   AB28: 𐘚
166 |   AB28B: 𐘛
167 |   AB29: 𐘜
168 |   "AB30/FIC": 𐘝 #FIC
169 |   AB31: 𐘞
170 |   AB34: 𐘟
171 |   AB37: 𐘠
172 |   AB38: 𐘡
173 |   AB39: 𐘢
174 |   AB40: 𐘣
175 |   AB41: 𐘤
176 |   AB44: 𐘥
177 |   AB45: 𐘦
178 |   AB46: 𐘧
179 |   AB47: 𐘨
180 |   AB48: 𐘩
181 |   AB49: 𐘪
182 |   AB50: 𐘫
183 |   AB51: 𐘬
184 |   AB53: 𐘭
185 |   "AB54/TELA": 𐘮 #TELA
186 |   AB55: 𐘯
187 |   AB56: 𐘰
188 |   AB57: 𐘱
189 |   AB58: 𐘲
190 |   AB59: 𐘳
191 |   AB60: 𐘴
192 |   AB61: 𐘵
193 |   AB65: 𐘶
194 |   AB66: 𐘷
195 |   AB67: 𐘸
196 |   AB69: 𐘹
197 |   AB70: 𐘺
198 |   AB73: 𐘻
199 |   AB74: 𐘼
200 |   AB76: 𐘽
201 |   AB77: 𐘾
202 |   AB78: 𐘿
203 |   AB79: 𐙀
204 |   AB80: 𐙁
205 |   AB81: 𐙂
206 |   AB82: 𐙃
207 |   "AB85/SUS": 𐙄
208 |   AB86: 𐙅
209 |   AB87: 𐙆
210 |   AB100/VIR: 𐙇
211 |   AB118: 𐙈
212 |   "AB120:/GRA": 𐙉
213 |   "AB120/GRAb": 𐙊
214 |   "AB122/OLIV": 𐙋
215 |   AB123: 𐙌 #AROM
216 |   AB131/VINa: 𐙍
217 |   AB131/VINb: 𐙎
218 |   AB131/VINc: 𐙏
219 |   AB164a: 𐙐
220 |   AB164b: 𐙐
221 |   AB164c: 𐙐
222 |   AB164d: 𐙐
223 |   AB171: 𐙑
224 |   AB180: 𐙒
225 |   AB188: 𐙓
226 |   AB191: 𐙔
227 |   AB302/OLE: 𐙖
228 | 
229 |   # LOGOGRAMS only attested in Linear A
230 |   A302: 𐙖
231 |   A303: 𐙗
232 |   A304: 𐙘
233 |   A306: 𐙚
234 |   A308: 𐙜
235 |   A315: 𐙧
236 |   A316: 𐙨
237 |   A317: 𐙩
238 |   A332: 𐙸
239 |   A334: 𐙺
240 |   A335: 𐙻
241 |   A336: 𐙼
242 |   A338: 𐙾
243 |   A339: 𐙿
244 |   A343: 𐚃
245 |   A344: 𐚄
246 |   A347: 𐚇
247 |   A365: 𐚙
248 |   A366: 𐚚
249 |   A367: 𐚛
250 |   A368: 𐚜
251 |   A369: 𐚝
252 |   A370: 𐚞
253 |   A371: 𐚟
254 |   ## Vases
255 |   A400VAS: 𐚠
256 |   A401VAS: 𐚡
257 |   A402VAS: 𐚢
258 |   A403VAS: 𐚣
259 |   A404VAS: 𐚤
260 |   A405VAS: 𐚥
261 |   A406VAS: 𐚦
262 |   A407VAS: 𐚧
263 |   A408VAS: 𐚨
264 |   A409VAS: 𐚩
265 |   A410VAS: 𐚪
266 |   A411VAS: 𐚫
267 |   A411VASa: 𐚫
268 |   A411VASb: 𐚫
269 |   A411VASc: 𐚫
270 |   A412VAS: 𐚬
271 |   A413VAS: 𐚭
272 |   A414VAS: 𐚮
273 |   A415VAS: 𐚯
274 |   A416VAS: 𐚰
275 |   A417VAS: 𐚱
276 |   A418VAS: 𐚲
277 | 
278 |   # TRANSACTIONAL SIGNS
279 |   A307: 𐙛
280 |   A318: 𐙪
281 |   A319: 𐙫
282 |   A326: 𐙲
283 |   A346: 𐚆
284 | 
285 |   # FRACTIONS
286 |   A701: 𐝀
287 |   # A: 𐝀
288 |   A702: 𐝁
289 |   # B: 𐝁
290 |   A703: 𐝂
291 |   # D: 𐝂
292 |   A704: 𐝃
293 |   # E: 𐝃
294 |   A705: 𐝄
295 |   # F: 𐝄
296 |   A706: 𐝅
297 |   # H: 𐝅
298 |   A707: 𐝆
299 |   # J: 𐝆
300 |   A708: 𐝇
301 |   # K: 𐝇
302 |   A709: 𐝈
303 |   # L: 𐝈
304 |   A7092: 𐝉
305 |   # L2: 𐝉
306 |   A7093: 𐝊
307 |   # L3: 𐝊
308 |   A7094: 𐝋
309 |   # L4: 𐝋
310 |   A7096: 𐝌
311 |   # L6: 𐝌
312 |   A710: 𐝍
313 |   # W: 𐝍
314 |   A711: 𐝎
315 |   # X: 𐝎
316 |   A712: 𐝏
317 |   # Y: 𐝏
318 |   A713: 𐝐
319 |   OMEGA: 𐝐 # CHECK
320 |   A714: 𐝑
321 |   # ABB: 𐝑
322 |   A715: 𐝒
323 |   # BB: 𐝒
324 |   A717: 𐝓
325 |   # DD: 𐝓
326 |   A726: 𐝔
327 |   # EYYY: 𐝔
328 |   A732: 𐝕
329 |   #bJE: 𐝕
330 | 
331 |   # LIGATURES
332 |   A311: 𐙡
333 |   A313a: 𐙣
334 |   A313b: 𐙤
335 |   A313c: 𐙥
336 |   A330: 𐙶
337 |   A337: 𐙽
338 |   A341: 𐚁
339 |   A348: 𐚈
340 |   A351: 𐚋
341 | 
342 |   # COMPOUND SIGNS
343 |   A501: 𐚳
344 |   A502: 𐚴
345 |   A503: 𐚵
346 |   A504: 𐚶
347 |   A505: 𐚷
348 |   A506: 𐚸
349 |   A508: 𐚹
350 |   A509: 𐚺
351 |   A510: 𐚻
352 |   A511: 𐚼
353 |   A512: 𐚽
354 |   A513: 𐚾
355 |   A515: 𐚿
356 |   A516: 𐛀
357 |   A520: 𐛁
358 |   A521: 𐛂
359 |   A523: 𐛃
360 |   A524: 𐛄
361 |   A525: 𐛅
362 |   A526: 𐛆
363 |   A527: 𐛇
364 |   A528: 𐛈
365 |   A529: 𐛉
366 |   A530: 𐛊
367 |   A531: 𐛋
368 |   A532: 𐛌
369 |   A534: 𐛍
370 |   A535: 𐛎
371 |   A536: 𐛏
372 |   A537: 𐛐
373 |   A538: 𐛑
374 |   A539: 𐛒
375 |   A540: 𐛓
376 |   A541: 𐛔
377 |   A542: 𐛕
378 |   A545: 𐛖
379 |   A547: 𐛗
380 |   A548: 𐛘
381 |   A549: 𐛙
382 |   A550: 𐛚
383 |   A551: 𐛛
384 |   A552: 𐛜
385 |   A553: 𐛝
386 |   A554: 𐛞
387 |   A555: 𐛟
388 |   A556: 𐛠
389 |   A557: 𐛡
390 |   A559: 𐛢
391 |   A563: 𐛣
392 |   A564: 𐛤
393 |   A565: 𐛥
394 |   A566: 𐛦
395 |   A568: 𐛧
396 |   A569: 𐛨
397 |   A570: 𐛩
398 |   A571: 𐛪
399 |   A572: 𐛫
400 |   A573: 𐛬
401 |   A574: 𐛭
402 |   A575: 𐛮
403 |   A576: 𐛯
404 |   A577: 𐛰
405 |   A578: 𐛱
406 |   A579: 𐛲
407 |   A580: 𐛳
408 |   A581: 𐛴
409 |   A582: 𐛵
410 |   A583: 𐛶
411 |   A584: 𐛷
412 |   A585: 𐛸
413 |   A586: 𐛹
414 |   A587: 𐛺
415 |   A588: 𐛻
416 |   A589: 𐛼
417 |   A591: 𐛽
418 |   A592: 𐛾
419 |   A594: 𐛿
420 |   A595: 𐜀
421 |   A596: 𐜁
422 |   A598: 𐜂
423 |   A600: 𐜃
424 |   A601: 𐜄
425 |   A602: 𐜅
426 |   A603: 𐜆
427 |   A604: 𐜇
428 |   A606: 𐜈
429 |   A608: 𐜉
430 |   A609: 𐜊
431 |   A610: 𐜋
432 |   A611: 𐜌
433 |   A612: 𐜍
434 |   A613: 𐜎
435 |   A614: 𐜏
436 |   A615: 𐜐
437 |   A616: 𐜑
438 |   A617: 𐜒
439 |   A618: 𐜓
440 |   A619: 𐜔
441 |   A620: 𐜕
442 |   A621: 𐜖
443 |   A622: 𐜗
444 |   A623: 𐜘
445 |   A624: 𐜙
446 |   A626: 𐜚
447 |   A627: 𐜛
448 |   A628: 𐜜
449 |   A629: 𐜝
450 |   A634: 𐜞
451 |   A637: 𐜟
452 |   A638: 𐜠
453 |   A640: 𐜡
454 |   A642: 𐜢
455 |   A643: 𐜣
456 |   A644: 𐜤
457 |   A645: 𐜥
458 |   A646: 𐜦
459 |   ## Vases
460 |   A648VAS: 𐜧
461 |   A649VAS: 𐜨
462 |   A651VAS: 𐜩
463 |   A652VAS: 𐜪
464 |   A653VAS: 𐜫
465 |   A654VAS: 𐜬
466 |   A655VAS: 𐜭
467 |   A656VAS: 𐜮
468 |   A657VAS: 𐜯
469 |   A658VAS: 𐜰
470 |   A659VAS: 𐜱
471 |   A660VAS: 𐜲
472 |   A661VAS: 𐜳
473 |   A662VAS: 𐜴
474 |   A663VAS: 𐜵
475 |   A664VAS: 𐜶
476 | 
477 |   # ADDITIONAL SIGNS
478 |   A800: 𐝠
479 |   A801: 𐝡
480 |   A802: 𐝢
481 |   A803: 𐝣
482 |   A804: 𐝤
483 |   A805: 𐝥
484 |   A806: 𐝦
485 |   A807: 𐝧
486 | 
487 | patterns_to_ignore:
488 |   - "vacat\\s*\\.?"
489 |   - "lat\\s*\\."
490 |   - "inf\\s*\\."
491 |   - "i\\s*\\."
492 |   - "mut\\s*\\."
493 |   - "sup\\s*\\."
494 |   - "vac\\s*\\."
495 |   - "v\\s*\\."
496 |   - "vestigia"
497 |   - "l\\s*\\."
498 |   - "s\\s*\\."
499 |   - "Graffito"
500 |   - "[\\/\\,\\'\\?]"
501 |   - "⟦.*?⟧"
502 |   - "deest"
503 |   - "[⸤⸥]"
504 |   - "[\\u231e\\u231f]"  # Ignore characters ⌞ and ⌟
505 | 
506 | 
507 | regularization:
508 |   - ['\\[\\?\\]', '%']
509 |   - ['\\[unclassified\\]', '%']  # Maps `[unclassified]` to `%`
510 |   - ['[?]','%']
511 |   - ['\\|', '']
512 |   - [':', '']
513 |   - ['r\\.', '']
514 |   - ['\\[•~\\]', '']
515 |   - ['⌜', '']
516 |   - ['⌝', '']
517 |   - ['mutila', '']
518 |   - ['\\[?•~•~•~•\\]?', '%%%%']
519 |   - ['\\[?•~•~•\\]?', '%%%']
520 |   - ['\\[?•~•~\\]?', '%%']
521 |   - ['\\[?•~•\\]?', '%%']  # Corrected this line
522 |   - ['\\<|\\>', '']
523 |   - ['\\[ \\]', '[ ]']
524 |   - ['ro2', '𐁊']
525 |   - ['vestigia', '%']
526 |   - ['\\bqs\\b', '%']
527 |   - ['vest\\s*\\.', '%']
528 |   - ['\\[•\\]', '%']
529 |   - ['supra sigillum|CMS \\w+\\d+[A-Z]* \\d+', '']
530 |   - ['reliqua pars sine regulis', '']
531 |   - ['[αβγ]', '']
532 |   - ['v\\.→', '']
533 |   - ['v\\.↓', '']
534 |   - ['v\\.', '']
535 |   - ['\\b(vacat|sup. mut.|inf. mut.|deest|X|fragmentum A|fragmentum B|graffito|angustum|prior pars sine regulis|fragmentum C|fragmentum D|fragmentum separatum|α|β|γ|δ|sigillum|)\\b', '']  # Corrected this line
536 |   - ['\\b(x|m|f)\\b', '']
537 |   - ['[\\[\\]]', '%']
538 |   - ['=[^ ]*', '']
539 |   - ['•', '%']
540 |   - ['●', '']
541 |   - ['dex.', '']
542 |   - ['sin.', '']
543 | 
544 | tokenization:
545 |   - ['\u00a0', ' ']  # Replace non-breaking space with regular space
546 |   - ['\u0323', '']   # Remove specific character (e.g., dot below)
547 |   - ['</em>', '']    # Remove HTML closing emphasis tag
548 |   - ['<em>', '']     # Remove HTML opening emphasis tag
549 |   - ['\|([^|]+)\|', '|\1|']  # Special handling to ensure pipes are treated as separate tokens
550 |   - ['ME<±RI>', 'ME±RI']  # Handle specific compound tokens like 'ME<±RI>'
551 |   - ['--', '-']  # Normalize the text by replacing double dashes with a single dash
552 |   - ['\b(EQU|SUS|OVIS|BOS|CAP)\s+(x|m|f)\b', '\1\2']  # Combine animal ideograms followed by 'x', 'm', or 'f' without space
553 |   - ['⌜', ' ⌜ ']  # Explicit tokenization for half brackets
554 |   - ['⌝', ' ⌝ ']
555 |   - ['mutila', ' mutila ']  # Handle 'mutila'
556 |   - ['fragmentum A', 'fragmentum_A']  # Preprocess 'fragmentum A' and 'fragmentum B' to ensure they are not split
557 |   - ['fragmentum B', 'fragmentum_B']
558 |   - ['\b(BOS|SUS|OVIS|CAP|EQU)\s([mf])\b', '\1\2']  # Combine terms with 'm' or 'f'
559 |   - ['\](?=[^\s])', ']-']  # Pre-process ']' and '[' for special handling
560 |   - ['(?<=[^\s])\[', '-[']
561 |   - ['TELA\s+(?=[1234x]\b)', 'TELA']  # Handle specific cases
562 |   - ['TELA\s+(\d+)', 'TELA \1']  # Handle other numbers with space
563 |   - ['\* (\d+)', '*\1']  # Combine '*' with the following numeral
564 |   - ['\+ ([^\s]+)', '+\1']  # Combine '+' with surrounding ideograms
565 |   - ['([^\s]) \+', '\1+']  # Ensure '+' is properly attached
566 |   - ['([^\s]+) VAS', '\1VAS']  # Attach 'VAS' properly
567 |   - ['\b(vac|vest|l|s|lat|inf|mut|sup|i)\s?\.', '\1.']  # Ignore or modify specific patterns
568 |   - ['\b(supra sigillum|reliqua pars sine regulis|vacat)\b', '\1']  # Explicit tokenization
569 | 
570 | complex_symbols:
571 |   'TELA-[;1+TE': 'PLACEHOLDER_TELA1'
572 |   'TELA;1+TE': 'PLACEHOLDER_TELA2'
573 |   'TELA-[;1]-+TE': 'PLACEHOLDER_TELA3'
574 |   'OVIS]-:m': 'PLACEHOLDER_OVIS'
575 | 
576 | special_chars_pattern: "(\\[|\\]|\\,|\\'|\\u27e6|\\u27e7|-|\\?|<|>|⌞|⌟|⸤|⸥|\\||\ue000)"
577 | 
578 | restore_patterns:
579 |   - ['fragmentum_A', 'fragmentum A']
580 |   - ['fragmentum_B', 'fragmentum B']
581 |   - ['ME±RI', 'ME<±RI>']  


--------------------------------------------------------------------------------
/paper.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Potnia: A Python library for the conversion of transliterated ancient texts to Unicode'
  3 | tags:
  4 |   - Python
  5 |   - Unicode 
  6 |   - ancient texts
  7 |   - ancient scripts
  8 |   - ancient languages
  9 |   - transliteration
 10 |   - machine learning
 11 | authors:
 12 |   - name: Emily Tour
 13 |     orcid: 0000-0001-5212-1427
 14 |     equal-contrib: true # (This is how you can denote equal contributions between multiple authors)
 15 |     corresponding: true
 16 |     affiliation: "1" 
 17 |   - name: Kabir Manandhar Shrestha
 18 |     orcid: 0009-0001-2059-1683
 19 |     equal-contrib: true
 20 |     affiliation: 2
 21 |   - name: Robert Turnbull
 22 |     orcid: 0000-0003-1274-6750
 23 |     corresponding: false
 24 |     equal-contrib: true
 25 |     affiliation: 2
 26 | affiliations:
 27 |  - name: University of Melbourne, Australia
 28 |    index: 1
 29 |  - name: Melbourne Data Analytics Platform, University of Melbourne, Australia
 30 |    index: 2
 31 | date: 23 September 2024
 32 | bibliography: paper.bib
 33 | ---
 34 | 
 35 | # Summary
 36 | 
 37 | Potnia is an open-source Python library designed to convert Romanized transliterations of ancient texts into their respective Unicode representations. Significant progress has been made in the digitization of ancient language corpora. However, many of these datasets are solely presented in transliterated form, even though the necessary Unicode blocks exist to render them using their native script. This restriction to using transliterated datasets for certain ancient scripts has the potential to limit the precision of linguistic analysis via machine learning.
 38 | 
 39 | Potnia bridges this gap by providing a flexible framework for converting transliterations into Unicode. By enabling tokenization and processing in the original script, Potnia can optimize tasks such as textual restoration and machine learning-based analysis. The library currently supports:
 40 | 
 41 |   - Linear A
 42 |   - Linear B
 43 |   - Hittite cuneiform
 44 |   - Arabic
 45 | 
 46 | While Linear B has the most comprehensive test cases and is the most robust, the tool can also be used effectively for the other scripts. The architectural flexibility of Potnia makes it easy to accommodate additional scripts, offering significant value to both computational linguistics and digital humanities by enabling researchers to work with ancient texts in their native scripts.
 47 | 
 48 | # Statement of Need
 49 | 
 50 | While machine learning has increasingly been applied to the study of ancient texts [@sommerschieldMachineLearningAncient2023], much of this progress has involved working with transliterated texts, rather than native script formats [@luoNeuralDeciphermentMinimumcost2019; @papavassileiouGenerativeModelMycenaean2023; @fetayaRestorationFragmentaryBabylonian2020; @peronocacciafocoNewApproachDecipherment2021]. Although Unicode standards exist for many ancient scripts, transliterated texts remain prevalent due to historical digitization practices.
 51 | 
 52 | Transliteration is the process of converting text from its original script into a different script, using systematic processes. It allows those who can understand the secondary script to comprehend the orthography and the approximate pronunciation of the original text. Prior to the gradual introduction of relevant Unicode blocks since the 1990s [@Hossain2024], it was also usually necessary for representing non-Latin scripts on Western computational systems, which were largely confined to letters of the Latin alphabet and a small number of special characters.
 53 | 
 54 | Transliteration has an important place in aiding new learners of an ancient script to understand the pronunciation and orthography of the underlying language it represents (particularly for non-alphabetic scripts, where beginners need to grasp a vast repertoire of unfamiliar signs). However, it is well recognized that this process can only provide an approximate, and often unsatisfactory or disputed, representation of the original text [@weinbergTransliterationDocumentation1974; @odishoTransliteratingEnglishArabic1992; @martinetProjectTransliterationClassical1953]. In particular, a lack of standardized approaches to transliteration can introduce considerable ambiguity and noise into the dataset in a variety of ways, including:
 55 | 
 56 |   - the use of various notation systems, with different transliterations representing the exact same sign in distinct ways (e.g. where ![Cuneiform sign 'mè'](docs/_static/img/Csign_me3_large.png){height="11pt"} in Akkadian cuneiform can be represented as either 'mè' or 'me\textsubscript{3}');
 57 |   - changing opinions on particular sign values over time, introducing possible differences between older and newer transliterations (e.g. ![Linear B sign 'qa'](docs/_static/img/LBsign_qa_large2.png){height="11pt"} in the Linear B script changing from the previously suggested value of 'pa\textsubscript{2}' to 'qa')[@chadwickDocumentsMycenaeanGreek1973, pp. 389, 391];
 58 |   - and the way in which transliteration obscures polyvalency in scripts, where a single sign can represent multiple different values (e.g. ![Cuneiform sign 'ḫar'](docs/_static/img/Csign_har_large.png){height="11pt"} in Hittite cuneiform can represent three different syllables, transliterated as 'ḫar', 'ḫur' and 'mur', as well as acting as a logogram for three different words, 'ring', 'thick' and 'lung').
 59 | 
 60 | For language modelling tasks, we therefore suggest that representations of texts in their native form are preferable to achieve the most accurate results. A number of digitized corpora for well-resourced and widely studied ancient languages are now available in Unicode representations of their native script, including a corpus of ancient Greek [@canonicalgreek], classical Hebrew [@sefaria_project], Syriac [@digital_syriac] and Arabic [@openiti]. However, many other online text corpora remain restricted to Romanized translisterations (despite the availability of relevant Unicode standards), presumably due to considerations around ease, system limitations and accessibility, e.g. Linear B [@auroraDAMOSDatabaseMycenaean2015], Ugaritic [@prosser2019ras] and Sumero-Akkadian cuneiform [@cdli2024]. For this latter group of scripts, current tools capable of converting transliterated ancient texts to the corresponding Unicode appear limited to a handful of individual scripts, such as the various implementations of 'Cuneify' [@tinney2019cuneify] that handle Sumero-Akkadian cuneiform, the PHP script 'UnicodeConverter', for Egyptian hieroglyphs [@ilintomich2021unicodeconverter], or 'Anatolian Hieroglyphics (Luwian) generation' tool for Luwian hieroglyphs [@senior2023anatoliangenerator], the latter only available to use through a basic online graphical user interface. While the PyArabic package [@Zerrouki2023] is able to convert Arabic text to and from the popular Timothy Buckwalter transliteration system, Potnia provides a complementary functionality for the DIN 31635 transliteration system [@DIN31635] which is widely used in academic literature.
 61 | 
 62 | In addition, such transliterations of ancient texts are often heavily annotated, with special characters used to denote a range of features including uncertain readings, missing or damaged elements, erasures, non-textual marks, and annotations by modern transliterators pertaining to structural or physical elements of the document. If not removed or handled appropriately, these have the potential to introduce further noise into language models. 
 63 | 
 64 | These are the primary gaps we have aimed to address through the development of Potnia. The library's focus on ancient scripts and its extensible architecture make it a valuable asset for researchers working with digitized ancient corpora. It is also equipped to provide specific handling of these elements, with tailored tokenization and regularization rules pertaining to both script-specific and corpus-specific conventions. Potnia therefore enables a key pre-processing step in the language modelling pipeline, with the resulting Unicode outputs of ancient texts enabling more accurate and nuanced computational analysis of these texts in downstream modelling tasks.
 65 | 
 66 | # Implementation
 67 | 
 68 | Potnia is implemented in Python with an extensible architecture centered around the `Script` class, which converts transliterated texts into Unicode representations. It is designed to handle the complexities of ancient scripts through a flexible and customizable framework.
 69 | 
 70 | ## Key Features
 71 | 
 72 | 1. **YAML-Based Mapping and Rule Specification:**
 73 | Each script in Potnia (e.g. Linear A, Linear B, Arabic, Hittite cuneiform) is configured via a single YAML file that contains syllabograms, logograms, and rules for transliteration and regularization. This unified structure simplifies updates, scales easily for new scripts, and eliminates the need for hardcoded source files (fig. \ref{fig:mappings}).
 74 | 
 75 | ![Example of YAML mapping specification.\label{fig:mappings}](docs/_static/img/mappings.png){ width=50% }
 76 | 
 77 | 2. **Tokenization:** The `tokenize_transliteration` method applies complex symbol replacements and regular expressions to transliterated text based on the rules specified in the YAML file. This tokenization process ensures that the text is split accurately into its meaningful components, handling special symbols and spacing using placeholders, and preparing the text for Unicode conversion.
 78 | 
 79 | 3. **Transliteration to Unicode:** Potnia uses the `__call__` method to convert the transliterated text to its Unicode representation (fig. \ref{fig:potnia-example}).
 80 | 
 81 | ![Example of using Potnia.\label{fig:potnia-example}](docs/_static/img/potnia-example.png){ width=80% }
 82 | 
 83 | 4. **Regularization of Text:** The regularize method applies a series of regular expression rules to clean and normalize the Unicode output. It removes unnecessary tags, ignores patterns specified in the YAML file (e.g. annotations or uncertain characters), and ensures that only the essential characters are retained. This step ensures the output is refined and ready for downstream tasks.
 84 | 
 85 | 5. **Comprehensive Testing:** Pytest fixtures allow us to define test cases as lines in YAML files which allowed us to consisely add over 360 test examples, covering a broad range of edge cases. The code coverage of the tests is 100%.
 86 | 
 87 | 6. **Versatile Interface Options** Users can interact with Potnia as a Python library, or through the command line interface (CLI), or through the graphical user interface (GUI) (fig. \ref{fig:potnia-gui})
 88 | 
 89 | ![Example of using the Potnia GUI.\label{fig:potnia-gui}](docs/_static/img/potnia-gui.png){ width=80% }
 90 | 
 91 | 
 92 | # Research Application
 93 | 
 94 | Potnia’s design and functionality address the following challenges in the analysis of ancient texts:
 95 | 
 96 | 1. **Extensibility:** Potnia is designed to be highly extensible, allowing researchers to integrate new scripts by defining script-specific rules for tokenization and conversion. This flexibility makes the library suitable for a wide range of ancient scripts that are not yet represented in Unicode, providing a valuable tool for researchers across various fields of ancient studies.
 97 | 
 98 | 2. **Integration with Research Workflows:** Researchers can easily incorporate Potnia into their existing workflows. For example, in a typical research scenario, Potnia could be used to preprocess a corpus of Linear B texts before feeding them into a machine learning model for further analysis.
 99 | 
100 | As part of a broader initiative to develop language models for ancient language research, Potnia serves as a foundational component by converting Romanized transliterations of Linear B texts into Unicode datasets for computational analysis. These datasets enable the development of language-specific models supporting tasks such as text generation, restoration and vector embedding analysis. The library's modular design facilitates its application to additional ancient scripts, contributing to broader research initiatives in computational philology.
101 | 
102 | # Availability
103 | 
104 | Potnia is open-source software released under the Apache 2.0 license. It is available through PyPI [https://pypi.org/project/potnia/](https://pypi.org/project/potnia/) and GitHub [https://github.com/AncientNLP/potnia](https://github.com/AncientNLP/potnia). We welcome contributions from the community and adhere to the Contributor Covenant Code of Conduct. Documentation is available at [https://ancientnlp.github.io/potnia/](https://ancientnlp.github.io/potnia/).
105 | 
106 | # Acknowledgements
107 | 
108 | We acknowledge support from Wytamma Wirth, Brent Davis, Kim Doyle, Man-Hua (Kate) Chu, Anhui (Ellie) Situ, Ekaterina Vylomova, Chris Guest and Stavroula (Stephie) Nikoloudis. This research was supported by The University of Melbourne’s Research Computing Services. Robert Turnbull completed part of this work through the BICROSS project, which has received funding from the European Research Council (ERC) under the European Union’s Horizon Europe research and innovation programme (grant agreement no. 101043730 – BICROSS – ERC-2021-COG).
109 | 
110 | # References
111 | 


--------------------------------------------------------------------------------
/potnia/data/linear_b.yaml:
--------------------------------------------------------------------------------
  1 | mappings:
  2 |   ############################################################
  3 |   ###### Syllabograms Common to Linear A and Linear B ########
  4 |   ############################################################
  5 |   # LB SOUND VALUES KNOWN
  6 |   a: 𐀀
  7 |   e: 𐀁
  8 |   i: 𐀂
  9 |   o: 𐀃
 10 |   u: 𐀄
 11 |   da: 𐀅
 12 |   de: 𐀆
 13 |   di: 𐀇
 14 |   do: 𐀈
 15 |   du: 𐀉
 16 |   ja: 𐀊
 17 |   je: 𐀋
 18 |   jo: 𐀍
 19 |   ju: 𐀎
 20 |   ka: 𐀏
 21 |   ke: 𐀐
 22 |   ki: 𐀑
 23 |   ko: 𐀒
 24 |   ku: 𐀓
 25 |   ma: 𐀔
 26 |   me: 𐀕
 27 |   mi: 𐀖
 28 |   mo: 𐀗
 29 |   mu: 𐀘
 30 |   na: 𐀙
 31 |   ne: 𐀚
 32 |   ni: 𐀛
 33 |   "no": 𐀜
 34 |   nu: 𐀝
 35 |   nwa: 𐁅
 36 |   pa: 𐀞
 37 |   pe: 𐀟
 38 |   pi: 𐀠
 39 |   po: 𐀡
 40 |   pu: 𐀢
 41 |   pu₂: 𐁆
 42 |   pu2: 𐁆
 43 |   qa: 𐀣
 44 |   qe: 𐀤
 45 |   qi: 𐀥
 46 |   qo: 𐀦
 47 |   ra: 𐀨
 48 |   ra₂: 𐁈
 49 |   ra2: 𐁈
 50 |   re: 𐀩
 51 |   ri: 𐀪
 52 |   ro: 𐀫
 53 |   ru: 𐀬
 54 |   sa: 𐀭
 55 |   se: 𐀮
 56 |   si: 𐀯
 57 |   so: 𐀰
 58 |   su: 𐀱
 59 |   ta: 𐀲
 60 |   ta₂: 𐁋
 61 |   ta2: 𐁋
 62 |   te: 𐀳
 63 |   ti: 𐀴
 64 |   to: 𐀵
 65 |   tu: 𐀶
 66 |   wa: 𐀷
 67 |   we: 𐀸
 68 |   wi: 𐀹
 69 |   wo: 𐀺
 70 |   za: 𐀼
 71 |   ze: 𐀽
 72 |   zo: 𐀿
 73 | 
 74 |   # LB SOUND VALUES UNKNOWN
 75 |   "*22": 𐁒
 76 |   "*34": 𐁓
 77 |   "*47": 𐁔
 78 |   "*49": 𐁕
 79 |   "*86": 𐁜
 80 | 
 81 |   ############################################################
 82 |   ######      Syllabograms Unique to Linear B         ########
 83 |   ############################################################
 84 | 
 85 |   ## Sound values known
 86 |   a₂: 𐁀
 87 |   a2: 𐁀
 88 |   a₃: 𐁁
 89 |   a3: 𐁁
 90 |   au: 𐁂
 91 |   dwe: 𐁃
 92 |   dwo: 𐁄
 93 |   pte: 𐁇
 94 |   ra₃: 𐁉
 95 |   ra3: 𐁉
 96 |   ro₂: 𐁊
 97 |   ro2: 𐁊
 98 |   twe: 𐁌
 99 |   two: 𐁍
100 | 
101 |   ## Sound values unknown
102 |   "*18": 𐁐
103 |   "*19": 𐁑
104 |   # *35 missing unicode sign?
105 |   "*63": 𐁗
106 |   "*83": 𐁛
107 |   "*89": 𐁝
108 | 
109 |   # Doubtful sound values
110 |   "*56": 𐁖 # potential sound value 'pa₃'
111 |   "*64": 𐁘 # potential sound value 'swi'
112 |   "*65": 𐀎 # potential sound value 'ju'
113 |   "*79": 𐁙 # potential sound value 'zu'
114 |   "*82": 𐁚 # potential sound value 'swa'
115 | 
116 |   ############################################################
117 |   #####                  Logograms                     #######
118 |   ############################################################
119 |   # PEOPLE AND ANIMALS
120 |   VIR: 𐂀 # man
121 |   MUL: 𐂁 # woman
122 |   CERV: 𐂂 # deer
123 |   EQU: 𐂃 # horse
124 |   EQUx: 𐂃 # horse
125 |   EQU:x: 𐂃 # horse
126 |   EQUf: 𐂄 # female horse
127 |   EQU:f: 𐂄 # female horse
128 |   EQUm: 𐂅 # male horse
129 |   EQU:m: 𐂅 # male horse
130 |   OVIS: 𐀥 # sheep
131 |   OVISx: 𐀥 # sheep
132 |   OVIS:x: 𐀥 # sheep
133 |   OVISf: 𐂆 # female sheep
134 |   OVIS:f: 𐂆 # female sheep
135 |   OVISm: 𐂇 # male sheep
136 |   OVIS:m: 𐂇 # male sheep
137 |   CAP: 𐁒 # goat
138 |   CAPx: 𐁒 # goat
139 |   CAP:x: 𐁒 # goat
140 |   CAPf: 𐂈 # female goat
141 |   CAP:f: 𐂈 # female goat
142 |   CAPm: 𐂉 # male goat
143 |   CAP:m: 𐂉 # male goat
144 |   SUS: 𐁂 # pig
145 |   SUSx: 𐁂 # pig
146 |   SUS:x: 𐁂 # pig
147 |   SUSf: 𐂊 # female pig
148 |   SUS:f: 𐂊 # female pig
149 |   SUSm: 𐂋 # male pig
150 |   SUS:m: 𐂋 # male pig
151 |   BOS: 𐀘 # cattle
152 |   BOSx: 𐀘 # cattle
153 |   BOS:x: 𐀘 # cattle
154 |   BOSf: 𐂌 # female cattle
155 |   BOS:f: 𐂌 # female cattle
156 |   BOSm: 𐂍 # male cattle
157 |   BOS:m: 𐂍 # male cattle
158 | 
159 |   # DRY COMMODITIES
160 |   GRA: 𐂎 # wheat or barley
161 |   HORD: 𐂏 # barley
162 |   OLIV: 𐂐 # olive
163 |   AROM: 𐂑 # spice
164 |   CYP: 𐂒 # cyperus
165 |   PYC: 𐂒 # cyperus
166 |   KA+PO: 𐂓 # fruit?
167 |   KA±PO: 𐂓 # fruit?
168 |   KA+NA+KO: 𐂔 # saffron
169 |   KA±NA±KO: 𐂔 # saffron
170 |   KANAKO: 𐂔 # saffron
171 |   CROC: 𐁉 # saffron
172 |   FAR: 𐀎 # flour
173 | 
174 |   # LIQUID COMMODITIES
175 |   OLE: 𐂕 # olive oil
176 |   VIN: 𐂖 # wine
177 |   "*132": 𐂗
178 |   A+RE+PA: 𐂘 # ointment
179 |   A±RE±PA: 𐂘 # ointment
180 |   ME+RI: 𐂙 # honey
181 |   ME±RI: 𐂙 # honey
182 |   ME<±RI>: 𐂙 # honey
183 | 
184 |   # METALS
185 |   AES: 𐂚 # bronze
186 |   AUR: 𐂛 # gold
187 |   "*142": 𐂜
188 | 
189 |   # OTHER MATERIALS AND ITEMS
190 |   LANA: 𐂝 # wool
191 |   "*146": 𐂞 # linen garment?
192 |   "*146;2": 𐂞² # linen garment?
193 |   "*146+PE": 𐂞+𐀟 # linen garment?
194 |   "*150": 𐂟
195 |   CORN: 𐂠 # horn (wild goat)
196 |   "*152": 𐂡 # oxhinde
197 |   "*153": 𐂢  # sheepskin
198 |   "*154": 𐂣 # hide?
199 |   TU+RO₂: 𐂤 # cheese
200 |   TU±RO2 : 𐂤 # cheese
201 |   "*157": 𐂥
202 |   "*158": 𐂦
203 |   TELA: 𐂧 # cloth
204 |   "*160": 𐂨
205 |   "*161": 𐂩
206 |   TUN: 𐂪 # breastplate
207 |   ARM: 𐂫 # armour
208 |   "*164": 𐂬
209 |   "*165": 𐂭
210 |   "*166": 𐂮
211 |   "*167": 𐂯 # ingot
212 |   "*168": 𐂰
213 |   "*169": 𐂱
214 |   "*170": 𐂲
215 |   "*171": 𐂳
216 |   "*172": 𐂴 # honeycomb?
217 |   LUNA: 𐂵 # month's ration?
218 |   "*174": 𐂶
219 |   ARB: 𐂷 # tree
220 |   "*177": 𐂸
221 |   "*178": 𐂹
222 |   "*179": 𐂺
223 |   "*180": 𐂻 # parchment?
224 |   "*181": 𐂼
225 |   "*182": 𐂽
226 |   "*183": 𐂾
227 |   "*184": 𐂿
228 |   "*185": 𐃀
229 |   "*189": 𐃁
230 |   "*190": 𐃂
231 |   GAL: 𐃃 # helmet
232 |   "*220": 𐃄 # footstool
233 |   ALV: 𐃅 # bathtub
234 |   HAS: 𐃆 # spear
235 |   SAG: 𐃇 # arrow
236 |   "*232": 𐃈
237 |   PUG: 𐃉 # dagger
238 |   "*234": 𐃊
239 |   GUP: 𐃋 # dagger
240 |   BIG: 𐃌 # chariot
241 |   CUR: 𐃍 # chariot
242 |   CAPS: 𐃎 # chariot frame
243 |   ROTA: 𐃏 # wheel
244 |   "*245": 𐃐 # chariot part?
245 |   "*246": 𐃑 # chariot part?
246 |   DI+PTE: 𐃒 # parchment?
247 |   DI±PTE: 𐃒 # parchment?
248 |   "*248": 𐃓
249 |   "*249": 𐃔
250 |   "*251": 𐃕
251 |   "*252": 𐃖
252 |   "*253": 𐃗
253 |   JAC: 𐃘 # javelin or dart
254 |   "*255": 𐃙
255 |   "*256": 𐃚 # bow?
256 |   "*257": 𐃛
257 |   "*258": 𐃜
258 |   "*259": 𐃝
259 | 
260 |   # VESSELS
261 |   "*155VAS": 𐃞 # basket?
262 |   "*200VAS": 𐃟
263 |   "*201VAS": 𐃠 # tripod
264 |   "*202VAS": 𐃡
265 |   "*203VAS": 𐃢
266 |   "*204VAS": 𐃣
267 |   "*205VAS": 𐃤
268 |   "*206VAS": 𐃥 # hydria
269 |   "*207VAS": 𐃦
270 |   "*208VAS": 𐃧 # patera
271 |   "*208aVAS": 𐃧 # patera
272 |   "*208bVAS": 𐃧 # patera
273 |   "*209VAS": 𐃨 # amphora
274 |   "*210VAS": 𐃩
275 |   "*211VAS": 𐃪
276 |   "*212VAS": 𐃫 
277 |   "*213VAS": 𐃬 # lanx
278 |   "*214VAS": 𐃭
279 |   "*215VAS": 𐃮 # kylix
280 |   "*216VAS": 𐃯
281 |   "*217VAS": 𐃰
282 |   "*218VAS": 𐃱
283 |   "*219VAS": 𐃲
284 |   "*221VAS": 𐃳
285 |   "*222VAS": 𐃴
286 |   "*226VAS": 𐃵 # washing ware shet
287 |   "*227VAS": 𐃶 # rhyton
288 |   "*228VAS": 𐃷 # ligula
289 |   "*229VAS": 𐃸 # ladle
290 |   "*250VAS": 𐃹
291 |   "*305VAS": 𐃺
292 | 
293 |   # METRIC SYMBOLS
294 |   Z: 𐄿 # volume measure
295 |   V: 𐄾 # volume measure
296 |   T: 𐄼 # dry measure
297 |   S: 𐄽 # liquid measure
298 |   Q: 𐄻 # weight measure
299 |   P: 𐄺 # weight measure
300 |   N: 𐄹 # weight measure
301 |   M: 𐄸 # weight measure
302 |   L: 𐄷 # talent
303 | 
304 |   # COMPOSITE IDEOGRAMS WITHOUT DESIGNATED UNICODE SIGNS
305 |   OVIS+TA: 𐀥+𐀲 # stabled sheep
306 |   SUS+KA: 𐁂+𐀏 # wild boar
307 |   SUS+SI: 𐁂+𐀯 # fattened pigs
308 |   BOS+SI: 𐀘+𐀯 # fattened cattle
309 |   CAP+E: 𐁒+𐀁 # kid (goat)
310 |   EQU+QE: 𐂃+𐀤 
311 |   GRA+Q: 𐂎+𐄻
312 |   GRA+PE: 𐂎+𐀟
313 |   OLIV+A: 𐂐+𐀀 # wild olive
314 |   OLIV+TI: 𐂐+𐀴 # domestic olive
315 |   AROM+CYP: 𐂑+𐂒 # cyperus
316 |   AROM+PYC: 𐂑+𐂒 # cyperus
317 |   AROM+KO: 𐂑+𐀒 # coriander
318 |   CYP+KU: 𐂒+𐀓 # cyperus
319 |   CYP+O: 𐂒+𐀡 # cyperus (variant)
320 |   CYP+PA: 𐂒+𐀞 # cyperus (variant)
321 |   CYP+QA: 𐂒+𐀣 # cyperus (variant)
322 |   PYC+KU: 𐂒+𐀓 # cyperus
323 |   PYC+O: 𐂒+𐀡 # cyperus (variant)
324 |   PYC+PA: 𐂒+𐀞 # cyperus (variant)
325 |   PYC+QA: 𐂒+𐀣 # cyperus (variant)
326 |   OLE+A: 𐂕+𐀀 # wild oil?
327 |   OLE+O: 𐂕+𐀃
328 |   OLE+PA: 𐂕+𐀞 # sage-scented oil?
329 |   OLE+RA: 𐂕+𐀨
330 |   OLE+SI: 𐂕+𐀯
331 |   OLE+WE: 𐂕+𐀸 # oil suitable for annointing
332 |   ROTA+TE: 𐃏+𐀳 # wheel with border or flange
333 |   TUN+KI: 𐂪+𐀑
334 |   TUN+QE: 𐂪+𐀤 # type of corselet
335 |   TUN+RI: 𐂪+𐀪
336 |   TELA+KU: 𐂧+𐀓
337 |   TELA+PA: 𐂧+𐀞 # pharwos cloth
338 |   TELA+PU: 𐂧+𐀢
339 |   TELA+TE: 𐂧+𐀳
340 |   "*166+WE": 𐂮+𐀸
341 |   "*167+PE": 𐂯+𐀟
342 |   "*168+SE": 𐂰+𐀮
343 |   "*172+KE": 𐂴+𐀐
344 |   "*172+KE+RO2": 𐂴+𐀐+𐂤
345 |   "*180+DI": 𐂻+𐀇
346 |   "*155VAS+DI": 𐃞+𐀇
347 |   "*155VAS+NI": 𐃞+𐀛
348 |   "*202VAS+DI": 𐃡+𐀇
349 |   "*209VAS+A": 𐃨+𐀀 # amphora (plural)
350 |   "*210VAS+KA": 𐃩+𐀏 # stirrup jar (plural)
351 |   "*211VAS+PO": 𐃪+𐂓
352 |   "*212VAS+U": 𐃫+𐀄
353 |   "*213VAS+U": 𐃬+𐀄
354 |   "*214VAS+U": 𐃭+𐀄
355 |   "*214VAS+DI": 𐃭+𐀇
356 |   "AUR+*213VAS": 𐂛+𐃬
357 | 
358 |   # SYLLABOGRAMS AS IDEOGRAMS
359 |   A: 𐀀
360 |   E: 𐀁 # ginger grass? # kid (goat)?
361 |   I: 𐀂
362 |   O: 𐀃
363 |   U: 𐀄
364 |   DA: 𐀅 # male steward
365 |   DE: 𐀆 # bundle (used to measure ginger grass)
366 |   DI: 𐀇
367 |   DO: 𐀈
368 |   DU: 𐀉
369 |   JA: 𐀊
370 |   JE: 𐀋
371 |   JO: 𐀍
372 |   JU: 𐀎
373 |   KA: 𐀏
374 |   KE: 𐀐
375 |   KI: 𐀑
376 |   KO: 𐀒 # coriander # piglet?
377 |   KU: 𐀓 # cumin
378 |   MA: 𐀔 # fennel
379 |   ME: 𐀕
380 |   MI: 𐀖 # mint
381 |   MO: 𐀗 # single
382 |   MU: 𐀘
383 |   NA: 𐀙
384 |   NE: 𐀚
385 |   NI: 𐀛 # figs
386 |   NO: 𐀜
387 |   NU: 𐀝
388 |   PA: 𐀞
389 |   PE: 𐀟
390 |   PI: 𐀠
391 |   PO: 𐀡
392 |   PU: 𐀢
393 |   QA: 𐀣
394 |   QE: 𐀤
395 |   QI: 𐀥
396 |   QO: 𐀦
397 |   RA: 𐀨
398 |   RE: 𐀩
399 |   RI: 𐀪 # flax
400 |   RO: 𐀫
401 |   RU: 𐀬
402 |   SA: 𐀭 # sesame
403 |   SE: 𐀮
404 |   SI: 𐀯
405 |   SO: 𐀰
406 |   SU: 𐀱
407 |   TA: 𐀲 # female steward
408 |   TE: 𐀳
409 |   TI: 𐀴
410 |   TO: 𐀵
411 |   TU: 𐀶
412 |   WA: 𐀷
413 |   WE: 𐀸 # yearling (animal)
414 |   WI: 𐀹
415 |   WO: 𐀺
416 |   ZA: 𐀼
417 |   ZE: 𐀽 # pair
418 |   ZO: 𐀿
419 |   A₂: 𐁀
420 |   A2: 𐁀
421 |   A₃: 𐁁
422 |   A3: 𐁁
423 |   AU: 𐁂
424 |   DWE: 𐁃
425 |   DWO: 𐁄
426 |   NWA: 𐁅
427 |   PU₂: 𐁆
428 |   PU2: 𐁆
429 |   PTE: 𐁇
430 |   RA₂: 𐁈
431 |   RA2: 𐁈
432 |   RA₃: 𐁉
433 |   RA3: 𐁉
434 |   RO₂: 𐁊
435 |   RO2: 𐁊
436 |   TA₂: 𐁋
437 |   TA2: 𐁋
438 |   TWE: 𐁌
439 |   TWO: 𐁍
440 | 
441 |   # TELA CASES
442 |   TELAx: 𐂧ˣ
443 |   TELA;x: 𐂧ˣ
444 |   TELA1: 𐂧¹ 
445 |   TELA;1: 𐂧¹  
446 |   TELA2: 𐂧²
447 |   TELA;2: 𐂧²
448 |   TELA3: 𐂧³
449 |   TELA;3: 𐂧³
450 |   TELA4: 𐂧⁴
451 |   TELA;4: 𐂧⁴    
452 |   TELAx+KU: 𐂧ˣ+𐀓
453 |   TELA;x+KU: 𐂧ˣ+𐀓
454 |   TELA1+KU: 𐂧¹+𐀓
455 |   TELA;1+KU: 𐂧¹+𐀓
456 |   TELA2+KU: 𐂧²+𐀓
457 |   TELA;2+KU: 𐂧²+𐀓
458 |   TELA3+KU: 𐂧³+𐀓
459 |   TELA;3+KU: 𐂧³+𐀓
460 |   TELA4+KU: 𐂧⁴+𐀓
461 |   TELA;4+KU: 𐂧⁴+𐀓
462 |   TELAx+PA: 𐂧ˣ+𐀞
463 |   TELA;x+PA: 𐂧ˣ+𐀞
464 |   TELA1+PA: 𐂧¹+𐀞
465 |   TELA;1+PA: 𐂧¹+𐀞
466 |   TELA2+PA: 𐂧²+𐀞
467 |   TELA;2+PA: 𐂧²+𐀞
468 |   TELA3+PA: 𐂧³+𐀞
469 |   TELA;3+PA: 𐂧³+𐀞
470 |   TELA4+PA: 𐂧⁴+𐀞
471 |   TELA;4+PA: 𐂧⁴+𐀞
472 |   TELAx+PO: 𐂧ˣ+𐀡
473 |   TELA;x+PO: 𐂧ˣ+𐀡
474 |   TELA1+PO: 𐂧¹+𐀡
475 |   TELA;1+PO: 𐂧¹+𐀡
476 |   TELA2+PO: 𐂧²+𐀡
477 |   TELA;2+PO: 𐂧²+𐀡
478 |   TELA3+PO: 𐂧³+𐀡
479 |   TELA;3+PO: 𐂧³+𐀡
480 |   TELA4+PO: 𐂧⁴+𐀡
481 |   TELA;4+PO: 𐂧⁴+𐀡
482 |   TELAx+PU: 𐂧ˣ+𐀢
483 |   TELA;x+PU: 𐂧ˣ+𐀢
484 |   TELA1+PU: 𐂧¹+𐀢
485 |   TELA;1+PU: 𐂧¹+𐀢
486 |   TELA2+PU: 𐂧²+𐀢
487 |   TELA;2+PU: 𐂧²+𐀢
488 |   TELA3+PU: 𐂧³+𐀢
489 |   TELA;3+PU: 𐂧³+𐀢
490 |   TELA4+PU: 𐂧⁴+𐀢
491 |   TELA;4+PU: 𐂧⁴+𐀢
492 |   TELAx+TE: 𐂧ˣ+𐀳
493 |   TELA;x+TE: 𐂧ˣ+𐀳
494 |   TELA1+TE: 𐂧¹+𐀳
495 |   TELA;1+TE: 𐂧¹+𐀳
496 |   TELA2+TE: 𐂧²+𐀳
497 |   TELA;2+TE: 𐂧²+𐀳
498 |   TELA3+TE: 𐂧³+𐀳
499 |   TELA;3+TE: 𐂧³+𐀳
500 |   TELA4+TE: 𐂧⁴+𐀳
501 |   TELA;4+TE: 𐂧⁴+𐀳
502 |   TELAx+ZO: 𐂧ˣ+𐀿
503 |   TELA;x+ZO: 𐂧ˣ+𐀿
504 |   TELA1+ZO: 𐂧¹+𐀿
505 |   TELA;1+ZO: 𐂧¹+𐀿
506 |   TELA2+ZO: 𐂧²+𐀿
507 |   TELA;2+ZO: 𐂧²+𐀿
508 |   TELA3+ZO: 𐂧³+𐀿
509 |   TELA;3+ZO: 𐂧³+𐀿
510 |   TELA4+ZO: 𐂧⁴+𐀿
511 |   TELA;4+ZO: 𐂧⁴+𐀿
512 | 
513 |   #EXCEPTION SCENARIOS
514 |   "TELA;1+": 𐂧¹+
515 |   "TELA[;1+TE": 𐂧¹%+𐀳
516 |   "TELA-[;1+TE": 𐂧¹%+𐀳
517 |   "TELA[;1]+TE": 𐂧¹%+𐀳
518 |   "TELA-[;1]-+TE": 𐂧¹%+𐀳
519 |   "TELA[;1": 𐂧¹%
520 |   "TELA;6+": 𐂧⁶+
521 |   "TELA;4+": 𐂧⁴+
522 |   "TELA;2+": 𐂧²+
523 |   "TELA;+": 𐂧+
524 |   "TELA;" : 𐂧
525 |   "*164;1": 𐂬¹
526 |   "+DI": +𐀇
527 |   "+WE": +𐀸
528 |   "+TE": +𐀳
529 |   "+TA": +𐀲
530 |   "OLE+": 𐂕+
531 |   "OVIS[:m": 𐂇
532 |   "OVIS]:m": 𐂇
533 |   "OVIS]-:m": 𐂇
534 |   "OVIS[:f": 𐂆
535 |   "OVIS]:f": 𐂆
536 |   "OVIS:": 𐀥
537 |   "ME<±RI>": 𐂙
538 | 
539 | 
540 | patterns_to_ignore:
541 |   - "vacat\\s*\\.?"
542 |   - "lat\\s*\\."
543 |   - "inf\\s*\\."
544 |   - "i\\s*\\."
545 |   - "mut\\s*\\."
546 |   - "sup\\s*\\."
547 |   - "vac\\s*\\."
548 |   - "v\\s*\\."
549 |   - "vestigia"
550 |   - "l\\s*\\."
551 |   - "s\\s*\\."
552 |   - "Graffito"
553 |   - "[\\/\\,\\'\\?]"
554 |   - "⟦.*?⟧"
555 |   - "deest"
556 |   - "[⸤⸥]"
557 |   - "[\\u231e\\u231f]"  # Ignore characters ⌞ and ⌟
558 | 
559 | 
560 | regularization:
561 |   - ['\\|', '']
562 |   - [':', '']
563 |   - ['r\\.', '']
564 |   - ['\\[•~\\]', '']
565 |   - ['⌜', '']
566 |   - ['⌝', '']
567 |   - ['mutila', '']
568 |   - ['dt','']
569 |   - ['\\[?•~•~•~•\\]?', '%%%%']
570 |   - ['\\[?•~•~•\\]?', '%%%']
571 |   - ['\\[?•~•~\\]?', '%%']
572 |   - ['\\[?•~•\\]?', '%%']  # Corrected this line
573 |   - ['\\<|\\>', '']
574 |   - ['\\[ \\]', '[ ]']
575 |   - ['ro2', '𐁊']
576 |   - ['vestigia', '%']
577 |   - ['\\bqs\\b', '%']
578 |   - ['vest\\s*\\.', '%']
579 |   - ['\\[•\\]', '%']
580 |   - ['supra sigillum|CMS \\w+\\d+[A-Z]* \\d+', '']
581 |   - ['reliqua pars sine regulis', '']
582 |   - ['[αβγ]', '']
583 |   - ['v\\.→', '']
584 |   - ['v\\.↓', '']
585 |   - ['v\\.', '']
586 |   - ['\\b(vacat|sup. mut.|inf. mut.|deest|X|fragmentum A|fragmentum B|graffito|angustum|prior pars sine regulis|fragmentum C|fragmentum D|fragmentum separatum|α|β|γ|δ|sigillum|)\\b', '']  # Corrected this line
587 |   - ['\\b(x|m|f)\\b', '']
588 |   - ['[\\[\\]]', '%']
589 |   - ['=[^ ]*', '']
590 |   - ['•', '%']
591 |   - ['●', '']
592 |   - ['dex.', '']
593 |   - ['sin.', '']
594 |   - [' p','']
595 | 
596 | tokenization:
597 | 
598 |   - ['\u00a0', ' ']  # Replace non-breaking space with regular space
599 |   - ['\u0323', '']   # Remove specific character (e.g., dot below)
600 |   - ['</em>', '']    # Remove HTML closing emphasis tag
601 |   - ['<em>', '']     # Remove HTML opening emphasis tag
602 |   - ['\|([^|]+)\|', '|\1|']  # Special handling to ensure pipes are treated as separate tokens
603 |   - ['ME<±RI>', 'ME±RI']  # Handle specific compound tokens like 'ME<±RI>'
604 |   - ['--', '-']  # Normalize the text by replacing double dashes with a single dash
605 |   - ['\b(EQU|SUS|OVIS|BOS|CAP)\s+(x|m|f)\b', '\1\2']  # Combine animal ideograms followed by 'x', 'm', or 'f' without space
606 |   - ['⌜', ' ⌜ ']  # Explicit tokenization for half brackets
607 |   - ['⌝', ' ⌝ ']
608 |   - ['mutila', ' mutila ']  # Handle 'mutila'
609 |   - ['fragmentum A', 'fragmentum_A']  # Preprocess 'fragmentum A' and 'fragmentum B' to ensure they are not split
610 |   - ['fragmentum B', 'fragmentum_B']
611 |   - ['\b(BOS|SUS|OVIS|CAP|EQU)\s([mf])\b', '\1\2']  # Combine terms with 'm' or 'f'
612 |   - ['\](?=[^\s])', ']-']  # Pre-process ']' and '[' for special handling
613 |   - ['(?<=[^\s])\[', '-[']
614 |   - ['TELA\s+(?=[1234x]\b)', 'TELA']  # Handle specific cases
615 |   - ['TELA\s+(\d+)', 'TELA \1']  # Handle other numbers with space
616 |   - ['\* (\d+)', '*\1']  # Combine '*' with the following numeral
617 |   - ['\+ ([^\s]+)', '+\1']  # Combine '+' with surrounding ideograms
618 |   - ['([^\s]) \+', '\1+']  # Ensure '+' is properly attached
619 |   - ['([^\s]+) VAS', '\1VAS']  # Attach 'VAS' properly
620 |   - ['\b(vac|vest|l|s|lat|inf|mut|sup|i)\s?\.', '\1.']  # Ignore or modify specific patterns
621 |   - ['\b(supra sigillum|reliqua pars sine regulis|vacat)\b', '\1']  # Explicit tokenization
622 | 
623 | complex_symbols:
624 |   'TELA-[;1+TE': 'PLACEHOLDER_TELA1'
625 |   'TELA;1+TE': 'PLACEHOLDER_TELA2'
626 |   'TELA-[;1]-+TE': 'PLACEHOLDER_TELA3'
627 |   'OVIS]-:m': 'PLACEHOLDER_OVIS'
628 | 
629 | special_chars_pattern: "(\\[|\\]|\\,|\\'|\\u27e6|\\u27e7|-|\\?|<|>|⌞|⌟|⸤|⸥|\\||\ue000)"
630 | 
631 | restore_patterns:
632 |   - ['fragmentum_A', 'fragmentum A']
633 |   - ['fragmentum_B', 'fragmentum B']
634 |   - ['ME±RI', 'ME<±RI>']


--------------------------------------------------------------------------------
/paper.bib:
--------------------------------------------------------------------------------
  1 | @article{sommerschieldMachineLearningAncient2023,
  2 | 	title = {Machine learning for ancient languages: {A} survey.},
  3 | 	volume = {49},
  4 | 	issn = {0891-2017, 1530-9312},
  5 | 	shorttitle = {Machine {Learning} for {Ancient} {Languages}},
  6 | 	url = {https://direct.mit.edu/coli/article/doi/10.1162/coli_a_00481/116160/Machine-Learning-for-Ancient-Languages-A-Survey},
  7 | 	doi = {10.1162/coli_a_00481},
  8 | 	abstract = {Abstract
  9 |             Ancient languages preserve the cultures and histories of the past. However, their study is fraught with difficulties, and experts must tackle a range of challenging text-based tasks, from deciphering lost languages to restoring damaged inscriptions, to determining the authorship of works of literature. Technological aids have long supported the study of ancient texts, but in recent years advances in artificial intelligence and machine learning have enabled analyses on a scale and in a detail that are reshaping the field of humanities, similarly to how microscopes and telescopes have contributed to the realm of science. This article aims to provide a comprehensive survey of published research using machine learning for the study of ancient texts written in any language, script, and medium, spanning over three and a half millennia of civilizations around the ancient world. To analyze the relevant literature, we introduce a taxonomy of tasks inspired by the steps involved in the study of ancient documents: digitization, restoration, attribution, linguistic analysis, textual criticism, translation, and decipherment. This work offers three major contributions: first, mapping the interdisciplinary field carved out by the synergy between the humanities and machine learning; second, highlighting how active collaboration between specialists from both fields is key to producing impactful and compelling scholarship; third, highlighting promising directions for future work in this field. Thus, this work promotes and supports the continued collaborative impetus between the humanities and machine learning.},
 10 | 	language = {en},
 11 | 	number = {3},
 12 | 	urldate = {2023-10-14},
 13 | 	journal = {Computational Linguistics},
 14 | 	author = {Sommerschield, Thea and Assael, Yannis and Pavlopoulos, John and Stefanak, Vanessa and Senior, Andrew and Dyer, Chris and Bodel, John and Prag, Jonathan and Androutsopoulos, Ion and Freitas, Nando De},
 15 | 	year = {2023},
 16 | 	pages = {1--45},
 17 | 	file = {Sommerschield-etal_2023_AncientLanguageML-Review.pdf:C\:\\Users\\esmto\\Zotero\\storage\\4K6FRP75\\Sommerschield-etal_2023_AncientLanguageML-Review.pdf:application/pdf},
 18 | }
 19 | 
 20 | @article{Terras_melissa,
 21 | author = {Terras, Melissa and Robertson, Paul},
 22 | year = {2005},
 23 | month = {03},
 24 | pages = {},
 25 | title = {Image and Interpretation: Using Artificial Intelligence to Read Ancient Roman Texts},
 26 | volume = {7},
 27 | journal = {Human IT: tidskrift för studier av IT ur ett humanvetenskapligt perspektiv}
 28 | }
 29 | 
 30 | @inproceedings{papavassileiouDatasetMycenaeanLinear2020,
 31 |   title = {A Dataset of {{Mycenaean Linear B}} Sequences},
 32 |   booktitle = {Proceedings of the 12th {{Conference}} on {{Language Resources}} and {{Evaluation}} ({{LREC}} 2020)},
 33 |   author = {Papavassileiou, Katerina and Owens, Gareth and Kosmopoulos, Dimitrios},
 34 |   year = {2020},
 35 |   pages = {2552--2561},
 36 |   publisher = {European Language Resources Association},
 37 |   abstract = {We present a dataset of Mycenaean Linear B sequences gathered from the Mycenaean inscriptions written in the 13th and 14th century B.C. (c. 1400-1200 B.C.). The dataset contains sequences of Mycenaean words and ideograms according to the rules of the Mycenaean Greek language in the Late Bronze Age. Our ultimate goal is to contribute to the study, reading and understanding of ancient scripts and languages. Focusing on sequences, we seek to exploit the structure of the entire language, not just the Mycenaean vocabulary, to analyse sequential patterns. We present an initial experiment on estimating the missing symbols in damaged inscriptions using the dataset.},
 38 |   file = {C:\Users\esmto\Zotero\storage\H4APPFAV\Papavassileiou-etal_2020_LinBSequences.pdf}
 39 | }
 40 | 
 41 | @ARTICLE{Hossain2024,
 42 |   author={Hossain, Anushah},
 43 |   journal={IEEE Annals of the History of Computing}, 
 44 |   title={{Text Standards for the “Rest of World”: The Making of the Unicode Standard and the OpenType Format}}, 
 45 |   year={2024},
 46 |   volume={46},
 47 |   number={1},
 48 |   pages={20--33},
 49 |   keywords={Standards;Computers;Encoding;Writing;History;Keyboards;Visualization;History of Computing;Unicode Standard;OpenType font format;Indic scripts;text stack},
 50 |   doi={10.1109/MAHC.2024.3351948}
 51 | }
 52 | 
 53 | @article{digital_syriac,
 54 | 	url = {https://doi.org/10.1515/zac-2020-0018},
 55 | 	title = {{The Digital Syriac Corpus: A Digital Repository for Syriac Texts}},
 56 | 	author = {James E. Walters},
 57 | 	pages = {109--122},
 58 | 	volume = {24},
 59 | 	number = {1},
 60 | 	journal = {Zeitschrift für Antikes Christentum / Journal of Ancient Christianity},
 61 | 	doi = {10.1515/zac-2020-0018},
 62 | 	year = {2020},
 63 | 	lastchecked = {2024-09-26}
 64 | }
 65 | 
 66 | @dataset{openiti,
 67 |   author       = {Nigst, Lorenz and
 68 |                   Romanov, Maxim and
 69 |                   Savant, Sarah Bowen and
 70 |                   Seydi, Masoumeh and
 71 |                   Verkinderen, Peter},
 72 |   title        = {{OpenITI: a Machine-Readable Corpus of Islamicate 
 73 |                    Texts}},
 74 |   month        = oct,
 75 |   year         = 2023,
 76 |   publisher    = {Zenodo},
 77 |   version      = {2023.1.8},
 78 |   doi          = {10.5281/zenodo.10021513},
 79 |   url          = {https://doi.org/10.5281/zenodo.10021513}
 80 | }
 81 | 
 82 | @misc{sefaria_project,
 83 |   author       = {{Sefaria}},
 84 |   title        = {Sefaria: A Living Library of Jewish Texts Online},
 85 |   howpublished = {\url{https://www.sefaria.org}},
 86 |   year         = {2024},
 87 | }
 88 | 
 89 | 
 90 | @software{canonicalgreek,
 91 |   author       = {Lisa Cerrato and
 92 |                   Bridget Almas and
 93 |                   TDBuck and
 94 |                   ahanhardt and
 95 |                   srdee and
 96 |                   Alison Babeu and
 97 |                   Thibault Clérice and
 98 |                   Scott Fleischman and
 99 |                   gregorycrane and
100 |                   Matthew Munson and
101 |                   Aurélien Berra and
102 |                   Adiel Mittmann and
103 |                   Chiara Palladino and
104 |                   KATEBHN and
105 |                   Eric Sowell and
106 |                   Joel Kalvesmaki and
107 |                   Stephen Scott and
108 |                   Jeroen Hellingman and
109 |                   Andrei and
110 |                   Chris Drymon},
111 |   title        = {Canonical Greek Literature},
112 |   month        = jul,
113 |   year         = 2021,
114 |   publisher    = {Zenodo},
115 |   version      = {0.0.2867},
116 |   doi          = {10.5281/zenodo.5090923},
117 |   url          = {https://doi.org/10.5281/zenodo.5090923}
118 | }
119 | 
120 | 
121 | @inproceedings{luoNeuralDeciphermentMinimumcost2019,
122 | 	address = {Florence, Italy},
123 | 	title = {Neural decipherment via minimum-cost flow: {From} {Ugaritic} to {Linear} {B}},
124 | 	shorttitle = {Neural {Decipherment} via {Minimum}-{Cost} {Flow}},
125 | 	url = {https://aclanthology.org/P19-1303},
126 | 	doi = {10.18653/v1/P19-1303},
127 | 	urldate = {2023-10-26},
128 | 	booktitle = {Proceedings of the 57th {Annual} {Meeting} of the {Association} for {Computational} {Linguistics}},
129 | 	publisher = {Association for Computational Linguistics},
130 | 	author = {Luo, Jiaming and Cao, Yuan and Barzilay, Regina},
131 | 	year = {2019},
132 | 	pages = {3146--3155},
133 | 	file = {Luo-etal_2021_UndecipheredUndersegmentedScripts.pdf:C\:\\Users\\esmto\\Zotero\\storage\\XCM7RBPS\\Luo-etal_2021_UndecipheredUndersegmentedScripts.pdf:application/pdf},
134 | }
135 | 
136 | 
137 | @article{papavassileiouGenerativeModelMycenaean2023,
138 | 	title = {A generative model for the {Mycenaean} {Linear} {B} script and its application in infilling text from ancient tablets},
139 | 	volume = {16},
140 | 	issn = {1556-4673, 1556-4711},
141 | 	url = {https://dl.acm.org/doi/10.1145/3593431},
142 | 	doi = {10.1145/3593431},
143 | 	language = {en},
144 | 	number = {3},
145 | 	urldate = {2023-10-14},
146 | 	journal = {Journal on Computing and Cultural Heritage},
147 | 	author = {Papavassileiou, Katerina and Kosmopoulos, Dimitrios I. and Owens, Gareth},
148 | 	year = {2023},
149 | 	pages = {1--25},
150 | 	file = {Papavassileiou-etal_2023_LinBGenerativeModel.pdf:C\:\\Users\\esmto\\Zotero\\storage\\2GFRQVBA\\Papavassileiou-etal_2023_LinBGenerativeModel.pdf:application/pdf},
151 | }
152 | 
153 | 
154 | @article{fetayaRestorationFragmentaryBabylonian2020,
155 | 	title = {Restoration of fragmentary {Babylonian} texts using recurrent neural networks},
156 | 	volume = {117},
157 | 	issn = {0027-8424, 1091-6490},
158 | 	url = {https://pnas.org/doi/full/10.1073/pnas.2003794117},
159 | 	doi = {10.1073/pnas.2003794117},
160 | 	language = {en},
161 | 	number = {37},
162 | 	urldate = {2023-10-14},
163 | 	journal = {Proceedings of the National Academy of Sciences},
164 | 	author = {Fetaya, Ethan and Lifshitz, Yonatan and Aaron, Elad and Gordin, Shai},
165 | 	month = sep,
166 | 	year = {2020},
167 | 	pages = {22743--22751},
168 | 	file = {Fetaya_2020_BabylonianRNN.pdf:C\:\\Users\\esmto\\Zotero\\storage\\D74YCB4F\\Fetaya_2020_BabylonianRNN.pdf:application/pdf},
169 | }
170 | 
171 | 
172 | @inproceedings{peronocacciafocoNewApproachDecipherment2021,
173 | 	address = {Brest},
174 | 	series = {Grapholinguistics and its applications},
175 | 	title = {A {New} {Approach} to the {Decipherment} of {Linear} {A}, {Stage} 2 - {Cryptanalysis} and {Language} {Deciphering}: {A} "{Brute} {Force} {Attack}" on an {Undeciphered} {Writing} {System}},
176 | 	shorttitle = {A {New} {Approach} to the {Decipherment} of {Linear} {A}, {Stage} 2 - {Cryptanalysis} and {Language} {Deciphering}},
177 | 	doi = {10.36824/2020-graf-cacc},	
178 | 	booktitle = {Grapholinguistics in the 21st {Century} 2020. {Proceedings}},
179 | 	publisher = {Fluxus Editions},
180 | 	author = {Perono Cacciafoco, Francesco and Loh, Colin Jia Sheng},
181 | 	year = {2021},
182 | 	pages = {927--943},
183 | 	file = {PeronoCacciafoco-Loh_2021_LADeciphermentBruteForce.pdf:C\:\\Users\\esmto\\Zotero\\storage\\5FYYFHK2\\PeronoCacciafoco-Loh_2021_LADeciphermentBruteForce.pdf:application/pdf},
184 | }
185 | 
186 | 
187 | @article{auroraDAMOSDatabaseMycenaean2015,
188 | 	title = {D{ĀMOS} ({Database} of {Mycenaean} at {Oslo}). {Annotating} a fragmentarily attested language},
189 | 	volume = {198},
190 | 	issn = {18770428},
191 | 	url = {https://linkinghub.elsevier.com/retrieve/pii/S187704281504416X},
192 | 	doi = {10.1016/j.sbspro.2015.07.415},
193 | 	language = {en},
194 | 	urldate = {2024-08-25},
195 | 	journal = {Procedia - Social and Behavioral Sciences},
196 | 	author = {Aurora, Federico},
197 | 	month = jul,
198 | 	year = {2015},
199 | 	pages = {21--31},
200 | }
201 | 
202 | 
203 | @article{weinbergTransliterationDocumentation1974,
204 | 	title = {Transliteration in documentation},
205 | 	volume = {30},
206 | 	issn = {0022-0418},
207 | 	url = {https://doi.org/10.1108/eb026567},
208 | 	doi = {10.1108/eb026567},
209 | 	abstract = {The validity of transliteration in documentation is questioned in light of the resulting loss of precise information. The process is examined from the linguist's, cataloguer's, and user's points of view. The pros and cons of phonetic transcription vs. scientific transliteration are discussed. Specific problems of several non‐Roman alphabets are touched upon. The author advocates development of non‐Latin print chains for computers used for documentation work. Where the cost of this is prohibitive, scientific transliteration is imperative for the purposes of international documentation. For library purposes, maintenance of separate catalogues for each script is recommended.},
210 | 	number = {1},
211 | 	urldate = {2024-09-23},
212 | 	journal = {Journal of Documentation},
213 | 	author = {Weinberg, Bella},
214 | 	year = {1974},
215 | 	note = {Publisher: MCB UP Ltd},
216 | 	keywords = {Potnia},
217 | 	pages = {18--31},
218 | 	file = {Weinberg_1974_Transliteration.pdf:C\:\\Users\\esmto\\OneDrive\\01 Education\\02 University of Melbourne\\03 Additional projects\\Weinberg_1974_Transliteration.pdf:application/pdf},
219 | }
220 | 
221 | @article{odishoTransliteratingEnglishArabic1992,
222 | 	title = {Transliterating {English} in {Arabic}},
223 | 	issn = {0170026X},
224 | 	url = {http://www.jstor.org/stable/43525603},
225 | 	number = {24},
226 | 	urldate = {2024-09-23},
227 | 	journal = {Zeitschrift für Arabische Linguistik},
228 | 	author = {Odisho, Edward Y.},
229 | 	year = {1992},
230 | 	note = {Publisher: Harrassowitz Verlag},
231 | 	keywords = {Potnia},
232 | 	pages = {21--34},
233 | 	file = {Odisho_1992_Transliteration-ArabicEnglish.pdf:C\:\\Users\\esmto\\OneDrive\\01 Education\\02 University of Melbourne\\03 Additional projects\\Odisho_1992_Transliteration-ArabicEnglish.pdf:application/pdf},
234 | }
235 | 
236 | @article{martinetProjectTransliterationClassical1953,
237 | 	title = {A {Project} of {Transliteration} of {Classical} {Greek}},
238 | 	volume = {9},
239 | 	issn = {0043-7956, 2373-5112},
240 | 	url = {http://www.tandfonline.com/doi/full/10.1080/00437956.1953.11659466},
241 | 	doi = {10.1080/00437956.1953.11659466},
242 | 	language = {en},
243 | 	number = {2},
244 | 	urldate = {2024-09-23},
245 | 	journal = {WORD},
246 | 	author = {Martinet, André},
247 | 	year = {1953},
248 | 	keywords = {Potnia},
249 | 	pages = {152--161},
250 | 	file = {Martinet_1953_Transliteration-ClassicalGreek.pdf:C\:\\Users\\esmto\\OneDrive\\01 Education\\02 University of Melbourne\\03 Additional projects\\Martinet_1953_Transliteration-ClassicalGreek.pdf:application/pdf},
251 | }
252 | 
253 | @book{chadwickDocumentsMycenaeanGreek1973,
254 | 	address = {Cambridge},
255 | 	edition = {2},
256 | 	title = {Documents in {Mycenaean} {Greek}},
257 | 	publisher = {Cambridge University Press},
258 | 	author = {Chadwick, J.},
259 | 	year = {1973},
260 | }
261 | 
262 | @misc{tinney2019cuneify,
263 |   author = {Steve Tinney},
264 |   title = {Cuneify},
265 |   year = {2019},
266 |   howpublished = {{Oracc: The Open Richly Annotated Cuneiform Corpus}},
267 |   url = {http://oracc.museum.upenn.edu/doc/tools/cuneify/}
268 | }
269 | 
270 | @misc{ilintomich2021unicodeconverter,
271 |   author = {Alexander Ilin-Tomich},
272 |   title = {UnicodeConverter},
273 |   year = {2019},
274 |   howpublished = {\url{https://github.com/ailintom/UnicodeConverter/}},
275 | }
276 | 
277 | @misc{senior2023anatoliangenerator,
278 |   author = {Andrew Senior},
279 |   title = {Anatolian Hieroglyphics (Luwian) generation},
280 |   year = {2023},
281 |   howpublished = {\url{https://andrewsenior.com/luwian/}},
282 | }
283 | 
284 | @misc{prosser2019ras,
285 |   author       = {Prosser, Miller C. and Pardee, Dennis G.},
286 |   title        = {{The Ras Shamra Tablet Inventory}},
287 |   year         = {2019},
288 |   howpublished = {\url{https://onlinepublications.uchicago.edu/RSTI/}},
289 |   note         = {Online Publication Service of the University of Chicago}
290 | }
291 | 
292 | @misc{cdli2024,
293 |   author       = {{CDLI contributors}},
294 |   title        = {{Cuneiform Digital Library Initiative}},
295 |   year         = {2024},
296 |   howpublished = {\url{https://cdli.mpiwg-berlin.mpg.de/}},
297 |   note         = {Cuneiform Digital Library Initiative, September 29, 2024}
298 | }
299 | 
300 | @article{Zerrouki2023,
301 | 	title        = {{PyArabic: A Python package for Arabic text}},
302 | 	author       = {Taha Zerrouki},
303 | 	year         = 2023,
304 | 	journal      = {Journal of Open Source Software},
305 | 	publisher    = {The Open Journal},
306 | 	volume       = 8,
307 | 	number       = 84,
308 | 	pages        = 4886,
309 | 	doi          = {10.21105/joss.04886},
310 | 	url          = {https://doi.org/10.21105/joss.04886}
311 | }
312 | 
313 | @misc{DIN31635,
314 |   author    = {{Deutsches Institut für Normung}},
315 |   title     = {{DIN 31635: Transliteration of the Arabic alphabet}},
316 |   year      = {2011},
317 |   howpublished = {Standard published by Deutsches Institut für Normung},
318 |   address   = {Berlin},
319 | }
320 | 


--------------------------------------------------------------------------------
/tests/expected/linear_b_unicode.yaml:
--------------------------------------------------------------------------------
  1 | # Test LB.A.1:
  2 | # Scenario: Test that blank spaces act as word separators within text in annotated scenarios (except for in specific scenarios, e.g. between a domesticated animal ideogram and a sex indicator).
  3 | # Requirements mapping:
  4 |     # LB.1-x: Tokenise each blank space (including Unicode '\u00a0') and use to distinguish individual words. Represent as is in both annotated and regularized output. 
  5 |         # This should occur in all but the specified exception scenarios (see LB.1-a–d).
  6 |     # LB.1-e: If a blank space appears after a domesticated animal ideogram (i.e. EQU, SUS, OVIS, BOS or CAP) and before either a lowercase 'm' 'f' or  'x', then remove the space in both the annotated and regularized outputs.
  7 | 
  8 |     "CAP f 130 SUS 17 SUS f 41 BOS m 2 BOS f 4": "𐂈 130 𐁂 17 𐂊 41 𐂍 2 𐂌 4"
  9 |     "]SUS x 4 KO 80[" : "]𐁂 4 𐀒 80["
 10 | 
 11 | # Test LB.A.2:
 12 | # Scenario: Test that hyphens act as sign separators within a word in annotated scenarios.
 13 | # Requirements mapping: 
 14 |     # LB.2: Tokenise each instance of '-' and use to recognise whole words in encoding, but do not represent this symbol in either the annotated or the regularized output.
 15 |     
 16 |     "a-ri-to-jo" : "𐀀𐀪𐀵𐀍"
 17 | 
 18 | # Test LB.A.3
 19 | # Scenario: Test that blank spaces after * and before and after '+' are removed in annotated scenarios.
 20 | # Requirements mapping:
 21 |     # LB.1-a: If a blank space appears after '*', then remove that space in both the annotated and regularized outputs.
 22 |     # LB.1-b: If a blank space appears before/after '+', then remove both those spaces in both the annotated and regularized outputs.
 23 | 
 24 |     "]qa-ra / re-me-to * 168 + SE 28" : "]𐀣𐀨 / 𐀩𐀕𐀵 𐂰+𐀮 28"
 25 | 
 26 | # Test LB.A.4
 27 | # Scenario: Test that '--' is treated the same as as '-' in annotated scenarios.
 28 | # Requirements mapping:
 29 |     # LB.3: Tokenise each instance of '--' as '-' and use to recognise whole words in encoding, but do not represent this symbol in either the annotated or the regularized output.
 30 | 
 31 |     "a-ka--[ ]--jo-jo , me-no-[ da-pu2-ri-[-to-jo ]-po-ti-ni-ja ri *166+WE 22-[" : "𐀀𐀏[ ]𐀍𐀍 , 𐀕𐀜[ 𐀅𐁆𐀪[𐀵𐀍 ]𐀡𐀴𐀛𐀊 𐀪 𐂮+𐀸 22[" # https://liber.cnr.it/tablet/view/124
 32 | 
 33 | # Test LB.A.5
 34 | # Scenario: Test that ']', '[', ',' and '/' are correctly printed in annotated scenarios.
 35 | # Requirements mapping:
 36 |     # LB.4: Tokenise each instance of '/'. Represent this symbol as is in the annotated output, but do not include in the regularized output.
 37 |     # LB.5: Tokenise each instance of '//'. Represent this symbol as is in the annotated output, but do not include in the regularized output.
 38 |     # LB.6: Tokenise each instance of ','. Represent this symbol as is in the annotated output, but do not include in the regularized output.
 39 |     # LB.9: Tokenise each instance of '['. Represent this symbol as is in the annotated output, and represent as wildcard (i.e. '%') in regularized output.
 40 |     # LB.10: Tokenise each instance of ']'. Represent this symbol as is in the annotated output, and represent as wildcard (i.e. '%') in regularized output.
 41 | 
 42 |     "wo-de-wi-jo-jo , / me-no[ // ]ri-jo-de , ko-no , MA 3 ko-ri[ ]2 pa-de-i , ko-no MA 2 KO T 1[ [ ] pa-si-te-o-i , pa-sa-ja , ko-no , [ ] a-mi-ni-so-de , MA 2 KO T 4" : "𐀺𐀆𐀹𐀍𐀍 , / 𐀕𐀜[ // ]𐀪𐀍𐀆 , 𐀒𐀜 , 𐀔 3 𐀒𐀪[ ]2 𐀞𐀆𐀂 , 𐀒𐀜 𐀔 2 𐀒 𐄼 1[ [ ] 𐀞𐀯𐀳𐀃𐀂 , 𐀞𐀭𐀊 , 𐀒𐀜 , [ ] 𐀀𐀖𐀛𐀰𐀆 , 𐀔 2 𐀒 𐄼 4"
 43 | 
 44 | # Test LB.A.6
 45 | # Scenario: Test that ':' is correctly printed in annotated scenarios.
 46 | # Requirements mapping: 
 47 |     # LB.7: Tokenise each instance of ':'. Represent this symbol as is in the annotated output, but do not include in the regularized output.
 48 | 
 49 |     "a-ta-ti-nu  :  si-wa-[" : "𐀀𐀲𐀴𐀝  :  𐀯𐀷["
 50 | 
 51 | # Test LB.A.7
 52 | # Scenario: Test that single quotation marks are correctly printed in annotated scenarios.
 53 | # Requirements mapping:
 54 |     # LB.8: Tokenise each instance of '''. Represent this symbol as is in the annotated output, but do not include in regularized output.
 55 | 
 56 |     "]wa VIR 1 MUL 2 'ko-wa 1' ko-wo 1" : "]𐀷 𐂀 1 𐂁 2 '𐀒𐀷 1' 𐀒𐀺 1"
 57 | 
 58 | # Test LB.A.8
 59 | # Scenario: Test that '\u0323' is not printed in annotated scenarios
 60 | # Requirements mapping:
 61 |     # LB.11: Ignore each instance of 'X̣' (or '\u0323') in tokenisation. Do not represent this symbol in either the annotated or the regularized output.
 62 | 
 63 |     "] ko-wo / m\u0323e\u0323[-zo] 1 ko-wo / me-wi-jo 2 [" : "] 𐀒𐀺 / 𐀕[𐀿] 1 𐀒𐀺 / 𐀕𐀹𐀍 2 ["
 64 | 
 65 | # Test LB.A.9
 66 | # Scenario: Test that '?' is correctly printed in annoted scenarios.
 67 | # Requirements mapping:
 68 |     # LB.12: Tokenise each instance of '?'. Represent this symbol as is in the annotated output, but do not include in the regularized output.
 69 | 
 70 |     "i[-qi-ja?": "𐀂[𐀥𐀊?"
 71 | 
 72 | # Test LB.A.10
 73 | # Scenario: Test that \u27e6 and \u27e7 are correctly printed as Scott brackets (i.e. '⟦' and '⟧') in annotated scenarios.
 74 | # Requirements mapping:
 75 |     # LB.13: Tokenise each instance of Scott brackets (i.e. '⟦' and '⟧', or '\u27e6' and '\u27e7'). Represent these symbols as is in the annotated output. Do not include these symbols, or any other text that they contain, in the reguarised ouput.
 76 |        
 77 |     "po-*34-wi-do \u27e6TUN\u27e7 BIG[" : "𐀡𐁓𐀹𐀈 ⟦𐂪⟧ 𐃌["
 78 | 
 79 | # Test LB.A.11
 80 | # Scenario: Test that '<' and '>' are correctly printed in annotated scenarios.
 81 | # Requirements mapping:
 82 |     # LB.14: Tokenise each instance of angle brackets (i.e. '<' and '>'). Represent this text as is in the annotated output. Do not include these symbols, or any other text that they contain, in the reguarised ouput.
 83 |     
 84 |     "] <OVIS:m> 69 OVIS:f 30 [ ]-e-ke-me-de , / tu-ni-ja , pa OVIS:m 1" : "] <𐂇> 69 𐂆 30 [ ]𐀁𐀐𐀕𐀆 , / 𐀶𐀛𐀊 , 𐀞 𐂇 1" # https://liber.cnr.it/tablet/view/3172?wl=12765
 85 | 
 86 | # Test LB.A.12
 87 | # Scenario: Test that lower half brackets (i.e. '⸤' and '⸥', or '\u2e24' and '\u2e25') are correctly printed in annotated scenarios.
 88 | # Requirements mapping:
 89 |     # LB.15: Tokenise each instance of lower half brackets (i.e. '⸤' and '⸥', or '\u2e24' and '\u2e25'). Represent these symbols as is in the annotated output. Do not include these symbols in the reguarised ouput.
 90 |     
 91 |     "du-to\u2e24 \u2e25 / r\u0323u\u0323-ki-to" : "𐀉𐀵⸤ ⸥ / 𐀬𐀑𐀵"
 92 |     "e-ke-qe ]-o-na-to , ke-ke-me-na⌞ ⌟ko-to-na GRA qs ] vac.": "𐀁𐀐𐀤 ]𐀃𐀙𐀵 , 𐀐𐀐𐀕𐀙⌞ ⌟𐀒𐀵𐀙 𐂎 qs ] vac."
 93 | 
 94 | # Test LB.A.13
 95 | # Scenario: Test that upper half brackets (i.e. '⌜' and '⌝') are correctly printed in annotated scenarios.
 96 | # Requirements mapping
 97 |     # LB.16: Tokenise each instance of upper half brackets (i.e. ' ⌜' and '⌝'). Represent these symbols as is in the annotated output. Do not include these symbols in the reguarised ouput.
 98 | 
 99 |     "]2 OLIV T 2 ] OLIV T 1 to]-ko-do-mo HORD[ ]Z 3 VIR 20[ pi-ri-e-te-re HORD[ ]Z 3 VIR 5 pa-te-ko-to⌜ ⌝HORD[ ]V 2 [ vacat qa-ra2-te , o[-pi-me-]ne[ ]OLIV 6 pa-ka , o-pi-me-ne , [ OLIV qs pa-te-ko-to , o-pi-me-ne[ ]HORD 1 [ pi-ri-e-te-si , o-pi-me-ne[ ]HORD 1 T 4[ to-ko-do-mo , o-pi-me-ne[ ]HORD 7[ ]5 vac." : "]2 𐂐 𐄼 2 ] 𐂐 𐄼 1 𐀵]𐀒𐀈𐀗 𐂏[ ]𐄿 3 𐂀 20[ 𐀠𐀪𐀁𐀳𐀩 𐂏[ ]𐄿 3 𐂀 5 𐀞𐀳𐀒𐀵 ⌜   ⌝ 𐂏[ ]𐄾 2 [ vacat 𐀣𐁈𐀳 , 𐀃[𐀠𐀕]𐀚[ ]𐂐 6 𐀞𐀏 , 𐀃𐀠𐀕𐀚 , [ 𐂐 qs 𐀞𐀳𐀒𐀵 , 𐀃𐀠𐀕𐀚[ ]𐂏 1 [ 𐀠𐀪𐀁𐀳𐀯 , 𐀃𐀠𐀕𐀚[ ]𐂏 1 𐄼 4[ 𐀵𐀒𐀈𐀗 , 𐀃𐀠𐀕𐀚[ ]𐂏 7[ ]5 vac."
100 | 
101 | # Test LB.A.14
102 | # Scenario: Test that \u2082 is correctly handled as a subscript '2' in annotated scenarios.
103 | # Requirements mapping:
104 |     # LB.17: Tokenise '\u2082' together with immediately preceding transliterated sign (as long as no hyphen '-' is between them). Confirm that it is treated correctly as a subscript '2', and expected Unicode sign is printed, as per mapping.
105 | 
106 |     "da-pu\u2082-ri-to-jo , / po-ti-ni-ja 'me-ri' * 209 VAS 1" : "𐀅𐁆𐀪𐀵𐀍 , / 𐀡𐀴𐀛𐀊 '𐀕𐀪' 𐃨 1"
107 | 
108 | # Test LB.A.15
109 | # Scenario: Test that '\u2083' is correctly handled as a subscript '2' in annotated scenarios
110 | # Requirements mapping:
111 |     # LB.18: Tokenise '\u2083' together with immediately preceding transliterated sign (as long as no hyphen '-' is between them). Confirm that it is treated correctly as a subscript '3', and expected Unicode sign is printed, as per mapping.
112 | 
113 |     "pu-ri / a\u2083-zo-ro-qe , po-da-ko-qe BOS m ZE 1[" :  "𐀢𐀪 / 𐁁𐀿𐀫𐀤 , 𐀡𐀅𐀒𐀤 𐂍 𐀽 1["
114 | 
115 | # Test LB.A.16
116 | # Scenario: Test that 'mutila' is correctly printed in annotated scenarios.
117 | # Requirements mapping:
118 |     # LB 19: Tokenise each instance of 'mutila'. Represent this text as is in the annotated output, but do not include in the regularized output.
119 | 
120 |     "] GRA[ qs mutila" : "] 𐂎[ qs  mutila "
121 | 
122 | # Test LB.A.17
123 | # Scenario: Test that 'mut' is correctly printed in annotated scenarios.
124 | # Requirements mapping:
125 |     # LB.20: Tokenise each instance of 'mut'. Represent this text as is in the annotated output, but do not include in the regularized output.
126 |     
127 |     "sup. mut. ]vacat [ ]A 5 A [ ]vest.[ inf. mut" : "sup. mut. ]vacat [ ]𐀀 5 𐀀 [ ]vest.[ inf. mut"
128 | 
129 | # Test LB.A.18
130 | # Scenario: Test that 'sup. mut.', 'inf. mut.' and 'vac.' are correctly printed in annotated scenarios.
131 | # Requirements mapping:
132 |     # LB.21: Tokenise each instance of 'sup.' and 'mut.'. Represent this text as is in the annotated output, but do not include in the regularized output.
133 |     # LB.22: Tokenise each instance of 'inf.' and 'mut.'. Represent this text as is in the annotated output, but do not include in the regularized output.
134 |     # LB.23: Tokenise each instance of 'vac.'. Represent this text as is in the annotated output, but do not include in the regularized output.
135 | 
136 |     "sup. mut. ] wo[ ] vac. [ inf. mut." : "sup. mut. ] 𐀺[ ] vac. [ inf. mut."
137 | 
138 | # Test LB.A.19
139 | # Scenario: Test that 'vacat' is correctly printed in annotated scenarios.
140 | # Requirements mapping:
141 |     # LB.24: Tokenise each instance of 'vacat'. Represent this text as is in the annotated output, but do not include in the regularized output.
142 |     
143 |     "] vacat [" : "] vacat ["
144 |     "] vacat v. ] 1" : "] vacat v. ] 1"
145 | 
146 | # Test LB.A.20
147 | # Scenario: Test that 'vest.' is correctly printed in annotated scenarios, and that Unicode '\u00a0' is treated as a blank space.
148 | # Requirements mapping:
149 |     # LB.1-x: Tokenise each blank space (including Unicode '\u00a0') and use to distinguish individual words. Represent as is in both annotated and regularized output. 
150 |         # This should occur in all but the specified exception scenarios (see LB.1-a–d).
151 |     # LB.25: Tokenise each instance of 'vest.'. Represent this text as is in the annotated output, and represent as wildcard (i.e. '%') in regularized output.
152 | 
153 |     "] vest ., / su-ri-mo , u-ta-jo-jo , o OVIS m 85[\u00a0] vac ." : "] vest., / 𐀱𐀪𐀗 , 𐀄𐀲𐀍𐀍 , 𐀃 𐂇 85[ ] vac."
154 | 
155 | # Test LB.A.21
156 | # Scenario: Test that 'vestigia' is correctly printed in annotated scenarios.
157 | # Requirements mapping:
158 |     # LB.26: Tokenise each instance of 'vestigia'. Represent this text as is in the annotated output, and represent as wildcard (i.e. '%') in regularized output.
159 | 
160 |     "pa-ro , we-u-da-ne-we re-u-ko , a-ko-ro-we-e BOS+SI 2 re[-u-]ko , ma-ra-pi , pe-ko , a-ko-ro-we BOS+SI 1 OVIS:m? ]3 CAP:m 3 WE 3 CAP:m 3 ]vestigia[ ]2 [ ]BOS:x 3 ⟦ ⟧ ] vest. [ ] vest. [ re-u-ko[ ]ma-ra[-pi ]pe-ko , a-ko-ro-we[ OVIS:m 1 CAP:m 1 WE[ ] SUS:x[ ] vacat [ inf. mut." : "𐀞𐀫 , 𐀸𐀄𐀅𐀚𐀸 𐀩𐀄𐀒 , 𐀀𐀒𐀫𐀸𐀁 𐀘+𐀯 2 𐀩[𐀄]𐀒 , 𐀔𐀨𐀠 , 𐀟𐀒 , 𐀀𐀒𐀫𐀸 𐀘+𐀯 1 𐂇? ]3 𐂉 3 𐀸 3 𐂉 3 ]vestigia[ ]2 [ ]𐀘 3 ⟦ ⟧ ] vest. [ ] vest. [ 𐀩𐀄𐀒[ ]𐀔𐀨[𐀠 ]𐀟𐀒 , 𐀀𐀒𐀫𐀸[ 𐂇 1 𐂉 1 𐀸[ ] 𐁂[ ] vacat [ inf. mut."
161 | 
162 | # Test LB.A.22
163 | # Scenario: Test that 'vestigia?' is correctly printed in annotated scenarios.
164 | # Requirements mapping:
165 |     # LB.27: Tokenise each instance of 'vestigia?'. Represent this text as is in the annotated output, and represent as wildcard (i.e. '%') in regularized output.
166 | 
167 |     "su-ma-no / ti-ri-to [ vestigia? ] vacat" : "𐀱𐀔𐀜 / 𐀴𐀪𐀵 [ vestigia? ] vacat"
168 | 
169 | # Test LB.A.23
170 | # Scenario: Test that 'qs' (i.e. 'quantum sufficit') is correctly printed in annoted scenarios.
171 | # Requirements mapping:
172 |     # LB.28: Tokenise each instance of 'qs'. Represent this text as is in the annotated output, but represent as wildcard (i.e. '%') in regularized output.
173 | 
174 |     "]-ke-ke-me-na-[ , ko-]-to-na GRA qs ] vac." : "]𐀐𐀐𐀕𐀙[ , 𐀒]𐀵𐀙 𐂎 qs ] vac."
175 | 
176 | # Test LB.A.24
177 | # Scenario: Test that 'fragmentum separatum', 'α', 'β', 'γ' and 'δ'  are correctly printed in annotated scenarios.
178 | # Requirements mapping:
179 |     # LB.29: Tokenise each instance of 'fragmentum separatum'. Represent this text as is in the annotated output, but do not include in the regularized output.
180 |     # LB.53: Tokenise each instance of 'α'. Represent this text as is in the annotated output, but do not include in the regularized output.
181 |     # LB.54: Tokenise each instance of 'β'. Represent this text as is in the annotated output, but do not include in the regularized output.
182 |     # LB.55: Tokenise each instance of 'γ'. Represent this text as is in the annotated output, but do not include in the regularized output.
183 |     # LB.56: Tokenise each instance of 'δ'. Represent this text as is in the annotated output, but do not include in the regularized output.
184 | 
185 |     "da-we-u[-pi ]a-ko[ da-we-u-pi , a[ da-we-u-pi , ka[ da-we-u-pi , e-[ a3-zo-wo[ da-we[-u-]pi ⌞ ⌟wo[ da-we-u-pi , e-ke[ da-we-u[-pi a-re[ a-zo[ inf. mut. fragmentum separatum α sup. mut. ]  OVIS:f X 15 [ fragmentum separatum β ] , ka[ fragmentum separatum γ sup. mut. ]no-wo[ fragmentum separatum δ sup. mut. ]ma-jo-wo-[ inf. mut." : "𐀅𐀸𐀄[𐀠 ]𐀀𐀒[ 𐀅𐀸𐀄𐀠 , 𐀀[ 𐀅𐀸𐀄𐀠 , 𐀏[ 𐀅𐀸𐀄𐀠 , 𐀁[ 𐁁𐀿𐀺[ 𐀅𐀸[𐀄]𐀠 ⌞ ⌟𐀺[ 𐀅𐀸𐀄𐀠 , 𐀁𐀐[ 𐀅𐀸𐀄[𐀠 𐀀𐀩[ 𐀀𐀿[ inf. mut. fragmentum separatum α sup. mut. ]  𐂆 X 15 [ fragmentum separatum β ] , 𐀏[ fragmentum separatum γ sup. mut. ]𐀜𐀺[ fragmentum separatum δ sup. mut. ]𐀔𐀍𐀺[ inf. mut."
186 | 
187 | # Test LB.A.25
188 | # Scenario: Test that 'fragmentum A' and 'fragmentum B' are correctly printed in annotated scenarios.
189 | # Requirements mapping:
190 |     # LB.30: Tokenise each instance of 'fragmentum A'. Represent this text as is in the annotated output, but do not include in the regularized output.
191 |     # LB.31: Tokenise each instance of 'fragmentum B'. Represent this text as is in the annotated output, but do not include in the regularized output.
192 | 
193 |     "fragmentum A fragmentum B vacat [ sup. mut. e-me-si-jo-jo-[ ] 3-[ pa-na-so GRA 100-[ ]-vac.-[ ta-ra-qo GRA [ inf. mut. ta-u-pa-du-we GRA-[ a-ro-ja-[ pu-na-so-[ inf. mut." : "fragmentum A fragmentum B vacat [ sup. mut. 𐀁𐀕𐀯𐀍𐀍[ ] 3[ 𐀞𐀙𐀰 𐂎 100[ ]vac.[ 𐀲𐀨𐀦 𐂎 [ inf. mut. 𐀲𐀄𐀞𐀉𐀸 𐂎[ 𐀀𐀫𐀊[ 𐀢𐀙𐀰[ inf. mut."
194 | 
195 | # Test LB.A.26
196 | # Scenario: Test that 'fragmentum C' and 'fragmentum D' are correctly printed in annotated scenarios.
197 | # Requirements mapping:
198 |     # LB.32: Tokenise each instance of 'fragmentum C'. Represent this text as is in the annotated output, but do not include in the regularized output.
199 |     # LB.33: Tokenise each instance of 'fragmentum D'. Represent this text as is in the annotated output, but do not include in the regularized output.
200 | 
201 |     "fragmentum A fragmentum B sup. mut. sup. mut. ]-na 1 i-[ ]so-i-[ ko-wa   1[ ]ku-mi-[•]-du 1[ inf. mut. vac. [ vac. [ fragmentum C fragmentum D sup. mut. sup. mut. ]di-mi[ ]vac. ]*56-za[ ]vac. inf. mut. inf. mut." : "fragmentum A fragmentum B sup. mut. sup. mut. ]𐀙 1 𐀂[ ]𐀰𐀂[ 𐀒𐀷   1[ ]𐀓𐀖[•]𐀉 1[ inf. mut. vac. [ vac. [ fragmentum C fragmentum D sup. mut. sup. mut. ]𐀇𐀖[ ]vac. ]𐁖𐀼[ ]vac. inf. mut. inf. mut."
202 | 
203 | # Test LB.A.27
204 | # Scenario: Test that 'deest' (or its abbreviation 'dt') is correctly printed in annotated scenarios
205 | # Requirements mapping:
206 |     # LB.34: Tokenise each instance of 'deest' or 'dt'. Represent this text as is in the annotated output, but do not include in the regularized output.
207 |     
208 |     "sup. mut. ]-deest-[ inf. mut." : "sup. mut. ]deest[ inf. mut."
209 |     "]GRA 37 T 6[ ] vac. [ ]⌞deest⌟ vac. ⌞dt⌟ [" : "]𐂎 37 𐄼 6[ ] vac. [ ]⌞deest⌟ vac. ⌞dt⌟ ["
210 | 
211 | # Test LB.A.28
212 | # Scenario: Test that 'prior pars sine regulis' and '•' are correctly printed in annotated scenarios.
213 | # Requirements mapping
214 |     # LB.35: Tokenise each instance of 'prior pars sine regulis'. Represent this text as is in the annotated output, but do not include in the regularized output.
215 |     # LB.57: Tokenise each instance of '•'. Represent this text as is in the annotated output, and represent as wildcard (i.e. '%') in regularized output.
216 | 
217 |     "]-ke-ra2-u-na , e-ra[ ]• po-se-da-o-ne⌞ ⌟re-ko-no 6 [ *146 18[ ] LANA 2 M 2[ A±RE±PA V 4[ ]• 1 OVIS:m 1 OVIS:f 1 CAP:f[ qs SUS+KA 2 SUS:f 4[ ]• 1 FAR T 1 V [ qs VIN 5 TELA [ ] 1 TELA+PA 1 vac. vac. vac. [ ]3[ ]-we-e-a2[ inf. mut. v. prior pars sine regulis ]e-ke-me-de , do[ ]du-ru-wo-qo deest vac. vac. vac. vac. vac. vac." : "]𐀐𐁈𐀄𐀙 , 𐀁𐀨[ ]• 𐀡𐀮𐀅𐀃𐀚⌞ ⌟𐀩𐀒𐀜 6 [ 𐂞 18[ ] 𐂝 2 𐄸 2[ 𐂘 𐄾 4[ ]• 1 𐂇 1 𐂆 1 𐂈[ qs 𐁂+𐀏 2 𐂊 4[ ]• 1 𐀎 𐄼 1 𐄾 [ qs 𐂖 5 𐂧 [ ] 1 𐂧+𐀞 1 vac. vac. vac. [ ]3[ ]𐀸𐀁𐁀[ inf. mut. v. prior pars sine regulis ]𐀁𐀐𐀕𐀆 , 𐀈[ ]𐀉𐀬𐀺𐀦 deest vac. vac. vac. vac. vac. vac."
218 | 
219 | # Test LB.A.29
220 | # Scenario: Test that 'reliqua pars sine regulis' is correctly printed in annotated scenarios.
221 | # Requirements mapping
222 |     # LB.36: Tokenise each instance of 'reliqua pars sine regulis'. Represent this text as is in the annotated output, but do not include in the regularized output.
223 | 
224 |     "sup. mut. ]-vest.-[ ]-na-ro GRA 5 ]--do-we-i , ma-so-qe GRA 8 ] vac. ] GRA 402 OLIV+A 52 reliqua pars sine regulis" : "sup. mut. ]vest.[ ]𐀙𐀫 𐂎 5 ]𐀈𐀸𐀂 , 𐀔𐀰𐀤 𐂎 8 ] vac. ] 𐂎 402 𐂐+𐀀 52 reliqua pars sine regulis"
225 | 
226 | # Test LB.A.30
227 | # Scenario: Test that 'angustum' and '[•~]' are correctly printed in annotated scenarios.
228 | # Requirements mapping
229 |     # LB.37: Tokenise each instance of 'angustum'. Represent this text as is in the annotated output, but do not include in the regularized output.
230 |     # LB.59: Tokenise each instance of '[•~]'. Represent this text as is in the annotated output, and represent a single wildcard (i.e. '%') in regularized output.
231 |         
232 |     "a[ ]te VIR[ 1 ]ke-ro-si-ja , a[ ] VIR 1 ke-ro-]si-ja , [•~]me-ka-[•] VIR 1 a-[ ke-ro-]si-ja , o-pa-[ ]vac.[ VIR 1 vac.[ ] vac. vac. [ ] vac. v. ta-we-si-jo-jo , ke-ro-si-ja , te-wa[ VIR 1 ta-]we-si-jo-jo , ke-ro-si-ja , tu-ru-we-u VIR 1 ] angustum ta-]we-si-jo-jo , ke-ro-si VIR 20 a-pi-qo-ta-o , ke-ro-si-ja VIR 17 a-pi-o-to , ke-ro-si-ja VIR [1]8⌟ o-to-wo[-o ke-]ro-si-ja VIR [1]4 angustum [ ] [ ] ka-ma-e[-we] VIR 10" :  "𐀀[ ]𐀳 𐂀[ 1 ]𐀐𐀫𐀯𐀊 , 𐀀[ ] 𐂀 1 𐀐𐀫]𐀯𐀊 , [•~]𐀕𐀏[•] 𐂀 1 𐀀[ 𐀐𐀫]𐀯𐀊 , 𐀃𐀞[ ]vac.[ 𐂀 1 vac.[ ] vac. vac. [ ] vac. v. 𐀲𐀸𐀯𐀍𐀍 , 𐀐𐀫𐀯𐀊 , 𐀳𐀷[ 𐂀 1 𐀲]𐀸𐀯𐀍𐀍 , 𐀐𐀫𐀯𐀊 , 𐀶𐀬𐀸𐀄 𐂀 1 ] angustum 𐀲]𐀸𐀯𐀍𐀍 , 𐀐𐀫𐀯 𐂀 20 𐀀𐀠𐀦𐀲𐀃 , 𐀐𐀫𐀯𐀊 𐂀 17 𐀀𐀠𐀃𐀵 , 𐀐𐀫𐀯𐀊 𐂀 [1]8⌟ 𐀃𐀵𐀺[𐀃 𐀐]𐀫𐀯𐀊 𐂀 [1]4 angustum [ ] [ ] 𐀏𐀔𐀁[𐀸] 𐂀 10"    
233 | 
234 | # Test LB.A.31
235 | # Scenario: Test that 'graffito' is correctly printed in annotated scenarios.
236 | # Requirements mapping:
237 |     # LB.38: Tokenise each instance of 'graffito'. Represent this text as is in the annotated output, but do not include in the regularized output.
238 | 
239 |     "]e-ke , e-u-da-i-ta OVIS:f 39[ ]ki-u-ro , / su-ki-ri-ta-pi o ki OVIS 15 [ v. graffito lat. inf." : "]𐀁𐀐 , 𐀁𐀄𐀅𐀂𐀲 𐂆 39[ ]𐀑𐀄𐀫 , / 𐀱𐀑𐀪𐀲𐀠 𐀃 𐀑 𐀥 15 [ v. graffito lat. inf."
240 | 
241 | # Test LB.A.32
242 | # Scenario: Check that 'Graffito' is correctly printed in annotated scenarios.
243 | # Requirements mapping:
244 |     # LB.25: Tokenise each instance of 'Graffito'. Represent this text as is in the annotated output, but do not include in the regularized output.
245 | 
246 |     "] Graffito [": "] Graffito ["
247 | 
248 | # Test LB.A.33
249 | # Scenario: Test that 'r.' and 'r.p' are correctly printed in annotated scenarios.
250 | # Requirements mapping:
251 |     # LB.40: Tokenise each instance of 'r.' or 'r.p'. Represent this text as is in the annotated output, but do not include in the regularized output.
252 | 
253 |     "lat. sup. ] KE [ r. ]VIN 1 S 2[ ]1 ko-ta V[" : "lat. sup. ] 𐀐 [ r. ]𐂖 1 𐄽 2[ ]1 𐀒𐀲 𐄾["
254 |     "v. ]i-je-re-ja TELA+TE[ qs ka-]ra-wi-po-ro TELA+TE[ qs lat. dex. ] ⟦WE 30⟧ r.p vacat vestigia po-se-da-o-ne [ po-de-da-o-ne" : "v. ]𐀂𐀋𐀩𐀊 𐂧+𐀳[ qs 𐀏]𐀨𐀹𐀡𐀫 𐂧+𐀳[ qs lat. dex. ] ⟦𐀸 30⟧ r.p vacat vestigia 𐀡𐀮𐀅𐀃𐀚 [ 𐀡𐀆𐀅𐀃𐀚"
255 | 
256 | # Test LB.A.34
257 | # Scenario: Test that 'v.' and 'v.p' are correctly printed in annotated scenarios.
258 | # Requirements mapping:
259 |     # LB.41: Tokenise each instance of 'v.' or 'v.p'. Represent this text as is in the annotated output, but do not include in the regularized output.
260 | 
261 |     "to-re : : : : [ v. di-we si-po-ro ti-mi-to-qo [" : "𐀵𐀩 : : : : [ v. 𐀇𐀸 𐀯𐀡𐀫 𐀴𐀖𐀵𐀦 ["
262 |     "ARM 1 me-zo-a2 O 22 me-u-jo-a2 O 12 KO O 4 PA 2 v.p to-mi-re-[ ]wa-[ ]-re-[ ]e-ko-si o-to-pe-da-ko-we-de-[•]-ke[" : "𐂫 1 𐀕𐀿𐁀 𐀃 22 𐀕𐀄𐀍𐁀 𐀃 12 𐀒 𐀃 4 𐀞 2 v.p 𐀵𐀖𐀩[ ]𐀷[ ]𐀩[ ]𐀁𐀒𐀯 𐀃𐀵𐀟𐀅𐀒𐀸𐀆[•]𐀐["
263 | 
264 | # Test LB.A.35
265 | # Scenario: Check that 'v.↓' is correctly printed in annotated scenarios.
266 | # Requirements mapping:
267 |     # LB.42: Abbreviated form of verso, indicates the reverse side of the tablet, when inscribed. Arrow indicates direction that record is rotated to reach verso.
268 |  
269 |     "qe-te-o TELA;2-[ po-po TELA;2 4 [ v.↓ ⟦a-mi-si-ja TELA;1 12⟧ [" : "𐀤𐀳𐀃 𐂧²[ 𐀡𐀡 𐂧² 4 [ v.↓ ⟦𐀀𐀖𐀯𐀊 𐂧¹ 12⟧ ["
270 | 
271 | # Test LB.A.36
272 | # Scenario: Check that 'v.→' is correctly printed in annotated scenarios.
273 | # Requirements mapping:
274 |     # LB.31: Tokenise each instance of 'v.→'. Represent this text as is in the annotated output, but do not include in the regularized output.
275 | 
276 |     "ne-wo , za-we-[ v.→ ] a-ro-we a-nu-to" : "𐀚𐀺 , 𐀼𐀸[ v.→ ] 𐀀𐀫𐀸 𐀀𐀝𐀵"
277 | 
278 | # Test LB.A.37
279 | # Scenario: Test that blank spaces are removed before the full stop for 'l .' and 's .', and that the resulting 'l.' and 's.' are correctly printed in annotated scenarios.
280 | # Requirements mapping:
281 |     # LB.1-c: If a space appears before a '.'  in an annotation (e.g. 'lat .'), then remove that space in both the annotated and regularized outputs.
282 |     # LB.44: Tokenise each instance of 'l.'. Represent this text as is in the annotated output, but do not include in the regularized output.
283 |     # LB.46: Tokenise each instance of 's.'. Represent this text as is in the annotated output, but do not include in the regularized output.
284 |  
285 |     "l . s . ]\u27e6 vest . \u27e7[": "l. s. ]⟦ vest. ⟧["
286 | 
287 | # Test LB.A.38
288 | # Scenario: Test that blank spaces are removed before the full stop for 'l .' and 'i .', and that the resulting 'l.' and 'i.' are correctly printed in annotated scenarios.
289 | # Requirements mapping:
290 |     # LB.1-c: If a space appears before a '.'  in an annotation (e.g. 'lat .'), then remove that space in both the annotated and regularized outputs.
291 |     # LB.44: Tokenise each instance of 'l.'. Represent this text as is in the annotated output, but do not include in the regularized output.
292 |     # LB.48: Tokenise each instance of 'i.'. Represent this text as is in the annotated output, but do not include in the regularized output.
293 | 
294 |     "l . i . LANA 250[": "l. i. 𐂝 250["
295 | 
296 | # Test LB.A.39
297 | # Scenario: Check that 'lat.' and 'inf.' are correctly printed in annotated scenarios.
298 | # Requirements mapping:
299 |     # LB.36: Tokenise each instance of 'lat.'. Represent this text as is in the annotated output, but do not include in the regularized output.
300 |     # LB.40: Tokenise each instance of 'inf.'. Represent this text as is in the annotated output, but do not include in the regularized output.
301 | 
302 |     "l\u0323a\u0323t\u0323 . i\u0323n\u0323f\u0323 .": "lat. inf."
303 | 
304 | # Test LB.A.40
305 | # Scenario: Test that 'lat.' and 'sup.' are correctly printed in annotated scenarios.
306 | # Requirements mapping:
307 |     # LB.45: Tokenise each instance of 'lat.'. Represent this text as is in the annotated output, but do not include in the regularized output.
308 |     # LB.47: Tokenise each instance of 'sup.'. Represent this text as is in the annotated output, but do not include in the regularized output.
309 | 
310 |     "] TELA;4+⟦ZO⟧ 1 [ ]LANA M 1[ v.↓ ]-a ra[ lat. sup.]-so-ma [" : "] 𐂧⁴+⟦𐀿⟧ 1 [ ]𐂝 𐄸 1[ v.↓ ]𐀀 𐀨[ lat. sup.]𐀰𐀔 ["
311 | 
312 | # Test LB.A.41
313 | # Scenario: Test that 'dex.' is correctly printed in annotated scenarios.
314 | # Requirements mapping:
315 |     # LB.50: Tokenise each instance of 'dex.'. Represent this text as is in the annotated output, but do not include in the regularized output.
316 | 
317 |     "wo-di-je-ja , de-mi-ni-ja 1 ma-no , a-re-ka-sa-da-ra-ka 2 ri-su-ra , qo-ta-qe 2 e-ri-tu-pi-na , te-o-do-ra-'qe' 2 o-to-wo-wi-je tu-ka-te-qe 2 a-ne-a2 , tu-ka-te-qe 2 pi-ro-wo-na ki-ra-qe 2 pu-ka-ro ke-ti-de-qe 2 ]-ri-mo-qe 2 ]ma-ta-qe 2 ]*82 1 ]-qe 2 ] vac. inf. mut. lat. dex. ] , i-ri-[• ]1 ke-ra-so , ki-ra-qe 2" : "𐀺𐀇𐀋𐀊 , 𐀆𐀖𐀛𐀊 1 𐀔𐀜 , 𐀀𐀩𐀏𐀭𐀅𐀨𐀏 2 𐀪𐀱𐀨 , 𐀦𐀲𐀤 2 𐀁𐀪𐀶𐀠𐀙 , 𐀳𐀃𐀈𐀨'𐀤' 2 𐀃𐀵𐀺𐀹𐀋 𐀶𐀏𐀳𐀤 2 𐀀𐀚𐁀 , 𐀶𐀏𐀳𐀤 2 𐀠𐀫𐀺𐀙 𐀑𐀨𐀤 2 𐀢𐀏𐀫 𐀐𐀴𐀆𐀤 2 ]𐀪𐀗𐀤 2 ]𐀔𐀲𐀤 2 ]𐁚 1 ]𐀤 2 ] vac. inf. mut. lat. dex. ] , 𐀂𐀪[• ]1 𐀐𐀨𐀰 , 𐀑𐀨𐀤 2"
318 | 
319 | # Test LB.A.42
320 | # Scenario: Test that 'sigillum' is correctly printed in annotated scenarios.
321 | # Requirements mapping:
322 |     # LB.51: Tokenise each instance of 'sigillum'. Represent this text as is in the annotated output, but do not include in the regularized output.
323 |     
324 |     "α sigillum β qe-ti-ja γ vac." : "α sigillum β 𐀤𐀴𐀊 γ vac."
325 | 
326 | # Test LB.A.43
327 | # Scenario: Test that 'supra sigillum' is correctly printed in annotated scenarios.
328 | # Requirements mapping:
329 |     # LB.52: Tokenise each instance of 'supra sigillum'. Represent this text as is in the annotated output, but do not include in the regularized output.
330 |     # If a notation about the seal type is also included (e.g. '=A', indicating that the seal is of type 'A', according to the publication of the Thebes sealings (Olivier et al. 1982), then also represent this in the annotated output, but do not include in the regularized output.
331 |     
332 |     "α JAC supra sigillum β o-pa γ pa-ta-ja" : "α 𐃘 supra sigillum β 𐀃𐀞 γ 𐀞𐀲𐀊"
333 |     "α OVIS:m supra sigillum=R β vac. γ vac." : "α 𐂇 supra sigillum=R β vac. γ vac."
334 |     "α CAP:m supra sigillum=Z=1 β vac. γ ]vac." : "α 𐂉 supra sigillum=Z=1 β vac. γ ]vac."
335 | 
336 | # Test LB.A.44
337 | # Scenario: Test that '[•]' (or '[\u2022]') is correctly printed in annotated scenarios.
338 | # Requirements mapping:
339 |     # LB.58: Tokenise each instance of '[•]' or '[\u2022]''. Represent this symbol as is in the annotated output, and represent as a single wildcard (i.e. '%') in regularized output.
340 | 
341 |     "]po-[\u2022] , / [ OVIS m ] 40 o OVIS m 20" : "]𐀡[•] , / [ 𐂇 ] 40 𐀃 𐂇 20"
342 | 
343 | # Test LB.A.45
344 | # Scenario: Test that '•~•' is correctly printed in annoted scenarios.
345 | # Requirements mapping:
346 |     # LB.60: Tokenise each instance of '•~•'. Represent this text as is in the annotated output, and represent as two wildcards (i.e. '%%') in regularized output.
347 | 
348 |     "ma-mi-di-zo / pi-ri-to-jo OVIS:f 40[ [•~•]-ro ,  da-nu-wo OVIS:f 100[ po-ri-wo , / su-ki-ri-ta-jo , wo-we-u CAP:m 180 ja-ru , / pa-ta-ti-jo , do-e-ro , CAP:f 230 a-du-po-to , / qi-ko-we-e , do-e-ro , CAP:f 90 qa-di-ja , / po-ku-te-ro , da-mo , 'do-e-ro' CAP:f 70 da-[•~• / ]po-ku-ta CAP:f 130 ra-wa-ni , / po-ku-ta , ra-ri-di-jo OVIS:m 190 o-mi-ri-so , / ta-so , do-e-ro OVIS:m 50 [•~•]-so / a-pi-me-de-o , po-ku-ta 'ra-ri-di-jo' OVIS:f 140 ku-jo-[ / ]ta-so , // do-e-ro OVIS:f 100 a-*56-da-ro / ka-ta-mi-jo , do-e-ro OVIS:x[ a-ra-ko , / ra-ri-di-jo , do-e-ro OVIS:m 100[ vac. vac. vac." : "𐀔𐀖𐀇𐀿 / 𐀠𐀪𐀵𐀍 𐂆 40[ [•~•]𐀫 ,  𐀅𐀝𐀺 𐂆 100[ 𐀡𐀪𐀺 , / 𐀱𐀑𐀪𐀲𐀍 , 𐀺𐀸𐀄 𐂉 180 𐀊𐀬 , / 𐀞𐀲𐀴𐀍 , 𐀈𐀁𐀫 , 𐂈 230 𐀀𐀉𐀡𐀵 , / 𐀥𐀒𐀸𐀁 , 𐀈𐀁𐀫 , 𐂈 90 𐀣𐀇𐀊 , / 𐀡𐀓𐀳𐀫 , 𐀅𐀗 , '𐀈𐀁𐀫' 𐂈 70 𐀅[•~• / ]𐀡𐀓𐀲 𐂈 130 𐀨𐀷𐀛 , / 𐀡𐀓𐀲 , 𐀨𐀪𐀇𐀍 𐂇 190 𐀃𐀖𐀪𐀰 , / 𐀲𐀰 , 𐀈𐀁𐀫 𐂇 50 [•~•]𐀰 / 𐀀𐀠𐀕𐀆𐀃 , 𐀡𐀓𐀲 '𐀨𐀪𐀇𐀍' 𐂆 140 𐀓𐀍[ / ]𐀲𐀰 , // 𐀈𐀁𐀫 𐂆 100 𐀀𐁖𐀅𐀫 / 𐀏𐀲𐀖𐀍 , 𐀈𐀁𐀫 𐀥[ 𐀀𐀨𐀒 , / 𐀨𐀪𐀇𐀍 , 𐀈𐀁𐀫 𐂇 100[ vac. vac. vac."
349 | 
350 | # Test LB.A.46
351 | # Scenario: Test that '●' and [•~•] are correctly printed in annotated scenarios'.
352 | # Requirements mapping:
353 |     # LB.61: Tokenise each instance of '[•~•]'. Represent this text as is in the annotated output, and represent as two wildcards (i.e. '%%') in regularized output.
354 |     # LB.66: Tokenise each instance of '●'. Represent this text as is in the annotated output, but do not include in the regularized output.
355 | 
356 |     "[•~•] [ wi-tu-ri-jo , / a-mo-te-re [" : "[•~•] [ 𐀹𐀶𐀪𐀍 , / 𐀀𐀗𐀳𐀩 ["
357 |     "sup. mut. ]vest.[ di-pa AES *214VAS+DI 30[ qe-ro2 'AES' *255 ● 16 ku-ru-su-*56 ● *207VAS 1 pi-ri-je ● ZE 1 [•~•] 'me-no-no[' inf. mut." : "sup. mut. ]vest.[ 𐀇𐀞 𐂚 𐃭+𐀇 30[ 𐀤𐁊 '𐂚' 𐃙 ● 16 𐀓𐀬𐀱𐁖 ● 𐃦 1 𐀠𐀪𐀋 ● 𐀽 1 [•~•] '𐀕𐀜𐀜[' inf. mut."
358 | 
359 | # Test LB.A.47
360 | # Scenario: Test that '•~•~' is correctly printed in annotated scenarios.
361 | # Requirements mapping:
362 |     # LB.62: Tokenise each instance of '•~•~'. Represent this text as is in the annotated output, and represent as two wildcards (i.e. '%%') in regularized output.
363 | 
364 |     # ADD TC
365 | 
366 |     # Test LB.A.48
367 | # Scenario: Test that '[•~•~]' is correctly printed in annotated scenarios.
368 | # Requirements mapping:
369 |     # LB.63: Tokenise each instance of '[•~•~]'. Represent this text as is in the annotated output, and represent as two wildcards (i.e. '%%') in regularized output.
370 | 
371 |     "][•~•~]*34-so , 'da-*22-to' OVIS:m 50 [ ]do-ti , ti-ri-to OVIS:m 50 [" : "][•~•~]𐁓𐀰 , '𐀅𐁒𐀵' 𐂇 50 [ ]𐀈𐀴 , 𐀴𐀪𐀵 𐂇 50 ["
372 | 
373 | # Test LB.A.49
374 | # Scenario: Test that '•~•~•' is correctly printed in annotated scenarios.
375 | # Requirements mapping:
376 |     # LB.64: Tokenise each instance of '•~•~•'. Represent this text as is in the annotated output, and represent as three wildcards (i.e. '%%%') in regularized output.
377 | 
378 |     # ADD TC
379 | 
380 | # Test LB.A.50
381 | # Scenario: Test that '[•~•~•]' is correctly printed in annotated scenarios.
382 | # Requirements mapping:
383 |     # LB.65: Tokenise each instance of '[•~•~•]'. Represent this text as is in the annotated output, and represent as three wildcards (i.e. '%%%') in regularized output.
384 | 
385 |     "] vest. [ [•~•~•]-ra-de / ne-wo-jo OLE 4[ ] vac. [": "] vest. [ [•~•~•]𐀨𐀆 / 𐀚𐀺𐀍 𐂕 4[ ] vac. ["
386 | 
387 | # Test LB.A.51
388 | # Scenario: Test that '•~•~•~•' is correctly printed in annotated scenarios.
389 | # Requirements mapping:
390 |     # LB.66: Tokenise each instance of '•~•~•~•'. Represent this text as is in the annotated output, and represent as four wildcards (i.e. '%%%%') in regularized output.
391 | 
392 |     # ADD TC
393 | 
394 | # Test LB.A.52
395 | # Scenario: Test that '[•~•~•~•]' is correctly printed in annotated scenarios.
396 | # Requirements mapping:
397 |     # LB.67: Tokenise each instance of '[•~•~•~•]'. Represent this text as is in the annotated output, and represent as four wildcards (i.e. '%%%%') in regularized output.
398 | 
399 |     # ADD TC
400 | 
401 | # Test LB.A.53
402 | # Scenario: Test that checkmarks (i.e. 'X') are correctly printed in annotated scenarios.
403 | # Requirements mapping:
404 |     # LB.69: Tokenise each instance of 'X'. Represent this text as is in the annotated output, but do not include in the regularized output.
405 | 
406 |     "fragmentum A sup. mut. ] X MUL 1 ]--u-ra MUL 1 X ]-na MUL 1 tu-ka-na X MUL 1 ]-ma MUL 1 te-qa-ja MUL 1 ]-ja MUL 1-[ ]-ja-mu-ta MUL 1-[ ]--ta2-no-[ inf. mut." : "fragmentum A sup. mut. ] X 𐂁 1 ]𐀄𐀨 𐂁 1 X ]𐀙 𐂁 1 𐀶𐀏𐀙 X 𐂁 1 ]𐀔 𐂁 1 𐀳𐀣𐀊 𐂁 1 ]𐀊 𐂁 1[ ]𐀊𐀘𐀲 𐂁 1[ ]𐁋𐀜[ inf. mut."
407 | 
408 | # Test LB.A.54
409 | # Scenario: Test that '|' is correctly printed in annotated scenarios.
410 | # Requirements mapping:
411 |     # LB.70: Tokenise each instance of '|'. Represent this sign as is in the annotated output, but do not include in the regularized output.
412 | 
413 |     "α ]a3-wo-re-u-|si|-si β do-ke γ [•]-ja-wo-ne" : "α ]𐁁𐀺𐀩𐀄|𐀯|𐀯 β 𐀈𐀐 γ [•]𐀊𐀺𐀚"
414 | 
415 | # Test LB.A.55
416 | # Scenario: Test that both '<em>' and '</em>' are not printed in either annotated or regularized scenarios.
417 | # Requirements mapping:
418 |     # LB.71: Ignore each instance of '<em>'. Do not represent this string in either the annotated or the regularized output.
419 |     # LB.72: Ignore each instance of '</em>'. Do not represent this string in either the annotated or the regularized output.
420 | 
421 |     "fragmentum A fragmentum B sup. mut. sup. mut.</em> ]--to-[ ]-da-*22-to HORD [ ] 'da-*22-to' HORD 2 da-]-*22-to HORD-[ ]--ro 'da-*22-to' HORD 2 inf. mut. ]--ri 'da-*22-to' HORD 2 ] vac. inf. mut." : "fragmentum A fragmentum B sup. mut. sup. mut. ]𐀵[ ]𐀅𐁒𐀵 𐂏 [ ] '𐀅𐁒𐀵' 𐂏 2 𐀅]𐁒𐀵 𐂏[ ]𐀫 '𐀅𐁒𐀵' 𐂏 2 inf. mut. ]𐀪 '𐀅𐁒𐀵' 𐂏 2 ] vac. inf. mut."
422 | 
423 |     # Test LB.A.56
424 | # Scenario: Test that the space is removed after the '+' sign, and the correct sign is printed in the regularized scenario.
425 | # Requirements mapping:
426 |     # LB.1-b: If a blank space appears before/after '+', then remove both those spaces in both the annotated and regularized outputs.
427 |     
428 |     "]r\u0323o\u0323 , / da-mo GRA [ ]8 OLIV+ A 12" : "]𐀫 , / 𐀅𐀗 𐂎 [ ]8 𐂐+𐀀 12"
429 | 
430 | # Test LB.A.57
431 | # Scenario: Test that spaces are removed after 'TELA' and before either a '1', '2', '3', '4' or 'x', and the correct sign/s are printed in annotated scenarios.
432 | # Requirements mapping:
433 |     # LB.1-d: If a blank space appears after 'TELA' and before either a '1', '2', '3', '4' or 'x', then remove that space in both the annotated and regularized outputs.
434 |     
435 |     "]\u0323a\u0323-ra-ka-te-ja / tu-na-no TELA 1\u0323 1 [" : "]𐀀𐀨𐀏𐀳𐀊 / 𐀶𐀙𐀜 𐂧¹ 1 ["
436 |     "] * 161 TELA 2 [" : "] 𐂩 𐂧² ["
437 |     "nu-wa-i-ja , / 'pa-we-a' * 161 TELA 3 30\u27e6 \u27e7" : "𐀝𐀷𐀂𐀊 , / '𐀞𐀸𐀀' 𐂩 𐂧³ 30⟦ ⟧"
438 |     "] TELA 4 + PU 1[" : "] 𐂧⁴+𐀢 1["
439 |     "]ti-jo\u2e24 \u2e25 / to-mi-ka TELA x 30" : "]𐀴𐀍⸤ ⸥ / 𐀵𐀖𐀏 𐂧ˣ 30"
440 |     ']TELA 10 ⟦        ⟧ *158 1' : ']𐂧 10 ⟦        ⟧ 𐂦 1'
441 |     "to-sa TELA 40 o TELA 1 6[" : "𐀵𐀭 𐂧 40 𐀃 𐂧¹ 6["
442 | 
443 |  # NEW SCENARIO
444 |  # Test LB.A.61
445 |     # NEW REQUIREMENT: Tokenise each instance of 'sin.'. Represent this text as is in the annotated output, but do not include in the regularized output.
446 |     
447 |     "ku-ro-ro2 AROM 13 T 5 KA±PO 4 *157 28 LANA 5 me-po 6 S 1 V 4 ko-ri-jo-da-na AROM 21 i-re-we[ ] T 2 v. ta-we-si-jo-jo , ke-ro-si-ja VIR 20[ a-pi-qo-o , ke-ro-si-ja VIR 17 [ a-pi-o-to , ke-ro-si-ja VIR 18 o-to-wo-o , ke-ro-si-ja VIR 13 lat. sin. ka-ma-e-we VIR 10" : "𐀓𐀫𐁊 𐂑 13 𐄼 5 𐂓 4 𐂥 28 𐂝 5 𐀕𐀡 6 𐄽 1 𐄾 4 𐀒𐀪𐀍𐀅𐀙 𐂑 21 𐀂𐀩𐀸[ ] 𐄼 2 v. 𐀲𐀸𐀯𐀍𐀍 , 𐀐𐀫𐀯𐀊 𐂀 20[ 𐀀𐀠𐀦𐀃 , 𐀐𐀫𐀯𐀊 𐂀 17 [ 𐀀𐀠𐀃𐀵 , 𐀐𐀫𐀯𐀊 𐂀 18 𐀃𐀵𐀺𐀃 , 𐀐𐀫𐀯𐀊 𐂀 13 lat. sin. 𐀏𐀔𐀁𐀸 𐂀 10"


--------------------------------------------------------------------------------