├── tests ├── __init__.py ├── expected │ ├── luwian_unicode.yaml │ ├── luwian_tokenize_transliteration.yaml │ ├── linear_b_transliteration.yaml │ ├── hittite_unicode.yaml │ ├── linear_a_transliteration.yaml │ ├── hittite_tokenize_transliteration.yaml │ ├── linear_a_tokenize_transliteration.yaml │ ├── linear_a_unicode_regularized.yaml │ ├── linear_a_unicode.yaml │ ├── arabic_unicode.yaml │ ├── linear_b_tokenize_transliteration.yaml │ ├── errors.txt │ └── linear_b_unicode.yaml ├── test_data.py ├── test_script.py ├── test_arabic.py ├── test_luwian.py ├── data.py ├── test_hittite.py ├── test_linear_b.py ├── test_linear_a.py └── test_main.py ├── potnia ├── data │ ├── akkadian.yaml │ ├── arabic.yaml │ ├── potnia.bib │ ├── hittite.yaml │ ├── luwian.yaml │ ├── linear_a.yaml │ └── linear_b.yaml ├── scripts │ ├── __init__.py │ ├── luwian.py │ ├── akkadian.py │ ├── linear_b.py │ ├── hittite.py │ ├── arabic.py │ └── linear_a.py ├── __init__.py ├── enums.py ├── data.py ├── main.py └── script.py ├── docs ├── contributing.rst ├── _static │ └── img │ │ ├── Csign_har.png │ │ ├── Csign_me3.png │ │ ├── LBsign-qa.png │ │ ├── erc-logo.jpg │ │ ├── mappings.png │ │ ├── PotniaLogo.png │ │ ├── potnia-gui.png │ │ ├── potnia-banner.jpg │ │ ├── potnia-example.png │ │ ├── syllabograms.png │ │ ├── Csign_har_large.png │ │ ├── Csign_me3_large.png │ │ ├── LBsign_qa_large.png │ │ ├── LBsign_qa_large2.png │ │ └── downstream-example.png ├── credits.rst ├── quickstart.rst ├── Makefile ├── fonts.rst ├── index.rst ├── make.bat ├── api.rst ├── conf.py ├── additions.rst └── linear_b.md ├── .coveragerc ├── mkdocs.sh ├── example.py ├── paper.sh ├── .github └── workflows │ ├── testing.yml │ ├── joss-draft-pdf.yml │ ├── publish.yml │ └── docs.yml ├── pyproject.toml ├── CONTRIBUTING.rst ├── .gitignore ├── CODE_OF_CONDUCT.md ├── README.rst ├── LICENSE ├── paper.md └── paper.bib /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /potnia/data/akkadian.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /potnia/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | ../CONTRIBUTING.rst -------------------------------------------------------------------------------- /tests/expected/luwian_unicode.yaml: -------------------------------------------------------------------------------- 1 | há : 𔓟 -------------------------------------------------------------------------------- /tests/expected/luwian_tokenize_transliteration.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = potnia 3 | 4 | [report] 5 | precision = 2 6 | 7 | -------------------------------------------------------------------------------- /mkdocs.sh: -------------------------------------------------------------------------------- 1 | sphinx-build -b html docs docshtml -E -a 2 | echo docshtml/index.html 3 | -------------------------------------------------------------------------------- /docs/_static/img/Csign_har.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/Csign_har.png -------------------------------------------------------------------------------- /docs/_static/img/Csign_me3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/Csign_me3.png -------------------------------------------------------------------------------- /docs/_static/img/LBsign-qa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/LBsign-qa.png -------------------------------------------------------------------------------- /docs/_static/img/erc-logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/erc-logo.jpg -------------------------------------------------------------------------------- /docs/_static/img/mappings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/mappings.png -------------------------------------------------------------------------------- /docs/_static/img/PotniaLogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/PotniaLogo.png -------------------------------------------------------------------------------- /docs/_static/img/potnia-gui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/potnia-gui.png -------------------------------------------------------------------------------- /docs/_static/img/potnia-banner.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/potnia-banner.jpg -------------------------------------------------------------------------------- /docs/_static/img/potnia-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/potnia-example.png -------------------------------------------------------------------------------- /docs/_static/img/syllabograms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/syllabograms.png -------------------------------------------------------------------------------- /tests/expected/linear_b_transliteration.yaml: -------------------------------------------------------------------------------- 1 | "𐀀𐀪𐀵𐀍" : "a-ri-to-jo" 2 | "𐀀𐀪𐀵𐀍 𐀀𐀪𐀵𐀍" : "a-ri-to-jo a-ri-to-jo" -------------------------------------------------------------------------------- /docs/_static/img/Csign_har_large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/Csign_har_large.png -------------------------------------------------------------------------------- /docs/_static/img/Csign_me3_large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/Csign_me3_large.png -------------------------------------------------------------------------------- /docs/_static/img/LBsign_qa_large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/LBsign_qa_large.png -------------------------------------------------------------------------------- /docs/_static/img/LBsign_qa_large2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/LBsign_qa_large2.png -------------------------------------------------------------------------------- /docs/_static/img/downstream-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AncientNLP/potnia/HEAD/docs/_static/img/downstream-example.png -------------------------------------------------------------------------------- /docs/credits.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | Credits 3 | ======================= 4 | 5 | .. include:: ../README.rst 6 | :start-after: start-credits 7 | :end-before: end-credits 8 | -------------------------------------------------------------------------------- /docs/quickstart.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | Quickstart 3 | ======================= 4 | 5 | .. include:: ../README.rst 6 | :start-after: start-quickstart 7 | :end-before: end-quickstart 8 | -------------------------------------------------------------------------------- /potnia/scripts/luwian.py: -------------------------------------------------------------------------------- 1 | # from dataclasses import dataclass 2 | # from ..hittite import Hittite 3 | 4 | # @dataclass 5 | # class Luwian(Hittite): 6 | # config:str = "luwian.yaml" 7 | 8 | 9 | # luwian = Luwian() -------------------------------------------------------------------------------- /potnia/scripts/akkadian.py: -------------------------------------------------------------------------------- 1 | # from dataclasses import dataclass 2 | # from ..script import Script 3 | 4 | 5 | # @dataclass 6 | # class Akkadian(Script): 7 | # config:str = "akkadian" 8 | 9 | 10 | # akkadian = Akkadian() -------------------------------------------------------------------------------- /tests/expected/hittite_unicode.yaml: -------------------------------------------------------------------------------- 1 | "ḫe]-en-ku-un šu-me-eš ma-ni-ia-aḫ-ḫi-eš-ke-et-tén" : "𒄭]𒂗𒆪𒌦 𒋗𒈨𒐁 𒈠𒉌𒅀𒄴𒄭𒐁𒆠𒀉𒁷" 2 | 3 | "a-ši KÁ-aš ku-iš pa-it nu DINGIR (MEŠ) 2 ḪUR.SAG (MEŠ)" : "𒀀𒅆 𒆍𒀸 𒆪𒅖 𒉺𒀉 𒉡 𒀭 𒌍 2 𒄯𒊕 𒌍" -------------------------------------------------------------------------------- /tests/test_data.py: -------------------------------------------------------------------------------- 1 | from potnia.data import read_data_yaml_cached 2 | 3 | def test_read_data_yaml_cached(): 4 | result = read_data_yaml_cached("does-not-exists.yaml") 5 | assert isinstance(result, dict) 6 | assert len(result) == 0 -------------------------------------------------------------------------------- /tests/test_script.py: -------------------------------------------------------------------------------- 1 | from potnia.script import Script 2 | 3 | 4 | 5 | def test_tokenize_unicode(): 6 | script = Script(config=dict(dummy="dummy")) 7 | 8 | result = script.tokenize_unicode("text") 9 | assert result == ["t", "e", "x", "t"] 10 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from potnia import linear_b 2 | text = "po-ti-ni-ja" 3 | 4 | tokens = linear_b.tokenize_transliteration(text) 5 | print(tokens) # Output: ['po', 'ti', 'ni', 'ja'] 6 | 7 | unicode_text = linear_b(text) 8 | print(unicode_text) # Output: 𐀡𐀴𐀛𐀊 9 | 10 | 11 | -------------------------------------------------------------------------------- /tests/expected/linear_a_transliteration.yaml: -------------------------------------------------------------------------------- 1 | "𐙒 𐙿 𐙇" : "*180 A339 *100" # https://sigla.phis.me/document/PH%2012a/ 2 | "𐀏𐀇 𐙒 𐙿 𐙇" : "ka-di *180 A339 *100" 3 | "𐀏𐀇 𐙍 3 𐀓𐀫 𐙍 78 𐛿 17" : "ka-di *131a 3 ku-ro *131a 78 A594 17" # https://sigla.phis.me/document/ZA%2015b/. Uses *131a instead of AB131/VINa 4 | -------------------------------------------------------------------------------- /tests/expected/hittite_tokenize_transliteration.yaml: -------------------------------------------------------------------------------- 1 | "ḫe]‑en‑ku‑un šu‑me‑eš ma‑ni‑ia‑aḫ‑ḫi‑eš‑ke‑et‑tén" : 2 | - "ḫe" 3 | - "]" 4 | - "en" 5 | - "ku" 6 | - "un" 7 | - " " 8 | - "šu" 9 | - "me" 10 | - "eš" 11 | - " " 12 | - "ma" 13 | - "ni" 14 | - "ia" 15 | - "aḫ" 16 | - "ḫi" 17 | - "eš" 18 | - "ke" 19 | - "et" 20 | - "tén" 21 | -------------------------------------------------------------------------------- /tests/test_arabic.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from potnia import arabic 3 | from .data import expected 4 | 5 | 6 | @pytest.mark.parametrize("test_input,expected", expected("arabic_unicode")) 7 | def test_arabic_unicode(test_input, expected): 8 | result = arabic(test_input) 9 | assert result == expected, f"Expected: arabic('{test_input}') to produce '{expected}' but got '{result}'" 10 | 11 | 12 | -------------------------------------------------------------------------------- /potnia/__init__.py: -------------------------------------------------------------------------------- 1 | from .script import Script 2 | from .scripts.linear_a import linear_a, LinearA 3 | from .scripts.linear_b import linear_b, LinearB 4 | from .scripts.hittite import hittite, Hittite 5 | from .scripts.arabic import arabic, Arabic 6 | 7 | 8 | # Luwian is currently a work in progress 9 | # from .luwian import luwian 10 | 11 | # Akkadian is currently a work in progress 12 | # from .akkadian import akkadian 13 | -------------------------------------------------------------------------------- /potnia/enums.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class BibliographyFormat(str, Enum): 5 | plaintext = "plaintext" 6 | html = "html" 7 | latex = "latex" 8 | markdown = "markdown" 9 | 10 | def __str__(self): 11 | return self.value 12 | 13 | 14 | class BibliographyStyle(str, Enum): 15 | plain = "plain" 16 | unsrt = "unsrt" 17 | alpha = "alpha" 18 | unsrtalpha = "unsrtalpha" 19 | 20 | def __str__(self): 21 | return self.value 22 | -------------------------------------------------------------------------------- /tests/test_luwian.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | # from potnia import luwian 3 | from .data import expected 4 | 5 | 6 | # @pytest.mark.parametrize("test_input,expected", expected("luwian_unicode")) 7 | # def test_luwian_unicode(test_input, expected): 8 | # assert luwian(test_input) == expected 9 | 10 | 11 | # @pytest.mark.parametrize("test_input,expected", expected("luwian_tokenize_transliteration")) 12 | # def test_tokenize_transliteration_luwian(test_input, expected): 13 | # assert luwian.tokenize_transliteration(test_input) == expected 14 | -------------------------------------------------------------------------------- /tests/data.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | from pathlib import Path 3 | 4 | EXPECTED_DIR = Path(__file__).parent / "expected" 5 | 6 | def read_expected(filename: str) -> dict[str, str]: 7 | path = EXPECTED_DIR / filename 8 | if not path.suffix: 9 | path = path.with_suffix(".yaml") 10 | 11 | if not path.exists(): 12 | return dict() 13 | 14 | with open(path, encoding='utf8') as f: 15 | result = yaml.safe_load(f) 16 | return result or dict() 17 | 18 | 19 | def expected(filename: str) -> str: 20 | return read_expected(filename).items() -------------------------------------------------------------------------------- /potnia/data/arabic.yaml: -------------------------------------------------------------------------------- 1 | mappings: 2 | b : ب 3 | t : ت 4 | ṯ : ث 5 | g : ج 6 | j : ج 7 | ǧ : ج 8 | ḥ : ح 9 | ḫ : خ 10 | d : د 11 | ḏ : ذ 12 | r : ر 13 | z : ز 14 | s : س 15 | š : ش 16 | ṣ : ص 17 | ḍ : ض 18 | ṭ : ط 19 | ẓ : ظ 20 | ʿ : ع 21 | ġ : غ 22 | f : ف 23 | q : ق 24 | k : ك 25 | l : ل 26 | m : م 27 | n : ن 28 | h : ه 29 | w : و 30 | y : ي 31 | ỳ : ى 32 | ā : ا 33 | ī : ي 34 | ū : و 35 | ʾ : ء 36 | 37 | a: َ # Fatha (short 'a' sound) 38 | i: ِ # Kasra (short 'i' sound) 39 | u: ُ # Damma (short 'u' sound) 40 | -------------------------------------------------------------------------------- /tests/expected/linear_a_tokenize_transliteration.yaml: -------------------------------------------------------------------------------- 1 | "]ta-pi ]ki[ ]a-ra[ ]a-su-mi-*118[ a-pa-[?][ ]mi-ki-sa-ne[": 2 | - "]" 3 | - "ta" 4 | - "pi" 5 | - " " 6 | - "]" 7 | - "ki" 8 | - "[" 9 | - " " 10 | - "]" 11 | - "a" 12 | - "ra" 13 | - "[" 14 | - " " 15 | - "]" 16 | - "a" 17 | - "su" 18 | - "mi" 19 | - "*118" 20 | - "[" 21 | - " " 22 | - "a" 23 | - "pa" 24 | - "[?]" 25 | - "[" 26 | - " " 27 | - "]" 28 | - "mi" 29 | - "ki" 30 | - "sa" 31 | - "ne" 32 | - "[" 33 | 34 | "pi[?]": 35 | - "pi" 36 | - "[?]" 37 | 38 | "[?]pi": 39 | - "[?]" 40 | - "pi" 41 | -------------------------------------------------------------------------------- /paper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Instructions here: https://joss.readthedocs.io/en/latest/submitting.html#docker 4 | docker run --rm -it \ 5 | -v $PWD:/data \ 6 | -u $(id -u):$(id -g) \ 7 | --env JOURNAL=joss \ 8 | openjournals/inara:latest \ 9 | -o pdf \ 10 | paper.md 11 | 12 | 13 | echo Generating preprint 14 | docker run --rm -it \ 15 | -v $PWD:/data \ 16 | -u $(id -u):$(id -g) \ 17 | --env JOURNAL=joss \ 18 | openjournals/inara:latest \ 19 | -o preprint \ 20 | paper.md 21 | 22 | # TODO replace "docs/_static/img/" in paths in paper.preprint.tex with root directory 23 | cat paper.preprint.tex | sed "s/docs\/_static\/img\///g" > tmp 24 | mv tmp paper.preprint.tex -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/fonts.rst: -------------------------------------------------------------------------------- 1 | Fonts 2 | ====================== 3 | 4 | If the glyphs for Linear A, Linear B, or other ancient scripts do not display correctly in your terminal, code editor, web browser, or Jupyter notebook, it is likely due to missing fonts. Install the following Noto Sans fonts from Google Fonts for proper rendering: 5 | 6 | - `Noto Sans Linear A `_ 7 | - `Noto Sans Linear B `_ 8 | 9 | Ensure your editor, terminal, or application is set to use these fonts. If you are browsing the repository online, **Firefox** typically offers better glyph support compared to **Google Chrome** or **Microsoft Edge**. 10 | -------------------------------------------------------------------------------- /potnia/data.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | from pathlib import Path 3 | from functools import cache 4 | 5 | DATA_DIR = Path(__file__).parent / "data" 6 | 7 | @cache 8 | def read_data_yaml_cached(filename: str) -> dict[str, str]: 9 | path = DATA_DIR / filename 10 | if not path.suffix: 11 | path = path.with_suffix(".yaml") 12 | 13 | if not path.exists(): 14 | return dict() 15 | 16 | with open(path, encoding='utf8') as f: 17 | result = yaml.safe_load(f) 18 | return result or dict() 19 | 20 | 21 | def read_data(*filenames) -> dict[str, str]: 22 | result = dict() 23 | for filename in filenames: 24 | result.update(read_data_yaml_cached(filename)) 25 | 26 | return result 27 | -------------------------------------------------------------------------------- /tests/test_hittite.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from potnia import hittite 3 | from .data import expected 4 | 5 | 6 | @pytest.mark.parametrize("test_input,expected", expected("hittite_unicode")) 7 | def test_hittite_unicode(test_input, expected): 8 | result = hittite(test_input) 9 | assert result == expected, f"Expected: hittite('{test_input}') to produce '{expected}' but got '{result}'" 10 | 11 | 12 | @pytest.mark.parametrize("test_input,expected", expected("hittite_tokenize_transliteration")) 13 | def test_tokenize_transliteration_hittite(test_input, expected): 14 | result = hittite.tokenize_transliteration(test_input) 15 | assert result == expected, f"Expected: hittite.tokenize_transliteration('{test_input}') to produce '{expected}' but got '{result}'" 16 | -------------------------------------------------------------------------------- /tests/expected/linear_a_unicode_regularized.yaml: -------------------------------------------------------------------------------- 1 | # Test that syllabograms and logograms are correctly converted to Unicode, that spaces work as word separators, and that hyphens act to join syllables within a word 2 | "ka-di AB131/VINa 3 ku-ro AB131/VINa 78 A594 17" : "𐀏𐀇 𐙍 3 𐀓𐀫 𐙍 78 𐛿 17" # https://sigla.phis.me/document/ZA%2015b/ 3 | 4 | # Test that ']' and '[' are correctly converted to wildcard characters ('%') 5 | # Test that '[?]'' is correctly converted to a wildcard character ('%') 6 | "]pa-ri-de ]a-si-*118 ]ku-ka-[?][" : "%𐀞𐀪𐀆 %𐀀𐀯𐙈 %𐀓𐀏%%" # https://sigla.phis.me/document/KH%2099/index-7.html 7 | 8 | # Test that '[unclassified]' is correctly converted to a wildcard character ('%') 9 | "[unclassified] *180 A339 *100" : "% 𐙒 𐙿 𐙇" # https://sigla.phis.me/document/PH%2012a/ -------------------------------------------------------------------------------- /.github/workflows/testing.yml: -------------------------------------------------------------------------------- 1 | name: testing 2 | 3 | on: [push] 4 | jobs: 5 | build: 6 | runs-on: ubuntu-latest 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | python-version: ["3.10", "3.11", "3.12"] 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Install poetry 14 | run: pipx install poetry 15 | - name: Initialise Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v3 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | cache: "poetry" 20 | - name: Install dependencies for Python ${{ matrix.python-version }} 21 | run: | 22 | poetry env use "${{ matrix.python-version }}" 23 | poetry install 24 | - name: Testing 25 | run: | 26 | poetry env info 27 | poetry run pytest -v 28 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. potnia documentation master file, created by 2 | sphinx-quickstart on Mon Jul 22 12:22:08 2024. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to potnia's documentation! 7 | ================================== 8 | 9 | .. image:: https://raw.githubusercontent.com/AncientNLP/potnia/main/docs/_static/img/PotniaLogo.png 10 | 11 | .. include:: ../README.rst 12 | :start-after: start-summary 13 | :end-before: end-summary 14 | 15 | .. toctree:: 16 | :maxdepth: 2 17 | :caption: Contents: 18 | 19 | quickstart 20 | fonts 21 | linear_b 22 | additions 23 | api 24 | contributing 25 | credits 26 | 27 | Indices and tables 28 | ================== 29 | 30 | * :ref:`genindex` 31 | * :ref:`modindex` 32 | * :ref:`search` 33 | -------------------------------------------------------------------------------- /tests/expected/linear_a_unicode.yaml: -------------------------------------------------------------------------------- 1 | # Test that syllabograms and logograms are correctly converted to Unicode, that spaces work as word separators, and that hyphens act to join syllables within a word 2 | "ka-di AB131/VINa 3 ku-ro AB131/VINa 78 A594 17" : "𐀏𐀇 𐙍 3 𐀓𐀫 𐙍 78 𐛿 17" # https://sigla.phis.me/document/ZA%2015b/ 3 | 4 | # Test that ']' and '[' are correctly tokenized as individual tokens 5 | # Test that '[?]'' is correctly converted to a wildcard character ('%') 6 | "]pa-ri-de ]a-si-*118 ]ku-ka-[?][" : "]𐀞𐀪𐀆 ]𐀀𐀯𐙈 ]𐀓𐀏[?][" # https://sigla.phis.me/document/KH%2099/index-7.html 7 | 8 | # Test that '[unclassified]' is correctly tokenized as a single token 9 | "[unclassified] *180 A339 *100" : "[unclassified] 𐙒 𐙿 𐙇" # https://sigla.phis.me/document/PH%2012a/ 10 | 11 | "ka-di[unclassified] *180 A339 *100" : "𐀏𐀇[unclassified] 𐙒 𐙿 𐙇" -------------------------------------------------------------------------------- /.github/workflows/joss-draft-pdf.yml: -------------------------------------------------------------------------------- 1 | name: joss-draft-pdf 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - paper 8 | 9 | jobs: 10 | paper: 11 | runs-on: ubuntu-latest 12 | name: Paper Draft 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v4 16 | - name: Build draft PDF 17 | uses: openjournals/openjournals-draft-action@master 18 | with: 19 | journal: joss 20 | # This should be the path to the paper within your repo. 21 | paper-path: paper.md 22 | - name: Upload 23 | uses: actions/upload-artifact@v4 24 | with: 25 | name: paper 26 | # This is the output path where Pandoc will write the compiled 27 | # PDF. Note, this should be the same directory as the input 28 | # paper.md 29 | path: paper.pdf 30 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /potnia/data/potnia.bib: -------------------------------------------------------------------------------- 1 | @article{potnia, 2 | author = {Emily Tour and Kabir Manandhar Shrestha and Robert Turnbull}, 3 | title = {{Potnia: A Python library for the conversion of transliterated ancient texts to Unicode}}, 4 | year = {2025}, 5 | journal = {Journal of Open Source Software}, 6 | publisher = {The Open Journal}, 7 | volume = {10}, 8 | number = {108}, 9 | pages = {7725}, 10 | doi = {10.21105/joss.07725}, 11 | url = {https://doi.org/10.21105/joss.07725} 12 | } 13 | @misc{potnia_release, 14 | author = {Emily Tour and Kabir Manandhar Shrestha and Robert Turnbull}, 15 | title = {{Potnia: A Python library for the conversion of transliterated ancient texts to Unicode}}, 16 | year = {2025}, 17 | url = {https://doi.org/10.26188/28721354.v1}, 18 | note = {Version 0.4.0, Apache License 2.0}, 19 | doi = {10.26188/28721354.v1} 20 | } 21 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | API Reference 3 | ======================= 4 | 5 | 6 | Abstract Base Class: `Script` 7 | ============================= 8 | 9 | .. autoclass:: potnia.script.Script 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | 14 | 15 | Scripts Available 16 | ================== 17 | 18 | Linear A 19 | -------- 20 | 21 | .. autoclass:: potnia.scripts.linear_a.LinearA 22 | :members: 23 | :undoc-members: 24 | :inherited-members: 25 | 26 | 27 | Linear B 28 | -------- 29 | 30 | .. autoclass:: potnia.scripts.linear_b.LinearB 31 | :members: 32 | :undoc-members: 33 | :inherited-members: 34 | 35 | 36 | Arabic 37 | ------- 38 | 39 | .. autoclass:: potnia.scripts.arabic.Arabic 40 | :members: 41 | :undoc-members: 42 | :inherited-members: 43 | 44 | 45 | Hittite 46 | ------- 47 | 48 | .. autoclass:: potnia.scripts.hittite.Hittite 49 | :members: 50 | :undoc-members: 51 | :inherited-members: 52 | 53 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: publish 2 | on: 3 | push: 4 | tags: 5 | - 'v*.*.*' 6 | jobs: 7 | build: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | python-version: ["3.11"] 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Install poetry 16 | run: pipx install poetry 17 | - name: Initialise Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v3 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | cache: "poetry" 22 | - name: Install dependencies for Python ${{ matrix.python-version }} 23 | run: | 24 | poetry env use "${{ matrix.python-version }}" 25 | poetry install 26 | - name: Build library 27 | run: poetry build 28 | - name: Publish library 29 | env: 30 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 31 | run: | 32 | poetry config pypi-token.pypi $PYPI_TOKEN 33 | poetry publish 34 | -------------------------------------------------------------------------------- /tests/expected/arabic_unicode.yaml: -------------------------------------------------------------------------------- 1 | "al-kitāb" : "الكِتاب" 2 | "al-salām" : "السَلام" 3 | naʿam: "نَعَم" # Yes 4 | lā: "لا" # No 5 | ṣabāḥu al-ḫayr: "صَباحُ الخَير" # Good morning 6 | masāʾu al-ḫayr: "مَساءُ الخَير" # Good evening 7 | ṣadīq: "صَديق" # Friend (male) 8 | kayfa ḥāluka: "كَيفَ حالُكَ" # How are you (to a male) 9 | kayfa ḥāluki: "كَيفَ حالُكِ" # How are you (to a female) 10 | marḥaban: "مَرحَبًا" # Hello 11 | šukran: "شُكرًا" # Thank you 12 | ʿafwan: "عَفوًا" # You're welcome 13 | ʿindī suʾālun: "عِندي سؤالٌ" # I have a question 14 | mā ismuka: "ما اسمُكَ" # What is your name (to a male) 15 | mā ismuki: "ما اسمُكِ" # What is your name (to a female) 16 | ʾanā min: "أنا مِن" # I am from 17 | ʾayna al-ḥammāmu: "أينَ الحَمّامُ" # Where is the bathroom 18 | hal tatakallamu al-inglīziyya: "هَل تَتَكَلَّمُ الإنجليزِيَّ" # Do you speak English (to a male) 19 | hal tatakallamīna al-inglīziyya: "هَل تَتَكَلَّمينَ الإنجليزِيَّ" # Do you speak English (to a female) 20 | mā al-ʿamalu: "ما العَمَلُ" # What is the work 21 | anā atakallamu al-ʿarabiyya: "أنا أتَكَلَّمُ العَرَبِيَّ" # I speak Arabic 22 | ṣadīqatun: "صَديقَةٌ" # Friend (female) 23 | 24 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = 'potnia' 10 | copyright = '2024, Emily Tour, Kabir Manandhar Shrestha, Robert Turnbull' 11 | author = 'Emily Tour, Kabir Manandhar Shrestha, Robert Turnbull' 12 | release = '0.1.1' 13 | 14 | # -- General configuration --------------------------------------------------- 15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 16 | 17 | extensions = [ 18 | "sphinx_rtd_theme", 19 | "myst_parser", 20 | "sphinx.ext.mathjax", 21 | "sphinx.ext.githubpages", 22 | "sphinx.ext.autodoc", 23 | "sphinx.ext.coverage", 24 | "sphinx.ext.napoleon", 25 | "sphinx_copybutton", 26 | "sphinx.ext.graphviz", 27 | ] 28 | 29 | templates_path = ['_templates'] 30 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 31 | 32 | 33 | 34 | # -- Options for HTML output ------------------------------------------------- 35 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 36 | 37 | html_theme = 'sphinx_rtd_theme' 38 | html_static_path = ['_static'] 39 | -------------------------------------------------------------------------------- /tests/test_linear_b.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from potnia import linear_b 3 | from .data import expected 4 | 5 | @pytest.mark.parametrize("test_input,expected", expected("linear_b_unicode")) 6 | def test_linear_b_unicode(test_input, expected): 7 | result = linear_b(test_input) 8 | assert result == expected, f"Expected: linear_b('{test_input}') to produce '{expected}' but got '{result}'" 9 | 10 | 11 | @pytest.mark.parametrize("test_input,expected", expected("linear_b_unicode_regularized")) 12 | def test_linear_b_unicode_regularized(test_input, expected): 13 | result = linear_b(test_input, regularize=True) 14 | assert result == expected, f"Expected: linear_b('{test_input}', regularize=True) to produce '{expected}' but got '{result}'" 15 | 16 | 17 | @pytest.mark.parametrize("test_input,expected", expected("linear_b_transliteration")) 18 | def test_linear_b_transliteration(test_input, expected): 19 | result = linear_b.to_transliteration(test_input) 20 | assert result == expected, f"Expected: linear_b.to_transliteration('{test_input}') to produce '{expected}' but got '{result}'" 21 | 22 | 23 | @pytest.mark.parametrize("test_input,expected", expected("linear_b_tokenize_transliteration")) 24 | def test_tokenize_transliteration_linear_b(test_input, expected): 25 | result = linear_b.tokenize_transliteration(test_input) 26 | assert result == expected, f"Expected: linear_b.tokenize_transliteration('{test_input}') to produce '{expected}' but got '{result}'" 27 | -------------------------------------------------------------------------------- /tests/test_linear_a.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from potnia import linear_a 3 | from .data import expected 4 | 5 | @pytest.mark.parametrize("test_input,expected", expected("linear_a_unicode")) 6 | def test_linear_a_unicode(test_input, expected): 7 | result = linear_a(test_input) 8 | assert result == expected, f"Expected: linear_a('{test_input}') to produce '{expected}' but got '{result}'" 9 | 10 | 11 | @pytest.mark.parametrize("test_input,expected", expected("linear_a_transliteration")) 12 | def test_linear_a_transliteration(test_input, expected): 13 | result = linear_a.to_transliteration(test_input) 14 | assert result == expected, f"Expected: linear_a.to_transliteration('{test_input}') to produce '{expected}' but got '{result}'" 15 | 16 | 17 | @pytest.mark.parametrize("test_input,expected", expected("linear_a_unicode_regularized")) 18 | def test_linear_a_unicode_regularized(test_input, expected): 19 | result = linear_a(test_input, regularize=True) 20 | assert result == expected, f"Expected: linear_a('{test_input}', regularize=True) to produce '{expected}' but got '{result}'" 21 | 22 | 23 | @pytest.mark.parametrize("test_input,expected", expected("linear_a_tokenize_transliteration")) 24 | def test_tokenize_transliteration_linear_a(test_input, expected): 25 | result = linear_a.tokenize_transliteration(test_input) 26 | assert result == expected, f"Expected: linear_a.tokenize_transliteration('{test_input}') to produce '{expected}' but got '{result}'" 27 | 28 | 29 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "potnia" 3 | version = "0.4.1" 4 | description = "Potnia is an open-source Python library designed to convert Romanized transliterations of ancient texts into Unicode representations of ther respective native scripts.." 5 | authors = ["Emily Tour ", "Kabir Manandhar Shrestha", "Robert Turnbull"] 6 | license = "Apache-2.0" 7 | readme = "README.rst" 8 | repository = "https://github.com/AncientNLP/potnia/" 9 | documentation = "https://AncientNLP.github.io/potnia" 10 | homepage = "https://github.com/AncientNLP/potnia/" 11 | keywords = ["linear a", "linear b", "sumerian", "akkadian", "cuneiform", "unicode"] 12 | # For classifiers see https://pypi.org/classifiers/ 13 | classifiers = [ 14 | "License :: OSI Approved :: Apache Software License", 15 | "Intended Audience :: Science/Research", 16 | "Topic :: Software Development :: Libraries :: Python Modules", 17 | ] 18 | 19 | [tool.poetry.dependencies] 20 | python = ">=3.10,<3.13" 21 | pyyaml = "^6.0.1" 22 | typer = "^0.12.5" 23 | guigaga = ">=0.0.5" 24 | numpy = "<2" 25 | pybtex = ">=0.24.0" 26 | pybtexnbib = ">=0.1.1" 27 | setuptools = "^75.8.0" 28 | 29 | 30 | [tool.poetry.group.dev.dependencies] 31 | pytest = "^7.4.4" 32 | coverage = ">=7.4.3" 33 | Sphinx = ">=5.0.0" 34 | sphinx-rtd-theme = ">=1.0.0" 35 | sphinx-autobuild = ">=2021.3.14" 36 | sphinx-copybutton = ">=0.4.0" 37 | myst-parser = "^3.0.1" 38 | 39 | 40 | [build-system] 41 | requires = ["poetry-core"] 42 | build-backend = "poetry.core.masonry.api" 43 | 44 | [tool.pytest.ini_options] 45 | filterwarnings = ["ignore::DeprecationWarning"] 46 | 47 | [tool.poetry.scripts] 48 | potnia = "potnia.main:app" 49 | -------------------------------------------------------------------------------- /potnia/scripts/linear_b.py: -------------------------------------------------------------------------------- 1 | import re 2 | from dataclasses import dataclass 3 | from ..script import Script 4 | 5 | @dataclass 6 | class LinearB(Script): 7 | """ 8 | Class for handling text transliteration and unicode conversion for Linear B. 9 | 10 | To use the singleton instance, import like so: 11 | ``from potnia import linear_b`` 12 | 13 | Designed especially for texts from DĀMOS (Database of Mycenaean at Oslo): https://damos.hf.uio.no/ 14 | and LiBER (Linear B Electronic Resources): https://liber.cnr.it/ 15 | 16 | Attributes: 17 | config (str): Path to the configuration file or configuration data in string format. 18 | By default, it uses the 'linear_a.yaml file in the 'data' directory. 19 | """ 20 | config:str = "linear_b" 21 | 22 | def regularize(self, text: str) -> str: 23 | """ 24 | Applies regularization rules to a given string. 25 | 26 | Args: 27 | string (str): Text string to be regularized. 28 | 29 | Returns: 30 | str: Regularized text string. 31 | """ 32 | text = super().regularize(text) 33 | 34 | # Ensure there are informative characters left in the text 35 | informative_chars = set(list(re.sub(r'[%\s]', "", text))) 36 | if len(informative_chars) == 0: 37 | return "" 38 | 39 | return text 40 | 41 | def tokenize_unicode(self, text:str) -> list[str]: 42 | """ 43 | Tokenizes a unicode string by splitting and joining words with dashes. 44 | 45 | Args: 46 | text (str): Input text in unicode format. 47 | 48 | Returns: 49 | list[str]: List of tokenized strings. 50 | """ 51 | words = ['-'.join(word) for word in text.split()] 52 | text = ' '.join(words) 53 | return list(text) 54 | 55 | 56 | linear_b = LinearB() 57 | -------------------------------------------------------------------------------- /tests/expected/linear_b_tokenize_transliteration.yaml: -------------------------------------------------------------------------------- 1 | "]ra-ma-na , / e-ne-ra MUL[": [']', 'ra', 'ma', 'na', ' ', ',', ' ', '/', ' ', 'e', 'ne', 'ra', ' ', 'MUL', '['] 2 | "] ko-wo / m\u0323e\u0323[-zo] 1 ko-wo / me-wi-jo 2 [": [']', ' ', 'ko', 'wo', ' ', '/', ' ', 'me', '[', 'zo', ']', ' ', '1', ' ', 'ko', 'wo', ' ', '/', ' ', 'me', 'wi', 'jo', ' ', '2', ' ', '['] 3 | "]wa VIR 1 MUL 2 'ko-wa 1' ko-wo 1": [']', 'wa', ' ', 'VIR', ' ', '1', ' ', 'MUL', ' ', '2', ' ', "'", 'ko', 'wa', ' ', '1', "'", ' ', 'ko', 'wo', ' ', '1'] 4 | "]qa-ra / re-me-to * 168 + SE 28": [']', 'qa', 'ra', ' ', '/', ' ', 're', 'me', 'to', ' ', '*168+SE', ' ', '28'] 5 | "da-pu₂-ri-to-jo , / po-ti-ni-ja 'me-ri' * 209 VAS 1": ['da', 'pu₂', 'ri', 'to', 'jo', ' ', ',', ' ', '/', ' ', 'po', 'ti', 'ni', 'ja', ' ', "'", 'me', 'ri', "'", ' ', '*209VAS', ' ', '1'] 6 | "po-*34-wi-do ⟦TUN⟧ BIG[": ['po', '*34', 'wi', 'do', ' ', '⟦', 'TUN', '⟧', ' ', 'BIG', '['] 7 | "inf . mut .": ['inf.', ' ', 'mut.'] 8 | "] vacat [": [']', ' ', 'vacat', ' ', '['] 9 | "] vest ., / su-ri-mo , u-ta-jo-jo , o OVIS m 85[\u00a0] vac .": [']', ' ', 'vest.', ',', ' ', '/', ' ', 'su', 'ri', 'mo', ' ', ',', ' ', 'u', 'ta', 'jo', 'jo', ' ', ',', ' ', 'o', ' ', 'OVISm', ' ', '85', '[', ' ', ']', ' ', 'vac.'] 10 | "su-ma-no / ti-ri-to [ vestigia? ] vacat": ['su', 'ma', 'no', ' ', '/', ' ', 'ti', 'ri', 'to', ' ', '[', ' ', 'vestigia', '?', ' ', ']', ' ', 'vacat'] 11 | "v.": ["v."] 12 | "l . s .": ['l.', ' ', 's.'] 13 | "l\u0323a\u0323t\u0323 . i\u0323n\u0323f\u0323 .": ['lat.', ' ', 'inf.'] 14 | "l . i .": ["l."," ","i."] 15 | "] Graffito [": [']', ' ', 'Graffito', ' ', '['] 16 | "CAP f 130 SUS 17 SUS f 41 BOS m 2 BOS f 4": ['CAPf', ' ', '130', ' ', 'SUS', ' ', '17', ' ', 'SUSf', ' ', '41', ' ', 'BOSm', ' ', '2', ' ', 'BOSf', ' ', '4'] 17 | "i[-qi-ja?": ['i', '[', 'qi', 'ja', '?'] 18 | "]2 TELA 2 + PU 90": ["]","2"," ","TELA2+PU"," ", "90"] 19 | "]TELA 1 1 LANA 3": ["]", "TELA1", " ", "1", " ", "LANA"," ","3"] -------------------------------------------------------------------------------- /potnia/scripts/hittite.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from ..script import Script 3 | 4 | 5 | @dataclass 6 | class Hittite(Script): 7 | """ 8 | Class for handling text transliteration and unicode conversion to Hittite. 9 | 10 | To use the singleton instance, import like so: 11 | ``from potnia import hittite`` 12 | 13 | Designed especially for texts from the Catalog der Texte der Hethiter (CTH): https://www.hethport.uni-wuerzburg.de/CTH/index.php 14 | 15 | Attributes: 16 | config (str): Path to the configuration file or configuration data in string format. 17 | By default, it uses the 'hittite.yaml file in the 'data' directory. 18 | """ 19 | config:str = "hittite" 20 | 21 | def tokenize_transliteration(self, input_string:str) -> list[str]: 22 | """ 23 | Tokenizes transliterated text according to specific patterns. 24 | 25 | Args: 26 | text (str): Input text in transliterated format. 27 | 28 | Returns: 29 | list[str]: List of tokens 30 | """ 31 | tokens = [] 32 | token = "" 33 | i = 0 34 | 35 | while i < len(input_string): 36 | char = input_string[i] 37 | 38 | # Handle characters ']', '[', and ' ' 39 | if char in '[] ': 40 | if token: 41 | tokens.append(token) 42 | token = "" 43 | tokens.append(char) 44 | # Handle other characters 45 | elif char in ['-','‑']: 46 | if token: 47 | tokens.append(token) 48 | token = "" 49 | else: 50 | token += char 51 | i += 1 52 | 53 | # Add the last token if it exists 54 | if token: 55 | tokens.append(token) 56 | 57 | return tokens 58 | 59 | 60 | 61 | 62 | hittite = Hittite() -------------------------------------------------------------------------------- /tests/expected/errors.txt: -------------------------------------------------------------------------------- 1 | # ISSUE 1 2 | 3 | linear_b.tokenize_transliteration("]-o-pe-ro *209VAS 'ME<±RI>' 5 [") 4 | # We want to see MERI as a single token 5 | [']', 'o', 'pe', 'ro', ' ', '*209VAS', ' ', "'", 'ME', '<', '±RI', '>', "'", ' ', '5', ' ', '['] 6 | 7 | linear_b("]-o-pe-ro *209VAS 'ME<±RI>' 5 [", regularize=True) 8 | '%𐀃𐀟𐀫 𐃨 𐀕±RI 5 %' 9 | 10 | # ISSUE 2 11 | 12 | linear_b("]-i-to , / da-nwa ME±RI *209VAS+A 16 *172 8", regularize=True) 13 | '%𐀂𐀵 𐀅𐁅 𐂙 𐃨+𐀀 16 𐂴 8' but should get "%𐀂𐀵 𐀅𐁅 𐂙 𐃨+𐀀 16 𐂹 8" 14 | 15 | # ISSUE 3 16 | 17 | linear_b.tokenize_transliteration("pa-si-te-o-i / me-ri *209VAS 1 da-pu2-ri-to-jo , / po-ti-ni-ja 'me-ri' *209VAS 1") 18 | ['pa', 'si', 'te', 'o', 'i', ' ', '/', ' ', 'me', 'ri', ' ', '*209VAS', ' ', '1', ' ', 'da', 'pu2', 'ri', 'to', 'jo', ' ', ',', ' ', '/', ' ', 'po', 'ti', 'ni', 'ja', ' ', "'", 'me', 'ri', "'", ' ', '*209VAS', ' ', '1'] 19 | 20 | linear_b("pa-si-te-o-i / me-ri *209VAS 1 da-pu2-ri-to-jo , / po-ti-ni-ja 'me-ri' *209VAS 1", regular 21 | ize=True) 22 | '𐀞𐀯𐀳𐀃𐀂 𐀕𐀪 𐃨 1 𐀅pu2𐀪𐀵𐀍 𐀡𐀴𐀛𐀊 𐀕𐀪 𐃨 1' pu2 is not mapped? 23 | 24 | # ISSUE 4 25 | 26 | linear_b("a-ka--[ ]--jo-jo , me-no-[ da-pu2-ri-[-to-jo ]-po-ti-ni-ja ri *166+WE 22-[", regularize=Tr 27 | ue) 28 | '𐀀𐀏% %𐀍𐀍 𐀕𐀜% 𐀅pu2𐀪%𐀵𐀍 %𐀡𐀴𐀛𐀊 𐀪 𐂮+𐀸 22%' Again pu2 not mapped? 29 | 30 | # ISSUE 5 31 | linear_b("a-ka--[ ]--jo-jo , me-no-[ da-pu2-ri-[-to-jo ]-po-ti-ni-ja ri *166+WE 22-[", regularize=Tr 32 | ue) 33 | '𐀀𐀏% %𐀍𐀍 𐀕𐀜% 𐀅pu2𐀪%𐀵𐀍 %𐀡𐀴𐀛𐀊 𐀪 𐂮+𐀸 22%' 34 | 35 | # ISSUE 6 36 | linear_b("] e-ko-so OVIS:m 100 LANA [ ]-da-ro , / X LANA [ lat. inf. ]-a3 [", regularize=True) 37 | '% 𐀁𐀒𐀰 𐂇 100 𐂝 % %𐀅𐀫 𐂝 % %a3 %' is a3 mapping broken? 38 | 39 | # ISSUE 7 40 | linear_b("fragmentum A fragmentum B vacat [ sup. mut. e-me-si-jo-jo-[ ] 3-[ pa-na-so GRA 100-[ ]-vac.-[ ta-ra-qo GRA [ inf. mut. ta-u-pa-du-we GRA-[ a-ro-ja-[ pu-na-so-[ inf. mut.", regularize=True) 41 | '𐀀 % 𐀁𐀕𐀯𐀍𐀍% % 3% 𐀞𐀙𐀰 𐂎 100% %% 𐀲𐀨𐀦 𐂎 % 𐀲𐀄𐀞𐀉𐀸 𐂎% 𐀀𐀫𐀊% 𐀢𐀙𐀰%' but should be "% % 𐀁𐀕𐀯𐀍𐀍% % 3% 𐀞𐀙𐀡 𐂎 100% %% 𐀲𐀨𐀦 𐂎 % 𐀲𐀄𐀞𐀉𐀸 𐂎% 𐀀𐀫𐀊% 𐀢𐀙𐀰%" 42 | 43 | # ISSUE 8 44 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | 3 | on: 4 | push: 5 | branches: main 6 | 7 | # Allows you to run this workflow manually from the Actions tab 8 | workflow_dispatch: 9 | 10 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 11 | permissions: 12 | contents: read 13 | pages: write 14 | id-token: write 15 | 16 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 17 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 18 | concurrency: 19 | group: "pages" 20 | cancel-in-progress: false 21 | 22 | jobs: 23 | build: 24 | environment: 25 | name: github-pages 26 | url: ${{ steps.deployment.outputs.page_url }} 27 | runs-on: ubuntu-latest 28 | strategy: 29 | fail-fast: false 30 | matrix: 31 | python-version: ["3.10"] 32 | steps: 33 | - uses: actions/checkout@v3 34 | - name: Install poetry 35 | run: pipx install poetry 36 | - name: Install dependencies for Python ${{ matrix.python-version }} 37 | uses: actions/setup-python@v3 38 | with: 39 | python-version: ${{ matrix.python-version }} 40 | cache: 'poetry' 41 | - run: | 42 | poetry env use "${{ matrix.python-version }}" 43 | poetry install 44 | - name: Docs 45 | run: | 46 | poetry run sphinx-build -b html docs gh-pages 47 | - name: Coverage 48 | run: | 49 | poetry run coverage run -m pytest 50 | echo "COVERAGE=$(poetry run coverage report --precision 2 | grep TOTAL | tr -s ' ' | cut -f 4 -d " ")" >> $GITHUB_ENV 51 | poetry run coverage html --directory gh-pages/coverage 52 | - name: Setup Pages 53 | uses: actions/configure-pages@v3 54 | - name: Upload artifact 55 | uses: actions/upload-pages-artifact@v3 56 | with: 57 | path: 'gh-pages' 58 | - name: Deploy to GitHub Pages 59 | id: deployment 60 | uses: actions/deploy-pages@v4 61 | - name: Create Coverage Badge 62 | uses: schneegans/dynamic-badges-action@v1.1.0 63 | with: 64 | auth: ${{ secrets.GIST_SECRET }} 65 | gistID: e640f26fb59e39e3051de8fbf020de62 66 | filename: coverage-badge.json 67 | label: coverage 68 | message: ${{ env.COVERAGE }} 69 | color: green 70 | -------------------------------------------------------------------------------- /potnia/scripts/arabic.py: -------------------------------------------------------------------------------- 1 | import re 2 | from dataclasses import dataclass 3 | from ..script import Script 4 | 5 | @dataclass 6 | class Arabic(Script): 7 | """ 8 | Class for handling text transliteration and unicode conversion to Arabic. 9 | 10 | To use the singleton instance, import like so: 11 | ``from potnia import arabic`` 12 | 13 | Uses the DIN 31635 standard for Arabic transliteration. 14 | 15 | If you need the Tim Buckwalter transliteration system, then use the PyArabic library. 16 | 17 | Attributes: 18 | config (str): Path to the configuration file or configuration data in string format. 19 | By default, it uses the 'arabic.yaml file in the 'data' directory. 20 | """ 21 | config:str = "arabic" 22 | 23 | def to_unicode(self, text:str, regularize:bool=False) -> str: 24 | """ 25 | Converts transliterated text to unicode format. 26 | 27 | Args: 28 | text (str): Input text in transliterated format. 29 | regularize (bool, optional): Whether to apply regularization. Defaults to False. 30 | 31 | Returns: 32 | str: Text converted to unicode format, optionally regularized. 33 | """ 34 | # if word ends with 'atun' then make it damataan with taa marbuta 35 | text = re.sub(r'(\w\w)atun\b', r'\1'+'َ\u0629\u064C', text) 36 | # if word has uʾ then make it a hamza on top of waw 37 | text = re.sub(r'uʾ', '\u0624', text) 38 | # if word ends with 'un' then make it damataan 39 | text = re.sub(r'(\w\w)un\b', r'\1'+'\u064C', text) 40 | # if word ends with 'in' then make it kasrataan 41 | text = re.sub(r'(\w\w)in\b', r'\1'+'\u064D', text) 42 | # if word ends with 'an' then make it fatatan 43 | text = re.sub(r'(\w\w)an\b', r'\1'+'\u064Bا', text) 44 | # if word starts with 'i' or 'a' then make it an alif with hamza 45 | text = re.sub(r'\b[i]', 'إ', text) 46 | text = re.sub(r'-[i]', "-إ", text) 47 | text = re.sub(r'\b[a]', 'أ', text) 48 | text = re.sub(r'-[a]', "-أ", text) 49 | 50 | text = re.sub(r'\bʾa', 'أ', text) 51 | 52 | # definite article 53 | text = re.sub(r'أl-', "ال", text) 54 | 55 | text = super().to_unicode(text, regularize) 56 | 57 | # fix the word 'اسم' if it is written as 'إسم' 58 | text = re.sub(r"إسم", "اسم", text) 59 | 60 | arabic_consonants_with_shadda = [ 61 | 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 62 | 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 63 | 'ه', 'و', 'ي' 64 | ] 65 | for consonant in arabic_consonants_with_shadda: 66 | text = re.sub(f'{consonant}{consonant}', f'{consonant}\u0651', text) 67 | 68 | return text 69 | 70 | 71 | arabic = Arabic() -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from typer.testing import CliRunner 4 | from potnia.main import app 5 | from unittest.mock import patch 6 | from .data import expected 7 | 8 | runner = CliRunner() 9 | 10 | 11 | @pytest.mark.parametrize("test_input,expected", expected("linear_a_unicode")) 12 | def test_linear_a_main(test_input, expected): 13 | result = runner.invoke(app, ["linear-a", test_input]) 14 | assert expected in result.stdout 15 | 16 | 17 | @pytest.mark.parametrize("test_input,expected", expected("linear_b_unicode")) 18 | def test_linear_b_main(test_input, expected): 19 | result = runner.invoke(app, ["linear-b", test_input]) 20 | assert expected in result.stdout 21 | 22 | 23 | @pytest.mark.parametrize("test_input,expected", expected("linear_b_unicode_regularized")) 24 | def test_linear_b_main_regularized(test_input, expected): 25 | result = runner.invoke(app, ["linear-b", test_input, "--regularize"]) 26 | assert expected in result.stdout 27 | 28 | 29 | @pytest.mark.parametrize("test_input,expected", expected("hittite_unicode")) 30 | def test_hittite_main(test_input, expected): 31 | result = runner.invoke(app, ["hittite", test_input]) 32 | assert expected in result.stdout 33 | 34 | 35 | # @pytest.mark.parametrize("test_input,expected", expected("luwian_unicode")) 36 | # def test_luwian_main(test_input, expected): 37 | # result = runner.invoke(app, ["luwian", test_input]) 38 | # assert expected in result.stdout 39 | 40 | 41 | @pytest.mark.parametrize("test_input,expected", expected("arabic_unicode")) 42 | def test_arabic_main(test_input, expected): 43 | result = runner.invoke(app, ["arabic", test_input]) 44 | assert expected in result.stdout 45 | 46 | 47 | def test_bibtex(): 48 | result = runner.invoke(app, ["bibtex"]) 49 | assert "Journal of Open Source Software" in result.stdout 50 | assert "10.21105/joss.07725" in result.stdout 51 | assert "2025" in result.stdout 52 | 53 | 54 | def test_bibliography(): 55 | result = runner.invoke(app, ["bibliography"]) 56 | assert "Emily Tour" in result.stdout 57 | assert "Kabir" in result.stdout 58 | assert "Turnbull" in result.stdout 59 | assert "2025" in result.stdout 60 | 61 | 62 | def test_gui_launch(): 63 | # Mock the GUIGAGA class and its launch method to avoid actually launching the GUI during the test 64 | with patch("guigaga.guigaga.GUIGAGA") as MockGUIGAGA: 65 | # Create a mock instance for the GUIGAGA class 66 | mock_gui = MockGUIGAGA.return_value 67 | mock_gui.launch.return_value = None 68 | 69 | # Run the CLI command 70 | result = runner.invoke(app, ["gui"]) 71 | 72 | # Assert that the command ran successfully (exit code 0) 73 | assert result.exit_code == 0 74 | 75 | # Assert that the GUIGAGA instance was created and launch was called 76 | MockGUIGAGA.assert_called_once() 77 | mock_gui.launch.assert_called_once() 78 | -------------------------------------------------------------------------------- /docs/additions.rst: -------------------------------------------------------------------------------- 1 | =========================== 2 | Adding New Scripts to Potnia 3 | ============================ 4 | 5 | Potnia allows for the easy integration of new ancient scripts by using a single YAML file per script. This file will contain the mappings for syllabograms, logograms (if applicable), transliteration rules, and regularization patterns. Below are the steps for adding a new script, along with examples. 6 | 7 | Steps to Add a New Script 8 | ---------------------------- 9 | 10 | 1. **Create a Single YAML Mapping and Rules File**: Define the mappings for syllabograms, logograms (if applicable), and the rules for transliteration and regularization. Here's an example for Linear B: 11 | 12 | .. code-block:: yaml 13 | 14 | mappings: 15 | a: 𐀀 16 | e: 𐀁 17 | i: 𐀂 18 | # logograms 19 | VIR: 𐂀 # man 20 | MUL: 𐂁 # woman 21 | transliteration: 22 | - ['ro2', '𐁊'] 23 | regularization: 24 | - ['\\[•~\\]', ''] # Remove uncertain readings 25 | - ['\\bqs\\b', '%'] # Handle missing elements 26 | 27 | 2. **Add the New Script Class**: Create a `Script` class that points to the new YAML file (usually in the `scripts` directory). For example: 28 | 29 | .. code-block:: python 30 | 31 | from dataclasses import dataclass 32 | from ..script import Script 33 | 34 | @dataclass 35 | class NewScript(Script): 36 | config: str = "new_script" # Refers to the YAML file name 37 | 38 | new_script = NewScript() 39 | 40 | 3. **Add to __init__.py**: Add the new script to the ``__init__.py`` file. For example: 41 | 42 | .. code-block:: python 43 | 44 | from .scripts.new_script import new_script, NewScript 45 | 46 | 4. **Write Test Cases**: Add test cases to ensure that the new script's transliteration and Unicode mapping work as expected. Example: 47 | 48 | .. code-block:: yaml 49 | 50 | test_newscript_unicode.yaml: 51 | "a-e-i": "𐀀𐀁𐀂" 52 | "VIR MUL": "𐂀𐂁" 53 | 54 | Then, write a test function to check the output of the new script: 55 | 56 | .. code-block:: python 57 | 58 | @pytest.mark.parametrize("test_input,expected", expected("test_newscript_unicode")) 59 | def test_test_newscript_unicode(test_input, expected): 60 | result = new_script(test_input) 61 | assert result == expected, f"Expected: new_script('{test_input}') to produce '{expected}' but got '{result}'" 62 | 63 | 64 | 5. **Usage Example**: Once the new script is added, it can be used as follows: 65 | 66 | .. code-block:: python 67 | 68 | from potnia import new_script 69 | 70 | # Convert transliterated text to Unicode 71 | new_script("a-e-i") 72 | 73 | # Regularize text 74 | new_script("a-[•~]", regularize=True) 75 | 76 | This approach centralizes all configuration for a given script into a single YAML file, simplifying the process of adding new scripts while maintaining Potnia's flexible and modular design. 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | Contributing 3 | ======================= 4 | 5 | These practices are subject to change based on the decisions of the team. 6 | 7 | - Use clear and explicit variable names. The variable names are typically more verbose than those in fastai. 8 | - Python code should be formatted using black with the settings in pyproject.toml. The maximum line length is 120 characters. 9 | - Contributions should be commited to a new branch and will be merged with main only after tests and documentation are complete. 10 | 11 | Installation 12 | ================== 13 | 14 | To install Potnia for development, run the following command: 15 | 16 | .. code-block:: bash 17 | 18 | git clone https://github.com/AncientNLP/potnia.git 19 | cd potnia 20 | 21 | Make sure that poetry is installed on your system. If not, see the `instructions `_. 22 | 23 | Then install the dependencies using poetry: 24 | 25 | .. code-block:: bash 26 | 27 | poetry install 28 | 29 | 30 | Testing 31 | ================== 32 | 33 | - All tests must be passing before merging with the ``main`` branch. 34 | - Tests are automatically included in the CI/CD pipeline using Github actions. 35 | 36 | Git Commits 37 | =========== 38 | 39 | We use the `git3moji `_ standard for expressive git commit messages. 40 | Use one of the following five short emojis at the start of your of your git commit messages: 41 | 42 | - ``:zap:`` ⚡️ – Features and primary concerns 43 | - ``:bug:`` 🐛 – Bugs and fixes 44 | - ``:tv:`` 📺 – CI, tooling, and configuration 45 | - ``:cop:`` 👮 – Tests and linting 46 | - ``:abc:`` 🔤 – Documentation 47 | 48 | As far as possible, please keep your git commits granular and focussed on one thing at a time. 49 | Please cite an the number of a Github issue if it relates to your commit. 50 | 51 | Documentation 52 | ================== 53 | 54 | - Docstrings for Python functions should use the Google docstring convention (https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) 55 | - Documentation generated using sphinx and automatically deployed as part of the CI/CD pipeline. 56 | - Docs should be written in reStructuredText. 57 | 58 | Files need to start with a heading for the section. The convention used here is to use the equals sign above and below the heading:: 59 | 60 | =============== 61 | Section Heading 62 | =============== 63 | 64 | Subsections also use an equals sign but just below the heading:: 65 | 66 | Subsection Heading 67 | ================== 68 | 69 | Subsubsections have a single dash below the heading:: 70 | 71 | Subsubsection Heading 72 | --------------------- 73 | 74 | Try not to have any other sections within this but if it is necessary, use tildas below the heading:: 75 | 76 | Further Subsection Headings 77 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 78 | 79 | Other information for using reStructuredText in Sphinx can be found here: https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#rst-primer and https://thomas-cokelaer.info/tutorials/sphinx/rest_syntax.html. 80 | 81 | Code of Conduct 82 | ================== 83 | 84 | We follow the `Contributor Covenant Code of Conduct `_ 85 | for all community contributions. -------------------------------------------------------------------------------- /potnia/scripts/linear_a.py: -------------------------------------------------------------------------------- 1 | import re 2 | from dataclasses import dataclass 3 | from ..script import Script 4 | 5 | @dataclass 6 | class LinearA(Script): 7 | """ 8 | Class for handling text transliteration and unicode conversion for Linear A. 9 | 10 | To use the singleton instance, import like so: 11 | ``from potnia import linear_a`` 12 | 13 | Attributes: 14 | config (str): Path to the configuration file or configuration data in string format. 15 | By default, it uses the 'linear_a.yaml file in the 'data' directory. 16 | """ 17 | config:str = "linear_a.yaml" 18 | 19 | def tokenize_transliteration(self, input_string: str) -> list[str]: 20 | """ 21 | Tokenizes transliterated text according to specific patterns. 22 | 23 | Args: 24 | text (str): Input text in transliterated format. 25 | 26 | Returns: 27 | list[str]: List of tokens 28 | """ 29 | tokens = [] 30 | token = "" 31 | i = 0 32 | 33 | while i < len(input_string): 34 | char = input_string[i] 35 | 36 | # Check for special sequences like "[?]" and "[unclassified]" 37 | if char == '[': 38 | if input_string[i:i + 3] == '[?]': 39 | if token: 40 | tokens.append(token) 41 | tokens.append("[?]") 42 | token = "" 43 | i += 3 # Skip past "[?]" 44 | continue 45 | elif input_string[i:i + 14] == '[unclassified]': 46 | if token: 47 | tokens.append(token) 48 | tokens.append("[unclassified]") 49 | token = "" 50 | i += 14 # Skip past "[unclassified]" 51 | continue 52 | 53 | # Handle characters ']', '[', and ' ' 54 | if char in '[] ': 55 | if token: 56 | tokens.append(token) 57 | token = "" 58 | tokens.append(char) 59 | # Handle other characters 60 | elif char == '-': 61 | if token: 62 | tokens.append(token) 63 | token = "" 64 | else: 65 | token += char 66 | i += 1 67 | 68 | # Add the last token if it exists 69 | if token: 70 | tokens.append(token) 71 | 72 | return tokens 73 | 74 | def tokenize_unicode(self, text:str) -> list[str]: 75 | """ 76 | Tokenizes a unicode string by splitting and joining words with dashes. 77 | 78 | Args: 79 | text (str): Input text in unicode format. 80 | 81 | Returns: 82 | list[str]: List of tokenized strings. 83 | """ 84 | def is_aegean(char): 85 | return "\U00010000" <= char <= "\U0001007F" or "\U00010600" <= char <= "\U0001077F" 86 | 87 | # Insert hyphens between consecutive Linear B characters 88 | modified_text = "" 89 | prev_was_aegean = False 90 | 91 | for char in text: 92 | if is_aegean(char): 93 | if prev_was_aegean: 94 | modified_text += "-" # Add hyphen if previous character was also Linear B 95 | modified_text += char 96 | prev_was_aegean = True 97 | else: 98 | modified_text += char 99 | prev_was_aegean = False # Reset flag on encountering a non-Linear B character 100 | 101 | return list(modified_text) 102 | 103 | 104 | linear_a = LinearA() -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | docshtml/ 163 | paper.pdf 164 | jats/ 165 | paper.preprint.tex 166 | paper.html -------------------------------------------------------------------------------- /potnia/main.py: -------------------------------------------------------------------------------- 1 | import typer 2 | from pybtex import PybtexEngine 3 | from potnia import linear_a as linear_a_script 4 | from potnia import linear_b as linear_b_script 5 | from potnia import hittite as hittite_script 6 | from potnia import arabic as arabic_script 7 | # from potnia import luwian as luwian_script 8 | # from potnia import akkadian as akkadian_script 9 | from rich.console import Console 10 | 11 | from .enums import BibliographyStyle, BibliographyFormat 12 | from .data import DATA_DIR 13 | 14 | BIBTEX_PATH = DATA_DIR / "potnia.bib" 15 | 16 | app = typer.Typer(context_settings={"help_option_names": ["-h", "--help"]}) 17 | 18 | TEXT_ARGUMENT = typer.Argument(help="The transliterated text to be converted to Unicode.") 19 | REGULARIZATION_DEFAULT = typer.Option(False, help="Whether or not to regularize the output.") 20 | 21 | 22 | @app.command() 23 | def linear_a(text: list[str]=TEXT_ARGUMENT, regularize:bool=REGULARIZATION_DEFAULT): 24 | """ Converts a Linear A text to Unicode. """ 25 | if isinstance(text, list): 26 | text = " ".join(text) 27 | print(linear_a_script(text, regularize=regularize)) 28 | 29 | 30 | @app.command() 31 | def linear_b(text: list[str]=TEXT_ARGUMENT, regularize:bool=REGULARIZATION_DEFAULT): 32 | """ Converts a Linear B text to Unicode. """ 33 | if isinstance(text, list): 34 | text = " ".join(text) 35 | print(linear_b_script(text, regularize=regularize)) 36 | 37 | 38 | @app.command() 39 | def hittite(text: list[str]=TEXT_ARGUMENT, regularize:bool=REGULARIZATION_DEFAULT): 40 | """ Converts a Hittite text to Unicode. """ 41 | if isinstance(text, list): 42 | text = " ".join(text) 43 | print(hittite_script(text, regularize=regularize)) 44 | 45 | 46 | # @app.command() 47 | # def luwian(text: list[str]=TEXT_ARGUMENT, regularize:bool=REGULARIZATION_DEFAULT): 48 | # """ Converts a Luwian text to Unicode. """ 49 | # if isinstance(text, list): 50 | # text = " ".join(text) 51 | # print(luwian_script(text, regularize=regularize)) 52 | 53 | 54 | # @app.command() 55 | # def akkadian(text: list[str]=TEXT_ARGUMENT, regularize:bool=REGULARIZATION_DEFAULT): 56 | # """ Converts a Akkadian text to Unicode. """ 57 | # if isinstance(text, list): 58 | # text = " ".join(text) 59 | # print(akkadian_script(text, regularize=regularize)) 60 | 61 | 62 | @app.command() 63 | def arabic(text: list[str]=TEXT_ARGUMENT, regularize:bool=REGULARIZATION_DEFAULT): 64 | """ Converts a Arabic text to Unicode. """ 65 | if isinstance(text, list): 66 | text = " ".join(text) 67 | print(arabic_script(text, regularize=regularize)) 68 | 69 | 70 | @app.command() 71 | def bibtex(): 72 | """ Prints the BibTeX entry for this software package. """ 73 | bibtex_str = BIBTEX_PATH.read_text() 74 | print(bibtex_str) 75 | 76 | 77 | @app.command() 78 | def bibliography( 79 | style:BibliographyStyle="plain", 80 | output:BibliographyFormat="plaintext", 81 | ): 82 | """ Displays the bibliography. """ 83 | engine = PybtexEngine() 84 | bibliography_string = engine.format_from_files( 85 | bib_files_or_filenames=[BIBTEX_PATH], 86 | style=str(style), 87 | output_backend=str(output), 88 | ) 89 | print(bibliography_string) 90 | 91 | 92 | @app.callback() 93 | def potnia(): 94 | """ 95 | Potnia Logo 96 | 97 | Potnia is an open-source Python library designed to convert Romanized transliterations of ancient texts into Unicode representations of ther respective native scripts. 98 | """ 99 | 100 | 101 | @app.command() 102 | def gui(ctx: typer.Context, share:bool=False): 103 | """ Launches the Potnia GUI. """ 104 | import gradio as gr 105 | from guigaga.guigaga import GUIGAGA 106 | theme = gr.themes.Soft( 107 | primary_hue="rose", 108 | secondary_hue="pink", 109 | text_size="lg", 110 | ) 111 | gui = GUIGAGA( 112 | typer.main.get_group(app), 113 | click_context=ctx, 114 | theme=theme, 115 | allow_file_download=False, 116 | ) 117 | gui.launch(launch_kwargs={"share": share}) 118 | 119 | 120 | @app.command() 121 | def bibtex(): 122 | """ Prints the BibTeX entry for this software package. """ 123 | bibtex_str = BIBTEX_PATH.read_text() 124 | print(bibtex_str) 125 | -------------------------------------------------------------------------------- /docs/linear_b.md: -------------------------------------------------------------------------------- 1 | # Linear B Conversion Rules 2 | 3 | This document outlines the rules used in the conversion process for Linear B texts. The process involves tokenization, regularization, and handling of special patterns to prepare the text for further analysis. 4 | 5 | ## 1. Tokenization Rules 6 | 7 | Tokenization is the process of breaking down the text into individual elements or tokens. For Linear B, this involves handling various special cases and patterns. 8 | 9 | ### a) Space Normalization 10 | 11 | Replaces non-breaking spaces and certain diacritic marks with regular spaces and empty strings respectively, cleaning up the text for uniform processing. 12 | 13 | - Replace non-breaking spaces (`\u00a0`) with regular spaces. 14 | - Remove combining dot below (`\u0323`) to simplify character representation. 15 | 16 | ### b) Special Pattern Handling 17 | 18 | | Pattern | Regex | Description | 19 | |---------|-------|-------------| 20 | | Combine terms with 'm' or 'f' | `r'\b({})\s([mf])\b'.format('\|'.join(['BOS', 'SUS', 'OVIS', 'CAP', 'EQU']))` to `r'\1\2'`| Combines terms like 'BOS', 'SUS', 'OVIS', 'CAP', 'EQU' with following 'm' or 'f' to form a single token, facilitating cleaner tokenization. | 21 | | Add hyphen after ']' | `r'\](?=[^\s])'` to ` r']-'` | Adds a hyphen right after ']' when it is followed by a non-space character, maintaining syntax integrity in tokenization. | 22 | | Add hyphen before '[' | `r'(?<=[^\s])\['` to `r'-['` | Inserts a hyphen right before '[' when it is preceded by a non-space character, ensuring consistent formatting for special handling. | 23 | | TELA Number Combination | `r"TELA\s+(\d+)"` to `r'TELA\1'` | Combines the term "TELA" with following numbers without spaces. | 24 | | Combine '*' with numeral | `r'\* (\d+)'` to `r'*\1'` | Directly attaches '*' to the following numeral without a space, aiding in recognizing these combinations as distinct tokens. | 25 | | Combine '+' with ideograms | `r'\+ ([^\s]+)'` and `r'([^\s]) \+'` | Merges '+' with adjacent ideograms without space, preserving semantic units in tokenization. | 26 | | Attach 'VAS' | `r'([^\s]+) VAS'` to `r'\1VAS'` | Attaches 'VAS' directly to the preceding term without space, ensuring that it is processed as a single token. | 27 | | Handle abbreviations | `*[(rf'\b{term}\s?\.', term + '.') for term in ['vac', 'vest', 'l', 's', 'lat', 'inf', 'mut', 'sup', 'i']]` | Ensures common abbreviations (like 'vac', 'inf', etc.) are correctly punctuated with a period if missing, standardizing text format. | 28 | 29 | Iterates over each pattern-replacement pair, applying them sequentially to the text to ensure all intended formatting and corrections are made. 30 | 31 | ### c) Space Handling : 32 | 33 | Uses a placeholder character to temporarily replace spaces, facilitating token splitting based on special characters and preserved spaces. 34 | 35 | ```python 36 | space_placeholder = "\uE000" # Placeholder for spaces 37 | text = re.sub(r' ', space_placeholder, text) 38 | ``` 39 | 40 | ### d) Tokenization with Space Placeholder 41 | 42 | Splits the text based on special characters and the space placeholder, ensuring that meaningful elements like brackets, commas, and quotation marks are preserved as separate tokens. 43 | 44 | ```python 45 | special_chars_pattern = r'(\[|\]|\,|\'|\u27e6|\u27e7|-|\?|\u2e24|\u2e25|' + re.escape(space_placeholder) + ')' 46 | tokens = re.split(special_chars_pattern, text) 47 | ``` 48 | 49 | ### e) Final Tokenization 50 | 51 | Replace the placeholder with actual spaces and filter empty tokens: 52 | 53 | ```python 54 | tokenized = [tok if tok != space_placeholder else " " for tok in tokens if tok and tok != "-"] 55 | ``` 56 | 57 | These rules form the core of the Linear B conversion process, handling various special cases in the transliteration, tokenization, and regularization of the text. The process aims to preserve important linguistic features while standardizing the format for further processing or analysis. This standardization is crucial for consistent treatment of texts across different sources and editions. 58 | 59 | ## 2. Regularization Rules: 60 | 61 | This list of regular expressions identifies various patterns in the text that should be tokenised as is in the previous step, but then either removed or handled as a special case during subsequent regularization. 62 | 63 | - `lat., l., inf., i., sup., s., dex., mut, verso, v., v.→, v.↓, Graffito, vacat, vac., deest, α, β, γ, supra sigillum, reliqua pars sine regulis`: Various annotations related epigraphic features of the document, which should be removed at this step. 64 | - `fragmentum, qs, vestigia, vest.` and `][•`: Various annotations or specific punctuation denoting undetermined text parts, which should be handled as wildcards at this step (i.e. converted to `%`). 65 | - `/,'?⸤⸥<>`: Specific punctuation and bracket types, which should be removed at this step. 66 | - `⟦.*?⟧`: Matches text within these special double brackets, which indicate text erasures. Both punctuation and included text should be removed at this step. 67 | 68 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement by email. 63 | All complaints will be reviewed and investigated promptly and fairly. 64 | 65 | All community leaders are obligated to respect the privacy and security of the 66 | reporter of any incident. 67 | 68 | ## Enforcement Guidelines 69 | 70 | Community leaders will follow these Community Impact Guidelines in determining 71 | the consequences for any action they deem in violation of this Code of Conduct: 72 | 73 | ### 1. Correction 74 | 75 | **Community Impact**: Use of inappropriate language or other behavior deemed 76 | unprofessional or unwelcome in the community. 77 | 78 | **Consequence**: A private, written warning from community leaders, providing 79 | clarity around the nature of the violation and an explanation of why the 80 | behavior was inappropriate. A public apology may be requested. 81 | 82 | ### 2. Warning 83 | 84 | **Community Impact**: A violation through a single incident or series 85 | of actions. 86 | 87 | **Consequence**: A warning with consequences for continued behavior. No 88 | interaction with the people involved, including unsolicited interaction with 89 | those enforcing the Code of Conduct, for a specified period of time. This 90 | includes avoiding interactions in community spaces as well as external channels 91 | like social media. Violating these terms may lead to a temporary or 92 | permanent ban. 93 | 94 | ### 3. Temporary Ban 95 | 96 | **Community Impact**: A serious violation of community standards, including 97 | sustained inappropriate behavior. 98 | 99 | **Consequence**: A temporary ban from any sort of interaction or public 100 | communication with the community for a specified period of time. No public or 101 | private interaction with the people involved, including unsolicited interaction 102 | with those enforcing the Code of Conduct, is allowed during this period. 103 | Violating these terms may lead to a permanent ban. 104 | 105 | ### 4. Permanent Ban 106 | 107 | **Community Impact**: Demonstrating a pattern of violation of community 108 | standards, including sustained inappropriate behavior, harassment of an 109 | individual, or aggression toward or disparagement of classes of individuals. 110 | 111 | **Consequence**: A permanent ban from any sort of public interaction within 112 | the community. 113 | 114 | ## Attribution 115 | 116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 117 | version 2.0, available at 118 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 119 | 120 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 121 | enforcement ladder](https://github.com/mozilla/diversity). 122 | 123 | [homepage]: https://www.contributor-covenant.org 124 | 125 | For answers to common questions about this code of conduct, see the FAQ at 126 | https://www.contributor-covenant.org/faq. Translations are available at 127 | https://www.contributor-covenant.org/translations. -------------------------------------------------------------------------------- /potnia/script.py: -------------------------------------------------------------------------------- 1 | import re 2 | from functools import reduce 3 | from pathlib import Path 4 | from dataclasses import dataclass 5 | 6 | from .data import read_data 7 | 8 | @dataclass 9 | class Script(): 10 | """ 11 | The abstract base class for handling text transliteration and unicode conversion. 12 | 13 | Attributes: 14 | config (str): Path to the configuration file or configuration data in YAML format. 15 | """ 16 | config:str 17 | 18 | def __post_init__(self): 19 | """Initializes configuration and sets up mappings, patterns, and regularization rules.""" 20 | 21 | if isinstance(self.config, (Path,str)): 22 | self.config = read_data(self.config) 23 | assert self.config, f"Configuration not found" 24 | 25 | self.transliteration_to_unicode_dict = self.config.get('mappings', {}) 26 | self.unicode_to_transliteration_dict = {} 27 | for k, v in self.transliteration_to_unicode_dict.items(): 28 | if v not in self.unicode_to_transliteration_dict: 29 | self.unicode_to_transliteration_dict[v] = k 30 | 31 | # Load patterns to ignore 32 | patterns_to_ignore = self.config.get('patterns_to_ignore', []) 33 | self.regex_to_ignore = [re.compile(pattern) for pattern in patterns_to_ignore] 34 | 35 | # Load regularization rules 36 | self.regularization_regex = [ 37 | (re.compile(re.sub(r'\\\\', r'\\', pattern)), replacement) 38 | for pattern, replacement in self.config.get('regularization', []) 39 | ] 40 | 41 | # Load transliteration rules 42 | self.transliteration_patterns = [ 43 | (re.compile(pattern),replacement) 44 | for pattern, replacement in self.config.get('tokenization', []) 45 | ] 46 | self.complex_symbols = self.config.get('complex_symbols', {}) 47 | self.special_chars_pattern = re.compile(self.config.get('special_chars_pattern', '')) 48 | self.restore_patterns = [ 49 | (re.compile(pattern),replacement) 50 | for pattern, replacement in self.config.get('restore_patterns', []) 51 | ] 52 | 53 | # Reverse the complex_symbols dictionary 54 | self.reversed_symbols = {v: k for k, v in self.complex_symbols.items()} 55 | 56 | def tokenize_unicode(self, text:str) -> list[str]: 57 | """ 58 | Tokenizes unicode text according to specific patterns. 59 | 60 | By default, it tokenizes each character as a separate token. 61 | This method can be overridden in subclasses to provide more complex tokenization. 62 | 63 | Args: 64 | text (str): Input text in unicode format. 65 | 66 | Returns: 67 | list[str]: List of tokens 68 | """ 69 | return list(text) 70 | 71 | def tokenize_transliteration(self, text:str) -> list[str]: 72 | """ 73 | Tokenizes transliterated text according to specific patterns. 74 | 75 | Args: 76 | text (str): Input text in transliterated format. 77 | 78 | Returns: 79 | list[str]: List of tokens 80 | """ 81 | # Replace complex symbols with placeholders 82 | for symbol, placeholder in self.complex_symbols.items(): 83 | text = text.replace(symbol, placeholder) 84 | 85 | # Apply each pattern replacement in order 86 | for pattern, replacement in self.transliteration_patterns: 87 | text = pattern.sub(replacement, text) 88 | 89 | # Handle space replacement with a placeholder 90 | space_placeholder = "\uE000" # Placeholder for spaces 91 | text = text.replace(" ", space_placeholder) 92 | 93 | # Tokenize using the special characters pattern 94 | tokens = self.special_chars_pattern.split(text) 95 | 96 | # Apply processing to each token and filter out empty tokens 97 | tokenized = [ 98 | " " if tok == space_placeholder else 99 | reduce(lambda t, p: p[0].sub(p[1], t), self.restore_patterns, tok) 100 | for tok in tokens if tok and tok != "-" 101 | ] 102 | 103 | # Restore complex symbols using the reversed dictionary 104 | for placeholder, symbol in self.reversed_symbols.items(): 105 | tokenized = [tok.replace(placeholder, symbol) for tok in tokenized] 106 | 107 | return tokenized if tokenized else [""] 108 | 109 | def to_transliteration(self, text:str) -> str: 110 | """ 111 | Converts unicode text to transliteration format. 112 | 113 | NB. This function may not work as expected for all scripts/languages 114 | because there may not be a one-to-one mapping between unicode and transliteration. 115 | 116 | Args: 117 | text (str): Input text in unicode format. 118 | 119 | Returns: 120 | str: Transliterated text. 121 | """ 122 | tokens = self.tokenize_unicode(text) 123 | return "".join( 124 | [ 125 | self.unicode_to_transliteration_dict.get(token, token) 126 | for token in tokens 127 | ] 128 | ) 129 | 130 | def to_unicode(self, text:str, regularize:bool=False) -> str: 131 | """ 132 | Converts transliterated text to unicode format. 133 | 134 | Args: 135 | text (str): Input text in transliterated format. 136 | regularize (bool, optional): Whether to apply regularization. Defaults to False. 137 | 138 | Returns: 139 | str: Text converted to unicode format, optionally regularized. 140 | """ 141 | tokens = self.tokenize_transliteration(text) 142 | result = "".join([self.transliteration_to_unicode_dict.get(token, token) for token in tokens]) 143 | if regularize: 144 | result = self.regularize(result) 145 | return result 146 | 147 | def __call__(self, text:str, regularize:bool=False) -> str: 148 | """ 149 | Allows the class instance to be called as a function for unicode conversion. 150 | 151 | Args: 152 | text (str): Input text in transliterated format. 153 | regularize (bool, optional): Whether to apply regularization. Defaults to False. 154 | 155 | Returns: 156 | str: Text converted to unicode format, optionally regularized. 157 | """ 158 | return self.to_unicode(text, regularize=regularize) 159 | 160 | def regularize(self, string: str) -> str: 161 | """ 162 | Applies regularization rules to a given string. 163 | 164 | Args: 165 | string (str): Text string to be regularized. 166 | 167 | Returns: 168 | str: Regularized text string. 169 | """ 170 | for pattern, replacement in self.regularization_regex: 171 | string = pattern.sub(replacement, string) 172 | 173 | for regex in self.regex_to_ignore: 174 | string = regex.sub("", string) 175 | string = re.sub(r'\s+', ' ', string) 176 | string = re.sub('mut','',string) 177 | return string.strip() 178 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================================================================ 2 | Potnia 3 | ================================================================ 4 | 5 | .. image:: https://raw.githubusercontent.com/AncientNLP/potnia/main/docs/_static/img/PotniaLogo.png 6 | 7 | .. start-summary 8 | 9 | |pypi badge| |testing badge| |coverage badge| |docs badge| |git3moji badge| |black badge| |JOSS badge| 10 | 11 | .. |pypi badge| image:: https://img.shields.io/pypi/v/potnia 12 | :target: https://pypi.org/project/potnia/ 13 | 14 | .. |testing badge| image:: https://github.com/AncientNLP/potnia/actions/workflows/testing.yml/badge.svg 15 | :target: https://github.com/AncientNLP/potnia/actions 16 | 17 | .. |coverage badge| image:: https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/rbturnbull/e640f26fb59e39e3051de8fbf020de62/raw/coverage.json 18 | :target: https://ancientnlp.github.io/potnia/coverage/ 19 | 20 | .. |docs badge| image:: https://github.com/AncientNLP/potnia/actions/workflows/docs.yml/badge.svg 21 | :target: https://ancientnlp.github.io/potnia 22 | 23 | .. |black badge| image:: https://img.shields.io/badge/code%20style-black-000000.svg 24 | :target: https://github.com/psf/black 25 | 26 | .. |git3moji badge| image:: https://img.shields.io/badge/git3moji-%E2%9A%A1%EF%B8%8F%F0%9F%90%9B%F0%9F%93%BA%F0%9F%91%AE%F0%9F%94%A4-fffad8.svg 27 | :target: https://robinpokorny.github.io/git3moji/ 28 | 29 | .. |JOSS badge| image:: https://joss.theoj.org/papers/7641150c49e996a21fa0f4dc3aadb258/status.svg 30 | :target: https://joss.theoj.org/papers/7641150c49e996a21fa0f4dc3aadb258 31 | 32 | 33 | 34 | 35 | 36 | Potnia is an open-source Python library designed to convert Romanized transliterations of ancient texts into Unicode representations of ther respective native scripts. 37 | 38 | Currently, the scripts supported by Potnia are: 39 | 40 | - Linear A 41 | - Linear B 42 | - Hittite cuneiform 43 | - Arabic 44 | 45 | Functionality for Luwian hieroglyphs, Sumero-Akkadian cuneiform, Lydian and Etruscan is in development. 46 | 47 | Contributions are welcome! Please see the `CONTRIBUTING.rst `_ file for more information. 48 | 49 | .. end-summary 50 | 51 | 52 | .. start-quickstart 53 | 54 | Installation 55 | ==================== 56 | 57 | To install Potnia, run the following command: 58 | 59 | .. code-block:: bash 60 | 61 | pip install potnia 62 | 63 | To install the latest version from the repository, you can use this command: 64 | 65 | .. code-block:: bash 66 | 67 | pip install git+https://github.com/AncientNLP/potnia.git 68 | 69 | You can also install Potnia by cloning the repository and installing using poetry. 70 | This will install all the dependencies required for Potnia from the with the version numbers pinned in the ``poetry.lock`` file. 71 | Make sure that poetry is installed on your system. If not, see the `instructions `_. 72 | Then follow these steps: 73 | 74 | .. code-block:: bash 75 | 76 | git clone https://github.com/AncientNLP/potnia.git 77 | cd potnia 78 | poetry install 79 | 80 | You can test that Potnia is working by running ``pytest``. 81 | 82 | .. note:: 83 | 84 | For proper display of ancient script glyphs, please refer to the `Fonts `_ section. 85 | 86 | Usage 87 | ==================== 88 | 89 | To convert transliterated Linear B to Linear B Unicode, use the following code: 90 | 91 | .. code-block:: python 92 | 93 | >>> from potnia import linear_b 94 | >>> linear_b("a-ri-to-jo") 95 | '𐀀𐀪𐀵𐀍' 96 | 97 | 98 | If you wish to regularize the text to remove additional annotations present in the `LiBER `_ 99 | and `DĀMOS `_ transliteration, use the following code: 100 | 101 | .. code-block:: python 102 | 103 | >>> linear_b("e-ke-qe ]-o-na-to , ke-ke-me-na⌞ ⌟ko-to-na GRA qs ] vac.", regularize=True) 104 | '𐀁𐀐𐀤 %𐀃𐀙𐀵 𐀐𐀐𐀕𐀙 𐀒𐀵𐀙 𐂎 %' 105 | 106 | Note that uncertain/missing signs or sections of text are presently being replaced with a wildcard '%' character. 107 | 108 | To tokenize transliterated Linear B texts without converting it to Unicode, use the following code: 109 | 110 | .. code-block:: python 111 | 112 | >>> linear_b.tokenize_transliteration("]wa VIR 1 MUL 2 'ko-wa 1' ko-wo 1") 113 | [']', 'wa', ' ', 'VIR', ' ', '1', ' ', 'MUL', ' ', '2', ' ', "'", 'ko', 'wa', ' ', '1', "'", ' ', 'ko', 'wo', ' ', '1'] 114 | 115 | Command Line Interface (CLI) 116 | ============================ 117 | 118 | Potnia also provides a command line interface (CLI). 119 | 120 | To convert transliterated Linear B to Unicode, use the following command: 121 | 122 | .. code-block:: bash 123 | 124 | potnia linear-b "a-ri-to-jo" 125 | 126 | To regularize the text, use the following command: 127 | 128 | .. code-block:: bash 129 | 130 | potnia linear-b "e-ke-qe ]-o-na-to , ke-ke-me-na⌞ ⌟ko-to-na GRA qs ] vac." --regularize 131 | 132 | To see the full set of commands available in the CLI, use the following command: 133 | 134 | .. code-block:: bash 135 | 136 | potnia --help 137 | 138 | Graphical User Interface (GUI) 139 | ============================== 140 | 141 | .. image:: https://raw.githubusercontent.com/AncientNLP/potnia/main/docs/_static/img/potnia-gui.png 142 | 143 | Potnia also provides a graphical user interface (GUI). To start it, run: 144 | 145 | .. code-block:: bash 146 | 147 | potnia gui 148 | 149 | This will show a link in the terminal that you can click on to open the GUI in your browser. 150 | 151 | 152 | .. end-quickstart 153 | 154 | Credits 155 | ==================== 156 | 157 | .. start-credits 158 | 159 | Potnia is developed by: 160 | 161 | - Emily Tour (University of Melbourne) 162 | - `Kabir Manandhar Shrestha `_ (Melbourne Data Analytics Platform, University of Melbourne) 163 | - `Dr Robert Turnbull `_ (Melbourne Data Analytics Platform, University of Melbourne) 164 | 165 | To cite Potnia, use this reference: 166 | 167 | Tour, Emily, Kabir Manandhar Shrestha, and Robert Turnbull. 'Potnia: A Python Library for the Conversion of Transliterated Ancient Texts to Unicode.' *Journal of Open Source Software* 10, no. 108 (2025): 7725. `doi:10.21105/joss.07725 `_ 168 | 169 | You can also use the following BibTeX entries: 170 | 171 | .. code-block:: bibtex 172 | 173 | @article{potnia, 174 | author = {Emily Tour and Kabir Manandhar Shrestha and Robert Turnbull}, 175 | title = {{Potnia: A Python library for the conversion of transliterated ancient texts to Unicode}}, 176 | year = {2025}, 177 | journal = {Journal of Open Source Software}, 178 | publisher = {The Open Journal}, 179 | volume = {10}, 180 | number = {108}, 181 | pages = {7725}, 182 | doi = {10.21105/joss.07725}, 183 | url = {https://doi.org/10.21105/joss.07725} 184 | } 185 | 186 | @misc{potnia_release, 187 | author = {Emily Tour and Kabir Manandhar Shrestha and Robert Turnbull}, 188 | title = {{Potnia: A Python library for the conversion of transliterated ancient texts to Unicode}}, 189 | year = {2025}, 190 | url = {https://doi.org/10.26188/28721354.v1}, 191 | note = {Version 0.4.0, Apache License 2.0}, 192 | doi = {10.26188/28721354.v1} 193 | } 194 | 195 | We acknowledge support from Wytamma Wirth, Brent Davis, Kim Doyle, Man-Hua (Kate) Chu, Anhui (Ellie) Situ, Ekaterina Vylomova, Chris Guest and Stavroula (Stephie) Nikoloudis. This research was supported by The University of Melbourne’s Research Computing Services. Robert Turnbull completed part of this work through the BICROSS project, which has received funding from the European Research Council (ERC) under the European Union’s Horizon Europe research and innovation programme (grant agreement no. 101043730 – BICROSS – ERC-2021-COG). 196 | 197 | .. image:: https://raw.githubusercontent.com/AncientNLP/potnia/main/docs/_static/img/erc-logo.jpg 198 | :alt: ERC logo 199 | :align: center 200 | :width: 200px 201 | 202 | .. end-credits 203 | -------------------------------------------------------------------------------- /potnia/data/hittite.yaml: -------------------------------------------------------------------------------- 1 | mappings: 2 | # V syllabograms 3 | "a": 𒀀 4 | "e": 𒂊 5 | "i": 𒄿 6 | "u": 𒌋 7 | "ú": 𒌑 8 | "ia": 𒅀 9 | 10 | # CV syllabograms 11 | "ba": 𒁀 12 | "be": 𒁁 13 | "bi": 𒁉 14 | "bu": 𒁍 15 | 16 | "pa": 𒉺 17 | "pé": 𒁉 18 | "pí": 𒁉 19 | "pu": 𒁍 20 | 21 | "da": 𒁕 22 | "de": 𒁲 23 | "di": 𒁲 24 | "du": 𒁺 25 | 26 | "ta": 𒋫 27 | "te": 𒋼 28 | "ti": 𒋾 29 | "tu": 𒌅 30 | 31 | "ga": 𒂵 32 | "ge": 𒄀 33 | "gi": 𒄀 34 | "gu": 𒄖 35 | 36 | "ka": 𒅗 37 | "ke": 𒆠 38 | "ki": 𒆠 39 | "ku": 𒆪 40 | 41 | "ḫa": 𒄩 42 | "ḫe": 𒄭 43 | "ḫé": 𒃶 44 | "ḫi": 𒄭 45 | "ḫu": 𒄷 46 | 47 | "la": 𒆷 48 | "le": 𒇷 49 | "li": 𒇷 50 | "lu": 𒇻 51 | 52 | "ma": 𒈠 53 | "me": 𒈨 54 | "mé": 𒈪 55 | "mi": 𒈪 56 | "mu": 𒈬 57 | 58 | "na": 𒈾 59 | "ne": 𒉈 60 | "né": 𒉌 61 | "ni": 𒉌 62 | "nu": 𒉡 63 | 64 | "ra": 𒊏 65 | "re": 𒊑 66 | "ri": 𒊑 67 | "ru": 𒊒 68 | 69 | "ša": 𒊭 70 | "še": 𒊺 71 | "ši": 𒅆 72 | "šu": 𒋗 73 | "šú": 𒋙 74 | 75 | "wa": 𒉿 76 | "wi5": 𒃾 77 | 78 | "ya": 𒅀 79 | 80 | "za": 𒍝 81 | "ze": 𒍣 82 | "zé": 𒍢 83 | "zi": 𒍣 84 | "zu": 𒍪 85 | 86 | # VC syllabograms 87 | "ab": 𒀊 88 | "ap": 𒀊 89 | "eb": 𒅁 90 | "ep": 𒅁 91 | "ib": 𒅁 92 | "ip": 𒅁 93 | "ub": 𒌒 94 | "up": 𒌒 95 | 96 | "ad": 𒀜 97 | "at": 𒀜 98 | "ed": 𒀉 99 | "et": 𒀉 100 | "id": 𒀉 101 | "it": 𒀉 102 | "ud": 𒌓 103 | "ut": 𒌓 104 | 105 | "ag": 𒀝 106 | "ak": 𒀝 107 | "eg": 𒅅 108 | "ek": 𒅅 109 | "ig": 𒅅 110 | "ik": 𒅅 111 | "ug": 𒊌 112 | "uk": 𒊌 113 | 114 | "aḫ": 𒄴 115 | "eḫ": 𒄴 116 | "iḫ": 𒄴 117 | "uḫ": 𒄴 118 | 119 | "al": 𒀠 120 | "el": 𒂖 121 | "il": 𒅋 122 | "ul": 𒌌 # Check 123 | 124 | "am": 𒄠 125 | "em": 𒅎 126 | "im": 𒅎 127 | "um": 𒌝 128 | 129 | "an": 𒀭 130 | "en": 𒂗 131 | "in": 𒅔 132 | "un": 𒌦 133 | 134 | "ar": 𒅈 135 | "er": 𒅕 136 | "ir": 𒅕 137 | "ur": 𒌨 138 | "úr": 𒌫 139 | 140 | "aš": 𒀸 141 | "eš": 𒌍 142 | "eš": 𒐁 143 | "iš": 𒅖 144 | "uš": 𒍑 145 | 146 | "az": 𒊍 147 | "ez": 𒄑 148 | "iz": 𒄑 149 | "uz": 𒊻 150 | 151 | # CVC syllabograms 152 | "ḫal": 𒄬 153 | "ḫab": 𒆸 154 | "ḫap": 𒆸 155 | "ḫaš": 𒋻 156 | "ḫad": 𒉺 157 | "ḫat": 𒉺 158 | "PA": 𒉺 # sceptre 159 | "ḫul": "??" 160 | "ḪUL": "??" # evil # Find 161 | "ḫub": 𒄽 162 | "ḫup": 𒄽 163 | "ḫar": 𒄯 164 | "ḪAR": 𒄯 # ring 165 | "ḫur": 𒄯 166 | "ḪUR": 𒄯 # thick 167 | "MUR": 𒄯 # lung 168 | "gal": 𒃲 169 | "GAL": 𒃲 # great 170 | "kal": 𒆗 171 | "gal9": 𒆗 172 | "kam": 𒄰 173 | "gám": 𒄰 174 | "TU7": 𒄰 # soup 175 | "kán": 𒃷 176 | "gán": 𒃷 177 | "GÁN": 𒃷 # field 178 | "kab": 𒆏 179 | "kap": 𒆏 180 | "gáb": 𒆏 181 | "gáp": 𒆏 182 | "KAB": 𒆏 # left 183 | "kar": "??" # find 184 | "KAR": "??" # find # But actually find 185 | "kàr": 𒃼 186 | "gàr": 𒃼 187 | "kaš": 𒁉 188 | "gáš": 𒁉 189 | "KAŠ": 𒁉 # beer 190 | "kad": 𒃰 191 | "gad": 𒃰 192 | "gat": 𒃰 193 | "GAD": 𒃰 # linen 194 | "gaz": 𒄤 195 | "GAZ": 𒄤 # kill 196 | "kib": "??" # Find 197 | "kir": 𒄫 198 | "gir": 𒄫 199 | "kiš": 𒆧 200 | "KIŠ": 𒆧 # world 201 | "kid": 𒃰 202 | "t9": 𒃰 203 | "kal": 𒆗 204 | "KAL": 𒆗 # strong 205 | "kul": 𒆰 206 | "KUL": 𒆰 # offspring 207 | "kúl": 𒄢 208 | "gul": 𒄢 209 | "GUL": 𒄢 # break 210 | "kum": 𒄣 211 | "gum": 𒄣 212 | "kur": 𒆳 213 | "KUR": 𒆳 # land 214 | "kùr": 𒄥 215 | "gur": 𒄥 216 | "lal": 𒇲 217 | "LAL": 𒇲 # bind 218 | "lam": 𒇴 219 | "lig": 𒌨 220 | "lik": 𒌨 221 | "liš": 𒇺 222 | "LIŠ": 𒇺 # spoon 223 | "luḫ": 𒈛 224 | "LUḪ": 𒈛 # minister 225 | "lum": 𒈝 226 | "maḫ": 𒈤 227 | "MAḪ": 𒈤 # great 228 | "man": "??" # Find 229 | "mar": 𒈥 230 | "maš": 𒈦 231 | "MAŠ": 𒈦 # half 232 | "meš": "𒌍" # check 233 | "mil": 𒅖 234 | "mel": 𒅖 235 | "miš": 𒈩 236 | "mur": 𒄯 237 | "mut": 𒅜 238 | "MUD": 𒅜 # blood 239 | "nam": 𒉆 240 | "NAM": 𒉆 # district 241 | "nab": 𒀮 242 | "nap": 𒀮 243 | "nir": 𒉪 244 | "niš": "??" # Find 245 | "pal": 𒁄 246 | "bal": 𒁄 247 | "pár": 𒈦 248 | "bar": 𒈦 249 | "paš": "??" # Find 250 | "pád": 𒁁 251 | "pát": 𒁁 252 | "píd": 𒁁 253 | "pít": 𒁁 254 | "píl": 𒉋 255 | "bíl": 𒉋 256 | "GIBIL": 𒉋 # new 257 | "pir": "??" # Find 258 | "piš": 𒄫 259 | "biš": 𒄫 260 | "pùš": 𒄫 261 | "pur": "??" # Find 262 | "bur": "??" # Find 263 | "rad": 𒋥 264 | "rat": 𒋥 265 | "riš": 𒊕 266 | "šaḫ": 𒋚 267 | "ŠUBUR": 𒋚 # pig 268 | "šag": 𒊕 269 | "šak": 𒊕 270 | "SAG": 𒊕 # head 271 | "šal": 𒊩 272 | "MUNUS": 𒊩 # woman 273 | "šam": 𒌑 274 | "šàm": "??" # Find 275 | "šab": "??" # Find 276 | "šap": "??" # Find 277 | "šar": 𒊬 278 | "SAR": 𒊬 # plant 279 | "šìp": "??" # Find 280 | "šir": 𒋓 281 | "ŠIR": 𒋓 # testicles 282 | "šum": 𒋳 283 | "šur": 𒋩 284 | "taḫ": 𒈭 285 | "daḫ": 𒈭 286 | "túḫ": 𒈭 287 | "tág": 𒁖 288 | "ták": 𒁖 289 | "dag": 𒁖 290 | "dak": 𒁖 291 | "tal": 𒊑 292 | "dal": 𒊑 293 | "tám": 𒁮 294 | "dam": 𒁮 295 | "DAM": 𒁮 # wife 296 | "tan": 𒆗 297 | "dan": 𒆗 298 | "tab": 𒋰 299 | "tap": 𒋰 300 | "dáb": 𒋰 301 | "dáp": 𒋰 302 | "TAB": 𒋰 # 2 303 | "tar": 𒋻 304 | "táš": 𒁹 305 | "dáš": 𒁹 306 | "tiš": 𒁹 307 | "diš": 𒁹 308 | "tàš": 𒀾 309 | "tin": 𒁷 310 | "tén": 𒁷 311 | "tim": 𒁴 312 | "dim": 𒁴 313 | "dir": 𒊑 314 | "DIR": 𒊑 # red 315 | "tir": 𒌁 316 | "ter": 𒌁 317 | "TIR": 𒌁 # forest 318 | "tíš": "??" # Find 319 | "túl": 𒇥 320 | "tum": 𒌈 321 | "dum": 𒌈 322 | "tub": 𒁾 323 | "dub": 𒁾 324 | "dup": 𒁾 325 | "DUB": 𒁾 # clay tablet 326 | "túr": 𒄙 327 | "dur": 𒄙 328 | "DUR": 𒄙 # strip 329 | "zul": 𒂄 330 | "zum": 𒍮 331 | 332 | # Logograms 333 | "DIŠ": 𒁹 # (ᵐ) # male personal names 334 | "DIDLI": 𒀸 # (suffixed) # plural or collective 335 | "DIDLI ḪI.A": 𒀸𒄭𒀀 # (suffixed) # plural 336 | "DINGIR": 𒀭 # (ᴰ) # "deity" 337 | "DUG": 𒂁 # "vessel" 338 | "É": 𒂍 # "house" 339 | "GAD": 𒃰 # "linen, cloth" 340 | "GI": 𒄀 # "tube; reed" 341 | "GIŠ": 𒄑 # "wood" 342 | "GUD": 𒄞 # "bovid" 343 | "ḪUR.SAG": 𒄯𒊕 # "mountain" 344 | "ÍD": 𒀀𒇉 # "river" 345 | "IM": 𒅎 # "clay" 346 | "ITU": 𒌚 # "month" 347 | "KÁ": 𒆍 348 | "KU6": 𒄩 # "fish" 349 | "KUR": 𒆳 # "land" 350 | "KUŠ": 𒋢 # "hide, fur" 351 | "LÚ": 𒇽 # "man" 352 | "MEŠ": 𒌍 # (suffixed) # plural 353 | "MEŠ ḪI.A": 𒌍𒄭𒀀 # (suffixed) # plural 354 | "MUL": 𒀯 # "star" 355 | "MUNUS": 𒊩 # (ᶠ) # "woman" # female personal name 356 | "MUŠ": 𒈲 # "serpent" 357 | "MUŠEN": 𒄷 # (suffixed) # "bird" 358 | "NA₄": 𒉌𒌓 # "stone" 359 | "NINDA": 𒃻 # "bread" 360 | "PÚ": 𒇥 # "source" 361 | "SAR": 𒊬 # (suffixed) # "plant" 362 | "SI": 𒋛 # "horn" 363 | "SÍG": 𒋠 # "wool" 364 | "TU7": 𒄰 # "soup" 365 | "TÚG": 𒌆 # "garment" 366 | "Ú": 𒌑 # "plant" 367 | "URU": 𒌷 # "city" 368 | "URUDU": 𒍐 # "copper" 369 | "UZU": 𒍜 # "meat" 370 | 371 | # Determinatives 372 | 373 | "(DIŠ)": 𒁹 # (ᵐ) # male personal names 374 | "(DIDLI)": 𒀸 # (suffixed) # plural or collective 375 | "(DIDLI ḪI.A)": 𒀸𒄭𒀀 # (suffixed) # plural 376 | "(DINGIR)": 𒀭 # (ᴰ) # "deity" 377 | "(DUG)": 𒂁 # "vessel" 378 | "(É)": 𒂍 # "house" 379 | "(GAD)": 𒃰 # "linen, cloth" 380 | "(GI)": 𒄀 # "tube; reed" 381 | "(GIŠ)": 𒄑 # "wood" 382 | "(GUD)": 𒄞 # "bovid" 383 | "(ḪI.A)": 𒄭𒀀 # (suffixed) # plural 384 | "(ḪUR.SAG)": 𒄯𒊕 # "mountain" 385 | "(ÍD)": 𒀀𒇉 # "river" 386 | "(IM)": 𒅎 # "clay" 387 | "(ITU)": 𒌚 # "month" 388 | "(KAM)": 𒄰 # (suffixed) # numerals 389 | "(KI)": 𒆠 # (suffixed) # in 0.6% of toponyms[5] 390 | "(KU6)": 𒄩 # "fish" 391 | "(KUR)": 𒆳 # "land" 392 | "(KUŠ)": 𒋢 # "hide, fur" 393 | "(LÚ)": 𒇽 # "man" 394 | "(MEŠ)": 𒌍 # (suffixed) # plural 395 | "(MEŠ ḪI.A)": 𒌍𒄭𒀀 # (suffixed) # plural 396 | "(MUL)": 𒀯 # "star" 397 | "(MUNUS)": 𒊩 # (ᶠ) # "woman" # female personal name 398 | "(MUŠ)": 𒈲 # "serpent" 399 | "(MUŠEN)": 𒄷 # (suffixed) # "bird" 400 | "(NA₄)": 𒉌𒌓 # "stone" 401 | "(NINDA)": 𒃻 # "bread" 402 | "(PÚ)": 𒇥 # "source" 403 | "(SAR)": 𒊬 # (suffixed) # "plant" 404 | "(SI)": 𒋛 # "horn" 405 | "(SÍG)": 𒋠 # "wool" 406 | "(TU7)": 𒄰 # "soup" 407 | "(TÚG)": 𒌆 # "garment" 408 | "(Ú)": 𒌑 # "plant" 409 | "(URU)": 𒌷 # "city" 410 | "(URUDU)": 𒍐 # "copper" 411 | "(UZU)": 𒍜 # "meat" -------------------------------------------------------------------------------- /potnia/data/luwian.yaml: -------------------------------------------------------------------------------- 1 | # WORK IN PROGRESS 2 | 3 | mappings: 4 | a: 𔗷 5 | á: 𔐓 6 | aₓ : 𔗨 # uncertain sound value 7 | i: 𔓯 8 | í: 𔕐 9 | u: 𔑻 10 | 11 | ha: 𔓷 12 | ha: 𔔁 # uncertain sound value 13 | há: 𔓟 14 | haₓ: 𔕡 15 | hi: 𔗒 16 | hí: 𔕘 17 | hu: 𔕙 18 | hú: 𔖈 19 | 20 | hwa: 𔘰 21 | hwi: 𔘰 22 | hwiₓ: 𔓎 23 | 24 | ka: 𔗧 25 | ká: 𔐾 26 | ki: 𔗳 27 | ki₄: 𔔓 28 | kiₓ: 𔔓 29 | ku: 𔗜 30 | kwa: 𔕰 31 | kwi: 𔕰 32 | 33 | la: 𔓊 34 | la: 𔗲 # issue with duplicate; to verify 35 | laₓ: 𔗽 36 | li: 𔔹 37 | li: 𔗲 # issue with duplicate; to verify 38 | lí: 𔒖 39 | lì: 𔕇 40 | lu: 𔗲 41 | 42 | ma: 𔒅 43 | má: 𔖘 44 | mà: 𔕖 45 | maₓ: 𔕖, 𔘅 # split into two 46 | mi: 𔖻 47 | mí: 𔗘 48 | mì: 𔖷 49 | mu: 𔑿, 𔖛, 𔑾, 𔒀 # split into multiple? unclear 50 | 51 | na: 𔐤 52 | ná: 𔕵 53 | ni: 𔗐 54 | ní: 𔓵 55 | nì: 𔐽 56 | niₓ: 𔗴 57 | nu: 𔒴 58 | nú: 𔖿 59 | 60 | pa: 𔕸, 𔔁 ? 61 | pá: 𔘅 62 | paₓ: 𔓐 63 | pi: 𔑉 64 | pu: 𔕯 65 | pú: 𔗣 66 | ra: 𔖱 67 | ri: 𔖱 68 | ru: 𔗑 69 | rú: 𔑳, 𔑵 70 | ur: 𔖙 71 | 72 | sa: 𔗔 73 | sá: 𔗦 74 | sà: 𔑷 75 | sa₄: 𔗆 76 | sa₅: 𔕮 77 | sa₆: 𔔀 78 | sa₇: 𔕣 79 | sa₈: 𔖭 80 | si: 𔓉 81 | sí ?: 𔗾 82 | su: 𔖢 83 | sú: 𔒂 84 | sù: 𔗵 85 | us: 𔗚 86 | 87 | ta: 𔑰 88 | tá: 𔐞 89 | tà: 𔐬 90 | ta₄: 𔕦 91 | ta₅: 𔓇 92 | ta₆: 𔑛 93 | taₓ: 𔐭 94 | ti: 𔑣 95 | tí: 𔘟 96 | tì ?: 𔕦 97 | ti₄ ?: 𔓇 98 | tu: 𔑡, 𔑢 99 | tú: 𔕬 100 | tù: 𔕭 101 | tu₄: 𔔈 102 | 103 | wa: 𔗬 104 | wá: 𔓁 105 | wà: 𔓀 106 | wa₄: 𔓬 107 | wa₅: 𔓩 108 | wa₆: 𔓤 109 | wa₇: 𔕁 110 | wa₉: 𔔻 111 | wi: 𔒻 112 | wi: 𔗬 113 | wí: 𔓁 114 | wì: 𔓀 115 | wi₄: 𔓬 116 | wi₅: 𔓩 117 | wi₆: 𔓤 118 | wi₇: 𔕁 119 | wi₉: 𔔻 120 | 121 | ia: 𔓱 122 | iá: 𔕑 123 | ià: 𔖬 124 | 125 | za: 𔖪, 𔖩 126 | zá: 𔕹 127 | zà: 𔕼 128 | za₄: 𔒈 129 | zaₓ: 𔕽 130 | zi: 𔖩 131 | zí: 𔕠 132 | zì: 𔕻 133 | zi₄: 𔒚 134 | zu ?: 𔗥, 𔕀 135 | zú: 𔗵 136 | 137 | a+ra: 𔗸 138 | a+ri: 𔗸 139 | a+tá: 𔐷 140 | ara: 𔒟 141 | ara: 𔒠 142 | ari: 𔒟 143 | # ari: 𔒠 144 | hara: 𔕆 145 | hari: 𔕆 146 | hur: 𔗹 147 | 148 | i+ra: 𔓰 149 | # i+ri: 𔓰 150 | # kar: 𔕢 151 | "la+ra+a": 𔓍 152 | pari: 𔐎 153 | ra+a: 𔗸 154 | ri+i: 𔓰 155 | sara: 𔕕 156 | sari: 𔕕 157 | tal: 𔖞 158 | tana: 𔗢 159 | tapa: 𔒋 160 | tár: 𔖤 161 | taraₓ: 𔖤 162 | tariₓ: 𔖤 163 | tara: 𔖹 164 | tari: 𔖹 165 | zuwa: 𔕀 166 | 167 | IUDEX+ra: 𔖤 168 | IUDEX+ri: 𔖤 169 | 170 | 171 | ADORARE: 𔐅 172 | AEDIFICARE: 𔔘, 𔒐 173 | AEDIFICIUM : 𔔖 174 | AEDIFICIUM.PONERE : 𔔘, 𔒐 175 | #AEDIFICIUM+MINUS : VASTUS) : 𔔗 176 | ALA : 𔑗 177 | AMPLECTI : 𔐈, 𔗱 178 | ANIMAL : 𔗈 179 | ANNUS : 𔕺 180 | ANNUS+ANNUS : 𔖁 181 | AQUA : 𔓳, 𔓴 182 | AQUILA : 𔒟 183 | ARGENTUM : 𔔣, 𔔤, 𔔦 184 | ARHA : 𔓸, 𔓹 185 | ASCIA : 𔔼 186 | ASINUS : 𔑯, 𔒍 187 | ASINUS₂ : 𔑱 188 | AUDIRE : 𔑒, 𔓅 189 | AURIGA : 𔕄 190 | AURIS+TU+MI : 𔑒, 𔓅 191 | AVIS : 𔒚 192 | AVIS₂ : 𔒞 193 | AVIS₃ : 𔒜 194 | AVIS₄ : 𔒟 195 | AVIS₅ : 𔒝 196 | AVISₓ : 𔒡 197 | AVUS : 𔕳 198 | BESTIA : 𔑫 199 | BIBERE : 𔐇 200 | BONUS : 𔕧 (2nd mil.), 𔓀 201 | BONUS₂ : 𔖢 202 | BOS : 𔑺 203 | BOS₂ : 𔑼 204 | BOS+MI : 𔑾 205 | BOS.MI : 𔒀 206 | BOS₂.MI : 𔒁 207 | BRACCHIUM : 𔐡 208 | CAELUM : 𔓑 209 | CANIS : 𔑬 210 | CANIS₂ : 𔑭 211 | CAPERE : 𔐫 212 | CAPERE+SCALPRUM : 𔕲 213 | CAPERE₂ : 𔐮, 𔒣 214 | CAPERE₂.CAPERE₂ : 𔐭 215 | CAPRA : 𔑶 216 | CAPRA₂ : 𔑸 217 | CAPRA2A : 𔑹 218 | CAPUT : 𔐉 219 | CAPUT+SCALPRUM : 𔐊 220 | CASTRUM : 𔔉, 𔔊, 𔔋 221 | CENTUM : 𔗃, 𔕂, 𔕔 222 | CERVUS : 𔑳 223 | CERVUS₂ : 𔑴 224 | CERVUS₃ : 𔑵 225 | CONTRACTUS : 𔖅 226 | COR : 𔖂 227 | CORNU : 𔒂 228 | CORNU+CAPUT : 𔙀 229 | CRUS : 𔑛 230 | CRUS₂ : 𔑝 231 | CRUS.CRUS : 𔑟, 𔑠 232 | CRUS+FLUMEN : 𔑜 233 | CRUX : 𔕛 234 | CUBITUM : 𔔕 235 | CULTER : 𔕿 236 | CUM : 𔑀 237 | CURRUS : 𔕃 238 | DARE : 𔑈 239 | DARE.DARE : 𔑊 240 | DECEM : 𔗁 241 | DELERE : 𔔚 242 | DEUS : 𔖖 243 | DEUS.DOMUS : 𔔛 244 | (DEUS)VIA+TERRA : 𔓧 245 | DIES : 𔖓, 𔖔, 𔖕 246 | DOMINA : 𔐏 247 | DOMINUS : 𔖺 248 | DOMUS : 𔔙 249 | DOMUS+MINUS : 𔔚 250 | DOMUS+SCALA : 𔔞, 𔔟 251 | DOMUS+X : 𔔝 252 | EDERE : 𔐆 253 | EGO : 𔐀, 𔘞 ? 254 | EGO₂ : 𔐁 255 | ENSIS : 𔐻 256 | EQUUS : 𔑮 257 | EUNUCHUS : 𔘑, 𔘐 258 | EUNUCHUS₂ : ?? 259 | EXERCITUS : 𔔰 260 | FALX ? : 𔘝 261 | FEMINA : 𔑘, 𔗌 262 | FILIA : 𔐱 263 | FILIUS : 𔐰 264 | FILIUS.NEPOS : 𔕒 265 | FINES : 𔓸 266 | FINES+ha : 𔓹 267 | FLAMMAE ? : 𔘔, 𔗅, 𔘖 268 | FLUMEN : 𔓳, 𔓴 269 | FONS : 𔓶 270 | FORTIS : 𔐝 271 | FRATER : 𔐰 272 | FRATER₂ : 𔔷 273 | FRONS : 𔐚, 𔒉 274 | FULGUR : 𔓣 275 | FUSUS : 𔕗 276 | GAZELLA : 𔑶 277 | GENUFLECTERE : 𔑞 278 | GRYLLUS : 𔒑 279 | HÁ+LI : 𔓠 280 | HALA : 𔕈 281 | HALI : 𔕈 282 | HALPA : 𔑞 283 | HANA : 𔘮 284 | HASTARIUS : 𔓈 285 | HATTI : 𔓟 286 | HATTI+li : 𔓠 287 | HEROS : 𔐕 288 | HORDEUM : 𔓎, 𔗻, 𔗼 289 | HORREUM ? : 𔔡, 𔔢 290 | HUR : 𔗹 291 | HWI : 𔘰 292 | IANUS : 𔒯 293 | INFANS : 𔐰 294 | INFRA : 𔐾, 𔐿 295 | IRA : 𔐘 296 | IŠUWA(URBS) : 𔔃 297 | IUDEX : 𔖣 298 | IUDEX.LA : 𔔸 299 | IUSTITIA : 𔖣 300 | IUSTITIA.LA : 𔔸 301 | LA+LA : 𔓋 302 | LAPIS : 𔔮 303 | LAPIS+SCALPRUM : 𔔭 304 | LECTUS : 𔕓 305 | LEO : 𔑪 306 | LEO₂ : 𔑫 307 | LEO+MONS.TU+LEO : 𔓭 308 | LEPUS : 𔒋 309 | LEPUS₂ : 𔒌 310 | LIₓ : 𔒗 311 | LIBARE : 𔐜 312 | LIBATIO : 𔒤 313 | LIGARE : 𔐠 314 | LIGNUM : 𔖰, 𔓄 315 | LINGERE : 𔒈 316 | LINGUA : 𔓊 317 | LINGUA+CLAVUS : 𔓌 318 | LIS : 𔐘 319 | LITUUS : 𔖫 320 | LITUUS+Á/LITUUS+á : 𔐔 321 | LITUUS+na : 𔐥 322 | LITUUS+u : 𔒊 323 | LOCUS : 𔓤, 𔕝 324 | LOQUI : 𔐖 325 | LUNA : 𔓜 326 | MAₓ : 𔒃 327 | MAGNUS : 𔖙 328 | MAGNUS.DOMINA : 𔐐 329 | MAGNUS.DOMUS : 𔔜 330 | MAGNUS.FILIA : 𔐴 331 | MAGNUS.REX : 𔐒 332 | MALLEUS : 𔔻 333 | MALUS : 𔖟 334 | MALUS₂ : 𔖠 335 | MANDARE : 𔑊 336 | MANUS : 𔑁, 𔑂, 𔑂 337 | MANUS.CULTER : 𔐺 338 | MANUS+CULTER : 𔐻 339 | MANUS+MINUS ? (LONGUS) : 𔑄, 𔑍 340 | MATER : 𔑘, 𔗌 341 | MENSA : 𔕊 342 | MENSA₂ : 𔕋 343 | MÍ.REGIO : 𔔇 344 | MILLE : 𔗄 345 | MINISTRARE ? : 𔓐 346 | MINUS : 𔖮 347 | MONS : 𔓬 348 | MONS₂ : 𔐃 349 | MONS.SARPA : 𔕍, 𔕎 350 | MORI : 𔖯 351 | MURUS ? : 𔔎 352 | NEG : 𔕴 353 | NEG₂ : 𔕵 354 | NEG₃ : 𔕶 355 | NEPOS : 𔕒 356 | OCCIDENS : 𔖬 357 | OCULUS : 𔐙 358 | OMNIS(+MI) : 𔖝 359 | OMNIS₂ : 𔗣 360 | ORIENS : 𔓛 361 | OVIS : 𔒇 362 | OVIS₂ : 𔘺 363 | PANIS : 𔓐 364 | PANIS.SCUTELLA : 𔗛 365 | PASTOR : 𔗫 366 | PES : 𔑣 367 | PES₂ : 𔑦 368 | PES₂.PES : 𔑩 369 | PES₂.PES₂ : 𔑨 370 | PES.SCALA.ROTAE : 𔑤, 𔑥, 𔑧 371 | PINCERNA : 𔖆, 𔖍, 𔖎, 𔖏, 𔘻 372 | PISCIS : 𔒥 373 | PITHOS : 𔕾 374 | PITHOS.SCUTELLA/PITHOS : 𔕺 375 | POCULUM : 𔖇 376 | PODIUM : 𔔪 377 | PONERE : 𔑇 378 | PORTA : 𔔏, 𔔐 379 | PORTA₂ : 𔔑 380 | POST : 𔐣 381 | PRAE : 𔐍, 𔐎 382 | PROPHETA ? : 𔙀 383 | PUGNUS : 𔐨, 𔐪, 𔐯 384 | PUGNUS+PUGNUS : 𔐠 385 | PUGNUS+URBS : 𔐹 386 | PUGNUS+X : 𔐩 387 | PURUS : 𔕩, 𔕪 388 | REGIO : 𔔆 389 | REL : 𔕰 390 | REX : 𔐑 391 | REX.FILIA : 𔐳 392 | REX.FILIUS : 𔐲 393 | REX.INFANS : 𔐲 394 | ROTA : 𔕈 395 | SACERDOS : 𔖐 396 | SACERDOS₂ : 𔖥 397 | SARA : 𔕕 398 | SARI : 𔕕 399 | SARMA : 𔑙, 𔑚 400 | SARMA₂ : 𔑙, 𔑚 401 | SARPA : 𔕋 402 | SCALPRUM : 𔔯 403 | SCRIBA : 𔕭 404 | SCUTELLA : 𔗆 405 | SCUTUM : 𔔳 406 | SERVUS : 𔖷 407 | SIGILLUM : 𔕮 408 | SOL : 𔓚, 𔘈, 𔘊 409 | SOL₂ : 𔓙 410 | SOL₂.MENSA : 𔕌 411 | SOL₂.THRONUS : 𔕌 412 | SOLIUM : 𔕐 413 | SPHINX : 𔒒 414 | STATUA : 𔐌 415 | STELE : 𔔭 416 | SUB : 𔐾, 𔐿 417 | SUPER : 𔔱 (earlier variant), 𔑏 418 | TÁ (?) : 𔐞 419 | TAL (?) : 𔖞 420 | TALA (?) : 𔖞 421 | TANA (?) : 𔗢 422 | TELIPINU : 𔒲 423 | TERRA : 𔓤, 𔕝 424 | TEŠUB : 𔕥 425 | THRONUS : 𔕊 426 | THRONUS : 𔕋 427 | THRONUS₂ : 𔕏 428 | TONITRUS : 𔓢 429 | TURRIS ? : 𔔍 430 | UNGULA : 𔒗 431 | UNUS : 𔖭 432 | UR : 𔖙 433 | URBS : 𔔂 434 | URBS+li : 𔔅 435 | URBS-li : 𔔅 436 | URBS-RA+li : 𔔄 437 | URBS-RI?+li : 𔔄 438 | URBS+RA-li : 𔔄 439 | URBS+RI?-li : 𔔄 440 | URCEUS : 𔖆, 𔖍, 𔖎, 𔖏, 𔘻 441 | US : 𔗚 442 | # VACUUS : : 𔔗 443 | 444 | VAS : 𔖂 445 | VASTUS : 𔔗 446 | VIA : 𔓾, 𔑕, 𔓿 447 | VIA+TERRA.SCALPRUM : 𔓥 448 | VIA+TERRA+SCALPRUM : 𔓦 449 | VINUM : 𔒻 450 | VIR : 𔕟 (earlier variant), 𔕠 451 | VIR₂ : 𔖶 (word separator) 452 | VIR₂.MINUS : 𔖯 453 | VITA : 𔖡 454 | VITELLUS : 𔒃 455 | VITIS : 𔒻 456 | 457 | 2 : 𔖳 458 | 3 : 𔖸 459 | 4 : 𔖻 460 | 5 : 𔖼 461 | 8 : 𔖽 462 | 9 : 𔖿 463 | 12 : 𔘍 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /potnia/data/linear_a.yaml: -------------------------------------------------------------------------------- 1 | mappings: 2 | ############################################################ 3 | ###### Syllabograms Common to Linear A and Linear B ######## 4 | ############################################################ 5 | a: 𐀀 6 | e: 𐀁 7 | i: 𐀂 8 | o: 𐀃 9 | u: 𐀄 10 | da: 𐀅 11 | de: 𐀆 12 | di: 𐀇 13 | do: 𐀈 14 | du: 𐀉 15 | ja: 𐀊 16 | je: 𐀋 17 | jo: 𐀍 18 | ju: 𐀎 19 | ka: 𐀏 20 | ke: 𐀐 21 | ki: 𐀑 22 | ko: 𐀒 23 | ku: 𐀓 24 | ma: 𐀔 25 | me: 𐀕 26 | mi: 𐀖 27 | mo: 𐀗 28 | mu: 𐀘 29 | na: 𐀙 30 | ne: 𐀚 31 | ni: 𐀛 32 | "no": 𐀜 33 | nu: 𐀝 34 | nwa: 𐁅 35 | pa: 𐀞 36 | pe: 𐀟 37 | pi: 𐀠 38 | po: 𐀡 39 | pu: 𐀢 40 | pu₂: 𐁆 41 | pu2: 𐁆 42 | qa: 𐀣 43 | qe: 𐀤 44 | qi: 𐀥 #𐘏? 45 | qif: 𐘐 46 | qo: 𐀦 47 | ra: 𐀨 48 | ra₂: 𐁈 49 | ra2: 𐁈 50 | re: 𐀩 51 | ri: 𐀪 52 | ro: 𐀫 53 | ru: 𐀬 54 | sa: 𐀭 55 | se: 𐀮 56 | si: 𐀯 57 | so: 𐀰 58 | su: 𐀱 59 | ta: 𐀲 60 | ta₂: 𐁋 61 | ta2: 𐁋 62 | te: 𐀳 63 | ti: 𐀴 64 | to: 𐀵 65 | tu: 𐀶 66 | wa: 𐀷 67 | we: 𐀸 68 | wi: 𐀹 69 | wo: 𐀺 70 | za: 𐀼 71 | ze: 𐀽 72 | zo: 𐀿 73 | 74 | "*22": 𐁒 75 | "*22f": 𐘓 76 | "*34": 𐁓 77 | "*47": 𐁔 78 | "*49": 𐁕 79 | "*56": 𐘰 80 | "*72": "?" #FIND # 𐘽 ? 81 | "*79": 𐙀 82 | "*82": 𐙃 83 | "*86": 𐁜 84 | "*118": 𐙈 85 | 86 | ############################################################ 87 | ###### Syllabograms Unique to Linear A ######## 88 | ############################################################ 89 | "*100": 𐙇 90 | "*123": 𐙌 91 | "*131a": 𐙍 92 | "*131b": 𐙎 93 | "*164a": 𐙐 94 | "*180": 𐙒 95 | "*188": 𐙓 96 | A301: 𐙕 97 | A305: 𐙙 98 | A309a: 𐙝 99 | A309b: 𐙞 100 | A309c: 𐙟 101 | A310: 𐙠 102 | A312: 𐙢 103 | A314: 𐙦 104 | A320: 𐙬 105 | A321: 𐙭 106 | A322: 𐙮 107 | A323: 𐙯 108 | A324: 𐙰 109 | A325: 𐙱 110 | A327: 𐙳 111 | A328: 𐙴 112 | A329: 𐙵 113 | A331: 𐙷 114 | A333: 𐙹 115 | A340: 𐚀 116 | A342: 𐚂 117 | A345: 𐚅 118 | A349: 𐚉 119 | A350: 𐚊 120 | A352: 𐚌 121 | A353: 𐚍 122 | A354: 𐚎 123 | A355: 𐚏 124 | A356: 𐚐 125 | A357: 𐚑 126 | A358: 𐚒 127 | A359: 𐚓 128 | A360: 𐚔 129 | A361: 𐚕 130 | A362: 𐚖 131 | A363: 𐚗 132 | A364: 𐚘 133 | 134 | ############################################################ 135 | ##### Logograms Common Between Linear A and Linear B ####### 136 | ############################################################ 137 | 138 | # LOGOGRAMS attested in Linear A and Linear B (SigLA codes) 139 | AB01: 𐘀 140 | AB02: 𐘁 141 | AB03: 𐘂 142 | AB04: 𐘃 143 | AB05: 𐘄 144 | AB06: 𐘅 145 | AB07: 𐘆 146 | AB08: 𐘇 147 | AB09: 𐘈 148 | AB10: 𐘉 149 | AB11: 𐘊 150 | AB13: 𐘋 151 | AB16: 𐘌 152 | AB17: 𐘍 153 | AB20: 𐘎 154 | "AB21/OVIS": 𐘏 #OVIS 155 | "AB21/OVISf": 𐘐 #OVIS 156 | "AB21/OVISm": 𐘑 #OVIS 157 | "AB22/CAP": 𐘒 #CAP 158 | "AB22/CAPf": 𐘓 #CAP 159 | "AB22/CAPm": 𐘔 #CAP 160 | "AB23/BOS": 𐘕 #BOS 161 | "AB23/BOSm": 𐘖 #BOS 162 | AB24: 𐘗 163 | AB26: 𐘘 164 | AB27: 𐘙 165 | AB28: 𐘚 166 | AB28B: 𐘛 167 | AB29: 𐘜 168 | "AB30/FIC": 𐘝 #FIC 169 | AB31: 𐘞 170 | AB34: 𐘟 171 | AB37: 𐘠 172 | AB38: 𐘡 173 | AB39: 𐘢 174 | AB40: 𐘣 175 | AB41: 𐘤 176 | AB44: 𐘥 177 | AB45: 𐘦 178 | AB46: 𐘧 179 | AB47: 𐘨 180 | AB48: 𐘩 181 | AB49: 𐘪 182 | AB50: 𐘫 183 | AB51: 𐘬 184 | AB53: 𐘭 185 | "AB54/TELA": 𐘮 #TELA 186 | AB55: 𐘯 187 | AB56: 𐘰 188 | AB57: 𐘱 189 | AB58: 𐘲 190 | AB59: 𐘳 191 | AB60: 𐘴 192 | AB61: 𐘵 193 | AB65: 𐘶 194 | AB66: 𐘷 195 | AB67: 𐘸 196 | AB69: 𐘹 197 | AB70: 𐘺 198 | AB73: 𐘻 199 | AB74: 𐘼 200 | AB76: 𐘽 201 | AB77: 𐘾 202 | AB78: 𐘿 203 | AB79: 𐙀 204 | AB80: 𐙁 205 | AB81: 𐙂 206 | AB82: 𐙃 207 | "AB85/SUS": 𐙄 208 | AB86: 𐙅 209 | AB87: 𐙆 210 | AB100/VIR: 𐙇 211 | AB118: 𐙈 212 | "AB120:/GRA": 𐙉 213 | "AB120/GRAb": 𐙊 214 | "AB122/OLIV": 𐙋 215 | AB123: 𐙌 #AROM 216 | AB131/VINa: 𐙍 217 | AB131/VINb: 𐙎 218 | AB131/VINc: 𐙏 219 | AB164a: 𐙐 220 | AB164b: 𐙐 221 | AB164c: 𐙐 222 | AB164d: 𐙐 223 | AB171: 𐙑 224 | AB180: 𐙒 225 | AB188: 𐙓 226 | AB191: 𐙔 227 | AB302/OLE: 𐙖 228 | 229 | # LOGOGRAMS only attested in Linear A 230 | A302: 𐙖 231 | A303: 𐙗 232 | A304: 𐙘 233 | A306: 𐙚 234 | A308: 𐙜 235 | A315: 𐙧 236 | A316: 𐙨 237 | A317: 𐙩 238 | A332: 𐙸 239 | A334: 𐙺 240 | A335: 𐙻 241 | A336: 𐙼 242 | A338: 𐙾 243 | A339: 𐙿 244 | A343: 𐚃 245 | A344: 𐚄 246 | A347: 𐚇 247 | A365: 𐚙 248 | A366: 𐚚 249 | A367: 𐚛 250 | A368: 𐚜 251 | A369: 𐚝 252 | A370: 𐚞 253 | A371: 𐚟 254 | ## Vases 255 | A400VAS: 𐚠 256 | A401VAS: 𐚡 257 | A402VAS: 𐚢 258 | A403VAS: 𐚣 259 | A404VAS: 𐚤 260 | A405VAS: 𐚥 261 | A406VAS: 𐚦 262 | A407VAS: 𐚧 263 | A408VAS: 𐚨 264 | A409VAS: 𐚩 265 | A410VAS: 𐚪 266 | A411VAS: 𐚫 267 | A411VASa: 𐚫 268 | A411VASb: 𐚫 269 | A411VASc: 𐚫 270 | A412VAS: 𐚬 271 | A413VAS: 𐚭 272 | A414VAS: 𐚮 273 | A415VAS: 𐚯 274 | A416VAS: 𐚰 275 | A417VAS: 𐚱 276 | A418VAS: 𐚲 277 | 278 | # TRANSACTIONAL SIGNS 279 | A307: 𐙛 280 | A318: 𐙪 281 | A319: 𐙫 282 | A326: 𐙲 283 | A346: 𐚆 284 | 285 | # FRACTIONS 286 | A701: 𐝀 287 | # A: 𐝀 288 | A702: 𐝁 289 | # B: 𐝁 290 | A703: 𐝂 291 | # D: 𐝂 292 | A704: 𐝃 293 | # E: 𐝃 294 | A705: 𐝄 295 | # F: 𐝄 296 | A706: 𐝅 297 | # H: 𐝅 298 | A707: 𐝆 299 | # J: 𐝆 300 | A708: 𐝇 301 | # K: 𐝇 302 | A709: 𐝈 303 | # L: 𐝈 304 | A7092: 𐝉 305 | # L2: 𐝉 306 | A7093: 𐝊 307 | # L3: 𐝊 308 | A7094: 𐝋 309 | # L4: 𐝋 310 | A7096: 𐝌 311 | # L6: 𐝌 312 | A710: 𐝍 313 | # W: 𐝍 314 | A711: 𐝎 315 | # X: 𐝎 316 | A712: 𐝏 317 | # Y: 𐝏 318 | A713: 𐝐 319 | OMEGA: 𐝐 # CHECK 320 | A714: 𐝑 321 | # ABB: 𐝑 322 | A715: 𐝒 323 | # BB: 𐝒 324 | A717: 𐝓 325 | # DD: 𐝓 326 | A726: 𐝔 327 | # EYYY: 𐝔 328 | A732: 𐝕 329 | #bJE: 𐝕 330 | 331 | # LIGATURES 332 | A311: 𐙡 333 | A313a: 𐙣 334 | A313b: 𐙤 335 | A313c: 𐙥 336 | A330: 𐙶 337 | A337: 𐙽 338 | A341: 𐚁 339 | A348: 𐚈 340 | A351: 𐚋 341 | 342 | # COMPOUND SIGNS 343 | A501: 𐚳 344 | A502: 𐚴 345 | A503: 𐚵 346 | A504: 𐚶 347 | A505: 𐚷 348 | A506: 𐚸 349 | A508: 𐚹 350 | A509: 𐚺 351 | A510: 𐚻 352 | A511: 𐚼 353 | A512: 𐚽 354 | A513: 𐚾 355 | A515: 𐚿 356 | A516: 𐛀 357 | A520: 𐛁 358 | A521: 𐛂 359 | A523: 𐛃 360 | A524: 𐛄 361 | A525: 𐛅 362 | A526: 𐛆 363 | A527: 𐛇 364 | A528: 𐛈 365 | A529: 𐛉 366 | A530: 𐛊 367 | A531: 𐛋 368 | A532: 𐛌 369 | A534: 𐛍 370 | A535: 𐛎 371 | A536: 𐛏 372 | A537: 𐛐 373 | A538: 𐛑 374 | A539: 𐛒 375 | A540: 𐛓 376 | A541: 𐛔 377 | A542: 𐛕 378 | A545: 𐛖 379 | A547: 𐛗 380 | A548: 𐛘 381 | A549: 𐛙 382 | A550: 𐛚 383 | A551: 𐛛 384 | A552: 𐛜 385 | A553: 𐛝 386 | A554: 𐛞 387 | A555: 𐛟 388 | A556: 𐛠 389 | A557: 𐛡 390 | A559: 𐛢 391 | A563: 𐛣 392 | A564: 𐛤 393 | A565: 𐛥 394 | A566: 𐛦 395 | A568: 𐛧 396 | A569: 𐛨 397 | A570: 𐛩 398 | A571: 𐛪 399 | A572: 𐛫 400 | A573: 𐛬 401 | A574: 𐛭 402 | A575: 𐛮 403 | A576: 𐛯 404 | A577: 𐛰 405 | A578: 𐛱 406 | A579: 𐛲 407 | A580: 𐛳 408 | A581: 𐛴 409 | A582: 𐛵 410 | A583: 𐛶 411 | A584: 𐛷 412 | A585: 𐛸 413 | A586: 𐛹 414 | A587: 𐛺 415 | A588: 𐛻 416 | A589: 𐛼 417 | A591: 𐛽 418 | A592: 𐛾 419 | A594: 𐛿 420 | A595: 𐜀 421 | A596: 𐜁 422 | A598: 𐜂 423 | A600: 𐜃 424 | A601: 𐜄 425 | A602: 𐜅 426 | A603: 𐜆 427 | A604: 𐜇 428 | A606: 𐜈 429 | A608: 𐜉 430 | A609: 𐜊 431 | A610: 𐜋 432 | A611: 𐜌 433 | A612: 𐜍 434 | A613: 𐜎 435 | A614: 𐜏 436 | A615: 𐜐 437 | A616: 𐜑 438 | A617: 𐜒 439 | A618: 𐜓 440 | A619: 𐜔 441 | A620: 𐜕 442 | A621: 𐜖 443 | A622: 𐜗 444 | A623: 𐜘 445 | A624: 𐜙 446 | A626: 𐜚 447 | A627: 𐜛 448 | A628: 𐜜 449 | A629: 𐜝 450 | A634: 𐜞 451 | A637: 𐜟 452 | A638: 𐜠 453 | A640: 𐜡 454 | A642: 𐜢 455 | A643: 𐜣 456 | A644: 𐜤 457 | A645: 𐜥 458 | A646: 𐜦 459 | ## Vases 460 | A648VAS: 𐜧 461 | A649VAS: 𐜨 462 | A651VAS: 𐜩 463 | A652VAS: 𐜪 464 | A653VAS: 𐜫 465 | A654VAS: 𐜬 466 | A655VAS: 𐜭 467 | A656VAS: 𐜮 468 | A657VAS: 𐜯 469 | A658VAS: 𐜰 470 | A659VAS: 𐜱 471 | A660VAS: 𐜲 472 | A661VAS: 𐜳 473 | A662VAS: 𐜴 474 | A663VAS: 𐜵 475 | A664VAS: 𐜶 476 | 477 | # ADDITIONAL SIGNS 478 | A800: 𐝠 479 | A801: 𐝡 480 | A802: 𐝢 481 | A803: 𐝣 482 | A804: 𐝤 483 | A805: 𐝥 484 | A806: 𐝦 485 | A807: 𐝧 486 | 487 | patterns_to_ignore: 488 | - "vacat\\s*\\.?" 489 | - "lat\\s*\\." 490 | - "inf\\s*\\." 491 | - "i\\s*\\." 492 | - "mut\\s*\\." 493 | - "sup\\s*\\." 494 | - "vac\\s*\\." 495 | - "v\\s*\\." 496 | - "vestigia" 497 | - "l\\s*\\." 498 | - "s\\s*\\." 499 | - "Graffito" 500 | - "[\\/\\,\\'\\?]" 501 | - "⟦.*?⟧" 502 | - "deest" 503 | - "[⸤⸥]" 504 | - "[\\u231e\\u231f]" # Ignore characters ⌞ and ⌟ 505 | 506 | 507 | regularization: 508 | - ['\\[\\?\\]', '%'] 509 | - ['\\[unclassified\\]', '%'] # Maps `[unclassified]` to `%` 510 | - ['[?]','%'] 511 | - ['\\|', ''] 512 | - [':', ''] 513 | - ['r\\.', ''] 514 | - ['\\[•~\\]', ''] 515 | - ['⌜', ''] 516 | - ['⌝', ''] 517 | - ['mutila', ''] 518 | - ['\\[?•~•~•~•\\]?', '%%%%'] 519 | - ['\\[?•~•~•\\]?', '%%%'] 520 | - ['\\[?•~•~\\]?', '%%'] 521 | - ['\\[?•~•\\]?', '%%'] # Corrected this line 522 | - ['\\<|\\>', ''] 523 | - ['\\[ \\]', '[ ]'] 524 | - ['ro2', '𐁊'] 525 | - ['vestigia', '%'] 526 | - ['\\bqs\\b', '%'] 527 | - ['vest\\s*\\.', '%'] 528 | - ['\\[•\\]', '%'] 529 | - ['supra sigillum|CMS \\w+\\d+[A-Z]* \\d+', ''] 530 | - ['reliqua pars sine regulis', ''] 531 | - ['[αβγ]', ''] 532 | - ['v\\.→', ''] 533 | - ['v\\.↓', ''] 534 | - ['v\\.', ''] 535 | - ['\\b(vacat|sup. mut.|inf. mut.|deest|X|fragmentum A|fragmentum B|graffito|angustum|prior pars sine regulis|fragmentum C|fragmentum D|fragmentum separatum|α|β|γ|δ|sigillum|)\\b', ''] # Corrected this line 536 | - ['\\b(x|m|f)\\b', ''] 537 | - ['[\\[\\]]', '%'] 538 | - ['=[^ ]*', ''] 539 | - ['•', '%'] 540 | - ['●', ''] 541 | - ['dex.', ''] 542 | - ['sin.', ''] 543 | 544 | tokenization: 545 | - ['\u00a0', ' '] # Replace non-breaking space with regular space 546 | - ['\u0323', ''] # Remove specific character (e.g., dot below) 547 | - ['', ''] # Remove HTML closing emphasis tag 548 | - ['', ''] # Remove HTML opening emphasis tag 549 | - ['\|([^|]+)\|', '|\1|'] # Special handling to ensure pipes are treated as separate tokens 550 | - ['ME<±RI>', 'ME±RI'] # Handle specific compound tokens like 'ME<±RI>' 551 | - ['--', '-'] # Normalize the text by replacing double dashes with a single dash 552 | - ['\b(EQU|SUS|OVIS|BOS|CAP)\s+(x|m|f)\b', '\1\2'] # Combine animal ideograms followed by 'x', 'm', or 'f' without space 553 | - ['⌜', ' ⌜ '] # Explicit tokenization for half brackets 554 | - ['⌝', ' ⌝ '] 555 | - ['mutila', ' mutila '] # Handle 'mutila' 556 | - ['fragmentum A', 'fragmentum_A'] # Preprocess 'fragmentum A' and 'fragmentum B' to ensure they are not split 557 | - ['fragmentum B', 'fragmentum_B'] 558 | - ['\b(BOS|SUS|OVIS|CAP|EQU)\s([mf])\b', '\1\2'] # Combine terms with 'm' or 'f' 559 | - ['\](?=[^\s])', ']-'] # Pre-process ']' and '[' for special handling 560 | - ['(?<=[^\s])\[', '-['] 561 | - ['TELA\s+(?=[1234x]\b)', 'TELA'] # Handle specific cases 562 | - ['TELA\s+(\d+)', 'TELA \1'] # Handle other numbers with space 563 | - ['\* (\d+)', '*\1'] # Combine '*' with the following numeral 564 | - ['\+ ([^\s]+)', '+\1'] # Combine '+' with surrounding ideograms 565 | - ['([^\s]) \+', '\1+'] # Ensure '+' is properly attached 566 | - ['([^\s]+) VAS', '\1VAS'] # Attach 'VAS' properly 567 | - ['\b(vac|vest|l|s|lat|inf|mut|sup|i)\s?\.', '\1.'] # Ignore or modify specific patterns 568 | - ['\b(supra sigillum|reliqua pars sine regulis|vacat)\b', '\1'] # Explicit tokenization 569 | 570 | complex_symbols: 571 | 'TELA-[;1+TE': 'PLACEHOLDER_TELA1' 572 | 'TELA;1+TE': 'PLACEHOLDER_TELA2' 573 | 'TELA-[;1]-+TE': 'PLACEHOLDER_TELA3' 574 | 'OVIS]-:m': 'PLACEHOLDER_OVIS' 575 | 576 | special_chars_pattern: "(\\[|\\]|\\,|\\'|\\u27e6|\\u27e7|-|\\?|<|>|⌞|⌟|⸤|⸥|\\||\ue000)" 577 | 578 | restore_patterns: 579 | - ['fragmentum_A', 'fragmentum A'] 580 | - ['fragmentum_B', 'fragmentum B'] 581 | - ['ME±RI', 'ME<±RI>'] -------------------------------------------------------------------------------- /paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Potnia: A Python library for the conversion of transliterated ancient texts to Unicode' 3 | tags: 4 | - Python 5 | - Unicode 6 | - ancient texts 7 | - ancient scripts 8 | - ancient languages 9 | - transliteration 10 | - machine learning 11 | authors: 12 | - name: Emily Tour 13 | orcid: 0000-0001-5212-1427 14 | equal-contrib: true # (This is how you can denote equal contributions between multiple authors) 15 | corresponding: true 16 | affiliation: "1" 17 | - name: Kabir Manandhar Shrestha 18 | orcid: 0009-0001-2059-1683 19 | equal-contrib: true 20 | affiliation: 2 21 | - name: Robert Turnbull 22 | orcid: 0000-0003-1274-6750 23 | corresponding: false 24 | equal-contrib: true 25 | affiliation: 2 26 | affiliations: 27 | - name: University of Melbourne, Australia 28 | index: 1 29 | - name: Melbourne Data Analytics Platform, University of Melbourne, Australia 30 | index: 2 31 | date: 23 September 2024 32 | bibliography: paper.bib 33 | --- 34 | 35 | # Summary 36 | 37 | Potnia is an open-source Python library designed to convert Romanized transliterations of ancient texts into their respective Unicode representations. Significant progress has been made in the digitization of ancient language corpora. However, many of these datasets are solely presented in transliterated form, even though the necessary Unicode blocks exist to render them using their native script. This restriction to using transliterated datasets for certain ancient scripts has the potential to limit the precision of linguistic analysis via machine learning. 38 | 39 | Potnia bridges this gap by providing a flexible framework for converting transliterations into Unicode. By enabling tokenization and processing in the original script, Potnia can optimize tasks such as textual restoration and machine learning-based analysis. The library currently supports: 40 | 41 | - Linear A 42 | - Linear B 43 | - Hittite cuneiform 44 | - Arabic 45 | 46 | While Linear B has the most comprehensive test cases and is the most robust, the tool can also be used effectively for the other scripts. The architectural flexibility of Potnia makes it easy to accommodate additional scripts, offering significant value to both computational linguistics and digital humanities by enabling researchers to work with ancient texts in their native scripts. 47 | 48 | # Statement of Need 49 | 50 | While machine learning has increasingly been applied to the study of ancient texts [@sommerschieldMachineLearningAncient2023], much of this progress has involved working with transliterated texts, rather than native script formats [@luoNeuralDeciphermentMinimumcost2019; @papavassileiouGenerativeModelMycenaean2023; @fetayaRestorationFragmentaryBabylonian2020; @peronocacciafocoNewApproachDecipherment2021]. Although Unicode standards exist for many ancient scripts, transliterated texts remain prevalent due to historical digitization practices. 51 | 52 | Transliteration is the process of converting text from its original script into a different script, using systematic processes. It allows those who can understand the secondary script to comprehend the orthography and the approximate pronunciation of the original text. Prior to the gradual introduction of relevant Unicode blocks since the 1990s [@Hossain2024], it was also usually necessary for representing non-Latin scripts on Western computational systems, which were largely confined to letters of the Latin alphabet and a small number of special characters. 53 | 54 | Transliteration has an important place in aiding new learners of an ancient script to understand the pronunciation and orthography of the underlying language it represents (particularly for non-alphabetic scripts, where beginners need to grasp a vast repertoire of unfamiliar signs). However, it is well recognized that this process can only provide an approximate, and often unsatisfactory or disputed, representation of the original text [@weinbergTransliterationDocumentation1974; @odishoTransliteratingEnglishArabic1992; @martinetProjectTransliterationClassical1953]. In particular, a lack of standardized approaches to transliteration can introduce considerable ambiguity and noise into the dataset in a variety of ways, including: 55 | 56 | - the use of various notation systems, with different transliterations representing the exact same sign in distinct ways (e.g. where ![Cuneiform sign 'mè'](docs/_static/img/Csign_me3_large.png){height="11pt"} in Akkadian cuneiform can be represented as either 'mè' or 'me\textsubscript{3}'); 57 | - changing opinions on particular sign values over time, introducing possible differences between older and newer transliterations (e.g. ![Linear B sign 'qa'](docs/_static/img/LBsign_qa_large2.png){height="11pt"} in the Linear B script changing from the previously suggested value of 'pa\textsubscript{2}' to 'qa')[@chadwickDocumentsMycenaeanGreek1973, pp. 389, 391]; 58 | - and the way in which transliteration obscures polyvalency in scripts, where a single sign can represent multiple different values (e.g. ![Cuneiform sign 'ḫar'](docs/_static/img/Csign_har_large.png){height="11pt"} in Hittite cuneiform can represent three different syllables, transliterated as 'ḫar', 'ḫur' and 'mur', as well as acting as a logogram for three different words, 'ring', 'thick' and 'lung'). 59 | 60 | For language modelling tasks, we therefore suggest that representations of texts in their native form are preferable to achieve the most accurate results. A number of digitized corpora for well-resourced and widely studied ancient languages are now available in Unicode representations of their native script, including a corpus of ancient Greek [@canonicalgreek], classical Hebrew [@sefaria_project], Syriac [@digital_syriac] and Arabic [@openiti]. However, many other online text corpora remain restricted to Romanized translisterations (despite the availability of relevant Unicode standards), presumably due to considerations around ease, system limitations and accessibility, e.g. Linear B [@auroraDAMOSDatabaseMycenaean2015], Ugaritic [@prosser2019ras] and Sumero-Akkadian cuneiform [@cdli2024]. For this latter group of scripts, current tools capable of converting transliterated ancient texts to the corresponding Unicode appear limited to a handful of individual scripts, such as the various implementations of 'Cuneify' [@tinney2019cuneify] that handle Sumero-Akkadian cuneiform, the PHP script 'UnicodeConverter', for Egyptian hieroglyphs [@ilintomich2021unicodeconverter], or 'Anatolian Hieroglyphics (Luwian) generation' tool for Luwian hieroglyphs [@senior2023anatoliangenerator], the latter only available to use through a basic online graphical user interface. While the PyArabic package [@Zerrouki2023] is able to convert Arabic text to and from the popular Timothy Buckwalter transliteration system, Potnia provides a complementary functionality for the DIN 31635 transliteration system [@DIN31635] which is widely used in academic literature. 61 | 62 | In addition, such transliterations of ancient texts are often heavily annotated, with special characters used to denote a range of features including uncertain readings, missing or damaged elements, erasures, non-textual marks, and annotations by modern transliterators pertaining to structural or physical elements of the document. If not removed or handled appropriately, these have the potential to introduce further noise into language models. 63 | 64 | These are the primary gaps we have aimed to address through the development of Potnia. The library's focus on ancient scripts and its extensible architecture make it a valuable asset for researchers working with digitized ancient corpora. It is also equipped to provide specific handling of these elements, with tailored tokenization and regularization rules pertaining to both script-specific and corpus-specific conventions. Potnia therefore enables a key pre-processing step in the language modelling pipeline, with the resulting Unicode outputs of ancient texts enabling more accurate and nuanced computational analysis of these texts in downstream modelling tasks. 65 | 66 | # Implementation 67 | 68 | Potnia is implemented in Python with an extensible architecture centered around the `Script` class, which converts transliterated texts into Unicode representations. It is designed to handle the complexities of ancient scripts through a flexible and customizable framework. 69 | 70 | ## Key Features 71 | 72 | 1. **YAML-Based Mapping and Rule Specification:** 73 | Each script in Potnia (e.g. Linear A, Linear B, Arabic, Hittite cuneiform) is configured via a single YAML file that contains syllabograms, logograms, and rules for transliteration and regularization. This unified structure simplifies updates, scales easily for new scripts, and eliminates the need for hardcoded source files (fig. \ref{fig:mappings}). 74 | 75 | ![Example of YAML mapping specification.\label{fig:mappings}](docs/_static/img/mappings.png){ width=50% } 76 | 77 | 2. **Tokenization:** The `tokenize_transliteration` method applies complex symbol replacements and regular expressions to transliterated text based on the rules specified in the YAML file. This tokenization process ensures that the text is split accurately into its meaningful components, handling special symbols and spacing using placeholders, and preparing the text for Unicode conversion. 78 | 79 | 3. **Transliteration to Unicode:** Potnia uses the `__call__` method to convert the transliterated text to its Unicode representation (fig. \ref{fig:potnia-example}). 80 | 81 | ![Example of using Potnia.\label{fig:potnia-example}](docs/_static/img/potnia-example.png){ width=80% } 82 | 83 | 4. **Regularization of Text:** The regularize method applies a series of regular expression rules to clean and normalize the Unicode output. It removes unnecessary tags, ignores patterns specified in the YAML file (e.g. annotations or uncertain characters), and ensures that only the essential characters are retained. This step ensures the output is refined and ready for downstream tasks. 84 | 85 | 5. **Comprehensive Testing:** Pytest fixtures allow us to define test cases as lines in YAML files which allowed us to consisely add over 360 test examples, covering a broad range of edge cases. The code coverage of the tests is 100%. 86 | 87 | 6. **Versatile Interface Options** Users can interact with Potnia as a Python library, or through the command line interface (CLI), or through the graphical user interface (GUI) (fig. \ref{fig:potnia-gui}) 88 | 89 | ![Example of using the Potnia GUI.\label{fig:potnia-gui}](docs/_static/img/potnia-gui.png){ width=80% } 90 | 91 | 92 | # Research Application 93 | 94 | Potnia’s design and functionality address the following challenges in the analysis of ancient texts: 95 | 96 | 1. **Extensibility:** Potnia is designed to be highly extensible, allowing researchers to integrate new scripts by defining script-specific rules for tokenization and conversion. This flexibility makes the library suitable for a wide range of ancient scripts that are not yet represented in Unicode, providing a valuable tool for researchers across various fields of ancient studies. 97 | 98 | 2. **Integration with Research Workflows:** Researchers can easily incorporate Potnia into their existing workflows. For example, in a typical research scenario, Potnia could be used to preprocess a corpus of Linear B texts before feeding them into a machine learning model for further analysis. 99 | 100 | As part of a broader initiative to develop language models for ancient language research, Potnia serves as a foundational component by converting Romanized transliterations of Linear B texts into Unicode datasets for computational analysis. These datasets enable the development of language-specific models supporting tasks such as text generation, restoration and vector embedding analysis. The library's modular design facilitates its application to additional ancient scripts, contributing to broader research initiatives in computational philology. 101 | 102 | # Availability 103 | 104 | Potnia is open-source software released under the Apache 2.0 license. It is available through PyPI [https://pypi.org/project/potnia/](https://pypi.org/project/potnia/) and GitHub [https://github.com/AncientNLP/potnia](https://github.com/AncientNLP/potnia). We welcome contributions from the community and adhere to the Contributor Covenant Code of Conduct. Documentation is available at [https://ancientnlp.github.io/potnia/](https://ancientnlp.github.io/potnia/). 105 | 106 | # Acknowledgements 107 | 108 | We acknowledge support from Wytamma Wirth, Brent Davis, Kim Doyle, Man-Hua (Kate) Chu, Anhui (Ellie) Situ, Ekaterina Vylomova, Chris Guest and Stavroula (Stephie) Nikoloudis. This research was supported by The University of Melbourne’s Research Computing Services. Robert Turnbull completed part of this work through the BICROSS project, which has received funding from the European Research Council (ERC) under the European Union’s Horizon Europe research and innovation programme (grant agreement no. 101043730 – BICROSS – ERC-2021-COG). 109 | 110 | # References 111 | -------------------------------------------------------------------------------- /potnia/data/linear_b.yaml: -------------------------------------------------------------------------------- 1 | mappings: 2 | ############################################################ 3 | ###### Syllabograms Common to Linear A and Linear B ######## 4 | ############################################################ 5 | # LB SOUND VALUES KNOWN 6 | a: 𐀀 7 | e: 𐀁 8 | i: 𐀂 9 | o: 𐀃 10 | u: 𐀄 11 | da: 𐀅 12 | de: 𐀆 13 | di: 𐀇 14 | do: 𐀈 15 | du: 𐀉 16 | ja: 𐀊 17 | je: 𐀋 18 | jo: 𐀍 19 | ju: 𐀎 20 | ka: 𐀏 21 | ke: 𐀐 22 | ki: 𐀑 23 | ko: 𐀒 24 | ku: 𐀓 25 | ma: 𐀔 26 | me: 𐀕 27 | mi: 𐀖 28 | mo: 𐀗 29 | mu: 𐀘 30 | na: 𐀙 31 | ne: 𐀚 32 | ni: 𐀛 33 | "no": 𐀜 34 | nu: 𐀝 35 | nwa: 𐁅 36 | pa: 𐀞 37 | pe: 𐀟 38 | pi: 𐀠 39 | po: 𐀡 40 | pu: 𐀢 41 | pu₂: 𐁆 42 | pu2: 𐁆 43 | qa: 𐀣 44 | qe: 𐀤 45 | qi: 𐀥 46 | qo: 𐀦 47 | ra: 𐀨 48 | ra₂: 𐁈 49 | ra2: 𐁈 50 | re: 𐀩 51 | ri: 𐀪 52 | ro: 𐀫 53 | ru: 𐀬 54 | sa: 𐀭 55 | se: 𐀮 56 | si: 𐀯 57 | so: 𐀰 58 | su: 𐀱 59 | ta: 𐀲 60 | ta₂: 𐁋 61 | ta2: 𐁋 62 | te: 𐀳 63 | ti: 𐀴 64 | to: 𐀵 65 | tu: 𐀶 66 | wa: 𐀷 67 | we: 𐀸 68 | wi: 𐀹 69 | wo: 𐀺 70 | za: 𐀼 71 | ze: 𐀽 72 | zo: 𐀿 73 | 74 | # LB SOUND VALUES UNKNOWN 75 | "*22": 𐁒 76 | "*34": 𐁓 77 | "*47": 𐁔 78 | "*49": 𐁕 79 | "*86": 𐁜 80 | 81 | ############################################################ 82 | ###### Syllabograms Unique to Linear B ######## 83 | ############################################################ 84 | 85 | ## Sound values known 86 | a₂: 𐁀 87 | a2: 𐁀 88 | a₃: 𐁁 89 | a3: 𐁁 90 | au: 𐁂 91 | dwe: 𐁃 92 | dwo: 𐁄 93 | pte: 𐁇 94 | ra₃: 𐁉 95 | ra3: 𐁉 96 | ro₂: 𐁊 97 | ro2: 𐁊 98 | twe: 𐁌 99 | two: 𐁍 100 | 101 | ## Sound values unknown 102 | "*18": 𐁐 103 | "*19": 𐁑 104 | # *35 missing unicode sign? 105 | "*63": 𐁗 106 | "*83": 𐁛 107 | "*89": 𐁝 108 | 109 | # Doubtful sound values 110 | "*56": 𐁖 # potential sound value 'pa₃' 111 | "*64": 𐁘 # potential sound value 'swi' 112 | "*65": 𐀎 # potential sound value 'ju' 113 | "*79": 𐁙 # potential sound value 'zu' 114 | "*82": 𐁚 # potential sound value 'swa' 115 | 116 | ############################################################ 117 | ##### Logograms ####### 118 | ############################################################ 119 | # PEOPLE AND ANIMALS 120 | VIR: 𐂀 # man 121 | MUL: 𐂁 # woman 122 | CERV: 𐂂 # deer 123 | EQU: 𐂃 # horse 124 | EQUx: 𐂃 # horse 125 | EQU:x: 𐂃 # horse 126 | EQUf: 𐂄 # female horse 127 | EQU:f: 𐂄 # female horse 128 | EQUm: 𐂅 # male horse 129 | EQU:m: 𐂅 # male horse 130 | OVIS: 𐀥 # sheep 131 | OVISx: 𐀥 # sheep 132 | OVIS:x: 𐀥 # sheep 133 | OVISf: 𐂆 # female sheep 134 | OVIS:f: 𐂆 # female sheep 135 | OVISm: 𐂇 # male sheep 136 | OVIS:m: 𐂇 # male sheep 137 | CAP: 𐁒 # goat 138 | CAPx: 𐁒 # goat 139 | CAP:x: 𐁒 # goat 140 | CAPf: 𐂈 # female goat 141 | CAP:f: 𐂈 # female goat 142 | CAPm: 𐂉 # male goat 143 | CAP:m: 𐂉 # male goat 144 | SUS: 𐁂 # pig 145 | SUSx: 𐁂 # pig 146 | SUS:x: 𐁂 # pig 147 | SUSf: 𐂊 # female pig 148 | SUS:f: 𐂊 # female pig 149 | SUSm: 𐂋 # male pig 150 | SUS:m: 𐂋 # male pig 151 | BOS: 𐀘 # cattle 152 | BOSx: 𐀘 # cattle 153 | BOS:x: 𐀘 # cattle 154 | BOSf: 𐂌 # female cattle 155 | BOS:f: 𐂌 # female cattle 156 | BOSm: 𐂍 # male cattle 157 | BOS:m: 𐂍 # male cattle 158 | 159 | # DRY COMMODITIES 160 | GRA: 𐂎 # wheat or barley 161 | HORD: 𐂏 # barley 162 | OLIV: 𐂐 # olive 163 | AROM: 𐂑 # spice 164 | CYP: 𐂒 # cyperus 165 | PYC: 𐂒 # cyperus 166 | KA+PO: 𐂓 # fruit? 167 | KA±PO: 𐂓 # fruit? 168 | KA+NA+KO: 𐂔 # saffron 169 | KA±NA±KO: 𐂔 # saffron 170 | KANAKO: 𐂔 # saffron 171 | CROC: 𐁉 # saffron 172 | FAR: 𐀎 # flour 173 | 174 | # LIQUID COMMODITIES 175 | OLE: 𐂕 # olive oil 176 | VIN: 𐂖 # wine 177 | "*132": 𐂗 178 | A+RE+PA: 𐂘 # ointment 179 | A±RE±PA: 𐂘 # ointment 180 | ME+RI: 𐂙 # honey 181 | ME±RI: 𐂙 # honey 182 | ME<±RI>: 𐂙 # honey 183 | 184 | # METALS 185 | AES: 𐂚 # bronze 186 | AUR: 𐂛 # gold 187 | "*142": 𐂜 188 | 189 | # OTHER MATERIALS AND ITEMS 190 | LANA: 𐂝 # wool 191 | "*146": 𐂞 # linen garment? 192 | "*146;2": 𐂞² # linen garment? 193 | "*146+PE": 𐂞+𐀟 # linen garment? 194 | "*150": 𐂟 195 | CORN: 𐂠 # horn (wild goat) 196 | "*152": 𐂡 # oxhinde 197 | "*153": 𐂢 # sheepskin 198 | "*154": 𐂣 # hide? 199 | TU+RO₂: 𐂤 # cheese 200 | TU±RO2 : 𐂤 # cheese 201 | "*157": 𐂥 202 | "*158": 𐂦 203 | TELA: 𐂧 # cloth 204 | "*160": 𐂨 205 | "*161": 𐂩 206 | TUN: 𐂪 # breastplate 207 | ARM: 𐂫 # armour 208 | "*164": 𐂬 209 | "*165": 𐂭 210 | "*166": 𐂮 211 | "*167": 𐂯 # ingot 212 | "*168": 𐂰 213 | "*169": 𐂱 214 | "*170": 𐂲 215 | "*171": 𐂳 216 | "*172": 𐂴 # honeycomb? 217 | LUNA: 𐂵 # month's ration? 218 | "*174": 𐂶 219 | ARB: 𐂷 # tree 220 | "*177": 𐂸 221 | "*178": 𐂹 222 | "*179": 𐂺 223 | "*180": 𐂻 # parchment? 224 | "*181": 𐂼 225 | "*182": 𐂽 226 | "*183": 𐂾 227 | "*184": 𐂿 228 | "*185": 𐃀 229 | "*189": 𐃁 230 | "*190": 𐃂 231 | GAL: 𐃃 # helmet 232 | "*220": 𐃄 # footstool 233 | ALV: 𐃅 # bathtub 234 | HAS: 𐃆 # spear 235 | SAG: 𐃇 # arrow 236 | "*232": 𐃈 237 | PUG: 𐃉 # dagger 238 | "*234": 𐃊 239 | GUP: 𐃋 # dagger 240 | BIG: 𐃌 # chariot 241 | CUR: 𐃍 # chariot 242 | CAPS: 𐃎 # chariot frame 243 | ROTA: 𐃏 # wheel 244 | "*245": 𐃐 # chariot part? 245 | "*246": 𐃑 # chariot part? 246 | DI+PTE: 𐃒 # parchment? 247 | DI±PTE: 𐃒 # parchment? 248 | "*248": 𐃓 249 | "*249": 𐃔 250 | "*251": 𐃕 251 | "*252": 𐃖 252 | "*253": 𐃗 253 | JAC: 𐃘 # javelin or dart 254 | "*255": 𐃙 255 | "*256": 𐃚 # bow? 256 | "*257": 𐃛 257 | "*258": 𐃜 258 | "*259": 𐃝 259 | 260 | # VESSELS 261 | "*155VAS": 𐃞 # basket? 262 | "*200VAS": 𐃟 263 | "*201VAS": 𐃠 # tripod 264 | "*202VAS": 𐃡 265 | "*203VAS": 𐃢 266 | "*204VAS": 𐃣 267 | "*205VAS": 𐃤 268 | "*206VAS": 𐃥 # hydria 269 | "*207VAS": 𐃦 270 | "*208VAS": 𐃧 # patera 271 | "*208aVAS": 𐃧 # patera 272 | "*208bVAS": 𐃧 # patera 273 | "*209VAS": 𐃨 # amphora 274 | "*210VAS": 𐃩 275 | "*211VAS": 𐃪 276 | "*212VAS": 𐃫 277 | "*213VAS": 𐃬 # lanx 278 | "*214VAS": 𐃭 279 | "*215VAS": 𐃮 # kylix 280 | "*216VAS": 𐃯 281 | "*217VAS": 𐃰 282 | "*218VAS": 𐃱 283 | "*219VAS": 𐃲 284 | "*221VAS": 𐃳 285 | "*222VAS": 𐃴 286 | "*226VAS": 𐃵 # washing ware shet 287 | "*227VAS": 𐃶 # rhyton 288 | "*228VAS": 𐃷 # ligula 289 | "*229VAS": 𐃸 # ladle 290 | "*250VAS": 𐃹 291 | "*305VAS": 𐃺 292 | 293 | # METRIC SYMBOLS 294 | Z: 𐄿 # volume measure 295 | V: 𐄾 # volume measure 296 | T: 𐄼 # dry measure 297 | S: 𐄽 # liquid measure 298 | Q: 𐄻 # weight measure 299 | P: 𐄺 # weight measure 300 | N: 𐄹 # weight measure 301 | M: 𐄸 # weight measure 302 | L: 𐄷 # talent 303 | 304 | # COMPOSITE IDEOGRAMS WITHOUT DESIGNATED UNICODE SIGNS 305 | OVIS+TA: 𐀥+𐀲 # stabled sheep 306 | SUS+KA: 𐁂+𐀏 # wild boar 307 | SUS+SI: 𐁂+𐀯 # fattened pigs 308 | BOS+SI: 𐀘+𐀯 # fattened cattle 309 | CAP+E: 𐁒+𐀁 # kid (goat) 310 | EQU+QE: 𐂃+𐀤 311 | GRA+Q: 𐂎+𐄻 312 | GRA+PE: 𐂎+𐀟 313 | OLIV+A: 𐂐+𐀀 # wild olive 314 | OLIV+TI: 𐂐+𐀴 # domestic olive 315 | AROM+CYP: 𐂑+𐂒 # cyperus 316 | AROM+PYC: 𐂑+𐂒 # cyperus 317 | AROM+KO: 𐂑+𐀒 # coriander 318 | CYP+KU: 𐂒+𐀓 # cyperus 319 | CYP+O: 𐂒+𐀡 # cyperus (variant) 320 | CYP+PA: 𐂒+𐀞 # cyperus (variant) 321 | CYP+QA: 𐂒+𐀣 # cyperus (variant) 322 | PYC+KU: 𐂒+𐀓 # cyperus 323 | PYC+O: 𐂒+𐀡 # cyperus (variant) 324 | PYC+PA: 𐂒+𐀞 # cyperus (variant) 325 | PYC+QA: 𐂒+𐀣 # cyperus (variant) 326 | OLE+A: 𐂕+𐀀 # wild oil? 327 | OLE+O: 𐂕+𐀃 328 | OLE+PA: 𐂕+𐀞 # sage-scented oil? 329 | OLE+RA: 𐂕+𐀨 330 | OLE+SI: 𐂕+𐀯 331 | OLE+WE: 𐂕+𐀸 # oil suitable for annointing 332 | ROTA+TE: 𐃏+𐀳 # wheel with border or flange 333 | TUN+KI: 𐂪+𐀑 334 | TUN+QE: 𐂪+𐀤 # type of corselet 335 | TUN+RI: 𐂪+𐀪 336 | TELA+KU: 𐂧+𐀓 337 | TELA+PA: 𐂧+𐀞 # pharwos cloth 338 | TELA+PU: 𐂧+𐀢 339 | TELA+TE: 𐂧+𐀳 340 | "*166+WE": 𐂮+𐀸 341 | "*167+PE": 𐂯+𐀟 342 | "*168+SE": 𐂰+𐀮 343 | "*172+KE": 𐂴+𐀐 344 | "*172+KE+RO2": 𐂴+𐀐+𐂤 345 | "*180+DI": 𐂻+𐀇 346 | "*155VAS+DI": 𐃞+𐀇 347 | "*155VAS+NI": 𐃞+𐀛 348 | "*202VAS+DI": 𐃡+𐀇 349 | "*209VAS+A": 𐃨+𐀀 # amphora (plural) 350 | "*210VAS+KA": 𐃩+𐀏 # stirrup jar (plural) 351 | "*211VAS+PO": 𐃪+𐂓 352 | "*212VAS+U": 𐃫+𐀄 353 | "*213VAS+U": 𐃬+𐀄 354 | "*214VAS+U": 𐃭+𐀄 355 | "*214VAS+DI": 𐃭+𐀇 356 | "AUR+*213VAS": 𐂛+𐃬 357 | 358 | # SYLLABOGRAMS AS IDEOGRAMS 359 | A: 𐀀 360 | E: 𐀁 # ginger grass? # kid (goat)? 361 | I: 𐀂 362 | O: 𐀃 363 | U: 𐀄 364 | DA: 𐀅 # male steward 365 | DE: 𐀆 # bundle (used to measure ginger grass) 366 | DI: 𐀇 367 | DO: 𐀈 368 | DU: 𐀉 369 | JA: 𐀊 370 | JE: 𐀋 371 | JO: 𐀍 372 | JU: 𐀎 373 | KA: 𐀏 374 | KE: 𐀐 375 | KI: 𐀑 376 | KO: 𐀒 # coriander # piglet? 377 | KU: 𐀓 # cumin 378 | MA: 𐀔 # fennel 379 | ME: 𐀕 380 | MI: 𐀖 # mint 381 | MO: 𐀗 # single 382 | MU: 𐀘 383 | NA: 𐀙 384 | NE: 𐀚 385 | NI: 𐀛 # figs 386 | NO: 𐀜 387 | NU: 𐀝 388 | PA: 𐀞 389 | PE: 𐀟 390 | PI: 𐀠 391 | PO: 𐀡 392 | PU: 𐀢 393 | QA: 𐀣 394 | QE: 𐀤 395 | QI: 𐀥 396 | QO: 𐀦 397 | RA: 𐀨 398 | RE: 𐀩 399 | RI: 𐀪 # flax 400 | RO: 𐀫 401 | RU: 𐀬 402 | SA: 𐀭 # sesame 403 | SE: 𐀮 404 | SI: 𐀯 405 | SO: 𐀰 406 | SU: 𐀱 407 | TA: 𐀲 # female steward 408 | TE: 𐀳 409 | TI: 𐀴 410 | TO: 𐀵 411 | TU: 𐀶 412 | WA: 𐀷 413 | WE: 𐀸 # yearling (animal) 414 | WI: 𐀹 415 | WO: 𐀺 416 | ZA: 𐀼 417 | ZE: 𐀽 # pair 418 | ZO: 𐀿 419 | A₂: 𐁀 420 | A2: 𐁀 421 | A₃: 𐁁 422 | A3: 𐁁 423 | AU: 𐁂 424 | DWE: 𐁃 425 | DWO: 𐁄 426 | NWA: 𐁅 427 | PU₂: 𐁆 428 | PU2: 𐁆 429 | PTE: 𐁇 430 | RA₂: 𐁈 431 | RA2: 𐁈 432 | RA₃: 𐁉 433 | RA3: 𐁉 434 | RO₂: 𐁊 435 | RO2: 𐁊 436 | TA₂: 𐁋 437 | TA2: 𐁋 438 | TWE: 𐁌 439 | TWO: 𐁍 440 | 441 | # TELA CASES 442 | TELAx: 𐂧ˣ 443 | TELA;x: 𐂧ˣ 444 | TELA1: 𐂧¹ 445 | TELA;1: 𐂧¹ 446 | TELA2: 𐂧² 447 | TELA;2: 𐂧² 448 | TELA3: 𐂧³ 449 | TELA;3: 𐂧³ 450 | TELA4: 𐂧⁴ 451 | TELA;4: 𐂧⁴ 452 | TELAx+KU: 𐂧ˣ+𐀓 453 | TELA;x+KU: 𐂧ˣ+𐀓 454 | TELA1+KU: 𐂧¹+𐀓 455 | TELA;1+KU: 𐂧¹+𐀓 456 | TELA2+KU: 𐂧²+𐀓 457 | TELA;2+KU: 𐂧²+𐀓 458 | TELA3+KU: 𐂧³+𐀓 459 | TELA;3+KU: 𐂧³+𐀓 460 | TELA4+KU: 𐂧⁴+𐀓 461 | TELA;4+KU: 𐂧⁴+𐀓 462 | TELAx+PA: 𐂧ˣ+𐀞 463 | TELA;x+PA: 𐂧ˣ+𐀞 464 | TELA1+PA: 𐂧¹+𐀞 465 | TELA;1+PA: 𐂧¹+𐀞 466 | TELA2+PA: 𐂧²+𐀞 467 | TELA;2+PA: 𐂧²+𐀞 468 | TELA3+PA: 𐂧³+𐀞 469 | TELA;3+PA: 𐂧³+𐀞 470 | TELA4+PA: 𐂧⁴+𐀞 471 | TELA;4+PA: 𐂧⁴+𐀞 472 | TELAx+PO: 𐂧ˣ+𐀡 473 | TELA;x+PO: 𐂧ˣ+𐀡 474 | TELA1+PO: 𐂧¹+𐀡 475 | TELA;1+PO: 𐂧¹+𐀡 476 | TELA2+PO: 𐂧²+𐀡 477 | TELA;2+PO: 𐂧²+𐀡 478 | TELA3+PO: 𐂧³+𐀡 479 | TELA;3+PO: 𐂧³+𐀡 480 | TELA4+PO: 𐂧⁴+𐀡 481 | TELA;4+PO: 𐂧⁴+𐀡 482 | TELAx+PU: 𐂧ˣ+𐀢 483 | TELA;x+PU: 𐂧ˣ+𐀢 484 | TELA1+PU: 𐂧¹+𐀢 485 | TELA;1+PU: 𐂧¹+𐀢 486 | TELA2+PU: 𐂧²+𐀢 487 | TELA;2+PU: 𐂧²+𐀢 488 | TELA3+PU: 𐂧³+𐀢 489 | TELA;3+PU: 𐂧³+𐀢 490 | TELA4+PU: 𐂧⁴+𐀢 491 | TELA;4+PU: 𐂧⁴+𐀢 492 | TELAx+TE: 𐂧ˣ+𐀳 493 | TELA;x+TE: 𐂧ˣ+𐀳 494 | TELA1+TE: 𐂧¹+𐀳 495 | TELA;1+TE: 𐂧¹+𐀳 496 | TELA2+TE: 𐂧²+𐀳 497 | TELA;2+TE: 𐂧²+𐀳 498 | TELA3+TE: 𐂧³+𐀳 499 | TELA;3+TE: 𐂧³+𐀳 500 | TELA4+TE: 𐂧⁴+𐀳 501 | TELA;4+TE: 𐂧⁴+𐀳 502 | TELAx+ZO: 𐂧ˣ+𐀿 503 | TELA;x+ZO: 𐂧ˣ+𐀿 504 | TELA1+ZO: 𐂧¹+𐀿 505 | TELA;1+ZO: 𐂧¹+𐀿 506 | TELA2+ZO: 𐂧²+𐀿 507 | TELA;2+ZO: 𐂧²+𐀿 508 | TELA3+ZO: 𐂧³+𐀿 509 | TELA;3+ZO: 𐂧³+𐀿 510 | TELA4+ZO: 𐂧⁴+𐀿 511 | TELA;4+ZO: 𐂧⁴+𐀿 512 | 513 | #EXCEPTION SCENARIOS 514 | "TELA;1+": 𐂧¹+ 515 | "TELA[;1+TE": 𐂧¹%+𐀳 516 | "TELA-[;1+TE": 𐂧¹%+𐀳 517 | "TELA[;1]+TE": 𐂧¹%+𐀳 518 | "TELA-[;1]-+TE": 𐂧¹%+𐀳 519 | "TELA[;1": 𐂧¹% 520 | "TELA;6+": 𐂧⁶+ 521 | "TELA;4+": 𐂧⁴+ 522 | "TELA;2+": 𐂧²+ 523 | "TELA;+": 𐂧+ 524 | "TELA;" : 𐂧 525 | "*164;1": 𐂬¹ 526 | "+DI": +𐀇 527 | "+WE": +𐀸 528 | "+TE": +𐀳 529 | "+TA": +𐀲 530 | "OLE+": 𐂕+ 531 | "OVIS[:m": 𐂇 532 | "OVIS]:m": 𐂇 533 | "OVIS]-:m": 𐂇 534 | "OVIS[:f": 𐂆 535 | "OVIS]:f": 𐂆 536 | "OVIS:": 𐀥 537 | "ME<±RI>": 𐂙 538 | 539 | 540 | patterns_to_ignore: 541 | - "vacat\\s*\\.?" 542 | - "lat\\s*\\." 543 | - "inf\\s*\\." 544 | - "i\\s*\\." 545 | - "mut\\s*\\." 546 | - "sup\\s*\\." 547 | - "vac\\s*\\." 548 | - "v\\s*\\." 549 | - "vestigia" 550 | - "l\\s*\\." 551 | - "s\\s*\\." 552 | - "Graffito" 553 | - "[\\/\\,\\'\\?]" 554 | - "⟦.*?⟧" 555 | - "deest" 556 | - "[⸤⸥]" 557 | - "[\\u231e\\u231f]" # Ignore characters ⌞ and ⌟ 558 | 559 | 560 | regularization: 561 | - ['\\|', ''] 562 | - [':', ''] 563 | - ['r\\.', ''] 564 | - ['\\[•~\\]', ''] 565 | - ['⌜', ''] 566 | - ['⌝', ''] 567 | - ['mutila', ''] 568 | - ['dt',''] 569 | - ['\\[?•~•~•~•\\]?', '%%%%'] 570 | - ['\\[?•~•~•\\]?', '%%%'] 571 | - ['\\[?•~•~\\]?', '%%'] 572 | - ['\\[?•~•\\]?', '%%'] # Corrected this line 573 | - ['\\<|\\>', ''] 574 | - ['\\[ \\]', '[ ]'] 575 | - ['ro2', '𐁊'] 576 | - ['vestigia', '%'] 577 | - ['\\bqs\\b', '%'] 578 | - ['vest\\s*\\.', '%'] 579 | - ['\\[•\\]', '%'] 580 | - ['supra sigillum|CMS \\w+\\d+[A-Z]* \\d+', ''] 581 | - ['reliqua pars sine regulis', ''] 582 | - ['[αβγ]', ''] 583 | - ['v\\.→', ''] 584 | - ['v\\.↓', ''] 585 | - ['v\\.', ''] 586 | - ['\\b(vacat|sup. mut.|inf. mut.|deest|X|fragmentum A|fragmentum B|graffito|angustum|prior pars sine regulis|fragmentum C|fragmentum D|fragmentum separatum|α|β|γ|δ|sigillum|)\\b', ''] # Corrected this line 587 | - ['\\b(x|m|f)\\b', ''] 588 | - ['[\\[\\]]', '%'] 589 | - ['=[^ ]*', ''] 590 | - ['•', '%'] 591 | - ['●', ''] 592 | - ['dex.', ''] 593 | - ['sin.', ''] 594 | - [' p',''] 595 | 596 | tokenization: 597 | 598 | - ['\u00a0', ' '] # Replace non-breaking space with regular space 599 | - ['\u0323', ''] # Remove specific character (e.g., dot below) 600 | - ['', ''] # Remove HTML closing emphasis tag 601 | - ['', ''] # Remove HTML opening emphasis tag 602 | - ['\|([^|]+)\|', '|\1|'] # Special handling to ensure pipes are treated as separate tokens 603 | - ['ME<±RI>', 'ME±RI'] # Handle specific compound tokens like 'ME<±RI>' 604 | - ['--', '-'] # Normalize the text by replacing double dashes with a single dash 605 | - ['\b(EQU|SUS|OVIS|BOS|CAP)\s+(x|m|f)\b', '\1\2'] # Combine animal ideograms followed by 'x', 'm', or 'f' without space 606 | - ['⌜', ' ⌜ '] # Explicit tokenization for half brackets 607 | - ['⌝', ' ⌝ '] 608 | - ['mutila', ' mutila '] # Handle 'mutila' 609 | - ['fragmentum A', 'fragmentum_A'] # Preprocess 'fragmentum A' and 'fragmentum B' to ensure they are not split 610 | - ['fragmentum B', 'fragmentum_B'] 611 | - ['\b(BOS|SUS|OVIS|CAP|EQU)\s([mf])\b', '\1\2'] # Combine terms with 'm' or 'f' 612 | - ['\](?=[^\s])', ']-'] # Pre-process ']' and '[' for special handling 613 | - ['(?<=[^\s])\[', '-['] 614 | - ['TELA\s+(?=[1234x]\b)', 'TELA'] # Handle specific cases 615 | - ['TELA\s+(\d+)', 'TELA \1'] # Handle other numbers with space 616 | - ['\* (\d+)', '*\1'] # Combine '*' with the following numeral 617 | - ['\+ ([^\s]+)', '+\1'] # Combine '+' with surrounding ideograms 618 | - ['([^\s]) \+', '\1+'] # Ensure '+' is properly attached 619 | - ['([^\s]+) VAS', '\1VAS'] # Attach 'VAS' properly 620 | - ['\b(vac|vest|l|s|lat|inf|mut|sup|i)\s?\.', '\1.'] # Ignore or modify specific patterns 621 | - ['\b(supra sigillum|reliqua pars sine regulis|vacat)\b', '\1'] # Explicit tokenization 622 | 623 | complex_symbols: 624 | 'TELA-[;1+TE': 'PLACEHOLDER_TELA1' 625 | 'TELA;1+TE': 'PLACEHOLDER_TELA2' 626 | 'TELA-[;1]-+TE': 'PLACEHOLDER_TELA3' 627 | 'OVIS]-:m': 'PLACEHOLDER_OVIS' 628 | 629 | special_chars_pattern: "(\\[|\\]|\\,|\\'|\\u27e6|\\u27e7|-|\\?|<|>|⌞|⌟|⸤|⸥|\\||\ue000)" 630 | 631 | restore_patterns: 632 | - ['fragmentum_A', 'fragmentum A'] 633 | - ['fragmentum_B', 'fragmentum B'] 634 | - ['ME±RI', 'ME<±RI>'] -------------------------------------------------------------------------------- /paper.bib: -------------------------------------------------------------------------------- 1 | @article{sommerschieldMachineLearningAncient2023, 2 | title = {Machine learning for ancient languages: {A} survey.}, 3 | volume = {49}, 4 | issn = {0891-2017, 1530-9312}, 5 | shorttitle = {Machine {Learning} for {Ancient} {Languages}}, 6 | url = {https://direct.mit.edu/coli/article/doi/10.1162/coli_a_00481/116160/Machine-Learning-for-Ancient-Languages-A-Survey}, 7 | doi = {10.1162/coli_a_00481}, 8 | abstract = {Abstract 9 | Ancient languages preserve the cultures and histories of the past. However, their study is fraught with difficulties, and experts must tackle a range of challenging text-based tasks, from deciphering lost languages to restoring damaged inscriptions, to determining the authorship of works of literature. Technological aids have long supported the study of ancient texts, but in recent years advances in artificial intelligence and machine learning have enabled analyses on a scale and in a detail that are reshaping the field of humanities, similarly to how microscopes and telescopes have contributed to the realm of science. This article aims to provide a comprehensive survey of published research using machine learning for the study of ancient texts written in any language, script, and medium, spanning over three and a half millennia of civilizations around the ancient world. To analyze the relevant literature, we introduce a taxonomy of tasks inspired by the steps involved in the study of ancient documents: digitization, restoration, attribution, linguistic analysis, textual criticism, translation, and decipherment. This work offers three major contributions: first, mapping the interdisciplinary field carved out by the synergy between the humanities and machine learning; second, highlighting how active collaboration between specialists from both fields is key to producing impactful and compelling scholarship; third, highlighting promising directions for future work in this field. Thus, this work promotes and supports the continued collaborative impetus between the humanities and machine learning.}, 10 | language = {en}, 11 | number = {3}, 12 | urldate = {2023-10-14}, 13 | journal = {Computational Linguistics}, 14 | author = {Sommerschield, Thea and Assael, Yannis and Pavlopoulos, John and Stefanak, Vanessa and Senior, Andrew and Dyer, Chris and Bodel, John and Prag, Jonathan and Androutsopoulos, Ion and Freitas, Nando De}, 15 | year = {2023}, 16 | pages = {1--45}, 17 | file = {Sommerschield-etal_2023_AncientLanguageML-Review.pdf:C\:\\Users\\esmto\\Zotero\\storage\\4K6FRP75\\Sommerschield-etal_2023_AncientLanguageML-Review.pdf:application/pdf}, 18 | } 19 | 20 | @article{Terras_melissa, 21 | author = {Terras, Melissa and Robertson, Paul}, 22 | year = {2005}, 23 | month = {03}, 24 | pages = {}, 25 | title = {Image and Interpretation: Using Artificial Intelligence to Read Ancient Roman Texts}, 26 | volume = {7}, 27 | journal = {Human IT: tidskrift för studier av IT ur ett humanvetenskapligt perspektiv} 28 | } 29 | 30 | @inproceedings{papavassileiouDatasetMycenaeanLinear2020, 31 | title = {A Dataset of {{Mycenaean Linear B}} Sequences}, 32 | booktitle = {Proceedings of the 12th {{Conference}} on {{Language Resources}} and {{Evaluation}} ({{LREC}} 2020)}, 33 | author = {Papavassileiou, Katerina and Owens, Gareth and Kosmopoulos, Dimitrios}, 34 | year = {2020}, 35 | pages = {2552--2561}, 36 | publisher = {European Language Resources Association}, 37 | abstract = {We present a dataset of Mycenaean Linear B sequences gathered from the Mycenaean inscriptions written in the 13th and 14th century B.C. (c. 1400-1200 B.C.). The dataset contains sequences of Mycenaean words and ideograms according to the rules of the Mycenaean Greek language in the Late Bronze Age. Our ultimate goal is to contribute to the study, reading and understanding of ancient scripts and languages. Focusing on sequences, we seek to exploit the structure of the entire language, not just the Mycenaean vocabulary, to analyse sequential patterns. We present an initial experiment on estimating the missing symbols in damaged inscriptions using the dataset.}, 38 | file = {C:\Users\esmto\Zotero\storage\H4APPFAV\Papavassileiou-etal_2020_LinBSequences.pdf} 39 | } 40 | 41 | @ARTICLE{Hossain2024, 42 | author={Hossain, Anushah}, 43 | journal={IEEE Annals of the History of Computing}, 44 | title={{Text Standards for the “Rest of World”: The Making of the Unicode Standard and the OpenType Format}}, 45 | year={2024}, 46 | volume={46}, 47 | number={1}, 48 | pages={20--33}, 49 | keywords={Standards;Computers;Encoding;Writing;History;Keyboards;Visualization;History of Computing;Unicode Standard;OpenType font format;Indic scripts;text stack}, 50 | doi={10.1109/MAHC.2024.3351948} 51 | } 52 | 53 | @article{digital_syriac, 54 | url = {https://doi.org/10.1515/zac-2020-0018}, 55 | title = {{The Digital Syriac Corpus: A Digital Repository for Syriac Texts}}, 56 | author = {James E. Walters}, 57 | pages = {109--122}, 58 | volume = {24}, 59 | number = {1}, 60 | journal = {Zeitschrift für Antikes Christentum / Journal of Ancient Christianity}, 61 | doi = {10.1515/zac-2020-0018}, 62 | year = {2020}, 63 | lastchecked = {2024-09-26} 64 | } 65 | 66 | @dataset{openiti, 67 | author = {Nigst, Lorenz and 68 | Romanov, Maxim and 69 | Savant, Sarah Bowen and 70 | Seydi, Masoumeh and 71 | Verkinderen, Peter}, 72 | title = {{OpenITI: a Machine-Readable Corpus of Islamicate 73 | Texts}}, 74 | month = oct, 75 | year = 2023, 76 | publisher = {Zenodo}, 77 | version = {2023.1.8}, 78 | doi = {10.5281/zenodo.10021513}, 79 | url = {https://doi.org/10.5281/zenodo.10021513} 80 | } 81 | 82 | @misc{sefaria_project, 83 | author = {{Sefaria}}, 84 | title = {Sefaria: A Living Library of Jewish Texts Online}, 85 | howpublished = {\url{https://www.sefaria.org}}, 86 | year = {2024}, 87 | } 88 | 89 | 90 | @software{canonicalgreek, 91 | author = {Lisa Cerrato and 92 | Bridget Almas and 93 | TDBuck and 94 | ahanhardt and 95 | srdee and 96 | Alison Babeu and 97 | Thibault Clérice and 98 | Scott Fleischman and 99 | gregorycrane and 100 | Matthew Munson and 101 | Aurélien Berra and 102 | Adiel Mittmann and 103 | Chiara Palladino and 104 | KATEBHN and 105 | Eric Sowell and 106 | Joel Kalvesmaki and 107 | Stephen Scott and 108 | Jeroen Hellingman and 109 | Andrei and 110 | Chris Drymon}, 111 | title = {Canonical Greek Literature}, 112 | month = jul, 113 | year = 2021, 114 | publisher = {Zenodo}, 115 | version = {0.0.2867}, 116 | doi = {10.5281/zenodo.5090923}, 117 | url = {https://doi.org/10.5281/zenodo.5090923} 118 | } 119 | 120 | 121 | @inproceedings{luoNeuralDeciphermentMinimumcost2019, 122 | address = {Florence, Italy}, 123 | title = {Neural decipherment via minimum-cost flow: {From} {Ugaritic} to {Linear} {B}}, 124 | shorttitle = {Neural {Decipherment} via {Minimum}-{Cost} {Flow}}, 125 | url = {https://aclanthology.org/P19-1303}, 126 | doi = {10.18653/v1/P19-1303}, 127 | urldate = {2023-10-26}, 128 | booktitle = {Proceedings of the 57th {Annual} {Meeting} of the {Association} for {Computational} {Linguistics}}, 129 | publisher = {Association for Computational Linguistics}, 130 | author = {Luo, Jiaming and Cao, Yuan and Barzilay, Regina}, 131 | year = {2019}, 132 | pages = {3146--3155}, 133 | file = {Luo-etal_2021_UndecipheredUndersegmentedScripts.pdf:C\:\\Users\\esmto\\Zotero\\storage\\XCM7RBPS\\Luo-etal_2021_UndecipheredUndersegmentedScripts.pdf:application/pdf}, 134 | } 135 | 136 | 137 | @article{papavassileiouGenerativeModelMycenaean2023, 138 | title = {A generative model for the {Mycenaean} {Linear} {B} script and its application in infilling text from ancient tablets}, 139 | volume = {16}, 140 | issn = {1556-4673, 1556-4711}, 141 | url = {https://dl.acm.org/doi/10.1145/3593431}, 142 | doi = {10.1145/3593431}, 143 | language = {en}, 144 | number = {3}, 145 | urldate = {2023-10-14}, 146 | journal = {Journal on Computing and Cultural Heritage}, 147 | author = {Papavassileiou, Katerina and Kosmopoulos, Dimitrios I. and Owens, Gareth}, 148 | year = {2023}, 149 | pages = {1--25}, 150 | file = {Papavassileiou-etal_2023_LinBGenerativeModel.pdf:C\:\\Users\\esmto\\Zotero\\storage\\2GFRQVBA\\Papavassileiou-etal_2023_LinBGenerativeModel.pdf:application/pdf}, 151 | } 152 | 153 | 154 | @article{fetayaRestorationFragmentaryBabylonian2020, 155 | title = {Restoration of fragmentary {Babylonian} texts using recurrent neural networks}, 156 | volume = {117}, 157 | issn = {0027-8424, 1091-6490}, 158 | url = {https://pnas.org/doi/full/10.1073/pnas.2003794117}, 159 | doi = {10.1073/pnas.2003794117}, 160 | language = {en}, 161 | number = {37}, 162 | urldate = {2023-10-14}, 163 | journal = {Proceedings of the National Academy of Sciences}, 164 | author = {Fetaya, Ethan and Lifshitz, Yonatan and Aaron, Elad and Gordin, Shai}, 165 | month = sep, 166 | year = {2020}, 167 | pages = {22743--22751}, 168 | file = {Fetaya_2020_BabylonianRNN.pdf:C\:\\Users\\esmto\\Zotero\\storage\\D74YCB4F\\Fetaya_2020_BabylonianRNN.pdf:application/pdf}, 169 | } 170 | 171 | 172 | @inproceedings{peronocacciafocoNewApproachDecipherment2021, 173 | address = {Brest}, 174 | series = {Grapholinguistics and its applications}, 175 | title = {A {New} {Approach} to the {Decipherment} of {Linear} {A}, {Stage} 2 - {Cryptanalysis} and {Language} {Deciphering}: {A} "{Brute} {Force} {Attack}" on an {Undeciphered} {Writing} {System}}, 176 | shorttitle = {A {New} {Approach} to the {Decipherment} of {Linear} {A}, {Stage} 2 - {Cryptanalysis} and {Language} {Deciphering}}, 177 | doi = {10.36824/2020-graf-cacc}, 178 | booktitle = {Grapholinguistics in the 21st {Century} 2020. {Proceedings}}, 179 | publisher = {Fluxus Editions}, 180 | author = {Perono Cacciafoco, Francesco and Loh, Colin Jia Sheng}, 181 | year = {2021}, 182 | pages = {927--943}, 183 | file = {PeronoCacciafoco-Loh_2021_LADeciphermentBruteForce.pdf:C\:\\Users\\esmto\\Zotero\\storage\\5FYYFHK2\\PeronoCacciafoco-Loh_2021_LADeciphermentBruteForce.pdf:application/pdf}, 184 | } 185 | 186 | 187 | @article{auroraDAMOSDatabaseMycenaean2015, 188 | title = {D{ĀMOS} ({Database} of {Mycenaean} at {Oslo}). {Annotating} a fragmentarily attested language}, 189 | volume = {198}, 190 | issn = {18770428}, 191 | url = {https://linkinghub.elsevier.com/retrieve/pii/S187704281504416X}, 192 | doi = {10.1016/j.sbspro.2015.07.415}, 193 | language = {en}, 194 | urldate = {2024-08-25}, 195 | journal = {Procedia - Social and Behavioral Sciences}, 196 | author = {Aurora, Federico}, 197 | month = jul, 198 | year = {2015}, 199 | pages = {21--31}, 200 | } 201 | 202 | 203 | @article{weinbergTransliterationDocumentation1974, 204 | title = {Transliteration in documentation}, 205 | volume = {30}, 206 | issn = {0022-0418}, 207 | url = {https://doi.org/10.1108/eb026567}, 208 | doi = {10.1108/eb026567}, 209 | abstract = {The validity of transliteration in documentation is questioned in light of the resulting loss of precise information. The process is examined from the linguist's, cataloguer's, and user's points of view. The pros and cons of phonetic transcription vs. scientific transliteration are discussed. Specific problems of several non‐Roman alphabets are touched upon. The author advocates development of non‐Latin print chains for computers used for documentation work. Where the cost of this is prohibitive, scientific transliteration is imperative for the purposes of international documentation. For library purposes, maintenance of separate catalogues for each script is recommended.}, 210 | number = {1}, 211 | urldate = {2024-09-23}, 212 | journal = {Journal of Documentation}, 213 | author = {Weinberg, Bella}, 214 | year = {1974}, 215 | note = {Publisher: MCB UP Ltd}, 216 | keywords = {Potnia}, 217 | pages = {18--31}, 218 | file = {Weinberg_1974_Transliteration.pdf:C\:\\Users\\esmto\\OneDrive\\01 Education\\02 University of Melbourne\\03 Additional projects\\Weinberg_1974_Transliteration.pdf:application/pdf}, 219 | } 220 | 221 | @article{odishoTransliteratingEnglishArabic1992, 222 | title = {Transliterating {English} in {Arabic}}, 223 | issn = {0170026X}, 224 | url = {http://www.jstor.org/stable/43525603}, 225 | number = {24}, 226 | urldate = {2024-09-23}, 227 | journal = {Zeitschrift für Arabische Linguistik}, 228 | author = {Odisho, Edward Y.}, 229 | year = {1992}, 230 | note = {Publisher: Harrassowitz Verlag}, 231 | keywords = {Potnia}, 232 | pages = {21--34}, 233 | file = {Odisho_1992_Transliteration-ArabicEnglish.pdf:C\:\\Users\\esmto\\OneDrive\\01 Education\\02 University of Melbourne\\03 Additional projects\\Odisho_1992_Transliteration-ArabicEnglish.pdf:application/pdf}, 234 | } 235 | 236 | @article{martinetProjectTransliterationClassical1953, 237 | title = {A {Project} of {Transliteration} of {Classical} {Greek}}, 238 | volume = {9}, 239 | issn = {0043-7956, 2373-5112}, 240 | url = {http://www.tandfonline.com/doi/full/10.1080/00437956.1953.11659466}, 241 | doi = {10.1080/00437956.1953.11659466}, 242 | language = {en}, 243 | number = {2}, 244 | urldate = {2024-09-23}, 245 | journal = {WORD}, 246 | author = {Martinet, André}, 247 | year = {1953}, 248 | keywords = {Potnia}, 249 | pages = {152--161}, 250 | file = {Martinet_1953_Transliteration-ClassicalGreek.pdf:C\:\\Users\\esmto\\OneDrive\\01 Education\\02 University of Melbourne\\03 Additional projects\\Martinet_1953_Transliteration-ClassicalGreek.pdf:application/pdf}, 251 | } 252 | 253 | @book{chadwickDocumentsMycenaeanGreek1973, 254 | address = {Cambridge}, 255 | edition = {2}, 256 | title = {Documents in {Mycenaean} {Greek}}, 257 | publisher = {Cambridge University Press}, 258 | author = {Chadwick, J.}, 259 | year = {1973}, 260 | } 261 | 262 | @misc{tinney2019cuneify, 263 | author = {Steve Tinney}, 264 | title = {Cuneify}, 265 | year = {2019}, 266 | howpublished = {{Oracc: The Open Richly Annotated Cuneiform Corpus}}, 267 | url = {http://oracc.museum.upenn.edu/doc/tools/cuneify/} 268 | } 269 | 270 | @misc{ilintomich2021unicodeconverter, 271 | author = {Alexander Ilin-Tomich}, 272 | title = {UnicodeConverter}, 273 | year = {2019}, 274 | howpublished = {\url{https://github.com/ailintom/UnicodeConverter/}}, 275 | } 276 | 277 | @misc{senior2023anatoliangenerator, 278 | author = {Andrew Senior}, 279 | title = {Anatolian Hieroglyphics (Luwian) generation}, 280 | year = {2023}, 281 | howpublished = {\url{https://andrewsenior.com/luwian/}}, 282 | } 283 | 284 | @misc{prosser2019ras, 285 | author = {Prosser, Miller C. and Pardee, Dennis G.}, 286 | title = {{The Ras Shamra Tablet Inventory}}, 287 | year = {2019}, 288 | howpublished = {\url{https://onlinepublications.uchicago.edu/RSTI/}}, 289 | note = {Online Publication Service of the University of Chicago} 290 | } 291 | 292 | @misc{cdli2024, 293 | author = {{CDLI contributors}}, 294 | title = {{Cuneiform Digital Library Initiative}}, 295 | year = {2024}, 296 | howpublished = {\url{https://cdli.mpiwg-berlin.mpg.de/}}, 297 | note = {Cuneiform Digital Library Initiative, September 29, 2024} 298 | } 299 | 300 | @article{Zerrouki2023, 301 | title = {{PyArabic: A Python package for Arabic text}}, 302 | author = {Taha Zerrouki}, 303 | year = 2023, 304 | journal = {Journal of Open Source Software}, 305 | publisher = {The Open Journal}, 306 | volume = 8, 307 | number = 84, 308 | pages = 4886, 309 | doi = {10.21105/joss.04886}, 310 | url = {https://doi.org/10.21105/joss.04886} 311 | } 312 | 313 | @misc{DIN31635, 314 | author = {{Deutsches Institut für Normung}}, 315 | title = {{DIN 31635: Transliteration of the Arabic alphabet}}, 316 | year = {2011}, 317 | howpublished = {Standard published by Deutsches Institut für Normung}, 318 | address = {Berlin}, 319 | } 320 | -------------------------------------------------------------------------------- /tests/expected/linear_b_unicode.yaml: -------------------------------------------------------------------------------- 1 | # Test LB.A.1: 2 | # Scenario: Test that blank spaces act as word separators within text in annotated scenarios (except for in specific scenarios, e.g. between a domesticated animal ideogram and a sex indicator). 3 | # Requirements mapping: 4 | # LB.1-x: Tokenise each blank space (including Unicode '\u00a0') and use to distinguish individual words. Represent as is in both annotated and regularized output. 5 | # This should occur in all but the specified exception scenarios (see LB.1-a–d). 6 | # LB.1-e: If a blank space appears after a domesticated animal ideogram (i.e. EQU, SUS, OVIS, BOS or CAP) and before either a lowercase 'm' 'f' or 'x', then remove the space in both the annotated and regularized outputs. 7 | 8 | "CAP f 130 SUS 17 SUS f 41 BOS m 2 BOS f 4": "𐂈 130 𐁂 17 𐂊 41 𐂍 2 𐂌 4" 9 | "]SUS x 4 KO 80[" : "]𐁂 4 𐀒 80[" 10 | 11 | # Test LB.A.2: 12 | # Scenario: Test that hyphens act as sign separators within a word in annotated scenarios. 13 | # Requirements mapping: 14 | # LB.2: Tokenise each instance of '-' and use to recognise whole words in encoding, but do not represent this symbol in either the annotated or the regularized output. 15 | 16 | "a-ri-to-jo" : "𐀀𐀪𐀵𐀍" 17 | 18 | # Test LB.A.3 19 | # Scenario: Test that blank spaces after * and before and after '+' are removed in annotated scenarios. 20 | # Requirements mapping: 21 | # LB.1-a: If a blank space appears after '*', then remove that space in both the annotated and regularized outputs. 22 | # LB.1-b: If a blank space appears before/after '+', then remove both those spaces in both the annotated and regularized outputs. 23 | 24 | "]qa-ra / re-me-to * 168 + SE 28" : "]𐀣𐀨 / 𐀩𐀕𐀵 𐂰+𐀮 28" 25 | 26 | # Test LB.A.4 27 | # Scenario: Test that '--' is treated the same as as '-' in annotated scenarios. 28 | # Requirements mapping: 29 | # LB.3: Tokenise each instance of '--' as '-' and use to recognise whole words in encoding, but do not represent this symbol in either the annotated or the regularized output. 30 | 31 | "a-ka--[ ]--jo-jo , me-no-[ da-pu2-ri-[-to-jo ]-po-ti-ni-ja ri *166+WE 22-[" : "𐀀𐀏[ ]𐀍𐀍 , 𐀕𐀜[ 𐀅𐁆𐀪[𐀵𐀍 ]𐀡𐀴𐀛𐀊 𐀪 𐂮+𐀸 22[" # https://liber.cnr.it/tablet/view/124 32 | 33 | # Test LB.A.5 34 | # Scenario: Test that ']', '[', ',' and '/' are correctly printed in annotated scenarios. 35 | # Requirements mapping: 36 | # LB.4: Tokenise each instance of '/'. Represent this symbol as is in the annotated output, but do not include in the regularized output. 37 | # LB.5: Tokenise each instance of '//'. Represent this symbol as is in the annotated output, but do not include in the regularized output. 38 | # LB.6: Tokenise each instance of ','. Represent this symbol as is in the annotated output, but do not include in the regularized output. 39 | # LB.9: Tokenise each instance of '['. Represent this symbol as is in the annotated output, and represent as wildcard (i.e. '%') in regularized output. 40 | # LB.10: Tokenise each instance of ']'. Represent this symbol as is in the annotated output, and represent as wildcard (i.e. '%') in regularized output. 41 | 42 | "wo-de-wi-jo-jo , / me-no[ // ]ri-jo-de , ko-no , MA 3 ko-ri[ ]2 pa-de-i , ko-no MA 2 KO T 1[ [ ] pa-si-te-o-i , pa-sa-ja , ko-no , [ ] a-mi-ni-so-de , MA 2 KO T 4" : "𐀺𐀆𐀹𐀍𐀍 , / 𐀕𐀜[ // ]𐀪𐀍𐀆 , 𐀒𐀜 , 𐀔 3 𐀒𐀪[ ]2 𐀞𐀆𐀂 , 𐀒𐀜 𐀔 2 𐀒 𐄼 1[ [ ] 𐀞𐀯𐀳𐀃𐀂 , 𐀞𐀭𐀊 , 𐀒𐀜 , [ ] 𐀀𐀖𐀛𐀰𐀆 , 𐀔 2 𐀒 𐄼 4" 43 | 44 | # Test LB.A.6 45 | # Scenario: Test that ':' is correctly printed in annotated scenarios. 46 | # Requirements mapping: 47 | # LB.7: Tokenise each instance of ':'. Represent this symbol as is in the annotated output, but do not include in the regularized output. 48 | 49 | "a-ta-ti-nu : si-wa-[" : "𐀀𐀲𐀴𐀝 : 𐀯𐀷[" 50 | 51 | # Test LB.A.7 52 | # Scenario: Test that single quotation marks are correctly printed in annotated scenarios. 53 | # Requirements mapping: 54 | # LB.8: Tokenise each instance of '''. Represent this symbol as is in the annotated output, but do not include in regularized output. 55 | 56 | "]wa VIR 1 MUL 2 'ko-wa 1' ko-wo 1" : "]𐀷 𐂀 1 𐂁 2 '𐀒𐀷 1' 𐀒𐀺 1" 57 | 58 | # Test LB.A.8 59 | # Scenario: Test that '\u0323' is not printed in annotated scenarios 60 | # Requirements mapping: 61 | # LB.11: Ignore each instance of 'X̣' (or '\u0323') in tokenisation. Do not represent this symbol in either the annotated or the regularized output. 62 | 63 | "] ko-wo / m\u0323e\u0323[-zo] 1 ko-wo / me-wi-jo 2 [" : "] 𐀒𐀺 / 𐀕[𐀿] 1 𐀒𐀺 / 𐀕𐀹𐀍 2 [" 64 | 65 | # Test LB.A.9 66 | # Scenario: Test that '?' is correctly printed in annoted scenarios. 67 | # Requirements mapping: 68 | # LB.12: Tokenise each instance of '?'. Represent this symbol as is in the annotated output, but do not include in the regularized output. 69 | 70 | "i[-qi-ja?": "𐀂[𐀥𐀊?" 71 | 72 | # Test LB.A.10 73 | # Scenario: Test that \u27e6 and \u27e7 are correctly printed as Scott brackets (i.e. '⟦' and '⟧') in annotated scenarios. 74 | # Requirements mapping: 75 | # LB.13: Tokenise each instance of Scott brackets (i.e. '⟦' and '⟧', or '\u27e6' and '\u27e7'). Represent these symbols as is in the annotated output. Do not include these symbols, or any other text that they contain, in the reguarised ouput. 76 | 77 | "po-*34-wi-do \u27e6TUN\u27e7 BIG[" : "𐀡𐁓𐀹𐀈 ⟦𐂪⟧ 𐃌[" 78 | 79 | # Test LB.A.11 80 | # Scenario: Test that '<' and '>' are correctly printed in annotated scenarios. 81 | # Requirements mapping: 82 | # LB.14: Tokenise each instance of angle brackets (i.e. '<' and '>'). Represent this text as is in the annotated output. Do not include these symbols, or any other text that they contain, in the reguarised ouput. 83 | 84 | "] 69 OVIS:f 30 [ ]-e-ke-me-de , / tu-ni-ja , pa OVIS:m 1" : "] <𐂇> 69 𐂆 30 [ ]𐀁𐀐𐀕𐀆 , / 𐀶𐀛𐀊 , 𐀞 𐂇 1" # https://liber.cnr.it/tablet/view/3172?wl=12765 85 | 86 | # Test LB.A.12 87 | # Scenario: Test that lower half brackets (i.e. '⸤' and '⸥', or '\u2e24' and '\u2e25') are correctly printed in annotated scenarios. 88 | # Requirements mapping: 89 | # LB.15: Tokenise each instance of lower half brackets (i.e. '⸤' and '⸥', or '\u2e24' and '\u2e25'). Represent these symbols as is in the annotated output. Do not include these symbols in the reguarised ouput. 90 | 91 | "du-to\u2e24 \u2e25 / r\u0323u\u0323-ki-to" : "𐀉𐀵⸤ ⸥ / 𐀬𐀑𐀵" 92 | "e-ke-qe ]-o-na-to , ke-ke-me-na⌞ ⌟ko-to-na GRA qs ] vac.": "𐀁𐀐𐀤 ]𐀃𐀙𐀵 , 𐀐𐀐𐀕𐀙⌞ ⌟𐀒𐀵𐀙 𐂎 qs ] vac." 93 | 94 | # Test LB.A.13 95 | # Scenario: Test that upper half brackets (i.e. '⌜' and '⌝') are correctly printed in annotated scenarios. 96 | # Requirements mapping 97 | # LB.16: Tokenise each instance of upper half brackets (i.e. ' ⌜' and '⌝'). Represent these symbols as is in the annotated output. Do not include these symbols in the reguarised ouput. 98 | 99 | "]2 OLIV T 2 ] OLIV T 1 to]-ko-do-mo HORD[ ]Z 3 VIR 20[ pi-ri-e-te-re HORD[ ]Z 3 VIR 5 pa-te-ko-to⌜ ⌝HORD[ ]V 2 [ vacat qa-ra2-te , o[-pi-me-]ne[ ]OLIV 6 pa-ka , o-pi-me-ne , [ OLIV qs pa-te-ko-to , o-pi-me-ne[ ]HORD 1 [ pi-ri-e-te-si , o-pi-me-ne[ ]HORD 1 T 4[ to-ko-do-mo , o-pi-me-ne[ ]HORD 7[ ]5 vac." : "]2 𐂐 𐄼 2 ] 𐂐 𐄼 1 𐀵]𐀒𐀈𐀗 𐂏[ ]𐄿 3 𐂀 20[ 𐀠𐀪𐀁𐀳𐀩 𐂏[ ]𐄿 3 𐂀 5 𐀞𐀳𐀒𐀵 ⌜ ⌝ 𐂏[ ]𐄾 2 [ vacat 𐀣𐁈𐀳 , 𐀃[𐀠𐀕]𐀚[ ]𐂐 6 𐀞𐀏 , 𐀃𐀠𐀕𐀚 , [ 𐂐 qs 𐀞𐀳𐀒𐀵 , 𐀃𐀠𐀕𐀚[ ]𐂏 1 [ 𐀠𐀪𐀁𐀳𐀯 , 𐀃𐀠𐀕𐀚[ ]𐂏 1 𐄼 4[ 𐀵𐀒𐀈𐀗 , 𐀃𐀠𐀕𐀚[ ]𐂏 7[ ]5 vac." 100 | 101 | # Test LB.A.14 102 | # Scenario: Test that \u2082 is correctly handled as a subscript '2' in annotated scenarios. 103 | # Requirements mapping: 104 | # LB.17: Tokenise '\u2082' together with immediately preceding transliterated sign (as long as no hyphen '-' is between them). Confirm that it is treated correctly as a subscript '2', and expected Unicode sign is printed, as per mapping. 105 | 106 | "da-pu\u2082-ri-to-jo , / po-ti-ni-ja 'me-ri' * 209 VAS 1" : "𐀅𐁆𐀪𐀵𐀍 , / 𐀡𐀴𐀛𐀊 '𐀕𐀪' 𐃨 1" 107 | 108 | # Test LB.A.15 109 | # Scenario: Test that '\u2083' is correctly handled as a subscript '2' in annotated scenarios 110 | # Requirements mapping: 111 | # LB.18: Tokenise '\u2083' together with immediately preceding transliterated sign (as long as no hyphen '-' is between them). Confirm that it is treated correctly as a subscript '3', and expected Unicode sign is printed, as per mapping. 112 | 113 | "pu-ri / a\u2083-zo-ro-qe , po-da-ko-qe BOS m ZE 1[" : "𐀢𐀪 / 𐁁𐀿𐀫𐀤 , 𐀡𐀅𐀒𐀤 𐂍 𐀽 1[" 114 | 115 | # Test LB.A.16 116 | # Scenario: Test that 'mutila' is correctly printed in annotated scenarios. 117 | # Requirements mapping: 118 | # LB 19: Tokenise each instance of 'mutila'. Represent this text as is in the annotated output, but do not include in the regularized output. 119 | 120 | "] GRA[ qs mutila" : "] 𐂎[ qs mutila " 121 | 122 | # Test LB.A.17 123 | # Scenario: Test that 'mut' is correctly printed in annotated scenarios. 124 | # Requirements mapping: 125 | # LB.20: Tokenise each instance of 'mut'. Represent this text as is in the annotated output, but do not include in the regularized output. 126 | 127 | "sup. mut. ]vacat [ ]A 5 A [ ]vest.[ inf. mut" : "sup. mut. ]vacat [ ]𐀀 5 𐀀 [ ]vest.[ inf. mut" 128 | 129 | # Test LB.A.18 130 | # Scenario: Test that 'sup. mut.', 'inf. mut.' and 'vac.' are correctly printed in annotated scenarios. 131 | # Requirements mapping: 132 | # LB.21: Tokenise each instance of 'sup.' and 'mut.'. Represent this text as is in the annotated output, but do not include in the regularized output. 133 | # LB.22: Tokenise each instance of 'inf.' and 'mut.'. Represent this text as is in the annotated output, but do not include in the regularized output. 134 | # LB.23: Tokenise each instance of 'vac.'. Represent this text as is in the annotated output, but do not include in the regularized output. 135 | 136 | "sup. mut. ] wo[ ] vac. [ inf. mut." : "sup. mut. ] 𐀺[ ] vac. [ inf. mut." 137 | 138 | # Test LB.A.19 139 | # Scenario: Test that 'vacat' is correctly printed in annotated scenarios. 140 | # Requirements mapping: 141 | # LB.24: Tokenise each instance of 'vacat'. Represent this text as is in the annotated output, but do not include in the regularized output. 142 | 143 | "] vacat [" : "] vacat [" 144 | "] vacat v. ] 1" : "] vacat v. ] 1" 145 | 146 | # Test LB.A.20 147 | # Scenario: Test that 'vest.' is correctly printed in annotated scenarios, and that Unicode '\u00a0' is treated as a blank space. 148 | # Requirements mapping: 149 | # LB.1-x: Tokenise each blank space (including Unicode '\u00a0') and use to distinguish individual words. Represent as is in both annotated and regularized output. 150 | # This should occur in all but the specified exception scenarios (see LB.1-a–d). 151 | # LB.25: Tokenise each instance of 'vest.'. Represent this text as is in the annotated output, and represent as wildcard (i.e. '%') in regularized output. 152 | 153 | "] vest ., / su-ri-mo , u-ta-jo-jo , o OVIS m 85[\u00a0] vac ." : "] vest., / 𐀱𐀪𐀗 , 𐀄𐀲𐀍𐀍 , 𐀃 𐂇 85[ ] vac." 154 | 155 | # Test LB.A.21 156 | # Scenario: Test that 'vestigia' is correctly printed in annotated scenarios. 157 | # Requirements mapping: 158 | # LB.26: Tokenise each instance of 'vestigia'. Represent this text as is in the annotated output, and represent as wildcard (i.e. '%') in regularized output. 159 | 160 | "pa-ro , we-u-da-ne-we re-u-ko , a-ko-ro-we-e BOS+SI 2 re[-u-]ko , ma-ra-pi , pe-ko , a-ko-ro-we BOS+SI 1 OVIS:m? ]3 CAP:m 3 WE 3 CAP:m 3 ]vestigia[ ]2 [ ]BOS:x 3 ⟦ ⟧ ] vest. [ ] vest. [ re-u-ko[ ]ma-ra[-pi ]pe-ko , a-ko-ro-we[ OVIS:m 1 CAP:m 1 WE[ ] SUS:x[ ] vacat [ inf. mut." : "𐀞𐀫 , 𐀸𐀄𐀅𐀚𐀸 𐀩𐀄𐀒 , 𐀀𐀒𐀫𐀸𐀁 𐀘+𐀯 2 𐀩[𐀄]𐀒 , 𐀔𐀨𐀠 , 𐀟𐀒 , 𐀀𐀒𐀫𐀸 𐀘+𐀯 1 𐂇? ]3 𐂉 3 𐀸 3 𐂉 3 ]vestigia[ ]2 [ ]𐀘 3 ⟦ ⟧ ] vest. [ ] vest. [ 𐀩𐀄𐀒[ ]𐀔𐀨[𐀠 ]𐀟𐀒 , 𐀀𐀒𐀫𐀸[ 𐂇 1 𐂉 1 𐀸[ ] 𐁂[ ] vacat [ inf. mut." 161 | 162 | # Test LB.A.22 163 | # Scenario: Test that 'vestigia?' is correctly printed in annotated scenarios. 164 | # Requirements mapping: 165 | # LB.27: Tokenise each instance of 'vestigia?'. Represent this text as is in the annotated output, and represent as wildcard (i.e. '%') in regularized output. 166 | 167 | "su-ma-no / ti-ri-to [ vestigia? ] vacat" : "𐀱𐀔𐀜 / 𐀴𐀪𐀵 [ vestigia? ] vacat" 168 | 169 | # Test LB.A.23 170 | # Scenario: Test that 'qs' (i.e. 'quantum sufficit') is correctly printed in annoted scenarios. 171 | # Requirements mapping: 172 | # LB.28: Tokenise each instance of 'qs'. Represent this text as is in the annotated output, but represent as wildcard (i.e. '%') in regularized output. 173 | 174 | "]-ke-ke-me-na-[ , ko-]-to-na GRA qs ] vac." : "]𐀐𐀐𐀕𐀙[ , 𐀒]𐀵𐀙 𐂎 qs ] vac." 175 | 176 | # Test LB.A.24 177 | # Scenario: Test that 'fragmentum separatum', 'α', 'β', 'γ' and 'δ' are correctly printed in annotated scenarios. 178 | # Requirements mapping: 179 | # LB.29: Tokenise each instance of 'fragmentum separatum'. Represent this text as is in the annotated output, but do not include in the regularized output. 180 | # LB.53: Tokenise each instance of 'α'. Represent this text as is in the annotated output, but do not include in the regularized output. 181 | # LB.54: Tokenise each instance of 'β'. Represent this text as is in the annotated output, but do not include in the regularized output. 182 | # LB.55: Tokenise each instance of 'γ'. Represent this text as is in the annotated output, but do not include in the regularized output. 183 | # LB.56: Tokenise each instance of 'δ'. Represent this text as is in the annotated output, but do not include in the regularized output. 184 | 185 | "da-we-u[-pi ]a-ko[ da-we-u-pi , a[ da-we-u-pi , ka[ da-we-u-pi , e-[ a3-zo-wo[ da-we[-u-]pi ⌞ ⌟wo[ da-we-u-pi , e-ke[ da-we-u[-pi a-re[ a-zo[ inf. mut. fragmentum separatum α sup. mut. ] OVIS:f X 15 [ fragmentum separatum β ] , ka[ fragmentum separatum γ sup. mut. ]no-wo[ fragmentum separatum δ sup. mut. ]ma-jo-wo-[ inf. mut." : "𐀅𐀸𐀄[𐀠 ]𐀀𐀒[ 𐀅𐀸𐀄𐀠 , 𐀀[ 𐀅𐀸𐀄𐀠 , 𐀏[ 𐀅𐀸𐀄𐀠 , 𐀁[ 𐁁𐀿𐀺[ 𐀅𐀸[𐀄]𐀠 ⌞ ⌟𐀺[ 𐀅𐀸𐀄𐀠 , 𐀁𐀐[ 𐀅𐀸𐀄[𐀠 𐀀𐀩[ 𐀀𐀿[ inf. mut. fragmentum separatum α sup. mut. ] 𐂆 X 15 [ fragmentum separatum β ] , 𐀏[ fragmentum separatum γ sup. mut. ]𐀜𐀺[ fragmentum separatum δ sup. mut. ]𐀔𐀍𐀺[ inf. mut." 186 | 187 | # Test LB.A.25 188 | # Scenario: Test that 'fragmentum A' and 'fragmentum B' are correctly printed in annotated scenarios. 189 | # Requirements mapping: 190 | # LB.30: Tokenise each instance of 'fragmentum A'. Represent this text as is in the annotated output, but do not include in the regularized output. 191 | # LB.31: Tokenise each instance of 'fragmentum B'. Represent this text as is in the annotated output, but do not include in the regularized output. 192 | 193 | "fragmentum A fragmentum B vacat [ sup. mut. e-me-si-jo-jo-[ ] 3-[ pa-na-so GRA 100-[ ]-vac.-[ ta-ra-qo GRA [ inf. mut. ta-u-pa-du-we GRA-[ a-ro-ja-[ pu-na-so-[ inf. mut." : "fragmentum A fragmentum B vacat [ sup. mut. 𐀁𐀕𐀯𐀍𐀍[ ] 3[ 𐀞𐀙𐀰 𐂎 100[ ]vac.[ 𐀲𐀨𐀦 𐂎 [ inf. mut. 𐀲𐀄𐀞𐀉𐀸 𐂎[ 𐀀𐀫𐀊[ 𐀢𐀙𐀰[ inf. mut." 194 | 195 | # Test LB.A.26 196 | # Scenario: Test that 'fragmentum C' and 'fragmentum D' are correctly printed in annotated scenarios. 197 | # Requirements mapping: 198 | # LB.32: Tokenise each instance of 'fragmentum C'. Represent this text as is in the annotated output, but do not include in the regularized output. 199 | # LB.33: Tokenise each instance of 'fragmentum D'. Represent this text as is in the annotated output, but do not include in the regularized output. 200 | 201 | "fragmentum A fragmentum B sup. mut. sup. mut. ]-na 1 i-[ ]so-i-[ ko-wa 1[ ]ku-mi-[•]-du 1[ inf. mut. vac. [ vac. [ fragmentum C fragmentum D sup. mut. sup. mut. ]di-mi[ ]vac. ]*56-za[ ]vac. inf. mut. inf. mut." : "fragmentum A fragmentum B sup. mut. sup. mut. ]𐀙 1 𐀂[ ]𐀰𐀂[ 𐀒𐀷 1[ ]𐀓𐀖[•]𐀉 1[ inf. mut. vac. [ vac. [ fragmentum C fragmentum D sup. mut. sup. mut. ]𐀇𐀖[ ]vac. ]𐁖𐀼[ ]vac. inf. mut. inf. mut." 202 | 203 | # Test LB.A.27 204 | # Scenario: Test that 'deest' (or its abbreviation 'dt') is correctly printed in annotated scenarios 205 | # Requirements mapping: 206 | # LB.34: Tokenise each instance of 'deest' or 'dt'. Represent this text as is in the annotated output, but do not include in the regularized output. 207 | 208 | "sup. mut. ]-deest-[ inf. mut." : "sup. mut. ]deest[ inf. mut." 209 | "]GRA 37 T 6[ ] vac. [ ]⌞deest⌟ vac. ⌞dt⌟ [" : "]𐂎 37 𐄼 6[ ] vac. [ ]⌞deest⌟ vac. ⌞dt⌟ [" 210 | 211 | # Test LB.A.28 212 | # Scenario: Test that 'prior pars sine regulis' and '•' are correctly printed in annotated scenarios. 213 | # Requirements mapping 214 | # LB.35: Tokenise each instance of 'prior pars sine regulis'. Represent this text as is in the annotated output, but do not include in the regularized output. 215 | # LB.57: Tokenise each instance of '•'. Represent this text as is in the annotated output, and represent as wildcard (i.e. '%') in regularized output. 216 | 217 | "]-ke-ra2-u-na , e-ra[ ]• po-se-da-o-ne⌞ ⌟re-ko-no 6 [ *146 18[ ] LANA 2 M 2[ A±RE±PA V 4[ ]• 1 OVIS:m 1 OVIS:f 1 CAP:f[ qs SUS+KA 2 SUS:f 4[ ]• 1 FAR T 1 V [ qs VIN 5 TELA [ ] 1 TELA+PA 1 vac. vac. vac. [ ]3[ ]-we-e-a2[ inf. mut. v. prior pars sine regulis ]e-ke-me-de , do[ ]du-ru-wo-qo deest vac. vac. vac. vac. vac. vac." : "]𐀐𐁈𐀄𐀙 , 𐀁𐀨[ ]• 𐀡𐀮𐀅𐀃𐀚⌞ ⌟𐀩𐀒𐀜 6 [ 𐂞 18[ ] 𐂝 2 𐄸 2[ 𐂘 𐄾 4[ ]• 1 𐂇 1 𐂆 1 𐂈[ qs 𐁂+𐀏 2 𐂊 4[ ]• 1 𐀎 𐄼 1 𐄾 [ qs 𐂖 5 𐂧 [ ] 1 𐂧+𐀞 1 vac. vac. vac. [ ]3[ ]𐀸𐀁𐁀[ inf. mut. v. prior pars sine regulis ]𐀁𐀐𐀕𐀆 , 𐀈[ ]𐀉𐀬𐀺𐀦 deest vac. vac. vac. vac. vac. vac." 218 | 219 | # Test LB.A.29 220 | # Scenario: Test that 'reliqua pars sine regulis' is correctly printed in annotated scenarios. 221 | # Requirements mapping 222 | # LB.36: Tokenise each instance of 'reliqua pars sine regulis'. Represent this text as is in the annotated output, but do not include in the regularized output. 223 | 224 | "sup. mut. ]-vest.-[ ]-na-ro GRA 5 ]--do-we-i , ma-so-qe GRA 8 ] vac. ] GRA 402 OLIV+A 52 reliqua pars sine regulis" : "sup. mut. ]vest.[ ]𐀙𐀫 𐂎 5 ]𐀈𐀸𐀂 , 𐀔𐀰𐀤 𐂎 8 ] vac. ] 𐂎 402 𐂐+𐀀 52 reliqua pars sine regulis" 225 | 226 | # Test LB.A.30 227 | # Scenario: Test that 'angustum' and '[•~]' are correctly printed in annotated scenarios. 228 | # Requirements mapping 229 | # LB.37: Tokenise each instance of 'angustum'. Represent this text as is in the annotated output, but do not include in the regularized output. 230 | # LB.59: Tokenise each instance of '[•~]'. Represent this text as is in the annotated output, and represent a single wildcard (i.e. '%') in regularized output. 231 | 232 | "a[ ]te VIR[ 1 ]ke-ro-si-ja , a[ ] VIR 1 ke-ro-]si-ja , [•~]me-ka-[•] VIR 1 a-[ ke-ro-]si-ja , o-pa-[ ]vac.[ VIR 1 vac.[ ] vac. vac. [ ] vac. v. ta-we-si-jo-jo , ke-ro-si-ja , te-wa[ VIR 1 ta-]we-si-jo-jo , ke-ro-si-ja , tu-ru-we-u VIR 1 ] angustum ta-]we-si-jo-jo , ke-ro-si VIR 20 a-pi-qo-ta-o , ke-ro-si-ja VIR 17 a-pi-o-to , ke-ro-si-ja VIR [1]8⌟ o-to-wo[-o ke-]ro-si-ja VIR [1]4 angustum [ ] [ ] ka-ma-e[-we] VIR 10" : "𐀀[ ]𐀳 𐂀[ 1 ]𐀐𐀫𐀯𐀊 , 𐀀[ ] 𐂀 1 𐀐𐀫]𐀯𐀊 , [•~]𐀕𐀏[•] 𐂀 1 𐀀[ 𐀐𐀫]𐀯𐀊 , 𐀃𐀞[ ]vac.[ 𐂀 1 vac.[ ] vac. vac. [ ] vac. v. 𐀲𐀸𐀯𐀍𐀍 , 𐀐𐀫𐀯𐀊 , 𐀳𐀷[ 𐂀 1 𐀲]𐀸𐀯𐀍𐀍 , 𐀐𐀫𐀯𐀊 , 𐀶𐀬𐀸𐀄 𐂀 1 ] angustum 𐀲]𐀸𐀯𐀍𐀍 , 𐀐𐀫𐀯 𐂀 20 𐀀𐀠𐀦𐀲𐀃 , 𐀐𐀫𐀯𐀊 𐂀 17 𐀀𐀠𐀃𐀵 , 𐀐𐀫𐀯𐀊 𐂀 [1]8⌟ 𐀃𐀵𐀺[𐀃 𐀐]𐀫𐀯𐀊 𐂀 [1]4 angustum [ ] [ ] 𐀏𐀔𐀁[𐀸] 𐂀 10" 233 | 234 | # Test LB.A.31 235 | # Scenario: Test that 'graffito' is correctly printed in annotated scenarios. 236 | # Requirements mapping: 237 | # LB.38: Tokenise each instance of 'graffito'. Represent this text as is in the annotated output, but do not include in the regularized output. 238 | 239 | "]e-ke , e-u-da-i-ta OVIS:f 39[ ]ki-u-ro , / su-ki-ri-ta-pi o ki OVIS 15 [ v. graffito lat. inf." : "]𐀁𐀐 , 𐀁𐀄𐀅𐀂𐀲 𐂆 39[ ]𐀑𐀄𐀫 , / 𐀱𐀑𐀪𐀲𐀠 𐀃 𐀑 𐀥 15 [ v. graffito lat. inf." 240 | 241 | # Test LB.A.32 242 | # Scenario: Check that 'Graffito' is correctly printed in annotated scenarios. 243 | # Requirements mapping: 244 | # LB.25: Tokenise each instance of 'Graffito'. Represent this text as is in the annotated output, but do not include in the regularized output. 245 | 246 | "] Graffito [": "] Graffito [" 247 | 248 | # Test LB.A.33 249 | # Scenario: Test that 'r.' and 'r.p' are correctly printed in annotated scenarios. 250 | # Requirements mapping: 251 | # LB.40: Tokenise each instance of 'r.' or 'r.p'. Represent this text as is in the annotated output, but do not include in the regularized output. 252 | 253 | "lat. sup. ] KE [ r. ]VIN 1 S 2[ ]1 ko-ta V[" : "lat. sup. ] 𐀐 [ r. ]𐂖 1 𐄽 2[ ]1 𐀒𐀲 𐄾[" 254 | "v. ]i-je-re-ja TELA+TE[ qs ka-]ra-wi-po-ro TELA+TE[ qs lat. dex. ] ⟦WE 30⟧ r.p vacat vestigia po-se-da-o-ne [ po-de-da-o-ne" : "v. ]𐀂𐀋𐀩𐀊 𐂧+𐀳[ qs 𐀏]𐀨𐀹𐀡𐀫 𐂧+𐀳[ qs lat. dex. ] ⟦𐀸 30⟧ r.p vacat vestigia 𐀡𐀮𐀅𐀃𐀚 [ 𐀡𐀆𐀅𐀃𐀚" 255 | 256 | # Test LB.A.34 257 | # Scenario: Test that 'v.' and 'v.p' are correctly printed in annotated scenarios. 258 | # Requirements mapping: 259 | # LB.41: Tokenise each instance of 'v.' or 'v.p'. Represent this text as is in the annotated output, but do not include in the regularized output. 260 | 261 | "to-re : : : : [ v. di-we si-po-ro ti-mi-to-qo [" : "𐀵𐀩 : : : : [ v. 𐀇𐀸 𐀯𐀡𐀫 𐀴𐀖𐀵𐀦 [" 262 | "ARM 1 me-zo-a2 O 22 me-u-jo-a2 O 12 KO O 4 PA 2 v.p to-mi-re-[ ]wa-[ ]-re-[ ]e-ko-si o-to-pe-da-ko-we-de-[•]-ke[" : "𐂫 1 𐀕𐀿𐁀 𐀃 22 𐀕𐀄𐀍𐁀 𐀃 12 𐀒 𐀃 4 𐀞 2 v.p 𐀵𐀖𐀩[ ]𐀷[ ]𐀩[ ]𐀁𐀒𐀯 𐀃𐀵𐀟𐀅𐀒𐀸𐀆[•]𐀐[" 263 | 264 | # Test LB.A.35 265 | # Scenario: Check that 'v.↓' is correctly printed in annotated scenarios. 266 | # Requirements mapping: 267 | # LB.42: Abbreviated form of verso, indicates the reverse side of the tablet, when inscribed. Arrow indicates direction that record is rotated to reach verso. 268 | 269 | "qe-te-o TELA;2-[ po-po TELA;2 4 [ v.↓ ⟦a-mi-si-ja TELA;1 12⟧ [" : "𐀤𐀳𐀃 𐂧²[ 𐀡𐀡 𐂧² 4 [ v.↓ ⟦𐀀𐀖𐀯𐀊 𐂧¹ 12⟧ [" 270 | 271 | # Test LB.A.36 272 | # Scenario: Check that 'v.→' is correctly printed in annotated scenarios. 273 | # Requirements mapping: 274 | # LB.31: Tokenise each instance of 'v.→'. Represent this text as is in the annotated output, but do not include in the regularized output. 275 | 276 | "ne-wo , za-we-[ v.→ ] a-ro-we a-nu-to" : "𐀚𐀺 , 𐀼𐀸[ v.→ ] 𐀀𐀫𐀸 𐀀𐀝𐀵" 277 | 278 | # Test LB.A.37 279 | # Scenario: Test that blank spaces are removed before the full stop for 'l .' and 's .', and that the resulting 'l.' and 's.' are correctly printed in annotated scenarios. 280 | # Requirements mapping: 281 | # LB.1-c: If a space appears before a '.' in an annotation (e.g. 'lat .'), then remove that space in both the annotated and regularized outputs. 282 | # LB.44: Tokenise each instance of 'l.'. Represent this text as is in the annotated output, but do not include in the regularized output. 283 | # LB.46: Tokenise each instance of 's.'. Represent this text as is in the annotated output, but do not include in the regularized output. 284 | 285 | "l . s . ]\u27e6 vest . \u27e7[": "l. s. ]⟦ vest. ⟧[" 286 | 287 | # Test LB.A.38 288 | # Scenario: Test that blank spaces are removed before the full stop for 'l .' and 'i .', and that the resulting 'l.' and 'i.' are correctly printed in annotated scenarios. 289 | # Requirements mapping: 290 | # LB.1-c: If a space appears before a '.' in an annotation (e.g. 'lat .'), then remove that space in both the annotated and regularized outputs. 291 | # LB.44: Tokenise each instance of 'l.'. Represent this text as is in the annotated output, but do not include in the regularized output. 292 | # LB.48: Tokenise each instance of 'i.'. Represent this text as is in the annotated output, but do not include in the regularized output. 293 | 294 | "l . i . LANA 250[": "l. i. 𐂝 250[" 295 | 296 | # Test LB.A.39 297 | # Scenario: Check that 'lat.' and 'inf.' are correctly printed in annotated scenarios. 298 | # Requirements mapping: 299 | # LB.36: Tokenise each instance of 'lat.'. Represent this text as is in the annotated output, but do not include in the regularized output. 300 | # LB.40: Tokenise each instance of 'inf.'. Represent this text as is in the annotated output, but do not include in the regularized output. 301 | 302 | "l\u0323a\u0323t\u0323 . i\u0323n\u0323f\u0323 .": "lat. inf." 303 | 304 | # Test LB.A.40 305 | # Scenario: Test that 'lat.' and 'sup.' are correctly printed in annotated scenarios. 306 | # Requirements mapping: 307 | # LB.45: Tokenise each instance of 'lat.'. Represent this text as is in the annotated output, but do not include in the regularized output. 308 | # LB.47: Tokenise each instance of 'sup.'. Represent this text as is in the annotated output, but do not include in the regularized output. 309 | 310 | "] TELA;4+⟦ZO⟧ 1 [ ]LANA M 1[ v.↓ ]-a ra[ lat. sup.]-so-ma [" : "] 𐂧⁴+⟦𐀿⟧ 1 [ ]𐂝 𐄸 1[ v.↓ ]𐀀 𐀨[ lat. sup.]𐀰𐀔 [" 311 | 312 | # Test LB.A.41 313 | # Scenario: Test that 'dex.' is correctly printed in annotated scenarios. 314 | # Requirements mapping: 315 | # LB.50: Tokenise each instance of 'dex.'. Represent this text as is in the annotated output, but do not include in the regularized output. 316 | 317 | "wo-di-je-ja , de-mi-ni-ja 1 ma-no , a-re-ka-sa-da-ra-ka 2 ri-su-ra , qo-ta-qe 2 e-ri-tu-pi-na , te-o-do-ra-'qe' 2 o-to-wo-wi-je tu-ka-te-qe 2 a-ne-a2 , tu-ka-te-qe 2 pi-ro-wo-na ki-ra-qe 2 pu-ka-ro ke-ti-de-qe 2 ]-ri-mo-qe 2 ]ma-ta-qe 2 ]*82 1 ]-qe 2 ] vac. inf. mut. lat. dex. ] , i-ri-[• ]1 ke-ra-so , ki-ra-qe 2" : "𐀺𐀇𐀋𐀊 , 𐀆𐀖𐀛𐀊 1 𐀔𐀜 , 𐀀𐀩𐀏𐀭𐀅𐀨𐀏 2 𐀪𐀱𐀨 , 𐀦𐀲𐀤 2 𐀁𐀪𐀶𐀠𐀙 , 𐀳𐀃𐀈𐀨'𐀤' 2 𐀃𐀵𐀺𐀹𐀋 𐀶𐀏𐀳𐀤 2 𐀀𐀚𐁀 , 𐀶𐀏𐀳𐀤 2 𐀠𐀫𐀺𐀙 𐀑𐀨𐀤 2 𐀢𐀏𐀫 𐀐𐀴𐀆𐀤 2 ]𐀪𐀗𐀤 2 ]𐀔𐀲𐀤 2 ]𐁚 1 ]𐀤 2 ] vac. inf. mut. lat. dex. ] , 𐀂𐀪[• ]1 𐀐𐀨𐀰 , 𐀑𐀨𐀤 2" 318 | 319 | # Test LB.A.42 320 | # Scenario: Test that 'sigillum' is correctly printed in annotated scenarios. 321 | # Requirements mapping: 322 | # LB.51: Tokenise each instance of 'sigillum'. Represent this text as is in the annotated output, but do not include in the regularized output. 323 | 324 | "α sigillum β qe-ti-ja γ vac." : "α sigillum β 𐀤𐀴𐀊 γ vac." 325 | 326 | # Test LB.A.43 327 | # Scenario: Test that 'supra sigillum' is correctly printed in annotated scenarios. 328 | # Requirements mapping: 329 | # LB.52: Tokenise each instance of 'supra sigillum'. Represent this text as is in the annotated output, but do not include in the regularized output. 330 | # If a notation about the seal type is also included (e.g. '=A', indicating that the seal is of type 'A', according to the publication of the Thebes sealings (Olivier et al. 1982), then also represent this in the annotated output, but do not include in the regularized output. 331 | 332 | "α JAC supra sigillum β o-pa γ pa-ta-ja" : "α 𐃘 supra sigillum β 𐀃𐀞 γ 𐀞𐀲𐀊" 333 | "α OVIS:m supra sigillum=R β vac. γ vac." : "α 𐂇 supra sigillum=R β vac. γ vac." 334 | "α CAP:m supra sigillum=Z=1 β vac. γ ]vac." : "α 𐂉 supra sigillum=Z=1 β vac. γ ]vac." 335 | 336 | # Test LB.A.44 337 | # Scenario: Test that '[•]' (or '[\u2022]') is correctly printed in annotated scenarios. 338 | # Requirements mapping: 339 | # LB.58: Tokenise each instance of '[•]' or '[\u2022]''. Represent this symbol as is in the annotated output, and represent as a single wildcard (i.e. '%') in regularized output. 340 | 341 | "]po-[\u2022] , / [ OVIS m ] 40 o OVIS m 20" : "]𐀡[•] , / [ 𐂇 ] 40 𐀃 𐂇 20" 342 | 343 | # Test LB.A.45 344 | # Scenario: Test that '•~•' is correctly printed in annoted scenarios. 345 | # Requirements mapping: 346 | # LB.60: Tokenise each instance of '•~•'. Represent this text as is in the annotated output, and represent as two wildcards (i.e. '%%') in regularized output. 347 | 348 | "ma-mi-di-zo / pi-ri-to-jo OVIS:f 40[ [•~•]-ro , da-nu-wo OVIS:f 100[ po-ri-wo , / su-ki-ri-ta-jo , wo-we-u CAP:m 180 ja-ru , / pa-ta-ti-jo , do-e-ro , CAP:f 230 a-du-po-to , / qi-ko-we-e , do-e-ro , CAP:f 90 qa-di-ja , / po-ku-te-ro , da-mo , 'do-e-ro' CAP:f 70 da-[•~• / ]po-ku-ta CAP:f 130 ra-wa-ni , / po-ku-ta , ra-ri-di-jo OVIS:m 190 o-mi-ri-so , / ta-so , do-e-ro OVIS:m 50 [•~•]-so / a-pi-me-de-o , po-ku-ta 'ra-ri-di-jo' OVIS:f 140 ku-jo-[ / ]ta-so , // do-e-ro OVIS:f 100 a-*56-da-ro / ka-ta-mi-jo , do-e-ro OVIS:x[ a-ra-ko , / ra-ri-di-jo , do-e-ro OVIS:m 100[ vac. vac. vac." : "𐀔𐀖𐀇𐀿 / 𐀠𐀪𐀵𐀍 𐂆 40[ [•~•]𐀫 , 𐀅𐀝𐀺 𐂆 100[ 𐀡𐀪𐀺 , / 𐀱𐀑𐀪𐀲𐀍 , 𐀺𐀸𐀄 𐂉 180 𐀊𐀬 , / 𐀞𐀲𐀴𐀍 , 𐀈𐀁𐀫 , 𐂈 230 𐀀𐀉𐀡𐀵 , / 𐀥𐀒𐀸𐀁 , 𐀈𐀁𐀫 , 𐂈 90 𐀣𐀇𐀊 , / 𐀡𐀓𐀳𐀫 , 𐀅𐀗 , '𐀈𐀁𐀫' 𐂈 70 𐀅[•~• / ]𐀡𐀓𐀲 𐂈 130 𐀨𐀷𐀛 , / 𐀡𐀓𐀲 , 𐀨𐀪𐀇𐀍 𐂇 190 𐀃𐀖𐀪𐀰 , / 𐀲𐀰 , 𐀈𐀁𐀫 𐂇 50 [•~•]𐀰 / 𐀀𐀠𐀕𐀆𐀃 , 𐀡𐀓𐀲 '𐀨𐀪𐀇𐀍' 𐂆 140 𐀓𐀍[ / ]𐀲𐀰 , // 𐀈𐀁𐀫 𐂆 100 𐀀𐁖𐀅𐀫 / 𐀏𐀲𐀖𐀍 , 𐀈𐀁𐀫 𐀥[ 𐀀𐀨𐀒 , / 𐀨𐀪𐀇𐀍 , 𐀈𐀁𐀫 𐂇 100[ vac. vac. vac." 349 | 350 | # Test LB.A.46 351 | # Scenario: Test that '●' and [•~•] are correctly printed in annotated scenarios'. 352 | # Requirements mapping: 353 | # LB.61: Tokenise each instance of '[•~•]'. Represent this text as is in the annotated output, and represent as two wildcards (i.e. '%%') in regularized output. 354 | # LB.66: Tokenise each instance of '●'. Represent this text as is in the annotated output, but do not include in the regularized output. 355 | 356 | "[•~•] [ wi-tu-ri-jo , / a-mo-te-re [" : "[•~•] [ 𐀹𐀶𐀪𐀍 , / 𐀀𐀗𐀳𐀩 [" 357 | "sup. mut. ]vest.[ di-pa AES *214VAS+DI 30[ qe-ro2 'AES' *255 ● 16 ku-ru-su-*56 ● *207VAS 1 pi-ri-je ● ZE 1 [•~•] 'me-no-no[' inf. mut." : "sup. mut. ]vest.[ 𐀇𐀞 𐂚 𐃭+𐀇 30[ 𐀤𐁊 '𐂚' 𐃙 ● 16 𐀓𐀬𐀱𐁖 ● 𐃦 1 𐀠𐀪𐀋 ● 𐀽 1 [•~•] '𐀕𐀜𐀜[' inf. mut." 358 | 359 | # Test LB.A.47 360 | # Scenario: Test that '•~•~' is correctly printed in annotated scenarios. 361 | # Requirements mapping: 362 | # LB.62: Tokenise each instance of '•~•~'. Represent this text as is in the annotated output, and represent as two wildcards (i.e. '%%') in regularized output. 363 | 364 | # ADD TC 365 | 366 | # Test LB.A.48 367 | # Scenario: Test that '[•~•~]' is correctly printed in annotated scenarios. 368 | # Requirements mapping: 369 | # LB.63: Tokenise each instance of '[•~•~]'. Represent this text as is in the annotated output, and represent as two wildcards (i.e. '%%') in regularized output. 370 | 371 | "][•~•~]*34-so , 'da-*22-to' OVIS:m 50 [ ]do-ti , ti-ri-to OVIS:m 50 [" : "][•~•~]𐁓𐀰 , '𐀅𐁒𐀵' 𐂇 50 [ ]𐀈𐀴 , 𐀴𐀪𐀵 𐂇 50 [" 372 | 373 | # Test LB.A.49 374 | # Scenario: Test that '•~•~•' is correctly printed in annotated scenarios. 375 | # Requirements mapping: 376 | # LB.64: Tokenise each instance of '•~•~•'. Represent this text as is in the annotated output, and represent as three wildcards (i.e. '%%%') in regularized output. 377 | 378 | # ADD TC 379 | 380 | # Test LB.A.50 381 | # Scenario: Test that '[•~•~•]' is correctly printed in annotated scenarios. 382 | # Requirements mapping: 383 | # LB.65: Tokenise each instance of '[•~•~•]'. Represent this text as is in the annotated output, and represent as three wildcards (i.e. '%%%') in regularized output. 384 | 385 | "] vest. [ [•~•~•]-ra-de / ne-wo-jo OLE 4[ ] vac. [": "] vest. [ [•~•~•]𐀨𐀆 / 𐀚𐀺𐀍 𐂕 4[ ] vac. [" 386 | 387 | # Test LB.A.51 388 | # Scenario: Test that '•~•~•~•' is correctly printed in annotated scenarios. 389 | # Requirements mapping: 390 | # LB.66: Tokenise each instance of '•~•~•~•'. Represent this text as is in the annotated output, and represent as four wildcards (i.e. '%%%%') in regularized output. 391 | 392 | # ADD TC 393 | 394 | # Test LB.A.52 395 | # Scenario: Test that '[•~•~•~•]' is correctly printed in annotated scenarios. 396 | # Requirements mapping: 397 | # LB.67: Tokenise each instance of '[•~•~•~•]'. Represent this text as is in the annotated output, and represent as four wildcards (i.e. '%%%%') in regularized output. 398 | 399 | # ADD TC 400 | 401 | # Test LB.A.53 402 | # Scenario: Test that checkmarks (i.e. 'X') are correctly printed in annotated scenarios. 403 | # Requirements mapping: 404 | # LB.69: Tokenise each instance of 'X'. Represent this text as is in the annotated output, but do not include in the regularized output. 405 | 406 | "fragmentum A sup. mut. ] X MUL 1 ]--u-ra MUL 1 X ]-na MUL 1 tu-ka-na X MUL 1 ]-ma MUL 1 te-qa-ja MUL 1 ]-ja MUL 1-[ ]-ja-mu-ta MUL 1-[ ]--ta2-no-[ inf. mut." : "fragmentum A sup. mut. ] X 𐂁 1 ]𐀄𐀨 𐂁 1 X ]𐀙 𐂁 1 𐀶𐀏𐀙 X 𐂁 1 ]𐀔 𐂁 1 𐀳𐀣𐀊 𐂁 1 ]𐀊 𐂁 1[ ]𐀊𐀘𐀲 𐂁 1[ ]𐁋𐀜[ inf. mut." 407 | 408 | # Test LB.A.54 409 | # Scenario: Test that '|' is correctly printed in annotated scenarios. 410 | # Requirements mapping: 411 | # LB.70: Tokenise each instance of '|'. Represent this sign as is in the annotated output, but do not include in the regularized output. 412 | 413 | "α ]a3-wo-re-u-|si|-si β do-ke γ [•]-ja-wo-ne" : "α ]𐁁𐀺𐀩𐀄|𐀯|𐀯 β 𐀈𐀐 γ [•]𐀊𐀺𐀚" 414 | 415 | # Test LB.A.55 416 | # Scenario: Test that both '' and '' are not printed in either annotated or regularized scenarios. 417 | # Requirements mapping: 418 | # LB.71: Ignore each instance of ''. Do not represent this string in either the annotated or the regularized output. 419 | # LB.72: Ignore each instance of ''. Do not represent this string in either the annotated or the regularized output. 420 | 421 | "fragmentum A fragmentum B sup. mut. sup. mut. ]--to-[ ]-da-*22-to HORD [ ] 'da-*22-to' HORD 2 da-]-*22-to HORD-[ ]--ro 'da-*22-to' HORD 2 inf. mut. ]--ri 'da-*22-to' HORD 2 ] vac. inf. mut." : "fragmentum A fragmentum B sup. mut. sup. mut. ]𐀵[ ]𐀅𐁒𐀵 𐂏 [ ] '𐀅𐁒𐀵' 𐂏 2 𐀅]𐁒𐀵 𐂏[ ]𐀫 '𐀅𐁒𐀵' 𐂏 2 inf. mut. ]𐀪 '𐀅𐁒𐀵' 𐂏 2 ] vac. inf. mut." 422 | 423 | # Test LB.A.56 424 | # Scenario: Test that the space is removed after the '+' sign, and the correct sign is printed in the regularized scenario. 425 | # Requirements mapping: 426 | # LB.1-b: If a blank space appears before/after '+', then remove both those spaces in both the annotated and regularized outputs. 427 | 428 | "]r\u0323o\u0323 , / da-mo GRA [ ]8 OLIV+ A 12" : "]𐀫 , / 𐀅𐀗 𐂎 [ ]8 𐂐+𐀀 12" 429 | 430 | # Test LB.A.57 431 | # Scenario: Test that spaces are removed after 'TELA' and before either a '1', '2', '3', '4' or 'x', and the correct sign/s are printed in annotated scenarios. 432 | # Requirements mapping: 433 | # LB.1-d: If a blank space appears after 'TELA' and before either a '1', '2', '3', '4' or 'x', then remove that space in both the annotated and regularized outputs. 434 | 435 | "]\u0323a\u0323-ra-ka-te-ja / tu-na-no TELA 1\u0323 1 [" : "]𐀀𐀨𐀏𐀳𐀊 / 𐀶𐀙𐀜 𐂧¹ 1 [" 436 | "] * 161 TELA 2 [" : "] 𐂩 𐂧² [" 437 | "nu-wa-i-ja , / 'pa-we-a' * 161 TELA 3 30\u27e6 \u27e7" : "𐀝𐀷𐀂𐀊 , / '𐀞𐀸𐀀' 𐂩 𐂧³ 30⟦ ⟧" 438 | "] TELA 4 + PU 1[" : "] 𐂧⁴+𐀢 1[" 439 | "]ti-jo\u2e24 \u2e25 / to-mi-ka TELA x 30" : "]𐀴𐀍⸤ ⸥ / 𐀵𐀖𐀏 𐂧ˣ 30" 440 | ']TELA 10 ⟦ ⟧ *158 1' : ']𐂧 10 ⟦ ⟧ 𐂦 1' 441 | "to-sa TELA 40 o TELA 1 6[" : "𐀵𐀭 𐂧 40 𐀃 𐂧¹ 6[" 442 | 443 | # NEW SCENARIO 444 | # Test LB.A.61 445 | # NEW REQUIREMENT: Tokenise each instance of 'sin.'. Represent this text as is in the annotated output, but do not include in the regularized output. 446 | 447 | "ku-ro-ro2 AROM 13 T 5 KA±PO 4 *157 28 LANA 5 me-po 6 S 1 V 4 ko-ri-jo-da-na AROM 21 i-re-we[ ] T 2 v. ta-we-si-jo-jo , ke-ro-si-ja VIR 20[ a-pi-qo-o , ke-ro-si-ja VIR 17 [ a-pi-o-to , ke-ro-si-ja VIR 18 o-to-wo-o , ke-ro-si-ja VIR 13 lat. sin. ka-ma-e-we VIR 10" : "𐀓𐀫𐁊 𐂑 13 𐄼 5 𐂓 4 𐂥 28 𐂝 5 𐀕𐀡 6 𐄽 1 𐄾 4 𐀒𐀪𐀍𐀅𐀙 𐂑 21 𐀂𐀩𐀸[ ] 𐄼 2 v. 𐀲𐀸𐀯𐀍𐀍 , 𐀐𐀫𐀯𐀊 𐂀 20[ 𐀀𐀠𐀦𐀃 , 𐀐𐀫𐀯𐀊 𐂀 17 [ 𐀀𐀠𐀃𐀵 , 𐀐𐀫𐀯𐀊 𐂀 18 𐀃𐀵𐀺𐀃 , 𐀐𐀫𐀯𐀊 𐂀 13 lat. sin. 𐀏𐀔𐀁𐀸 𐂀 10" --------------------------------------------------------------------------------