├── setup.cfg
├── tests
    ├── mk_accented_latin.txt
    ├── bg.txt
    ├── sr_latinica.txt
    ├── bg_accented.txt
    ├── by.txt
    ├── sr.txt
    ├── ua.txt
    ├── mk.txt
    ├── mn.txt
    ├── ru.txt
    ├── me.txt
    ├── tj.txt
    ├── mk_accented.txt
    └── bg_windows1251.txt
├── .gitignore
├── cyrtranslit
    ├── mapping
    │   ├── me.py
    │   ├── sr.py
    │   ├── tj.py
    │   ├── ua.py
    │   ├── by.py
    │   ├── ru.py
    │   ├── el.py
    │   ├── mn.py
    │   ├── bg.py
    │   ├── __init__.py
    │   └── mk.py
    ├── cyrtranslit.py
    └── __init__.py
├── LICENSE
├── .github
    └── workflows
    │   └── test.yml
├── setup.py
├── README.md
└── tests.py


/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/tests/mk_accented_latin.txt:
--------------------------------------------------------------------------------
1 | ì e tuka
2 | nè sme tamu
3 | 


--------------------------------------------------------------------------------
/tests/bg.txt:
--------------------------------------------------------------------------------
1 | АаБбВвГгДдЕеЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЬьЮюЯя


--------------------------------------------------------------------------------
/tests/sr_latinica.txt:
--------------------------------------------------------------------------------
1 | Dobar dan
2 | Kako si?
3 | Ovo je test fajl.
4 | 


--------------------------------------------------------------------------------
/tests/bg_accented.txt:
--------------------------------------------------------------------------------
1 | ѝ е тук
2 | и аз съм тук
3 | ѝ каза нещо
4 | Ѝ дойде
5 | 


--------------------------------------------------------------------------------
/tests/by.txt:
--------------------------------------------------------------------------------
1 | АаБбВвГгДдЕеЁёЖжЗзІіЙйКкЛлМмНнОоПпРрСсТтУуЎўФфХхЦцЧчШшЫыЬьЭэЮюЯя


--------------------------------------------------------------------------------
/tests/sr.txt:
--------------------------------------------------------------------------------
1 | АаБбВвГгДдЂђЕеЖжЗзИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЋћУуФфХхЦцЧчЏџШш
2 | 


--------------------------------------------------------------------------------
/tests/ua.txt:
--------------------------------------------------------------------------------
1 | АаБбВвГгҐґДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЮюЯяь


--------------------------------------------------------------------------------
/tests/mk.txt:
--------------------------------------------------------------------------------
1 | АаБбВвГгДдЃѓЕеЖжЗзЅѕИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЌќУуФфХхЦцЧчЏџШш
2 | 


--------------------------------------------------------------------------------
/tests/mn.txt:
--------------------------------------------------------------------------------
1 | АаБбВвГгДдЂђЕеЖжЗзЗ́з́ИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЋћУуФфХхЦцЧчЏџШшС́с́


--------------------------------------------------------------------------------
/tests/ru.txt:
--------------------------------------------------------------------------------
1 | АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя
2 | 


--------------------------------------------------------------------------------
/tests/me.txt:
--------------------------------------------------------------------------------
1 | АаБбВвГгДдЂђЕеЖжЗзЗ́з́ИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЋћУуФфХхЦцЧчЏџШшС́с́
2 | 


--------------------------------------------------------------------------------
/tests/tj.txt:
--------------------------------------------------------------------------------
1 | АаБбВвГгҒғДдЕеЁёЖжЗзИиӢӣЙйКкЛлМмНнОоПпРрСсТтУуӮӯФфХхҲҳЧчҶҷШшъЭэЮюЯя
2 | 


--------------------------------------------------------------------------------
/tests/mk_accented.txt:
--------------------------------------------------------------------------------
1 | ѝ је тука
2 | нѐ сме таму
3 | сѐ е добро
4 | Ѐдна работа
5 | Ѝ си дошла
6 | 


--------------------------------------------------------------------------------
/tests/bg_windows1251.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/opendatakosovo/cyrillic-transliteration/HEAD/tests/bg_windows1251.txt


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .pypirc
 2 | _build
 3 | MANIFEST
 4 | .idea
 5 | # Byte-compiled / optimized / DLL files
 6 | __pycache__/
 7 | *.py[cod]
 8 | 
 9 | # C extensions
10 | *.so
11 | 
12 | # Distribution / packaging
13 | .Python
14 | env/
15 | venv/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | 
31 | # PyInstaller
32 | #  Usually these files are written by a python script from a template
33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 | 
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 | 
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *,cover
50 | tests/output.txt
51 | 
52 | # Translations
53 | *.mo
54 | *.pot
55 | 
56 | # Django stuff:
57 | *.log
58 | 
59 | # Sphinx documentation
60 | docs/_build/
61 | 
62 | # PyBuilder
63 | target/
64 | 


--------------------------------------------------------------------------------
/cyrtranslit/mapping/me.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Montenegrin (me) transliteration mappings.
 4 | 
 5 | ISO 3166-1 country code: me
 6 | 
 7 | Montenegrin Latin is based on Serbo-Croatian Latin, with the addition of the two letters Ś and Ź,
 8 | to replace the digraphs SJ and ZJ. These parallel the two letters of the Montenegrin Cyrillic
 9 | alphabet not found in Serbian, С́ and З́. These, respectively, could also be represented in the
10 | original alphabets as šj and žj, and шj and жj.
11 | 
12 | Source: https://en.wikipedia.org/wiki/Montenegrin_alphabet#Latin_alphabet
13 | Also see: http://news.bbc.co.uk/2/hi/8520466.stm
14 | """
15 | 
16 | import copy
17 | from .sr import SR_CYR_TO_LAT_DICT
18 | 
19 | ME_CYR_TO_LAT_DICT = copy.deepcopy(SR_CYR_TO_LAT_DICT)
20 | ME_CYR_TO_LAT_DICT.update({
21 |     u'С́': u'Ś', u'с́': u'ś',  # Montenegrin
22 |     u'З́': u'Ź', u'з́': u'ź'  # Montenegrin
23 | })
24 | 
25 | # This dictionary is to transliterate from Montenegrin Latin to Cyrillic.
26 | ME_LAT_TO_CYR_DICT = {y: x for x, y in iter(ME_CYR_TO_LAT_DICT.items())}
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Open Data Kosovo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13', '3.14']
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v4
18 | 
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v5
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 | 
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install pytest pytest-cov
28 |         pip install -e .
29 | 
30 |     - name: Run tests with coverage
31 |       run: |
32 |         pytest tests.py --cov=cyrtranslit --cov-report=xml --cov-report=term-missing
33 | 
34 |     - name: Generate coverage report
35 |       uses: irongut/CodeCoverageSummary@v1.3.0
36 |       with:
37 |         filename: coverage.xml
38 |         badge: true
39 |         format: markdown
40 |         output: both
41 | 
42 |     - name: Add coverage to job summary
43 |       run: |
44 |         cat code-coverage-results.md >> $GITHUB_STEP_SUMMARY
45 | 


--------------------------------------------------------------------------------
/cyrtranslit/mapping/sr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Serbian (sr/rs) transliteration mappings.
 4 | 
 5 | ISO 639-1 language code: sr
 6 | ISO 3166-1 country code alias: rs
 7 | """
 8 | 
 9 | # This dictionary is to transliterate from Serbian Cyrillic to Latin.
10 | SR_CYR_TO_LAT_DICT = {
11 |     u'А': u'A', u'а': u'a',
12 |     u'Б': u'B', u'б': u'b',
13 |     u'В': u'V', u'в': u'v',
14 |     u'Г': u'G', u'г': u'g',
15 |     u'Д': u'D', u'д': u'd',
16 |     u'Ђ': u'Đ', u'ђ': u'đ',
17 |     u'Е': u'E', u'е': u'e',
18 |     u'Ж': u'Ž', u'ж': u'ž',
19 |     u'З': u'Z', u'з': u'z',
20 |     u'И': u'I', u'и': u'i',
21 |     u'Ј': u'J', u'ј': u'j',
22 |     u'К': u'K', u'к': u'k',
23 |     u'Л': u'L', u'л': u'l',
24 |     u'Љ': u'Lj', u'љ': u'lj',
25 |     u'М': u'M', u'м': u'm',
26 |     u'Н': u'N', u'н': u'n',
27 |     u'Њ': u'Nj', u'њ': u'nj',
28 |     u'О': u'O', u'о': u'o',
29 |     u'П': u'P', u'п': u'p',
30 |     u'Р': u'R', u'р': u'r',
31 |     u'С': u'S', u'с': u's',
32 |     u'Т': u'T', u'т': u't',
33 |     u'Ћ': u'Ć', u'ћ': u'ć',
34 |     u'У': u'U', u'у': u'u',
35 |     u'Ф': u'F', u'ф': u'f',
36 |     u'Х': u'H', u'х': u'h',
37 |     u'Ц': u'C', u'ц': u'c',
38 |     u'Ч': u'Č', u'ч': u'č',
39 |     u'Џ': u'Dž', u'џ': u'dž',
40 |     u'Ш': u'Š', u'ш': u'š',
41 | }
42 | 
43 | # This dictionary is to transliterate from Serbian Latin to Cyrillic.
44 | # Let's build it by simply swapping keys and values of previous dictionary.
45 | SR_LAT_TO_CYR_DICT = {y: x for x, y in iter(SR_CYR_TO_LAT_DICT.items())}
46 | 


--------------------------------------------------------------------------------
/cyrtranslit/mapping/tj.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Tajik (tj) transliteration mappings.
 4 | 
 5 | ISO 639-1 language code: tj
 6 | 
 7 | Transliteration follows ISO 9 (1995).
 8 | https://en.wikipedia.org/wiki/Tajik_alphabet#Cyrillic
 9 | """
10 | 
11 | import copy
12 | from .ru import RU_CYR_TO_LAT_DICT
13 | 
14 | # Transliterate from Tajik cyrillic to latin
15 | TJ_CYR_TO_LAT_DICT = copy.deepcopy(RU_CYR_TO_LAT_DICT)
16 | # Change Mapping according to ISO 9 (1995)
17 | TJ_CYR_TO_LAT_DICT[u"Э"] = u"È"
18 | TJ_CYR_TO_LAT_DICT[u"э"] = u"è"
19 | TJ_CYR_TO_LAT_DICT[u"ъ"] = u"’"
20 | TJ_CYR_TO_LAT_DICT[u"Х"] = u"H"
21 | TJ_CYR_TO_LAT_DICT[u"х"] = u"h"
22 | TJ_CYR_TO_LAT_DICT[u"Ч"] = u"Č"
23 | TJ_CYR_TO_LAT_DICT[u"ч"] = u"č"
24 | TJ_CYR_TO_LAT_DICT[u"Ж"] = u"Ž"
25 | TJ_CYR_TO_LAT_DICT[u"ж"] = u"ž"
26 | TJ_CYR_TO_LAT_DICT[u"Ё"] = u"Ë"
27 | TJ_CYR_TO_LAT_DICT[u"ё"] = u"ë"
28 | TJ_CYR_TO_LAT_DICT[u"Ш"] = u"Š"
29 | TJ_CYR_TO_LAT_DICT[u"ш"] = u"š"
30 | TJ_CYR_TO_LAT_DICT[u"Ю"] = u"Û"
31 | TJ_CYR_TO_LAT_DICT[u"ю"] = u"û"
32 | TJ_CYR_TO_LAT_DICT[u"Я"] = u"Â"
33 | TJ_CYR_TO_LAT_DICT[u"я"] = u"â"
34 | # delete letters not used
35 | del TJ_CYR_TO_LAT_DICT[u"Ц"]
36 | del TJ_CYR_TO_LAT_DICT[u"ц"]
37 | del TJ_CYR_TO_LAT_DICT[u"Щ"]
38 | del TJ_CYR_TO_LAT_DICT[u"щ"]
39 | del TJ_CYR_TO_LAT_DICT[u"Ы"]
40 | del TJ_CYR_TO_LAT_DICT[u"ы"]
41 | 
42 | # update the dict for the additional letters in the tajik cyrillic alphabet ( Ғ, Ӣ, Қ, Ӯ, Ҳ, Ҷ )
43 | # see https://en.wikipedia.org/wiki/Tajik_alphabet#Cyrillic
44 | TJ_CYR_TO_LAT_DICT.update({
45 |     u"Ғ": u"Ǧ", u"ғ": u"ǧ",
46 |     u"Ӣ": u"Ī", u"ӣ": u"ī",
47 |     u"Қ": u"Q", u"қ": u"q",
48 |     u"Ӯ": u"Ū", u"ӯ": u"ū",
49 |     u"Ҳ": u"Ḩ", u"ҳ": u"ḩ",
50 |     u"Ҷ": u"Ç", u"ҷ": u"ç"
51 | })
52 | 
53 | # transliterate from latin tajik to cyrillic
54 | TJ_LAT_TO_CYR_DICT = {y: x for x, y in iter(TJ_CYR_TO_LAT_DICT.items())}
55 | 


--------------------------------------------------------------------------------
/cyrtranslit/mapping/ua.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Ukrainian (ua) transliteration mappings.
 4 | 
 5 | ISO 639-1 language code: ua
 6 | 
 7 | Transliteration follows Scientific Ukrainian transliteration system.
 8 | """
 9 | 
10 | import copy
11 | from .ru import RU_CYR_TO_LAT_DICT
12 | 
13 | # Transliterate from Ukrainian
14 | UA_CYR_TO_LAT_DICT = copy.deepcopy(RU_CYR_TO_LAT_DICT)
15 | # Change mapping to match with Scientific Ukrainian
16 | UA_CYR_TO_LAT_DICT[u"Г"] = u"H"
17 | UA_CYR_TO_LAT_DICT[u"г"] = u"h"
18 | UA_CYR_TO_LAT_DICT[u"Ж"] = u"Ž"
19 | UA_CYR_TO_LAT_DICT[u"ж"] = u"ž"
20 | UA_CYR_TO_LAT_DICT[u"И"] = u"Y"
21 | UA_CYR_TO_LAT_DICT[u"и"] = u"y"
22 | UA_CYR_TO_LAT_DICT[u"Х"] = u"X"
23 | UA_CYR_TO_LAT_DICT[u"х"] = u"x"
24 | UA_CYR_TO_LAT_DICT[u"Ц"] = u"C"
25 | UA_CYR_TO_LAT_DICT[u"ц"] = u"c"
26 | UA_CYR_TO_LAT_DICT[u"Ч"] = u"Č"
27 | UA_CYR_TO_LAT_DICT[u"ч"] = u"č"
28 | UA_CYR_TO_LAT_DICT[u"Ш"] = u"Š"
29 | UA_CYR_TO_LAT_DICT[u"ш"] = u"š"
30 | UA_CYR_TO_LAT_DICT[u"Щ"] = u"Šč"
31 | UA_CYR_TO_LAT_DICT[u"щ"] = u"šč"
32 | UA_CYR_TO_LAT_DICT[u"Ю"] = u"Ju"
33 | UA_CYR_TO_LAT_DICT[u"ю"] = u"ju"
34 | UA_CYR_TO_LAT_DICT[u"Я"] = u"Ja"
35 | UA_CYR_TO_LAT_DICT[u"я"] = u"ja"
36 | # Delete unused letters
37 | del UA_CYR_TO_LAT_DICT[u"Ё"]
38 | del UA_CYR_TO_LAT_DICT[u"ё"]
39 | del UA_CYR_TO_LAT_DICT[u"Ъ"]
40 | del UA_CYR_TO_LAT_DICT[u"ъ"]
41 | del UA_CYR_TO_LAT_DICT[u"Ы"]
42 | del UA_CYR_TO_LAT_DICT[u"ы"]
43 | del UA_CYR_TO_LAT_DICT[u"Э"]
44 | del UA_CYR_TO_LAT_DICT[u"э"]
45 | 
46 | # Update for Ukrainian letters
47 | UA_CYR_TO_LAT_DICT.update({
48 |     u"Ґ": u"G", u"ґ": u"g",
49 |     u"Є": u"Je", u"є": u"je",
50 |     u"І": u"I", u"і": u"i",
51 |     u"Ї": u"Ji", u"ї": u"ji"
52 | })
53 | 
54 | # Latin to Cyrillic
55 | UA_LAT_TO_CYR_DICT = {y: x for x, y in iter(UA_CYR_TO_LAT_DICT.items())}
56 | UA_LAT_TO_CYR_DICT.update({
57 |     u"JE": u"Є", u"jE": u"є",
58 |     u"JI": u"Ї", u"jI": u"ї"
59 | })
60 | 


--------------------------------------------------------------------------------
/cyrtranslit/mapping/by.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Belarusian (by) transliteration mappings.
 4 | 
 5 | ISO 639-1 language code: by
 6 | 
 7 | Transliteration follows ISO 9:1995 and BGN/PCGN romanization standards.
 8 | https://en.wikipedia.org/wiki/Belarusian_alphabet
 9 | https://en.wikipedia.org/wiki/Romanization_of_Belarusian
10 | """
11 | 
12 | import copy
13 | from .ru import RU_CYR_TO_LAT_DICT
14 | 
15 | # Transliterate from Belarusian (based on ISO 9:1995 and BGN/PCGN)
16 | BY_CYR_TO_LAT_DICT = copy.deepcopy(RU_CYR_TO_LAT_DICT)
17 | # Change mapping to match Belarusian scientific transliteration
18 | BY_CYR_TO_LAT_DICT[u"Г"] = u"H"
19 | BY_CYR_TO_LAT_DICT[u"г"] = u"h"
20 | BY_CYR_TO_LAT_DICT[u"Ё"] = u"Ë"
21 | BY_CYR_TO_LAT_DICT[u"ё"] = u"ë"
22 | BY_CYR_TO_LAT_DICT[u"Ж"] = u"Ž"
23 | BY_CYR_TO_LAT_DICT[u"ж"] = u"ž"
24 | BY_CYR_TO_LAT_DICT[u"Х"] = u"X"
25 | BY_CYR_TO_LAT_DICT[u"х"] = u"x"
26 | BY_CYR_TO_LAT_DICT[u"Ц"] = u"C"
27 | BY_CYR_TO_LAT_DICT[u"ц"] = u"c"
28 | BY_CYR_TO_LAT_DICT[u"Ч"] = u"Č"
29 | BY_CYR_TO_LAT_DICT[u"ч"] = u"č"
30 | BY_CYR_TO_LAT_DICT[u"Ш"] = u"Š"
31 | BY_CYR_TO_LAT_DICT[u"ш"] = u"š"
32 | BY_CYR_TO_LAT_DICT[u"Ы"] = u"Y"
33 | BY_CYR_TO_LAT_DICT[u"ы"] = u"y"
34 | BY_CYR_TO_LAT_DICT[u"Ь"] = u"'"
35 | BY_CYR_TO_LAT_DICT[u"ь"] = u"'"
36 | BY_CYR_TO_LAT_DICT[u"Э"] = u"Ė"
37 | BY_CYR_TO_LAT_DICT[u"э"] = u"ė"
38 | BY_CYR_TO_LAT_DICT[u"Ю"] = u"Ju"
39 | BY_CYR_TO_LAT_DICT[u"ю"] = u"ju"
40 | BY_CYR_TO_LAT_DICT[u"Я"] = u"Ja"
41 | BY_CYR_TO_LAT_DICT[u"я"] = u"ja"
42 | # Delete letters not used in Belarusian
43 | del BY_CYR_TO_LAT_DICT[u"Щ"]
44 | del BY_CYR_TO_LAT_DICT[u"щ"]
45 | del BY_CYR_TO_LAT_DICT[u"Ъ"]
46 | del BY_CYR_TO_LAT_DICT[u"ъ"]
47 | # Update for Belarusian-specific letters
48 | BY_CYR_TO_LAT_DICT.update({
49 |     u"І": u"I", u"і": u"i",
50 |     u"Ў": u"Ŭ", u"ў": u"ŭ"
51 | })
52 | 
53 | # Latin to Cyrillic
54 | BY_LAT_TO_CYR_DICT = {y: x for x, y in iter(BY_CYR_TO_LAT_DICT.items())}
55 | BY_LAT_TO_CYR_DICT.update({
56 |     u"JU": u"Ю", u"Ju": u"Ю", u"ju": u"ю",
57 |     u"JA": u"Я", u"Ja": u"Я", u"ja": u"я",
58 |     u"''": u"Ьь"  # Two apostrophes for Ьь
59 | })
60 | # Single apostrophe defaults to lowercase ь
61 | BY_LAT_TO_CYR_DICT[u"'"] = u"ь"
62 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from setuptools import setup
 3 | 
 4 | # use this to read the contents of the README.md file
 5 | from pathlib import Path
 6 | 
 7 | setup(
 8 |   name='cyrtranslit',
 9 |   packages=['cyrtranslit', 'cyrtranslit.mapping'],
10 |   version='1.2.0',
11 |   description='Bi-directional Cyrillic transliteration. Transliterate Cyrillic script to Latin script and vice versa. Supports transliteration for Belarusian, Bulgarian, Greek, Montenegrin, Macedonian, Mongolian, Russian, Serbian, Tajik, and Ukrainian.',
12 |   long_description=(Path(__file__).parent / "README.md").read_text(),
13 |   long_description_content_type='text/markdown',
14 |   author='Georges Labrèche, Open Data Kosovo',
15 |   author_email='georges@tanagraspace.com',
16 |   url='https://github.com/opendatakosovo/cyrillic-transliteration',
17 |   download_url='https://github.com/opendatakosovo/cyrillic-transliteration/archive/v1.2.0.tar.gz',
18 |   license='MIT',
19 |   keywords=['cyrillic', 'latin', 'transliteration', 'transliterate', 'cyrtranslit', 'belarusian', 'bulgarian', 'greek', 'montenegrin', 'macedonian', 'mongolian', 'russian', 'serbian', 'tajik', 'ukrainian'],
20 |   classifiers=['Development Status :: 5 - Production/Stable',
21 |                'Intended Audience :: Developers',
22 |                'License :: OSI Approved :: MIT License',
23 |                'Programming Language :: Python',
24 |                'Programming Language :: Python :: 2.7',
25 |                'Programming Language :: Python :: 3',
26 |                'Programming Language :: Python :: 3.1',
27 |                'Programming Language :: Python :: 3.2',
28 |                'Programming Language :: Python :: 3.3',
29 |                'Programming Language :: Python :: 3.4',
30 |                'Programming Language :: Python :: 3.5',
31 |                'Programming Language :: Python :: 3.6',
32 |                'Programming Language :: Python :: 3.7',
33 |                'Programming Language :: Python :: 3.8',
34 |                'Programming Language :: Python :: 3.9',
35 |                'Programming Language :: Python :: 3.10',
36 |                'Programming Language :: Python :: 3.11',
37 |                'Programming Language :: Python :: 3.12',
38 |                'Programming Language :: Python :: 3.13',
39 |                'Programming Language :: Python :: 3.14'],
40 |   entry_points={
41 |       "console_scripts": [
42 |           "cyrtranslit=cyrtranslit.cyrtranslit:main",
43 |       ]
44 |   }
45 | )


--------------------------------------------------------------------------------
/cyrtranslit/mapping/ru.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Russian (ru) transliteration mappings.
 4 | 
 5 | ISO 639-1 language code: ru
 6 | 
 7 | Transliteration follows GOST 7.79-2000 System B.
 8 | """
 9 | 
10 | # This dictionary is to transliterate from Russian Cyrillic to Latin (GOST_7.79-2000 System B).
11 | RU_CYR_TO_LAT_DICT = {
12 |     u"А": u"A", u"а": u"a",
13 |     u"Б": u"B", u"б": u"b",
14 |     u"В": u"V", u"в": u"v",
15 |     u"Г": u"G", u"г": u"g",
16 |     u"Д": u"D", u"д": u"d",
17 |     u"Е": u"E", u"е": u"e",
18 |     u"Ё": u"YO", u"ё": u"yo",
19 |     u"Ж": u"ZH", u"ж": u"zh",
20 |     u"З": u"Z", u"з": u"z",
21 |     u"И": u"I", u"и": u"i",
22 |     u"Й": u"J", u"й": u"j",
23 |     u"К": u"K", u"к": u"k",
24 |     u"Л": u"L", u"л": u"l",
25 |     u"М": u"M", u"м": u"m",
26 |     u"Н": u"N", u"н": u"n",
27 |     u"О": u"O", u"о": u"o",
28 |     u"П": u"P", u"п": u"p",
29 |     u"Р": u"R", u"р": u"r",
30 |     u"С": u"S", u"с": u"s",
31 |     u"Т": u"T", u"т": u"t",
32 |     u"У": u"U", u"у": u"u",
33 |     u"Ф": u"F", u"ф": u"f",
34 |     u"Х": u"H", u"х": u"h",
35 |     u"Ц": u"CZ", u"ц": u"cz",
36 |     u"Ч": u"CH", u"ч": u"ch",
37 |     u"Ш": u"SH", u"ш": u"sh",
38 |     u"Щ": u"SHH", u"щ": u"shh",
39 |     u"Ъ": u"''", u"ъ": u"''",
40 |     u"Ы": u"Y'", u"ы": u"y'",
41 |     u"Ь": u"'", u"ь": u"'",
42 |     u"Э": u"E'", u"э": u"e'",
43 |     u"Ю": u"Yu", u"ю": u"yu",
44 |     u"Я": u"Ya", u"я": u"ya",
45 | }
46 | 
47 | # This dictionary is to transliterate from Russian Latin to Cyrillic.
48 | RU_LAT_TO_CYR_DICT = {y: x for x, y in RU_CYR_TO_LAT_DICT.items()}
49 | RU_LAT_TO_CYR_DICT.update({
50 |     u"''": u"ъ",
51 |     u"'": u"ь",
52 |     u"C": u"К", u"c": u"к",
53 |     u"CK": u"К", u"Ck": u"К", u"ck": u"к",
54 |     u"JA": u"ЖА", u"Ja": u"Жа", u"ja": u"жа",
55 |     u"JE": u"ЖЕ", u"Je": u"Же", u"je": u"же",
56 |     u"JI": u"ЖИ", u"Ji": u"Жи", u"ji": u"жи",
57 |     u"JO": u"ЖО", u"Jo": u"Жо", u"jo": u"жо",
58 |     u"JU": u"ЖУ", u"Ju": u"Жу", u"ju": u"жу",
59 |     u"PH": u"Ф", u"Ph": u"Ф", u"ph": u"ф",
60 |     u"TH": u"З", u"Th": u"З", u"th": u"з",
61 |     u"W": u"В", u"w": u"в", u"Q": u"К", u"q": u"к",
62 |     u"WH": u"В", u"Wh": u"В", u"wh": u"в",
63 |     u"Y": u"И", u"y": u"и",
64 |     u"YA": u"Я", u"Ya": u"Я", u"ya": u"я",
65 |     u"YE": u"Е", u"Ye": u"Е", u"ye": u"е",
66 |     u"YI": u"И", u"Yi": u"И", u"yi": u"и",
67 |     u"YO": u"Ё", u"Yo": u"Ё", u"yo": u"ё",
68 |     u"YU": u"Ю", u"Yu": u"Ю", u"yu": u"ю",
69 |     u"Y'": u"ы", u"y'": u"ы",
70 |     u"iy": u"ый", u"ij": u"ый",  # dobriy => добрый
71 | })
72 | 


--------------------------------------------------------------------------------
/cyrtranslit/mapping/el.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Greek (el) transliteration mappings.
 4 | 
 5 | ISO 639-1 language code: el
 6 | 
 7 | Transliteration follows ELOT 743 / ISO 843.
 8 | https://en.wikipedia.org/wiki/Greek_alphabet
 9 | https://en.wikipedia.org/wiki/Romanization_of_Greek
10 | """
11 | 
12 | # Transliterate from Greek to Latin (ELOT 743 / ISO 843)
13 | EL_GRE_TO_LAT_DICT = {
14 |     u"Α": u"A", u"α": u"a", u"Ά": u"A", u"ά": u"a",  # alpha (with/without tonos)
15 |     u"Β": u"V", u"β": u"v",
16 |     u"Γ": u"G", u"γ": u"g",
17 |     u"Δ": u"D", u"δ": u"d",
18 |     u"Ε": u"E", u"ε": u"e", u"Έ": u"E", u"έ": u"e",  # epsilon (with/without tonos)
19 |     u"Ζ": u"Z", u"ζ": u"z",
20 |     u"Η": u"H", u"η": u"h", u"Ή": u"H", u"ή": u"h",  # eta (with/without tonos)
21 |     u"Θ": u"Th", u"θ": u"th",
22 |     u"Ι": u"I", u"ι": u"i", u"Ί": u"I", u"ί": u"i", u"Ϊ": u"I", u"ϊ": u"i",  # iota (with tonos/dialytika)
23 |     u"Κ": u"K", u"κ": u"k",
24 |     u"Λ": u"L", u"λ": u"l",
25 |     u"Μ": u"M", u"μ": u"m",
26 |     u"Ν": u"N", u"ν": u"n",
27 |     u"Ξ": u"X", u"ξ": u"x",
28 |     u"Ο": u"O", u"ο": u"o", u"Ό": u"O", u"ό": u"o",  # omicron (with/without tonos)
29 |     u"Π": u"P", u"π": u"p",
30 |     u"Ρ": u"R", u"ρ": u"r",
31 |     u"Σ": u"S", u"σ": u"s", u"ς": u"s",  # sigma (ς is final sigma)
32 |     u"Τ": u"T", u"τ": u"t",
33 |     u"Υ": u"Y", u"υ": u"y", u"Ύ": u"Y", u"ύ": u"y", u"Ϋ": u"Y", u"ϋ": u"y",  # upsilon (with tonos/dialytika)
34 |     u"Φ": u"F", u"φ": u"f",
35 |     u"Χ": u"Ch", u"χ": u"ch",
36 |     u"Ψ": u"Ps", u"ψ": u"ps",
37 |     u"Ω": u"W", u"ω": u"w", u"Ώ": u"W", u"ώ": u"w",  # omega (with/without tonos)
38 | }
39 | 
40 | # This dictionary is to transliterate from Latin to Greek
41 | # Build the reverse mapping, but only include unaccented letters
42 | # (accented vowels transliterate to same Latin as unaccented, so we default to unaccented)
43 | EL_LAT_TO_GRE_DICT = {
44 |     u"A": u"Α", u"a": u"α",
45 |     u"V": u"Β", u"v": u"β",
46 |     u"G": u"Γ", u"g": u"γ",
47 |     u"D": u"Δ", u"d": u"δ",
48 |     u"E": u"Ε", u"e": u"ε",
49 |     u"Z": u"Ζ", u"z": u"ζ",
50 |     u"H": u"Η", u"h": u"η",
51 |     u"I": u"Ι", u"i": u"ι",
52 |     u"K": u"Κ", u"k": u"κ",
53 |     u"L": u"Λ", u"l": u"λ",
54 |     u"M": u"Μ", u"m": u"μ",
55 |     u"N": u"Ν", u"n": u"ν",
56 |     u"X": u"Ξ", u"x": u"ξ",
57 |     u"O": u"Ο", u"o": u"ο",
58 |     u"P": u"Π", u"p": u"π",
59 |     u"R": u"Ρ", u"r": u"ρ",
60 |     u"S": u"Σ", u"s": u"σ",
61 |     u"T": u"Τ", u"t": u"τ",
62 |     u"Y": u"Υ", u"y": u"υ",
63 |     u"F": u"Φ", u"f": u"φ",
64 |     u"W": u"Ω", u"w": u"ω",
65 | }
66 | EL_LAT_TO_GRE_DICT.update({
67 |     u"TH": u"Θ", u"Th": u"Θ", u"th": u"θ",
68 |     u"CH": u"Χ", u"Ch": u"Χ", u"ch": u"χ",
69 |     u"PS": u"Ψ", u"Ps": u"Ψ", u"ps": u"ψ",
70 | })
71 | 


--------------------------------------------------------------------------------
/cyrtranslit/mapping/mn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Mongolian (mn) transliteration mappings.
 4 | 
 5 | ISO 639-1 language code: mn
 6 | 
 7 | This version of Mongolian Latin <-> Cyrillic is based on MNS 5217:2012
 8 | as far as I know this is the latest standard. Inform me @ https://github.com/Serbipunk
 9 | 
10 | References:
11 | https://gogo.mn/r/101115
12 | https://en.wikipedia.org/wiki/Mongolian_Cyrillic_alphabet
13 | """
14 | 
15 | # This list contains alternating Cyrillic and Latin mappings
16 | # Format: [Cyrillic_upper, Latin_upper, Cyrillic_lower, Latin_lower, ...]
17 | MN_CYR_LAT_LIST = [
18 |     u"А", u"A", u"а", u"a",
19 |     u"Э", u"E", u"э", u"e",
20 |     u"И", u"I", u"и", u"i",  # i
21 |     u"О", u"O", u"о", u"o",
22 |     u"У", u"U", u"у", u"u",
23 |     u"Ө", u"Ö", u"ө", u"ö",
24 |     u"Ү", u"Ü", u"ү", u"ü",
25 |     u"Н", u"N", u"н", u"n",
26 |     u"М", u"M", u"м", u"m",
27 |     u"Л", u"L", u"л", u"l",
28 |     u"В", u"V", u"в", u"v",
29 |     u"П", u"P", u"п", u"p",
30 |     u"Ф", u"F", u"ф", u"f",
31 |     u"К", u"K", u"к", u"k",
32 |     u"Х", u"Kh", u"х", u"kh",        # lat 1
33 |     u"Х", u"KH", u"х", u"kH",        # lat 1
34 |     u"Г", u"G", u"г", u"g",
35 |     u"С", u"S", u"с", u"s",
36 |     u"Ш", u"Sh", u"ш", u"sh",  # sh  # lat2
37 |     u"Ш", u"SH", u"ш", u"sH",  # sh  # lat2
38 |     u"Т", u"T", u"т", u"t",
39 |     u"Д", u"D", u"д", u"d",
40 |     u"Ц", u"Ts", u"ц", u"ts",        # lat3
41 |     u"Ц", u"TS", u"ц", u"tS",        # lat3
42 |     u"Ч", u"Ch", u"ч", u"ch",        # lat4
43 |     u"Ч", u"CH", u"ч", u"cH",        # lat4
44 |     u"З", u"Z", u"з", u"z",
45 |     u"Ж", u"J", u"ж", u"j",
46 |     u"Й", u"I", u"й", u"i",  # i * 2
47 |     u"Р", u"R", u"р", u"r",
48 |     u"Б", u"B", u"б", u"b",
49 |     u"Е", u"Ye", u"е", u"ye",             # lat 5
50 |     u"Е", u"YE", u"е", u"yE",             # lat 5
51 |     u"Ё", u"Yo", u"ё", u"yo",             # lat 6
52 |     u"Ё", u"YO", u"ё", u"yO",             # lat 6
53 |     u"Ъ", u"I", u"ъ", u"i",  # i * 3
54 |     u"Ы", u"Y", u"ы", u"y",
55 |     u"Ь", u"I", u"ь", u"i",  # i * 4
56 |     u"Ю", u"Yu", u"ю", u"yu",             # lat 8
57 |     u"Ю", u"YU", u"ю", u"yU",             # lat 8
58 |     u"Я", u"Ya", u"я", u"ya",             # lat 9
59 |     u"Я", u"YA", u"я", u"yA",             # lat 9
60 | ]
61 | # Building the dictionary with the filter to skip pairs with 2-character Latin letters where the second character is uppercase
62 | MN_CYR_TO_LAT_DICT = {
63 |     c: l for c, l in zip(MN_CYR_LAT_LIST[::2], MN_CYR_LAT_LIST[1::2])
64 |     if not (len(l) == 2 and l[1].isupper())
65 | }
66 | 
67 | # Handle Щ (shcha): This letter is part of Mongolian Cyrillic (inherited from Russian)
68 | # but is rarely used in practice. It's pronounced the same as Ш (/ʃ/), so both
69 | # transliterate to "Sh/sh" in Latin. When going Latin → Cyrillic, "Sh" defaults to Ш.
70 | MN_CYR_TO_LAT_DICT['Щ'] = 'Sh'
71 | MN_CYR_TO_LAT_DICT['щ'] = 'sh'
72 | 
73 | MN_LAT_TO_CYR_DICT = dict([(l, c) for c, l in zip(MN_CYR_LAT_LIST[-2::-2], MN_CYR_LAT_LIST[-1::-2])])
74 | 


--------------------------------------------------------------------------------
/cyrtranslit/mapping/bg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Bulgarian (bg) transliteration mappings.
 4 | 
 5 | ISO 639-1 language code: bg
 6 | 
 7 | Supports accented I with grave for stress marking and homograph disambiguation.
 8 | Following ISO 9:1995.
 9 | """
10 | 
11 | import copy
12 | from .ru import RU_CYR_TO_LAT_DICT
13 | 
14 | # Transliterate from Bulgarian Cyrillic to Latin
15 | BG_CYR_TO_LAT_DICT = copy.deepcopy(RU_CYR_TO_LAT_DICT)
16 | 
17 | # There are a couple of letters that don't exist in Bulgarian:
18 | del BG_CYR_TO_LAT_DICT[u"Ё"]
19 | del BG_CYR_TO_LAT_DICT[u"ё"]
20 | del BG_CYR_TO_LAT_DICT[u"Ы"]
21 | del BG_CYR_TO_LAT_DICT[u"ы"]
22 | del BG_CYR_TO_LAT_DICT[u"Э"]
23 | del BG_CYR_TO_LAT_DICT[u"э"]
24 | 
25 | # Some letters that are pronounced differently
26 | BG_CYR_TO_LAT_DICT[u"Й"] = u"Y"
27 | BG_CYR_TO_LAT_DICT[u"й"] = u"y"
28 | BG_CYR_TO_LAT_DICT[u"Х"] = u"H"
29 | BG_CYR_TO_LAT_DICT[u"х"] = u"h"
30 | BG_CYR_TO_LAT_DICT[u"Ц"] = u"TS"
31 | BG_CYR_TO_LAT_DICT[u"ц"] = u"ts"
32 | BG_CYR_TO_LAT_DICT[u"Щ"] = u"SHT"
33 | BG_CYR_TO_LAT_DICT[u"щ"] = u"sht"
34 | BG_CYR_TO_LAT_DICT[u"Ю"] = u"YU"
35 | BG_CYR_TO_LAT_DICT[u"ю"] = u"yu"
36 | BG_CYR_TO_LAT_DICT[u"Я"] = u"YA"
37 | BG_CYR_TO_LAT_DICT[u"я"] = u"ya"
38 | 
39 | # The following letters use the pre-2012 "Andreichin" system for lettering,
40 | # because in the newest "Ivanov" system "a" and "y" translate to two Bulgarian
41 | # letters and choosing to which one depends on the word and text context
42 | # https://en.wikipedia.org/wiki/Romanization_of_Bulgarian
43 | BG_CYR_TO_LAT_DICT[u"Ъ"] = u"Ă"
44 | BG_CYR_TO_LAT_DICT[u"ъ"] = u"ă"
45 | BG_CYR_TO_LAT_DICT[u"Ь"] = u"J"
46 | BG_CYR_TO_LAT_DICT[u"ь"] = u"j"
47 | 
48 | # Transliterate from Latin Bulgarian to Cyrillic.
49 | # Build this BEFORE adding accented Cyrillic characters to avoid reverse mapping conflicts
50 | BG_LAT_TO_CYR_DICT = {y: x for x, y in iter(BG_CYR_TO_LAT_DICT.items())}
51 | 
52 | # Accented vowels with grave accent (used for stress marking and homograph disambiguation)
53 | # Following ISO 9:1995
54 | # Source: https://en.wikipedia.org/wiki/I_with_grave_(Cyrillic)
55 | # Used to distinguish: ѝ (her) vs и (and)
56 | #
57 | # By default (preserve_accents=False), accented Cyrillic maps to unaccented Latin
58 | BG_CYR_TO_LAT_DICT[u"Ѝ"] = u"I"  # Cyrillic I with grave → I (U+040D)
59 | BG_CYR_TO_LAT_DICT[u"ѝ"] = u"i"  # Cyrillic i with grave → i (U+045D)
60 | 
61 | # Accented map: When preserve_accents=True, these override the standard mappings
62 | BG_CYR_TO_LAT_ACCENTED_DICT = {
63 |     u"Ѝ": u"Ì",  # Cyrillic I with grave → Ì
64 |     u"ѝ": u"ì",  # Cyrillic i with grave → ì
65 | }
66 | 
67 | BG_LAT_TO_CYR_DICT.update({
68 |     u"ZH": u"Ж", u"Zh": u"Ж", u"zh": u"ж",
69 |     u"TS": u"Ц", u"Ts": u"Ц", u"ts": u"ц",
70 |     u"CH": u"Ч", u"Ch": u"Ч", u"ch": u"ч",
71 |     u"SH": u"Ш", u"Sh": u"Ш", u"sh": u"ш",
72 |     u"SHT": u"Щ", u"Sht": u"Щ", u"sht": u"щ",
73 |     u"YU": u"Ю", u"Yu": u"Ю", u"yu": u"ю",
74 |     u"YA": u"Я", u"Ya": u"Я", u"ya": u"я",
75 |     # Accented Latin to unaccented Cyrillic (preserve_accents=False)
76 |     u"Ì": u"И", u"ì": u"и",  # Latin I with grave → Cyrillic I
77 | })
78 | 
79 | # Accented map for Latin→Cyrillic: When preserve_accents=True, these override
80 | BG_LAT_TO_CYR_ACCENTED_DICT = {
81 |     u"Ì": u"Ѝ", u"ì": u"ѝ",  # Latin I with grave → Cyrillic Ѝ
82 | }
83 | 


--------------------------------------------------------------------------------
/cyrtranslit/mapping/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Cyrillic transliteration mapping package.
 4 | 
 5 | This package contains transliteration mappings for various Cyrillic and Greek scripts.
 6 | Each language has its own module with specific transliteration dictionaries.
 7 | 
 8 | Supported languages:
 9 | - bg: Bulgarian
10 | - by: Belarusian
11 | - el: Greek
12 | - me: Montenegrin
13 | - mk: Macedonian
14 | - mn: Mongolian
15 | - rs: Serbian (ISO 3166-1 country code alias)
16 | - ru: Russian
17 | - sr: Serbian
18 | - tj: Tajik
19 | - ua: Ukrainian
20 | """
21 | 
22 | # Import all language-specific mappings
23 | from .sr import SR_CYR_TO_LAT_DICT, SR_LAT_TO_CYR_DICT
24 | from .me import ME_CYR_TO_LAT_DICT, ME_LAT_TO_CYR_DICT
25 | from .mk import (
26 |     MK_CYR_TO_LAT_DICT,
27 |     MK_LAT_TO_CYR_DICT,
28 |     MK_CYR_TO_LAT_ACCENTED_DICT,
29 |     MK_LAT_TO_CYR_ACCENTED_DICT
30 | )
31 | from .ru import RU_CYR_TO_LAT_DICT, RU_LAT_TO_CYR_DICT
32 | from .tj import TJ_CYR_TO_LAT_DICT, TJ_LAT_TO_CYR_DICT
33 | from .bg import (
34 |     BG_CYR_TO_LAT_DICT,
35 |     BG_LAT_TO_CYR_DICT,
36 |     BG_CYR_TO_LAT_ACCENTED_DICT,
37 |     BG_LAT_TO_CYR_ACCENTED_DICT
38 | )
39 | from .ua import UA_CYR_TO_LAT_DICT, UA_LAT_TO_CYR_DICT
40 | from .by import BY_CYR_TO_LAT_DICT, BY_LAT_TO_CYR_DICT
41 | from .mn import MN_CYR_TO_LAT_DICT, MN_LAT_TO_CYR_DICT
42 | from .el import EL_GRE_TO_LAT_DICT, EL_LAT_TO_GRE_DICT
43 | 
44 | # Bundle up all the dictionaries in a lookup dictionary
45 | TRANSLIT_DICT = {
46 |     'sr': {  # Serbian (ISO 639-1 language code)
47 |         'tolatin': SR_CYR_TO_LAT_DICT,
48 |         'tocyrillic': SR_LAT_TO_CYR_DICT
49 |     },
50 |     'rs': {  # Serbian (ISO 3166-1 country code alias)
51 |         'tolatin': SR_CYR_TO_LAT_DICT,
52 |         'tocyrillic': SR_LAT_TO_CYR_DICT
53 |     },
54 |     'me': {  # Montenegro
55 |         'tolatin': ME_CYR_TO_LAT_DICT,
56 |         'tocyrillic': ME_LAT_TO_CYR_DICT
57 |     },
58 |     'mk': {  # Macedonia
59 |         'tolatin': MK_CYR_TO_LAT_DICT,
60 |         'tocyrillic': MK_LAT_TO_CYR_DICT,
61 |         'tolatin_accented': MK_CYR_TO_LAT_ACCENTED_DICT,
62 |         'tocyrillic_accented': MK_LAT_TO_CYR_ACCENTED_DICT
63 |     },
64 |     'ru': {  # Russian
65 |         'tolatin': RU_CYR_TO_LAT_DICT,
66 |         'tocyrillic': RU_LAT_TO_CYR_DICT
67 |     },
68 |     'tj': {  # Tajik
69 |         'tolatin': TJ_CYR_TO_LAT_DICT,
70 |         'tocyrillic': TJ_LAT_TO_CYR_DICT
71 |     },
72 |     'bg': {  # Bulgarian
73 |         'tolatin': BG_CYR_TO_LAT_DICT,
74 |         'tocyrillic': BG_LAT_TO_CYR_DICT,
75 |         'tolatin_accented': BG_CYR_TO_LAT_ACCENTED_DICT,
76 |         'tocyrillic_accented': BG_LAT_TO_CYR_ACCENTED_DICT
77 |     },
78 |     'ua': {  # Ukrainian
79 |         'tolatin': UA_CYR_TO_LAT_DICT,
80 |         'tocyrillic': UA_LAT_TO_CYR_DICT
81 |     },
82 |     'by': {  # Belarusian
83 |         'tolatin': BY_CYR_TO_LAT_DICT,
84 |         'tocyrillic': BY_LAT_TO_CYR_DICT
85 |     },
86 |     'mn': {  # Mongolian
87 |         'tolatin': MN_CYR_TO_LAT_DICT,
88 |         'tocyrillic': MN_LAT_TO_CYR_DICT
89 |     },
90 |     'el': {  # Greek (ISO 639-1 language code)
91 |         'tolatin': EL_GRE_TO_LAT_DICT,
92 |         'tocyrillic': EL_LAT_TO_GRE_DICT
93 |     }
94 | }
95 | 
96 | # Export the main dictionary for backward compatibility
97 | __all__ = ['TRANSLIT_DICT']
98 | 


--------------------------------------------------------------------------------
/cyrtranslit/mapping/mk.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Macedonian (mk) transliteration mappings.
 4 | 
 5 | ISO 639-1 language code: mk
 6 | 
 7 | Supports accented vowels with grave accent used for homograph disambiguation:
 8 | - Ѐ/ѐ (U+0400/U+0450) - IE with grave
 9 | - Ѝ/ѝ (U+040D/U+045D) - I with grave
10 | 
11 | Following ISO 9:1968/1995, adopted by Macedonian Academy of Arts and Sciences (1970).
12 | """
13 | 
14 | import copy
15 | from .sr import SR_CYR_TO_LAT_DICT
16 | 
17 | # Build the dictionaries to transliterate Macedonian Cyrillic to Latin and vice versa.
18 | MK_CYR_TO_LAT_DICT = copy.deepcopy(SR_CYR_TO_LAT_DICT)
19 | 
20 | # Differences with Serbian:
21 | # 1) Between Ze (З з) and I (И и) is the letter Dze (Ѕ ѕ), which looks like the Latin letter S and represents /d͡z/.
22 | MK_CYR_TO_LAT_DICT[u'Ѕ'] = u'Dz'
23 | MK_CYR_TO_LAT_DICT[u'ѕ'] = u'dz'
24 | 
25 | # 2) Dje (Ђ ђ) is replaced by Gje (Ѓ ѓ), which represents /ɟ/ (voiced palatal stop).
26 | # In some dialects, it represents /d͡ʑ/ instead, like Dje
27 | # It is written ⟨Ǵ ǵ⟩ in the corresponding Macedonian Latin alphabet.
28 | del MK_CYR_TO_LAT_DICT[u'Ђ']
29 | del MK_CYR_TO_LAT_DICT[u'ђ']
30 | MK_CYR_TO_LAT_DICT[u'Ѓ'] = u'Ǵ'
31 | MK_CYR_TO_LAT_DICT[u'ѓ'] = u'ǵ'
32 | 
33 | # 3) Tshe (Ћ ћ) is replaced by Kje (Ќ ќ), which represents /c/ (voiceless palatal stop).
34 | # In some dialects, it represents /t͡ɕ/ instead, like Tshe.
35 | # It is written ⟨Ḱ ḱ⟩ in the corresponding Macedonian Latin alphabet.
36 | del MK_CYR_TO_LAT_DICT[u'Ћ']
37 | del MK_CYR_TO_LAT_DICT[u'ћ']
38 | MK_CYR_TO_LAT_DICT[u'Ќ'] = u'Ḱ'
39 | MK_CYR_TO_LAT_DICT[u'ќ'] = u'ḱ'
40 | 
41 | # This dictionary is to transliterate from Macedonian Latin to Cyrillic.
42 | # Build this BEFORE adding accented Cyrillic characters to avoid reverse mapping conflicts
43 | MK_LAT_TO_CYR_DICT = {y: x for x, y in iter(MK_CYR_TO_LAT_DICT.items())}
44 | 
45 | # 4) Accented vowels with grave accent (used to disambiguate homographs in Macedonian)
46 | # Following ISO 9:1968/1995, adopted by Macedonian Academy of Arts and Sciences in 1970
47 | # Source: https://en.wikipedia.org/wiki/I_with_grave_(Cyrillic)
48 | # These are used to distinguish homographs:
49 | # - ѝ (her) vs и (and)
50 | # - нѐ (us) vs не (no)
51 | # - сѐ (everything) vs се (short reflexive pronoun)
52 | #
53 | # By default (preserve_accents=False), accented Cyrillic maps to unaccented Latin
54 | MK_CYR_TO_LAT_DICT[u'Ѐ'] = u'E'  # Cyrillic E with grave → E (U+0400)
55 | MK_CYR_TO_LAT_DICT[u'ѐ'] = u'e'  # Cyrillic e with grave → e (U+0450)
56 | MK_CYR_TO_LAT_DICT[u'Ѝ'] = u'I'  # Cyrillic I with grave → I (U+040D)
57 | MK_CYR_TO_LAT_DICT[u'ѝ'] = u'i'  # Cyrillic i with grave → i (U+045D)
58 | 
59 | # Accented map: When preserve_accents=True, these override the standard mappings
60 | MK_CYR_TO_LAT_ACCENTED_DICT = {
61 |     u'Ѐ': u'È',  # Cyrillic E with grave → È
62 |     u'ѐ': u'è',  # Cyrillic e with grave → è
63 |     u'Ѝ': u'Ì',  # Cyrillic I with grave → Ì
64 |     u'ѝ': u'ì',  # Cyrillic i with grave → ì
65 | }
66 | 
67 | # Add mappings for accented Latin to unaccented Cyrillic (preserve_accents=False)
68 | MK_LAT_TO_CYR_DICT.update({
69 |     u'È': u'Е', u'è': u'е',  # Latin E with grave → Cyrillic E
70 |     u'Ì': u'И', u'ì': u'и',  # Latin I with grave → Cyrillic I
71 | })
72 | 
73 | # Accented map for Latin→Cyrillic: When preserve_accents=True, these override
74 | MK_LAT_TO_CYR_ACCENTED_DICT = {
75 |     u'È': u'Ѐ', u'è': u'ѐ',  # Latin E with grave → Cyrillic Ѐ
76 |     u'Ì': u'Ѝ', u'ì': u'ѝ',  # Latin I with grave → Cyrillic Ѝ
77 | }
78 | 


--------------------------------------------------------------------------------
/cyrtranslit/cyrtranslit.py:
--------------------------------------------------------------------------------
  1 | import cyrtranslit
  2 | from cyrtranslit.mapping import TRANSLIT_DICT
  3 | from argparse import ArgumentParser, FileType
  4 | import os
  5 | import sys
  6 | 
  7 | def __is_valid_language_code(parse, arg):
  8 |     ''' Validates inputted two-letter language code.
  9 |     :param parse: The argument parser. Used to display error message.
 10 |     :param arg: The language code argument.
 11 |     '''
 12 |     if arg.lower() not in TRANSLIT_DICT:
 13 |         parser.error("The language code %s is not supported. Support language codes are: %s." % (arg, ", ".join(TRANSLIT_DICT.keys()).upper()))
 14 |     else:
 15 |         return arg
 16 | 
 17 | def main():
 18 |     # Setup argument parser
 19 |     parser = ArgumentParser(description="Transiliterate text in a given file.")
 20 | 
 21 |     # Input file.
 22 |     # Not required.
 23 |     parser.add_argument("-i", dest="input_file", required=False,
 24 |                         help="input file",
 25 |                         default=None)
 26 | 
 27 |     # Output file.
 28 |     # Not required. If not specified, transliteration will appear as console output.
 29 |     parser.add_argument("-o", dest="output_file", required=False,
 30 |                         help="ouput file",
 31 |                         default=None)
 32 | 
 33 |     # Language code for cyrillic text in inputted file.
 34 |     # Required.
 35 |     parser.add_argument("-l", dest="language_code", required=True,
 36 |                         help="two-letter ISO 639-1 language code of cyrillic text",
 37 |                         type=lambda x: __is_valid_language_code(parser, x))
 38 | 
 39 |     # Flag for reverse transliteration, i.e. from latin/roman alphabet to cyrillic.
 40 |     parser.add_argument("-c", dest="to_cyrillic", action='store_true',
 41 |                         help="Parse latin characters to cyrillic (reverse of transliteration)")
 42 | 
 43 |     # Flag to preserve accent marks in transliteration
 44 |     parser.add_argument("-p", "--preserve-accents", dest="preserve_accents", action='store_true',
 45 |                         help="Preserve accent marks (e.g., Macedonian/Bulgarian Ѐ→È, ѝ→ì instead of Ѐ→E, ѝ→i)")
 46 | 
 47 |     # Input file encoding.
 48 |     # Not required. Defaults to utf-8 with fallback to common Cyrillic encodings.
 49 |     parser.add_argument("-e", "--encoding", dest="encoding", required=False,
 50 |                         help="input file encoding (default: utf-8 with automatic fallback to windows-1251, iso-8859-5, koi8-r, cp866)",
 51 |                         default="utf-8")
 52 | 
 53 |     # Parse arguments
 54 |     args = parser.parse_args()
 55 | 
 56 |     # Fetch arguments.
 57 |     lang_code = args.language_code
 58 |     to_cyrillic = args.to_cyrillic
 59 |     preserve_accents = args.preserve_accents
 60 |     encoding = args.encoding
 61 | 
 62 |     # Open input file with proper encoding handling
 63 |     if args.input_file:
 64 |         # Try specified encoding first
 65 |         file_input = None
 66 |         tried_encodings = [encoding]
 67 | 
 68 |         # Helper function to test if an encoding works
 69 |         def try_encoding(filepath, enc):
 70 |             try:
 71 |                 f = open(filepath, 'r', encoding=enc)
 72 |                 # Try to read the file to actually test the encoding
 73 |                 f.read()
 74 |                 # If successful, reopen from the beginning
 75 |                 f.close()
 76 |                 return open(filepath, 'r', encoding=enc)
 77 |             except (UnicodeDecodeError, LookupError):
 78 |                 if f:
 79 |                     f.close()
 80 |                 return None
 81 | 
 82 |         file_input = try_encoding(args.input_file, encoding)
 83 | 
 84 |         if file_input is None:
 85 |             # If specified encoding fails, try common Cyrillic encodings as fallback
 86 |             fallback_encodings = ['windows-1251', 'iso-8859-5', 'koi8-r', 'cp866']
 87 | 
 88 |             for fallback_enc in fallback_encodings:
 89 |                 if fallback_enc == encoding:
 90 |                     continue  # Already tried this one
 91 |                 tried_encodings.append(fallback_enc)
 92 |                 file_input = try_encoding(args.input_file, fallback_enc)
 93 |                 if file_input is not None:
 94 |                     print(f"Warning: Failed to decode with {encoding}, using {fallback_enc} instead.", file=sys.stderr)
 95 |                     break
 96 | 
 97 |             if file_input is None:
 98 |                 print(f"Error: Unable to decode file with any of the attempted encodings: {', '.join(tried_encodings)}", file=sys.stderr)
 99 |                 print(f"Try specifying the correct encoding with -e/--encoding parameter.", file=sys.stderr)
100 |                 print(f"Common Cyrillic encodings: windows-1251, iso-8859-5, koi8-r, cp866", file=sys.stderr)
101 |                 sys.exit(1)
102 |     else:
103 |         file_input = sys.stdin
104 | 
105 |     # Open output file
106 |     if args.output_file:
107 |         file_output = open(args.output_file, 'w', encoding='utf-8')
108 |     else:
109 |         file_output = sys.stdout
110 | 
111 |     # Transliterate and write directly to output line by line
112 |     try:
113 |         for line in file_input:
114 |             if to_cyrillic is True:
115 |                 file_output.write(cyrtranslit.to_cyrillic(line, lang_code=lang_code, preserve_accents=preserve_accents))
116 |             else:
117 |                 file_output.write(cyrtranslit.to_latin(line, lang_code=lang_code, preserve_accents=preserve_accents))
118 |     finally:
119 |         # Close streams if they're not stdin/stdout
120 |         if args.input_file and file_input:
121 |             file_input.close()
122 |         if args.output_file and file_output:
123 |             file_output.close()
124 | 
125 | if __name__ == "__main__":
126 |     main()
127 | 


--------------------------------------------------------------------------------
/cyrtranslit/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from .mapping import TRANSLIT_DICT
  3 | import sys
  4 | 
  5 | def __encode_utf8(_string):
  6 |     if sys.version_info < (3, 0):
  7 |         return _string.encode('utf-8')
  8 |     else:
  9 |         return _string
 10 | 
 11 | def __decode_utf8(_string):
 12 |     if sys.version_info < (3, 0):
 13 |         return _string.decode('utf-8')
 14 |     else:
 15 |         return _string
 16 | 
 17 | def to_latin(string_to_transliterate, lang_code='sr', preserve_accents=False):
 18 |     ''' Transliterate cyrillic string of characters to latin string of characters.
 19 |     :param string_to_transliterate: The cyrillic string to transliterate into latin characters.
 20 |     :param lang_code: Indicates the cyrillic language code we are translating from. Defaults to Serbian (sr).
 21 |     :param preserve_accents: If False (default), uses standard mappings (accented Cyrillic → unaccented Latin, e.g., Ѐ→E, ѝ→i).
 22 |                              If True, merges accented mappings (accented Cyrillic → accented Latin, e.g., Ѐ→È, ѝ→ì).
 23 |     :return: A string of latin characters transliterated from the given cyrillic string.
 24 |     '''
 25 | 
 26 |     # First check if we support the cyrillic alphabet we want to transliterate to latin.
 27 |     if lang_code.lower() not in TRANSLIT_DICT:
 28 |         # If we don't support it, then just return the original string.
 29 |         return string_to_transliterate
 30 | 
 31 |     # If we do support it, check if the implementation is not missing before proceeding.
 32 |     elif not TRANSLIT_DICT[lang_code.lower()]['tolatin']:
 33 |         return string_to_transliterate
 34 | 
 35 |     # Everything checks out, proceed with transliteration.
 36 |     else:
 37 | 
 38 |         # Get the character per character transliteration dictionary
 39 |         transliteration_dict = TRANSLIT_DICT[lang_code.lower()]['tolatin'].copy()
 40 | 
 41 |         # If preserve_accents=True and accented mappings exist, merge them (accented overrides standard)
 42 |         if preserve_accents and 'tolatin_accented' in TRANSLIT_DICT[lang_code.lower()]:
 43 |             transliteration_dict.update(TRANSLIT_DICT[lang_code.lower()]['tolatin_accented'])
 44 | 
 45 |         # Initialize the output latin string variable
 46 |         latinized_str = ''
 47 | 
 48 |         # Transliterate by traversing the input string character by character.
 49 |         string_to_transliterate = __decode_utf8(string_to_transliterate)
 50 | 
 51 | 
 52 |         for c in string_to_transliterate:
 53 | 
 54 |             # If character is in dictionary, it means it's a cyrillic so let's transliterate that character.
 55 |             if c in transliteration_dict:
 56 |                 # Transliterate current character.
 57 |                 latinized_str += transliteration_dict[c]
 58 | 
 59 |             # If character is not in character transliteration dictionary,
 60 |             # it is most likely a number or a special character so just keep it.
 61 |             else:
 62 |                 latinized_str += c
 63 | 
 64 |         # Return the transliterated string.
 65 |         return __encode_utf8(latinized_str)
 66 | 
 67 | 
 68 | def to_cyrillic(string_to_transliterate, lang_code='sr', preserve_accents=False):
 69 |     ''' Transliterate latin string of characters to cyrillic string of characters.
 70 |     :param string_to_transliterate: The latin string to transliterate into cyrillic characters.
 71 |     :param lang_code: Indicates the cyrillic language code we are translating to. Defaults to Serbian (sr).
 72 |     :param preserve_accents: If False (default), uses standard mappings (accented Latin → unaccented Cyrillic, e.g., È→Е, ì→и).
 73 |                              If True, merges accented mappings (accented Latin → accented Cyrillic, e.g., È→Ѐ, ì→ѝ).
 74 |     :return: A string of cyrillic characters transliterated from the given latin string.
 75 |     '''
 76 | 
 77 |     # First check if we support the cyrillic alphabet we want to transliterate to latin.
 78 |     if lang_code.lower() not in TRANSLIT_DICT:
 79 |         # If we don't support it, then just return the original string.
 80 |         return string_to_transliterate
 81 | 
 82 |     # If we do support it, check if the implementation is not missing before proceeding.
 83 |     elif not TRANSLIT_DICT[lang_code.lower()]['tocyrillic']:
 84 |         return string_to_transliterate
 85 | 
 86 |     else:
 87 |         # Get the character per character transliteration dictionary
 88 |         transliteration_dict = TRANSLIT_DICT[lang_code.lower()]['tocyrillic'].copy()
 89 | 
 90 |         # If preserve_accents=True and accented mappings exist, merge them (accented overrides standard)
 91 |         if preserve_accents and 'tocyrillic_accented' in TRANSLIT_DICT[lang_code.lower()]:
 92 |             transliteration_dict.update(TRANSLIT_DICT[lang_code.lower()]['tocyrillic_accented'])
 93 | 
 94 |         # Initialize the output cyrillic string variable
 95 |         cyrillic_str = ''
 96 | 
 97 |         string_to_transliterate = __decode_utf8(string_to_transliterate)
 98 | 
 99 |         # Transliterate by traversing the inputted string character by character.
100 |         length_of_string_to_transliterate = len(string_to_transliterate)
101 |         index = 0
102 | 
103 |         while index < length_of_string_to_transliterate:
104 |             # Grab a character from the string at the current index
105 |             c = string_to_transliterate[index]
106 | 
107 |             # Watch out for Lj and lj. Don't want to interpret Lj/lj as L/l and j.
108 |             # Watch out for Nj and nj. Don't want to interpret Nj/nj as N/n and j.
109 |             # Watch out for Dž and and dž. Don't want to interpret Dž/dž as D/d and j.
110 |             c_plus_1 = u''
111 |             if index != length_of_string_to_transliterate - 1:
112 |                 c_plus_1 = string_to_transliterate[index + 1]
113 | 
114 |             c_plus_2 = u''
115 |             if index + 2 <= length_of_string_to_transliterate - 1:
116 |                 c_plus_2 = string_to_transliterate[index + 2]
117 | 
118 |             if ((c == u'L' or c == u'l') and c_plus_1 == u'j') or \
119 |                ((c == u'N' or c == u'n') and c_plus_1 == u'j') or \
120 |                ((c == u'D' or c == u'd') and c_plus_1 == u'ž') or \
121 |                (lang_code == 'mk' and (c == u'D' or c == u'd') and c_plus_1 == u'z') or \
122 |                (lang_code == 'bg' and (
123 |                    (c in u'Zz' and c_plus_1 in u'Hh') or # Zh, zh
124 |                    (c in u'Tt' and c_plus_1 in u'Ss') or # Ts, ts
125 |                    (c in u'Ss' and c_plus_1 in u'Hh') or # Sh, sh (and also covers Sht, sht)
126 |                    (c in u'Cc' and c_plus_1 in u'Hh') or # Ch, ch
127 |                    (c in u'Yy' and c_plus_1 in u'Uu') or # Yu, yu
128 |                    (c in u'Yy' and c_plus_1 in u'Aa') # Ya, ya
129 |                 )) or \
130 |                (lang_code == 'ru' and (
131 |                     (c in u'Cc' and c_plus_1 in u'HhKkZz') or  # c, ch, ck, cz
132 |                     (c in u'Tt' and c_plus_1 in u'Hh') or  # th
133 |                     (c in u'Ww' and c_plus_1 in u'Hh') or  # wh
134 |                     (c in u'Pp' and c_plus_1 in u'Hh') or  # ph
135 |                     (c in u'Ee' and c_plus_1 == u'\'') or  # e'
136 | 
137 |                     (c == u'i'  and c_plus_1 == u'y' and
138 |                      string_to_transliterate[index + 2:index + 3] not in u'aou') or  # iy[^AaOoUu]
139 |                     (c in u'Jj' and c_plus_1 in u'UuAaEeIiOo') or  # j, ju, ja, je, ji, jo
140 |                     (c in u'Ss' and c_plus_1 in u'HhZz') or  # s, sh, sz
141 |                     (c in u'Yy' and c_plus_1 in u'AaOoUuEeIi\'') or  # y, ya, yo, yu, ye, yi, y'
142 |                     (c in u'Zz' and c_plus_1 in u'Hh') or  # z, zh
143 |                     (c == u'\'' and c_plus_1 == u'\'')  # ''
144 |                )) or \
145 |                (lang_code == 'ua' and (
146 |                     (c in u'Jj' and c_plus_1 in u'eEaAuUiI') or # je, ja, ju
147 |                     (c in u'Šš' and c_plus_1 in u'č')      # šč
148 |                 )) or \
149 |                (lang_code == 'by' and (
150 |                     (c in u'Jj' and c_plus_1 in u'uUaA') or   # ju, ja
151 |                     (c == u'\'' and c_plus_1 == u'\'')         # '' for Ьь
152 |                 )) or \
153 |                (lang_code == "mn" and (
154 |                     (c in u'Kk' and c_plus_1 in u'Hh') or  # Х х
155 |                     (c in u'Ss' and c_plus_1 in u'Hh') or  # Ш ш
156 |                     (c in u'Tt' and c_plus_1 in u'Ss') or  # Ц ц
157 |                     (c in u'Cc' and c_plus_1 in u'Hh') or  # Ч ч
158 |                     (c in u'Yy' and c_plus_1 in u'EeOoUuAa')  # Е Ё Ю Я
159 |                 )) or \
160 |                (lang_code == "el" and (
161 |                     (c in u'Tt' and c_plus_1 in u'Hh') or  # Θ θ - Theta
162 |                     (c in u'Cc' and c_plus_1 in u'Hh') or  # Χ χ - Chi
163 |                     (c in u'Pp' and c_plus_1 in u'Ss')     # Ψ ψ - Psi
164 |                 )):
165 |                 index += 1
166 |                 c += c_plus_1
167 | 
168 |                 # In Bulgarian, the letter "щ" is represented by three latin letters: "sht", 
169 |                 # so we need this logic to support the third latin letter
170 |                 if lang_code == 'bg' and \
171 |                         index + 2 <= length_of_string_to_transliterate - 1 and \
172 |                         (c == 'sh' or c == 'Sh' or c == 'SH') and \
173 |                         string_to_transliterate[index + 1] in u'Tt':
174 |                     index += 1
175 |                     c += string_to_transliterate[index]
176 |                     
177 |                 # Similarly in Russian, the letter "щ" шы represented by "shh".
178 |                 if lang_code == 'ru' and \
179 |                         index + 2 <= length_of_string_to_transliterate - 1 and \
180 |                         (c == u'sh' or c == 'Sh' or c == 'SH') and \
181 |                         string_to_transliterate[index + 1] in u'Hh':  # shh
182 |                     index += 1
183 |                     c += string_to_transliterate[index]
184 | 
185 |                 # In Mongolia the begining of if statement is not the truth
186 |                 #                ((c == u'L' or c == u'l') and c_plus_1 == u'j') or \
187 |                 #                ((c == u'N' or c == u'n') and c_plus_1 == u'j') or \
188 |                 #                ((c == u'D' or c == u'd') and c_plus_1 == u'ž') or \
189 |                 # Sü(nj)idmaa -> Сүнжидмаагаа  not  Сүnjидмаа
190 |                 # I add post-processing , wonder if @georgeslabreche would like to change the old code, thx
191 |                 if lang_code == 'mn' and c in [u'Lj', u'lj', u'Nj', u'nj']:
192 |                     index -= 1
193 |                     c = c[:-1]
194 | 
195 |             # If character is in dictionary, it means it's a cyrillic so let's transliterate that character.
196 |             if c in transliteration_dict:
197 |                 # ay, ey, iy, oy, uy
198 |                 if lang_code == 'ru' and c in u'Yy' and \
199 |                         cyrillic_str and cyrillic_str[-1].lower() in u"аеиоуэя":
200 |                     cyrillic_str += u"й" if c == u'y' else u"Й"
201 |                 else:
202 |                     # Transliterate current character.
203 |                     cyrillic_str += transliteration_dict[c]
204 | 
205 |             # If character is not in character transliteration dictionary,
206 |             # it is most likely a number or a special character so just keep it.
207 |             else:
208 |                 cyrillic_str += c
209 | 
210 |             index += 1
211 | 
212 |         return __encode_utf8(cyrillic_str)
213 | 
214 |       
215 | def supported():
216 |     ''' Returns list of supported languages, sorted alphabetically.
217 |     :return:
218 |     '''
219 |     return sorted(TRANSLIT_DICT.keys())
220 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.17663256.svg)](https://doi.org/10.5281/zenodo.17663256)
  2 | 
  3 | ## What is CyrTranslit?
  4 | 
  5 | A Python package for bi-directional transliteration of Cyrillic script to Latin script and vice versa.
  6 | 
  7 | By default, transliterates for the Serbian language. A language flag can be set in order to transliterate to and from Belarusian, Bulgarian, Greek, Montenegrin, Macedonian, Mongolian, Russian, Serbian, Tajik, and Ukrainian.
  8 | 
  9 | **Note:** Greek is also supported. While Greek uses its own alphabet and is not Cyrillic, it has been included due to user demand and shared transliteration needs.
 10 | 
 11 | ## What is transliteration?
 12 | 
 13 | Transliteration is the conversion of a text from one script to another. For instance, a Latin alphabet transliteration of the Serbian phrase _"Мој ховеркрафт је пун јегуља"_ is _"Moj hoverkraft je pun jegulja"_.
 14 | 
 15 | ## Citation
 16 | 
 17 | A citation would be much appreciated if you use CyrTranslit in a research publication:
 18 | 
 19 | [Georges Labrèche. (2025). CyrTranslit (1.2.0). Zenodo. https://doi.org/10.5281/zenodo.17663256](https://doi.org/10.5281/zenodo.17663256)
 20 | 
 21 | BibTex entry:
 22 | ```bibtex
 23 | @software{georges_labreche_nov2025,
 24 |   author       = {Georges Labrèche},
 25 |   title        = {CyrTranslit},
 26 |   month        = nov,
 27 |   year         = 2025,
 28 |   note         = {{A Python package for bi-directional 
 29 |                    transliteration of Cyrillic script to Latin script
 30 |                    and vice versa. Supports transliteration for Belarusian, 
 31 |                    Bulgarian, Greek, Montenegrin, Macedonian, Mongolian,
 32 |                    Russian, Serbian, Tajik, and Ukrainian.}},
 33 |   publisher    = {Zenodo},
 34 |   version      = {1.2.0},
 35 |   doi          = {10.5281/zenodo.17663256},
 36 |   url          = {https://doi.org/10.5281/zenodo.17663256}
 37 | }
 38 | ```
 39 | 
 40 | 
 41 | ## Advancing research
 42 | 
 43 | CyrTranslit is actively used as a reliable tool to advance research! Here's an incomplete list of publications for research projects that have relied on CyrTranslit:
 44 | 
 45 | ### Text Normalization, Unicode Perturbations & Robustness
 46 | 
 47 | - Cooper, Portia, Blanco, Eduardo, and Surdeanu, Mihai. (2025). "[The Lies Characters Tell: Utilizing Large Language Models to Normalize Adversarial Unicode Perturbations](https://aclanthology.org/2025.findings-acl.969.pdf)," *Findings of the Association for Computational Linguistics: ACL 2025*.
 48 | 
 49 | - Cooper, Portia, Surdeanu, Mihai, and Blanco, Eduardo. (2023). "[Hiding in Plain Sight: Tweets with Hate Speech Masked by Homoglyphs](https://aclanthology.org/2023.findings-emnlp.192.pdf)," *Findings of the Association for Computational Linguistics: EMNLP 2023*.
 50 | 
 51 | 
 52 | ### Low-Resource NLP & Machine Translation
 53 | 
 54 | - Cvetanović, Aleksa and Tadić, Predrag. (2024). "[Synthetic Dataset Creation and Fine-Tuning of Transformer Models for Question Answering in Serbian](https://arxiv.org/pdf/2404.08617)," arXiv:2404.08617.
 55 | 
 56 | - Lakew, Surafel Melaku. (2020). "[Multilingual Neural Machine Translation for Low Resource Languages](https://surafelml.github.io/phd-thesis/)," PhD Thesis, University of Trento.
 57 | 
 58 | - Filo, Denis. (2020). "[Neuronový strojový překlad pro jazykové páry s malým množstvím trénovacích dat: Low-Resource Neural Machine Translation](https://www.fit.vut.cz/study/thesis/23087/.en)," Master's Thesis, Brno University of Technology.
 59 | 
 60 | - Lakew, Surafel Melaku, Erofeeva, Aliia, and Federico, Marcello. (2018). "[Neural Machine Translation into Language Varieties](https://aclanthology.org/W18-6316/)," *Proceedings of the Third Conference on Machine Translation (WMT 2018)*.
 61 | 
 62 | 
 63 | ### Serbian Language NLP (Topic Modeling, Sentiment, Lexicons, QA, Abuse Detection)
 64 | 
 65 | - Medvecki, Darija, Bašaragin, Bojana, Ljajić, Adela, and Milošević, Nikola. (2024). "[Multilingual transformer and BERTopic for short text topic modeling: The case of Serbian](https://doi.org/10.1007/978-3-031-50755-7_16)," *Lecture Notes in Networks and Systems* 872:159-169, Springer.
 66 | 
 67 | - Bogdanović, Miloš, Kocić, Jelena, and Stoimenov, Leonid. (2024). "[SRBerta—A Transformer Language Model for Serbian Cyrillic Legal Texts](https://doi.org/10.3390/info15020074)," *Information* 15(2):74.
 68 | 
 69 | - Košprdić, Miloš, Prodanović, Nikola, Ljajić, Adela, Bašaragin, Bojana, and Milošević, Nikola. (2024). "[From Zero to Hero: Harnessing Transformers for Biomedical Named Entity Recognition in Zero- and Few-shot Contexts](https://doi.org/10.1016/j.artmed.2024.102970)," *Artificial Intelligence in Medicine* 157:102970.
 70 | 
 71 | - Ljajić, Adela, Prodanović, Nikola, Medvecki, Darija, Bašaragin, Bojana, and Mitrović, Jelena. (2022). "[Uncovering the Reasons Behind COVID-19 Vaccine Hesitancy in Serbia: Sentiment-Based Topic Modeling](https://doi.org/10.2196/42261)," *Journal of Medical Internet Research* 24(11):e42261.
 72 | 
 73 | - Ljajić, Adela, Prodanović, Nikola, Medvecki, Darija, Bašaragin, Bojana, and Mitrović, Jelena. (2022). "[Topic Modeling Technique on Covid19 Tweets in Serbian](https://www.researchgate.net/publication/364302202_Topic_Modeling_Technique_on_Covid19_Tweets_in_Serbian)," *Proceedings of the 12th International Conference on Information Society and Technology (ICIST 2022)*.
 74 | 
 75 | - Jokic, Danka, Stanković, Ranka, Krstev, Cvetana, and Šandrih Todorović, Branislava. (2021). "[A Twitter Corpus and Lexicon for Abusive Speech Detection in Serbian](https://drops.dagstuhl.de/opus/volltexte/2021/14549/)," *Proceedings of the 3rd Conference on Language, Data and Knowledge (LDK 2021)*.
 76 | 
 77 | - Batanović, Vuk and Nikolic, Bosko. (2019). "[Using Language Technologies to Automate the UNDP Rapid Integrated Assessment Mechanism in Serbian](https://www.researchgate.net/publication/339615659_Using_Language_Technologies_to_Automate_the_UNDP_Rapid_Integrated_Assessment_Mechanism_in_Serbian)," *Proceedings of the Conference on Language Technologies for All (LT4All)*.
 78 | 
 79 | - Ljajić, Adela and Marovac, Ulfeta. (2018). "[Improving sentiment analysis for twitter data by handling negation rules in the Serbian language](http://www.doiserbia.nb.rs/Article.aspx?ID=1820-02141800013L)," *Computer Science and Information Systems* 16(1):13-33.
 80 | 
 81 | 
 82 | ### NLP Applications for Society, Government, and Political Analysis
 83 | 
 84 | - Paula, Katrin and Scholz, Nele. (2025). "[Where do regimes rally their supporters? The geographical distribution of pro-government mobilization in Russia from February to April 2022](https://www.sciencedirect.com/science/article/pii/S096262982500068X)," *Political Geography* 116:103277.
 85 | 
 86 | 
 87 | ### Engineering, Software Systems, and Backend Development
 88 | 
 89 | - Alyoshin, S.P., Borodina, E.A., Hafiiak, A.M., Zhabran, I.B., and Kikot, A.S. (2019). "[Developing Q-Orca site backend using various Python programming language libraries](https://reposit.nupp.edu.ua/bitstream/PoltNTU/5811/1/ME%26IT_Part%203_P%2048_March%202019_Aleshin_Borodina_Hafiiak_Zhabran_Kikot%20%28pdf.io%29.pdf)," *Modern Engineering and Innovative Technologies* 3(7-3):48-53.
 90 | 
 91 | 
 92 | ### Proceedings, Collections, and Meta-Documents
 93 | 
 94 | - LDK. (2021). "[Complete Volume: Proceedings of the 3rd Conference on Language, Data and Knowledge (LDK 2021)](http://dagstuhl.sunsite.rwth-aachen.de/volltexte/2021/14535/pdf/oasics-vol093-ldk2021-complete.pdf)," *OASIcs* Vol. 93.
 95 | 
 96 | - Brown, J. M. M., Schmidt, Andreas, and Wierzba, Marta (Eds.). (2019). "[Of trees and birds: A Festschrift for Gisbert Fanselow](https://publishup.uni-potsdam.de/opus4-ubp/frontdoor/deliver/index/docId/42654/file/of_trees_and_birds.pdf)," Universitätsverlag Potsdam.
 97 | 
 98 | 
 99 | ### Addresses, Geocoding, and NLP
100 | 
101 | - Mussylmanbay, Meiirgali. (2022). "[Addresses Standardization and Geocoding using Natural Language Processing](https://nur.nu.edu.kz/handle/123456789/6705)," Master's Thesis, Nazarbayev University.
102 | 
103 | 
104 | ## How do I install this?
105 | 
106 | CyrTranslit is [hosted in the Python Package Index (PyPI)](https://pypi.python.org/pypi/cyrtranslit) so it can be installed using pip:
107 | ```
108 | python3 -m pip install cyrtranslit         # latest version
109 | python3 -m pip install cyrtranslit==1.2.0  # specific version
110 | python3 -m pip install cyrtranslit>=1.2.0  # minimum version
111 | ```
112 | 
113 | ## What languages are supported?
114 | 
115 | CyrTranslit currently supports bi-directional transliteration of Belarusian, Bulgarian, Greek, Montenegrin, Macedonian, Mongolian, Russian, Serbian, Tajik, and Ukrainian.
116 | 
117 | Language codes are based on ISO 639-1 standards. For Serbian, both `sr` (ISO 639-1 language code) and `rs` (ISO 3166-1 country code) are accepted:
118 | ```python
119 | >>> import cyrtranslit
120 | >>> cyrtranslit.supported()
121 | ['bg', 'by', 'el', 'me', 'mk', 'mn', 'rs', 'ru', 'sr', 'tj', 'ua']
122 | ```
123 | 
124 | ## How do I use this? 
125 | 
126 | CyrTranslit can be used both programatically and via command line interface.
127 | 
128 | ### Programmatically
129 | 
130 | #### Belarusian
131 | 
132 | ```python
133 | >>> import cyrtranslit
134 | >>> cyrtranslit.to_latin("Прывітанне, свет!", "by")
135 | "Pryvitanne, svet!"
136 | >>> cyrtranslit.to_cyrillic("Pryvitanne, svet!", "by")
137 | "Прывітанне, свет!"
138 | ```
139 | 
140 | #### Bulgarian
141 | 
142 | ```python
143 | >>> import cyrtranslit
144 | >>> cyrtranslit.to_latin("Съединението прави силата!", "bg")
145 | "Săedinenieto pravi silata!"
146 | >>> cyrtranslit.to_cyrillic("Săedinenieto pravi silata!", "bg")
147 | "Съединението прави силата!"
148 | ```
149 | 
150 | #### Greek
151 | 
152 | ```python
153 | >>> import cyrtranslit
154 | >>> cyrtranslit.to_latin("Το χόβερκραφτ μου είναι γεμάτο χέλια", "el")
155 | "To choverkraft moy einai gemato chelia"
156 | >>> cyrtranslit.to_cyrillic("To choverkraft moy einai gemato chelia", "el")
157 | "Το χόβερκραφτ μου είναι γεμάτο χέλια"
158 | ```
159 | 
160 | #### Montenegrin
161 | 
162 | ```python
163 | >>> import cyrtranslit
164 | >>> cyrtranslit.to_latin("Република", "me")
165 | "Republika"
166 | >>> cyrtranslit.to_cyrillic("Republika", "me")
167 | "Република"
168 | ```
169 | 
170 | #### Macedonian
171 | 
172 | ```python
173 | >>> import cyrtranslit
174 | >>> cyrtranslit.to_latin("Моето летачко возило е полно со јагули", "mk")
175 | "Moeto letačko vozilo e polno so jaguli"
176 | >>> cyrtranslit.to_cyrillic("Moeto letačko vozilo e polno so jaguli", "mk")
177 | "Моето летачко возило е полно со јагули"
178 | ```
179 | 
180 | #### Mongolian
181 | 
182 | ```python
183 | >>> import cyrtranslit
184 | >>> cyrtranslit.to_latin("Амрагаа Сүнжидмаагаа гэсээр ирлээ дээ хө-хө-хө", "mn")
185 | "Amragaa Sünjidmaagaa geseer irlee dee khö-khö-khö"
186 | >>> cyrtranslit.to_cyrillic("Amragaa Sünjidmaagaa geseer irlee dee khö-khö-khö", "mn")
187 | "Амрагаа Сүнжидмаагаа гэсээр ирлээ дээ хө-хө-хө"
188 | ```
189 | 
190 | #### Russian
191 | 
192 | ```python
193 | >>> import cyrtranslit
194 | >>> cyrtranslit.to_latin("Моё судно на воздушной подушке полно угрей", "ru")
195 | "Moyo sudno na vozdushnoj podushke polno ugrej"
196 | >>> cyrtranslit.to_cyrillic("Moyo sudno na vozdushnoj podushke polno ugrej", "ru")
197 | "Моё судно на воздушной подушке полно угрей"
198 | ```
199 | 
200 | #### Serbian
201 | 
202 | ```python
203 | >>> import cyrtranslit
204 | >>> cyrtranslit.to_latin("Мој ховеркрафт је пун јегуља")
205 | "Moj hoverkraft je pun jegulja"
206 | >>> cyrtranslit.to_cyrillic("Moj hoverkraft je pun jegulja")
207 | "Мој ховеркрафт је пун јегуља"
208 | ```
209 | 
210 | #### Tajik
211 | 
212 | ```python
213 | >>> import cyrtranslit
214 | >>> cyrtranslit.to_latin("Ман мактуб навишта истодам", "tj")
215 | "Man maktub navišta istodam"
216 | >>> cyrtranslit.to_cyrillic("Man maktub navišta istodam", "tj")
217 | "Ман мактуб навишта истодам"
218 | ```
219 | 
220 | #### Ukrainian
221 | 
222 | ```python
223 | >>> import cyrtranslit
224 | >>> cyrtranslit.to_latin("Під лежачий камінь вода не тече", "ua")
225 | "Pid ležačyj kamin' voda ne teče"
226 | >>> cyrtranslit.to_cyrillic("Pid ležačyj kamin' voda ne teče", "ua")
227 | "Під лежачий камінь вода не тече"
228 | ```
229 | 
230 | ### Accented Characters (Macedonian & Bulgarian)
231 | 
232 | CyrTranslit supports Cyrillic characters with grave accents used in Macedonian and Bulgarian for homograph disambiguation and stress marking. By default, accents are stripped during transliteration for cleaner output. Use the `preserve_accents` parameter to preserve them.
233 | 
234 | #### Supported Accented Characters
235 | 
236 | **Macedonian:**
237 | - **Ѐ/ѐ** (U+0400/U+0450) - Cyrillic IE with grave
238 |   - **Purpose:** Distinguishes homographs (e.g., нѐ "us" vs не "no", сѐ "everything" vs се "reflexive pronoun")
239 |   - **Standard:** ISO 9:1968/1995, adopted by Macedonian Academy of Arts and Sciences (1970)
240 | 
241 | - **Ѝ/ѝ** (U+040D/U+045D) - Cyrillic I with grave
242 |   - **Purpose:** Distinguishes homographs (e.g., ѝ "her" vs и "and")
243 |   - **Standard:** ISO 9:1968/1995
244 | 
245 | **Bulgarian:**
246 | - **Ѝ/ѝ** (U+040D/U+045D) - Cyrillic I with grave
247 |   - **Purpose:** Stress marking and homograph disambiguation (e.g., ѝ "her" vs и "and")
248 |   - **Standard:** ISO 9:1995
249 | 
250 | **Sources:**
251 | - ISO 9:1995 - Information and documentation — Transliteration of Cyrillic characters into Latin characters
252 | - [Wikipedia: I with grave (Cyrillic)](https://en.wikipedia.org/wiki/I_with_grave_(Cyrillic))
253 | - [Wikipedia: Ye with grave](https://en.wikipedia.org/wiki/Ye_with_grave)
254 | 
255 | #### Usage Examples
256 | 
257 | **Default behavior (accents stripped):**
258 | 
259 | ```python
260 | >>> import cyrtranslit
261 | >>> cyrtranslit.to_latin("ѝ је", "mk")
262 | "i je"
263 | >>> cyrtranslit.to_latin("нѐ сме", "mk")
264 | "ne sme"
265 | >>> cyrtranslit.to_cyrillic("i je", "mk")
266 | "и је"
267 | ```
268 | 
269 | **With accents preserved:**
270 | 
271 | ```python
272 | >>> import cyrtranslit
273 | >>> cyrtranslit.to_latin("ѝ је", "mk", preserve_accents=True)
274 | "ì je"
275 | >>> cyrtranslit.to_latin("нѐ сме", "mk", preserve_accents=True)
276 | "nè sme"
277 | >>> cyrtranslit.to_cyrillic("ì je", "mk", preserve_accents=True)
278 | "ѝ је"
279 | >>> cyrtranslit.to_cyrillic("nè sme", "mk", preserve_accents=True)
280 | "нѐ сме"
281 | ```
282 | 
283 | **Command-line usage:**
284 | 
285 | ```bash
286 | # Default (accents stripped)
287 | $ echo "ѝ је" | cyrtranslit -l mk
288 | i je
289 | 
290 | # Preserve accents
291 | $ echo "ѝ је" | cyrtranslit -l mk --preserve-accents
292 | ì je
293 | ```
294 | 
295 | ## Command Line Interface
296 | 
297 | Sample command line call to transliterate a Russian text file:
298 | ```bash
299 | $ cyrtranslit -l RU -i tests/ru.txt -o tests/output.txt
300 | ```
301 | 
302 | Use the -c argument to accomplish the reverse, that is to input latin characters and output cyrillic.
303 | 
304 | Use the -h argument for help.
305 | 
306 | You can also omit the input and output files and use standard input/output
307 | ```bash
308 | $ echo 'Мој ховеркрафт је пун јегуља' | cyrtranslit -l sr
309 | Moj hoverkraft je pun jegulja
310 | $ echo 'Moj hoverkraft je pun jegulja' | cyrtranslit -l sr
311 | Мој ховеркрафт је пун јегуља
312 | ```
313 | 
314 | ### File Encodings
315 | 
316 | By default, input files are expected to be UTF-8. For files with different encodings, use the `-e/--encoding` parameter:
317 | 
318 | ```bash
319 | $ cyrtranslit -l BG -i file.txt -e windows-1251
320 | ```
321 | 
322 | If no encoding is specified and encoding fails with the default UTF-8, then CyrTranslit automatically tries the following common Cyrillic encodings: windows-1251, iso-8859-5, koi8-r, and cp866.
323 | 
324 | Try CyrTranslit by running it directly on the Python command line interface, e.g.:
325 | ```python
326 | >>> import sys
327 | >>> import cyrtranslit.cyrtranslit
328 | >>> sys.argv.extend(['-l', 'UA'])
329 | >>> sys.argv.extend(['-i', 'tests/ua.txt'])
330 | >>> sys.argv.extend(['-o', 'tests/output.txt'])
331 | >>> cyrtranslit.cyrtranslit.main()
332 | >>> exit()
333 | ```
334 | 
335 | 
336 | ## How can I contribute?
337 | 
338 | Include support for other Cyrillic script alphabets. Follow these steps in order to do so:
339 | 
340 | 1. Create a new transliteration mapping file in the **[mapping/](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping/)** directory (using the language code as the filename, e.g., `xx.py`) and reference to it in the _**[TRANSLIT\_DICT](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping/__init__.py)**_ dictionary in **mapping/\_\_init\_\_.py**. If the language uses accented characters (like Macedonian and Bulgarian), create separate accented dictionaries (e.g., `XX_CYR_TO_LAT_ACCENTED_DICT`) following the pattern in **[mk.py](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping/mk.py)** or **[bg.py](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping/bg.py)**.
341 | 2. Watch out for cases where two consecutive Latin alphabet letters are meant to transliterate into a single Cyrillic script letter. These cases need to be explicitly checked for inside the **to_cyrillic()** function in **[\_\_init\_\_.py](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/__init__.py)**.
342 | 3. Add test cases inside of **[tests.py](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/tests.py)**.
343 | 4. Add test CLI input files in the **[tests](https://github.com/opendatakosovo/cyrillic-transliteration/tree/master/tests)** directory.
344 | 5. Update the documentation in the **[README.md](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/README.md)**.
345 | 6. List yourself as one of the contributors.
346 | 
347 | Before tagging a release version and deploying to [PyPI](https://pypi.org/):
348 | 
349 | 1. Update the `version` and `download_url` properties in [setup.py](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/setup.py).
350 | 2. [Reserve a Zenodo DOI](https://cassgvp.github.io/github-for-collaborative-documentation/docs/tut/6-Zenodo-integration.html) for the release and update this readme's Zenodo badge and [citation instructions](https://github.com/opendatakosovo/cyrillic-transliteration#citation).
351 | 
352 | A big thank you to everyone who contributed:
353 | 
354 | - Bulgarian 🇧🇬: [@Syndamia](https://github.com/Syndamia) and [@Sparkycz](https://github.com/Sparkycz).
355 | - Russian 🇷🇺: [@ratijas](https://github.com/ratijas) and [@rominf](https://github.com/rominf).
356 | - Tajik 🇹🇯: [@diejani](https://github.com/diejani).
357 | - Ukrainian 🇺🇦: [@AnonymousVoice1](https://github.com/AnonymousVoice1).
358 | - Mongolian 🇲🇳: [@Serbipunk](https://github.com/Serbipunk).
359 | - Command Line Interface (CLI): [@ZJaume](https://github.com/ZJaume).


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import unittest
  3 | import cyrtranslit
  4 | 
  5 | # Test inputs and output strings
  6 | serbian_alphabet_cyrillic = 'АаБбВвГгДдЂђЕеЖжЗзИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЋћУуФфХхЦцЧчЏџШш'
  7 | serbian_alphabet_latin = 'AaBbVvGgDdĐđEeŽžZzIiJjKkLlLjljMmNnNjnjOoPpRrSsTtĆćUuFfHhCcČčDždžŠš'
  8 | 
  9 | montenegrin_alphabet_cyrillic = 'АаБбВвГгДдЂђЕеЖжЗзЗ́з́ИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЋћУуФфХхЦцЧчЏџШшС́с́'
 10 | montenegrin_alphabet_latin = 'AaBbVvGgDdĐđEeŽžZzŹźIiJjKkLlLjljMmNnNjnjOoPpRrSsTtĆćUuFfHhCcČčDždžŠšŚś'
 11 | 
 12 | macedonian_alphabet_cyrillic = 'АаБбВвГгДдЃѓЕеЖжЗзЅѕИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЌќУуФфХхЦцЧчЏџШш'
 13 | macedonian_alphabet_latin = 'AaBbVvGgDdǴǵEeŽžZzDzdzIiJjKkLlLjljMmNnNjnjOoPpRrSsTtḰḱUuFfHhCcČčDždžŠš'
 14 | 
 15 | russian_alphabet_cyrillic = 'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыьЭэЮюЯя'
 16 | russian_alphabet_latin = 'AaBbVvGgDdEeYOyoZHzhZzIiJjKkLlMmNnOoPpRrSsTtUuFfHhCZczCHchSHshSHHshh\'\'\'\'Y\'y\'\'E\'e\'YuyuYaya'
 17 | 
 18 | tajik_alphabet_cyrillic = 'АаБбВвГгҒғДдЕеЁёЖжЗзИиӢӣЙйКкЛлМмНнОоПпРрСсТтУуӮӯФфХхҲҳЧчҶҷШшъЭэЮюЯя'
 19 | tajik_alphabet_latin = 'AaBbVvGgǦǧDdEeËëŽžZzIiĪīJjKkLlMmNnOoPpRrSsTtUuŪūFfHhḨḩČčÇçŠš’ÈèÛûÂâ'
 20 | 
 21 | bulgarian_alphabet_cyrillic = 'АаБбВвГгДдЕеЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЬьЮюЯя'
 22 | bulgarian_alphabet_latin = 'AaBbVvGgDdEeZHzhZzIiYyKkLlMmNnOoPpRrSsTtUuFfHhTStsCHchSHshSHTshtĂăJjYUyuYAya'
 23 | 
 24 | # not testing Ь for the apostrophe, sticking with just ь. Both will transliterate to '.
 25 | ukrainian_alphabet_cyrillic = 'АаБбВвГгҐґДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЮюЯяь'
 26 | ukrainian_alphabet_latin = 'AaBbVvHhGgDdEeJejeŽžZzYyIiJijiJjKkLlMmNnOoPpRrSsTtUuFfXxCcČčŠšŠčščJujuJaja\''
 27 | 
 28 | belarusian_alphabet_cyrillic = 'АаБбВвГгДдЕеЁёЖжЗзІіЙйКкЛлМмНнОоПпРрСсТтУуЎўФфХхЦцЧчШшЫыЬьЭэЮюЯя'
 29 | belarusian_alphabet_latin = 'AaBbVvHhDdEeËëŽžZzIiJjKkLlMmNnOoPpRrSsTtUuŬŭFfXxCcČčŠšYy\'\'ĖėJujuJaja'
 30 | 
 31 | greek_alphabet = 'ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω'
 32 | greek_alphabet_latin = 'AaVvGgDdEeZzHhThthIiKkLlMmNnXxOoPpRrSssTtYyFfChchPspsWw'
 33 | 
 34 | mongolian_alphabet_cyrillic = 'АаЭэИиОоУуӨөҮүНнМмЛлВвПпФфКкХхГгСсШшТтДдЦцЧчЗзЖжРрБбЕеЁёЫыЮюЯя'  # exclude (Й Ъ Ь)<->I  Щ<->Sh
 35 | mongolian_alphabet_latin = 'AaEeIiOoUuÖöÜüNnMmLlVvPpFfKkKhkhGgSsShshTtDdTstsChchZzJjRrBbYeyeYoyoYyYuyuYaya'
 36 | 
 37 | special_chars = '‘’‚“”„†‡‰‹›♠♣♥♦‾←↑→↓™!"#$%&\'()*+,-./ :;<=>?@[\\]^_`{|}~…–—¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿×'
 38 | 
 39 | diacritic_chars = 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïðñòóôõöøùúûüý'
 40 | 
 41 | numerical_chars = '1234567890'
 42 | 
 43 | alphabet_chars = 'AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz'
 44 | 
 45 | mix_characters_some_cyrillic = '!ЉFљñМ мНÈÆнЊњО)*+,оП>пР?р'
 46 | mix_characters_all_latin = '!LjFljñM mNÈÆnNjnjO)*+,oP>pR?r'
 47 | 
 48 | mix_characters_some_cyrillic_no_alpha = '\'Ћ<=>?ћУуФфХхЦцЧчЏ%4џШ12ш♥'
 49 | mix_characters_all_latin_no_alpha = '\'Ć<=>?ćUuFfHhCcČčDž%4džŠ12š♥'
 50 | 
 51 | 
 52 | class TestSerbianTransliterationFromCyrillicToLatin(unittest.TestCase):
 53 | 
 54 |     def test_alphabet_transliteration(self):
 55 |         ''' Transliteration of entire Serbian cyrillic alphabet to latin.
 56 |         '''
 57 |         transliterated_serbian_alphabet = cyrtranslit.to_latin(serbian_alphabet_cyrillic)
 58 | 
 59 |         self.assertEqual(transliterated_serbian_alphabet, serbian_alphabet_latin)
 60 | 
 61 | 
 62 |     def test_special_characters(self):
 63 |         ''' Special characters should remain the same.
 64 |         '''
 65 |         transliterated_special_chars = cyrtranslit.to_latin(special_chars)
 66 | 
 67 |         self.assertEqual(transliterated_special_chars, special_chars)
 68 | 
 69 | 
 70 |     def test_special_diacritic_characters(self):
 71 |         ''' Diacritic characters should remain the same.
 72 |         '''
 73 |         transliterated_diacritic_chars = cyrtranslit.to_latin(diacritic_chars)
 74 | 
 75 |         self.assertEqual(transliterated_diacritic_chars, diacritic_chars)
 76 | 
 77 | 
 78 |     def test_numerical_characters(self):
 79 |         ''' Numerical characters should remain the same.
 80 |         '''
 81 |         transliterated_numerical_chars = cyrtranslit.to_latin(numerical_chars)
 82 | 
 83 |         self.assertEqual(transliterated_numerical_chars, numerical_chars)
 84 | 
 85 | 
 86 |     def test_latin_alphabet_characters(self):
 87 |         ''' Alphabet characters should remain the same.
 88 |         '''
 89 |         transliterated_alphabet_chars = cyrtranslit.to_latin(alphabet_chars)
 90 | 
 91 |         self.assertEqual(transliterated_alphabet_chars, alphabet_chars)
 92 | 
 93 | 
 94 |     def test_mix_characters(self):
 95 |         ''' Serbian cyrillic characters should be transliterated but non serbian cyrillic ones shouldn't.
 96 |         '''
 97 | 
 98 |         transliterated_mix = cyrtranslit.to_latin(mix_characters_some_cyrillic)
 99 | 
100 |         self.assertEqual(transliterated_mix, mix_characters_all_latin)
101 | 
102 | 
103 | class TestSerbianTransliterationFromLatinToCyrillic(unittest.TestCase):
104 | 
105 |     def test_alphabet_transliteration(self):
106 |         ''' Transliteration of entire Serbian cyrillic alphabet to latin.
107 |         '''
108 |         transliterated_serbian_alphabet = cyrtranslit.to_cyrillic(serbian_alphabet_latin)
109 | 
110 |         self.assertEqual(transliterated_serbian_alphabet, serbian_alphabet_cyrillic)
111 | 
112 | 
113 |     def test_special_characters(self):
114 |         ''' Special characters should remain the same.
115 |         '''
116 |         transliterated_special_chars = cyrtranslit.to_cyrillic(special_chars)
117 | 
118 |         self.assertEqual(transliterated_special_chars, special_chars)
119 | 
120 | 
121 |     def test_special_diacritic_characters(self):
122 |         ''' Diacritic characters should remain the same.
123 |         '''
124 |         transliterated_diacritic_chars = cyrtranslit.to_cyrillic(diacritic_chars)
125 | 
126 |         self.assertEqual(transliterated_diacritic_chars, diacritic_chars)
127 | 
128 | 
129 |     def test_numerical_characters(self):
130 |         ''' Numerical characters should remain the same.
131 |         '''
132 |         transliterated_numerical_chars = cyrtranslit.to_cyrillic(numerical_chars)
133 | 
134 |         self.assertEqual(transliterated_numerical_chars, numerical_chars)
135 | 
136 |     def test_mix_characters(self):
137 |         ''' Serbian cyrillic characters should be transliterated but non serbian cyrillic ones shouldn't.
138 |         '''
139 |         transliterated_mix = cyrtranslit.to_cyrillic(mix_characters_all_latin_no_alpha)
140 | 
141 |         self.assertEqual(transliterated_mix, mix_characters_some_cyrillic_no_alpha)
142 | 
143 | 
144 | class TestSerbianCountryCodeAlias(unittest.TestCase):
145 |     ''' Test that 'rs' (ISO 3166-1 country code) works as alias for 'sr' (ISO 639-1 language code).
146 |         Addresses issue #46.
147 |     '''
148 | 
149 |     def test_rs_to_latin(self):
150 |         ''' Test transliteration using 'rs' country code to latin.
151 |         '''
152 |         transliterated = cyrtranslit.to_latin("Мој ховеркрафт је пун јегуља", lang_code='rs')
153 |         self.assertEqual(transliterated, "Moj hoverkraft je pun jegulja")
154 | 
155 |     def test_rs_to_cyrillic(self):
156 |         ''' Test transliteration using 'rs' country code to cyrillic.
157 |         '''
158 |         transliterated = cyrtranslit.to_cyrillic("Moj hoverkraft je pun jegulja", lang_code='rs')
159 |         self.assertEqual(transliterated, "Мој ховеркрафт је пун јегуља")
160 | 
161 |     def test_rs_alphabet_to_latin(self):
162 |         ''' Test full alphabet transliteration with 'rs' code.
163 |         '''
164 |         transliterated = cyrtranslit.to_latin(serbian_alphabet_cyrillic, lang_code='rs')
165 |         self.assertEqual(transliterated, serbian_alphabet_latin)
166 | 
167 | 
168 | class TestMontenegrinTransliteration(unittest.TestCase):
169 |     def test_alphabet_transliteration_cyrillic_to_latin(self):
170 |         ''' Transliteration of entire cyrillic alphabet to latin.
171 |         '''
172 |         transliterated_alphabet = cyrtranslit.to_latin(montenegrin_alphabet_cyrillic, lang_code='me')
173 | 
174 |         # transliterated_alphabet =  u's\u0301' 's\xcc\x81'
175 |         self.assertEqual(transliterated_alphabet, montenegrin_alphabet_latin)
176 | 
177 |     def test_alphabet_transliteration_latin_to_cyrillic(self):
178 |         ''' Transliteration of entire latin alphabet to cyrillic.
179 |         '''
180 |         transliterated_alphabet = cyrtranslit.to_cyrillic(montenegrin_alphabet_latin, lang_code='me')
181 | 
182 |         self.assertEqual(transliterated_alphabet, montenegrin_alphabet_cyrillic)
183 | 
184 | 
185 | class TestMacedonianTransliteration(unittest.TestCase):
186 |     def test_alphabet_transliteration_cyrillic_to_latin(self):
187 |         ''' Transliteration of entire cyrillic alphabet to latin.
188 |         '''
189 |         transliterated_alphabet = cyrtranslit.to_latin(macedonian_alphabet_cyrillic, lang_code='mk')
190 | 
191 |         # transliterated_alphabet =  u's\u0301' 's\xcc\x81'
192 |         self.assertEqual(transliterated_alphabet, macedonian_alphabet_latin)
193 | 
194 |     def test_alphabet_transliteration_latin_to_cyrillic(self):
195 |         ''' Transliteration of entire latin alphabet to cyrillic.
196 |         '''
197 |         transliterated_alphabet = cyrtranslit.to_cyrillic(macedonian_alphabet_latin, lang_code='mk')
198 | 
199 |         self.assertEqual(transliterated_alphabet, macedonian_alphabet_cyrillic)
200 | 
201 | 
202 | class TestRussianTransliteration(unittest.TestCase):
203 |     def test_alphabet_transliteration_cyrillic_to_latin(self):
204 |         ''' Transliteration of entire cyrillic alphabet to latin.
205 |         '''
206 |         transliterated_alphabet = cyrtranslit.to_latin(russian_alphabet_cyrillic, lang_code='ru')
207 | 
208 |         self.assertEqual(transliterated_alphabet, russian_alphabet_latin)
209 | 
210 |     def test_alphabet_transliteration_latin_to_cyrillic(self):
211 |         ''' Transliteration of entire latin alphabet to cyrillic.
212 |         '''
213 |         transliterated_alphabet = cyrtranslit.to_cyrillic(russian_alphabet_latin, lang_code='ru')
214 | 
215 |         self.assertEqual(transliterated_alphabet, russian_alphabet_cyrillic.replace('Ъ', 'ъ').replace('Ь', 'ь').replace('Ы', 'ы'))
216 | 
217 |     def test_h_transliteration(self):
218 |         ''' Cyrillic Х should transliterate to H, not X.
219 |         '''
220 |         self.assertEqual(cyrtranslit.to_latin('Х', lang_code='ru'), 'H')
221 |         self.assertEqual(cyrtranslit.to_latin('х', lang_code='ru'), 'h')
222 |         self.assertEqual(cyrtranslit.to_cyrillic('H', lang_code='ru'), 'Х')
223 |         self.assertEqual(cyrtranslit.to_cyrillic('h', lang_code='ru'), 'х')
224 | 
225 |     def test_ya_capitalization(self):
226 |         ''' Capital Я should transliterate to Ya, not YA.
227 |         '''
228 |         self.assertEqual(cyrtranslit.to_latin('Я', lang_code='ru'), 'Ya')
229 |         self.assertEqual(cyrtranslit.to_latin('я', lang_code='ru'), 'ya')
230 |         self.assertEqual(cyrtranslit.to_latin('Янковский', lang_code='ru'), 'Yankovskij')
231 |         self.assertEqual(cyrtranslit.to_latin('яблоко', lang_code='ru'), 'yabloko')
232 |         self.assertEqual(cyrtranslit.to_cyrillic('Ya', lang_code='ru'), 'Я')
233 |         self.assertEqual(cyrtranslit.to_cyrillic('ya', lang_code='ru'), 'я')
234 | 
235 | class TestTajikTransliteration(unittest.TestCase):
236 |     def test_alphabet_transliteration_cyrillic_to_latin(self):
237 |         ''' Transliterate the entire cyrillic alphabet to latin '''
238 |         transliterated_alphabet = cyrtranslit.to_latin(tajik_alphabet_cyrillic, lang_code='tj')
239 | 
240 |         self.assertEqual(transliterated_alphabet, tajik_alphabet_latin)
241 | 
242 |     def test_alphabet_transliteration_latin_to_cyrillic(self):
243 |         ''' Transliterate the entire latin alphabet to cyrillic '''
244 |         transliterated_alphabet = cyrtranslit.to_cyrillic(tajik_alphabet_latin, lang_code='tj')
245 | 
246 |         self.assertEqual(transliterated_alphabet, tajik_alphabet_cyrillic)
247 |  
248 | class TestUkrainianTransliteration(unittest.TestCase):
249 |     def test_alphabet_transliteration_cyrillic_to_latin(self):
250 |         ''' Transliterate the entire cyrillic alphabet to latin '''
251 |         transliterated_alphabet = cyrtranslit.to_latin(ukrainian_alphabet_cyrillic, lang_code='ua')
252 | 
253 |         self.assertEqual(transliterated_alphabet, ukrainian_alphabet_latin)
254 | 
255 |     def test_alphabet_transliteration_latin_to_cyrillic(self):
256 |         ''' Transliterate the entire latin alphabet to cyrillic '''
257 |         transliterated_alphabet = cyrtranslit.to_cyrillic(ukrainian_alphabet_latin, lang_code='ua')
258 | 
259 |         self.assertEqual(transliterated_alphabet, ukrainian_alphabet_cyrillic)
260 | 
261 | 
262 |     def test_special_diacritic_characters(self):
263 |         ''' Diacritic characters should remain the same.
264 |         '''
265 |         transliterated_diacritic_chars = cyrtranslit.to_latin(diacritic_chars, lang_code='tj')
266 | 
267 |         self.assertEqual(transliterated_diacritic_chars, diacritic_chars)
268 | 
269 | 
270 |     def test_numerical_characters(self):
271 |         ''' Numerical characters should remain the same.
272 |         '''
273 |         transliterated_numerical_chars = cyrtranslit.to_latin(numerical_chars, lang_code='tj')
274 | 
275 |         self.assertEqual(transliterated_numerical_chars, numerical_chars)
276 | 
277 | 
278 | class TestBelarusianTransliteration(unittest.TestCase):
279 |     ''' Test Belarusian transliteration. Addresses issue #47.
280 |     '''
281 | 
282 |     def test_alphabet_transliteration_cyrillic_to_latin(self):
283 |         ''' Transliterate the entire Belarusian cyrillic alphabet to latin.
284 |         '''
285 |         transliterated_alphabet = cyrtranslit.to_latin(belarusian_alphabet_cyrillic, lang_code='by')
286 | 
287 |         self.assertEqual(transliterated_alphabet, belarusian_alphabet_latin)
288 | 
289 |     def test_alphabet_transliteration_latin_to_cyrillic(self):
290 |         ''' Transliterate the entire Belarusian latin alphabet to cyrillic.
291 |         '''
292 |         transliterated_alphabet = cyrtranslit.to_cyrillic(belarusian_alphabet_latin, lang_code='by')
293 | 
294 |         self.assertEqual(transliterated_alphabet, belarusian_alphabet_cyrillic)
295 | 
296 |     def test_phrase_transliteration_to_latin(self):
297 |         ''' Test common Belarusian phrase transliteration.
298 |         '''
299 |         # "Hello, World!" in Belarusian
300 |         cyrillic_text = "Прывітанне, свет!"
301 |         expected_latin = "Pryvitanne, svet!"
302 | 
303 |         transliterated = cyrtranslit.to_latin(cyrillic_text, lang_code='by')
304 |         self.assertEqual(transliterated, expected_latin)
305 | 
306 |     def test_short_u_transliteration(self):
307 |         ''' Test Belarusian unique letter Ў (short U).
308 |         '''
309 |         # Ў is unique to Belarusian
310 |         self.assertEqual(cyrtranslit.to_latin("Ў", lang_code='by'), "Ŭ")
311 |         self.assertEqual(cyrtranslit.to_latin("ў", lang_code='by'), "ŭ")
312 |         self.assertEqual(cyrtranslit.to_cyrillic("Ŭ", lang_code='by'), "Ў")
313 |         self.assertEqual(cyrtranslit.to_cyrillic("ŭ", lang_code='by'), "ў")
314 | 
315 | 
316 | class TestGreekTransliteration(unittest.TestCase):
317 |     ''' Test Greek transliteration. Addresses issue #40.
318 |     '''
319 | 
320 |     def test_alphabet_transliteration_to_latin(self):
321 |         ''' Transliterate the entire Greek alphabet to latin.
322 |         '''
323 |         transliterated_alphabet = cyrtranslit.to_latin(greek_alphabet, lang_code='el')
324 | 
325 |         self.assertEqual(transliterated_alphabet, greek_alphabet_latin)
326 | 
327 |     def test_alphabet_transliteration_to_greek(self):
328 |         ''' Transliterate the entire latin alphabet to Greek.
329 |             Note: Final sigma (ς) converts to regular sigma (σ) when going Latin→Greek
330 |             since we can't determine word endings from Latin text.
331 |         '''
332 |         transliterated_alphabet = cyrtranslit.to_cyrillic(greek_alphabet_latin, lang_code='el')
333 | 
334 |         # Replace final sigma with regular sigma for comparison
335 |         expected_greek = greek_alphabet.replace('ς', 'σ')
336 |         self.assertEqual(transliterated_alphabet, expected_greek)
337 | 
338 |     def test_theta_transliteration(self):
339 |         ''' Test Greek Theta (Θθ) transliterates to Th/th.
340 |         '''
341 |         self.assertEqual(cyrtranslit.to_latin('Θ', lang_code='el'), 'Th')
342 |         self.assertEqual(cyrtranslit.to_latin('θ', lang_code='el'), 'th')
343 |         self.assertEqual(cyrtranslit.to_cyrillic('Th', lang_code='el'), 'Θ')
344 |         self.assertEqual(cyrtranslit.to_cyrillic('th', lang_code='el'), 'θ')
345 | 
346 |     def test_chi_transliteration(self):
347 |         ''' Test Greek Chi (Χχ) transliterates to Ch/ch.
348 |         '''
349 |         self.assertEqual(cyrtranslit.to_latin('Χ', lang_code='el'), 'Ch')
350 |         self.assertEqual(cyrtranslit.to_latin('χ', lang_code='el'), 'ch')
351 |         self.assertEqual(cyrtranslit.to_cyrillic('Ch', lang_code='el'), 'Χ')
352 |         self.assertEqual(cyrtranslit.to_cyrillic('ch', lang_code='el'), 'χ')
353 | 
354 |     def test_psi_transliteration(self):
355 |         ''' Test Greek Psi (Ψψ) transliterates to Ps/ps.
356 |         '''
357 |         self.assertEqual(cyrtranslit.to_latin('Ψ', lang_code='el'), 'Ps')
358 |         self.assertEqual(cyrtranslit.to_latin('ψ', lang_code='el'), 'ps')
359 |         self.assertEqual(cyrtranslit.to_cyrillic('Ps', lang_code='el'), 'Ψ')
360 |         self.assertEqual(cyrtranslit.to_cyrillic('ps', lang_code='el'), 'ψ')
361 | 
362 |     def test_final_sigma(self):
363 |         ''' Test Greek final sigma (ς) transliterates same as regular sigma.
364 |         '''
365 |         self.assertEqual(cyrtranslit.to_latin('ς', lang_code='el'), 's')
366 |         self.assertEqual(cyrtranslit.to_latin('Σ', lang_code='el'), 'S')
367 |         self.assertEqual(cyrtranslit.to_latin('σ', lang_code='el'), 's')
368 | 
369 |     def test_phrase_transliteration(self):
370 |         ''' Test common Greek phrase transliteration.
371 |         '''
372 |         # "Hello" in Greek (Γειά σου)
373 |         greek_text = "Γειά σου"
374 |         expected_latin = "Geia soy"
375 | 
376 |         transliterated = cyrtranslit.to_latin(greek_text, lang_code='el')
377 |         self.assertEqual(transliterated, expected_latin)
378 | 
379 | 
380 | class TestBulgarianTransliteration(unittest.TestCase):
381 |     def test_alphabet_transliteration_cyrillic_to_latin(self):
382 |         ''' Transliteration of entire cyrillic alphabet to latin.
383 |         '''
384 |         transliterated_alphabet = cyrtranslit.to_latin(bulgarian_alphabet_cyrillic, lang_code='bg')
385 | 
386 |         self.assertEqual(transliterated_alphabet, bulgarian_alphabet_latin)
387 | 
388 |     def test_alphabet_transliteration_latin_to_cyrillic(self):
389 |         ''' Transliteration of entire latin alphabet to cyrillic.
390 |         '''
391 |         transliterated_alphabet = cyrtranslit.to_cyrillic(bulgarian_alphabet_latin, lang_code='bg')
392 | 
393 |         self.assertEqual(transliterated_alphabet, bulgarian_alphabet_cyrillic)
394 | 
395 |     def test_sh_at_the_end_of_string(self):
396 |         ''' Check if "sh" at the of the string doesn't cause any exception.'''
397 |         transliterated_alphabet = cyrtranslit.to_cyrillic("AaBbsh", lang_code='bg')
398 | 
399 |         self.assertEqual(transliterated_alphabet, "АаБбш")
400 | 
401 | 
402 | class TestMongolianTransliterationFromCyrillicToLatin(unittest.TestCase):
403 | 
404 |     def test_alphabet_transliteration_cyrillic_to_latin(self):
405 |         ''' Transliteration of entire Mongolian cyrillic alphabet to latin.
406 |         '''
407 |         transliterated_mongolian_alphabet = cyrtranslit.to_latin(mongolian_alphabet_cyrillic, 'mn')
408 | 
409 |         self.assertEqual(transliterated_mongolian_alphabet, mongolian_alphabet_latin)
410 | 
411 |     def test_alphabet_transliteration_latin_to_cyrillic(self):
412 |         ''' Transliteration of entire latin alphabet to cyrillic.
413 |         '''
414 |         transliterated_mongolian_alphabet = cyrtranslit.to_cyrillic(mongolian_alphabet_latin, 'mn')
415 | 
416 |         self.assertEqual(transliterated_mongolian_alphabet, mongolian_alphabet_cyrillic)
417 | 
418 |     def test_mixed_casing_transliteration_latin_to_cyrillic(self):
419 |         ''' Transliteration from latin with mixed casing, e.g. Sh SH sh sH.
420 |         '''
421 |         input_latin = 'KhKHkhkHShSHshsHTsTStstSChCHchcHYeYEyeyEYoYOyoyOYaYAyayA'
422 |         expected_output_cyrillic = 'ХХххШШшшЦЦццЧЧччЕЕееЁЁёёЯЯяя'
423 | 
424 |         actual_output_cyrillic = cyrtranslit.to_cyrillic(input_latin, 'mn')
425 | 
426 |         self.assertEqual(actual_output_cyrillic, expected_output_cyrillic)
427 | 
428 |     def test_transliteration_cyrillic_to_sh(self):
429 |         ''' Transliteration from Ш/Щ and ш/щ should be Sh and sh.
430 |             Both Ш and Щ are pronounced the same (/ʃ/) in Mongolian.
431 |         '''
432 |         input_cyrillic = 'ШшЩщ'
433 |         expected_output_latin = 'ShshShsh'
434 | 
435 |         actual_output_latin = cyrtranslit.to_latin(input_cyrillic, 'mn')
436 | 
437 |         self.assertEqual(actual_output_latin, expected_output_latin)
438 | 
439 |     def test_transliteration_sh_to_cyrillic_defaults_to_sha(self):
440 |         ''' Transliteration from Latin Sh/sh should default to Ш (not Щ).
441 |             Ш is more commonly used in Mongolian than Щ (which appears mainly in loanwords).
442 |         '''
443 |         input_latin = 'ShSHshsH'
444 |         expected_output_cyrillic = 'ШШшш'  # All variants should produce Ш (with proper casing)
445 | 
446 |         actual_output_cyrillic = cyrtranslit.to_cyrillic(input_latin, 'mn')
447 | 
448 |         self.assertEqual(actual_output_cyrillic, expected_output_cyrillic)
449 | 
450 | 
451 | class TestFileEncoding(unittest.TestCase):
452 |     ''' Test transliteration from files with different encodings.
453 |     '''
454 | 
455 |     def test_windows1251_encoded_file(self):
456 |         ''' Test that we can read and transliterate a windows-1251 encoded file.
457 |             This addresses issue #49 where files with non-UTF-8 Cyrillic encodings
458 |             would fail with UnicodeDecodeError.
459 |         '''
460 |         import subprocess
461 |         import sys
462 | 
463 |         # Run the CLI tool on a windows-1251 encoded file (auto-detection)
464 |         result = subprocess.run(
465 |             [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'bg', '-i', 'tests/bg_windows1251.txt'],
466 |             capture_output=True,
467 |             text=True
468 |         )
469 | 
470 |         # Should not fail with UnicodeDecodeError
471 |         self.assertEqual(result.returncode, 0, f"Command failed with: {result.stderr}")
472 | 
473 |         # Should produce Latin output
474 |         self.assertIn('Zdravey', result.stdout)
475 | 
476 |         # Should show a warning that it fell back to windows-1251
477 |         self.assertIn('windows-1251', result.stderr)
478 | 
479 |     def test_explicit_encoding_parameter(self):
480 |         ''' Test that we can explicitly specify the encoding with -e parameter.
481 |         '''
482 |         import subprocess
483 |         import sys
484 | 
485 |         # Run the CLI tool with explicit encoding parameter
486 |         result = subprocess.run(
487 |             [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'bg', '-i', 'tests/bg_windows1251.txt', '-e', 'windows-1251'],
488 |             capture_output=True,
489 |             text=True
490 |         )
491 | 
492 |         # Should not fail
493 |         self.assertEqual(result.returncode, 0, f"Command failed with: {result.stderr}")
494 | 
495 |         # Should produce Latin output
496 |         self.assertIn('Zdravey', result.stdout)
497 | 
498 |         # Should NOT show a warning when correct encoding is specified
499 |         self.assertNotIn('Warning', result.stderr)
500 | 
501 | 
502 | class TestMacedonianAccentedCharacters(unittest.TestCase):
503 |     ''' Test Macedonian accented vowels with grave accent for homograph disambiguation.
504 |         Addresses issue #4.
505 | 
506 |         According to ISO 9:1968/1995 (adopted by Macedonian Academy of Arts and Sciences in 1970):
507 |         - Ѐ (U+0400) / ѐ (U+0450) - Cyrillic Ie with grave
508 |         - Ѝ (U+040D) / ѝ (U+045D) - Cyrillic I with grave
509 | 
510 |         These are used to distinguish homographs:
511 |         - ѝ (her) vs и (and)
512 |         - нѐ (us) vs не (no)
513 |         - сѐ (everything) vs се (short reflexive pronoun)
514 |     '''
515 | 
516 |     def test_ie_with_grave_to_latin_preserve_accents_false(self):
517 |         ''' Ѐ/ѐ should transliterate to E/e when preserve_accents=False (default).
518 |         '''
519 |         self.assertEqual(cyrtranslit.to_latin('Ѐ', lang_code='mk', preserve_accents=False), 'E')
520 |         self.assertEqual(cyrtranslit.to_latin('ѐ', lang_code='mk', preserve_accents=False), 'e')
521 |         self.assertEqual(cyrtranslit.to_latin('нѐ', lang_code='mk', preserve_accents=False), 'ne')
522 |         self.assertEqual(cyrtranslit.to_latin('сѐ', lang_code='mk', preserve_accents=False), 'se')
523 | 
524 |     def test_ie_with_grave_to_latin_preserve_accents_true(self):
525 |         ''' Ѐ/ѐ should transliterate to È/è when preserve_accents=True.
526 |         '''
527 |         self.assertEqual(cyrtranslit.to_latin('Ѐ', lang_code='mk', preserve_accents=True), 'È')
528 |         self.assertEqual(cyrtranslit.to_latin('ѐ', lang_code='mk', preserve_accents=True), 'è')
529 |         self.assertEqual(cyrtranslit.to_latin('нѐ', lang_code='mk', preserve_accents=True), 'nè')
530 |         self.assertEqual(cyrtranslit.to_latin('сѐ', lang_code='mk', preserve_accents=True), 'sè')
531 | 
532 |     def test_i_with_grave_to_latin_preserve_accents_false(self):
533 |         ''' Ѝ/ѝ should transliterate to I/i when preserve_accents=False (default).
534 |         '''
535 |         self.assertEqual(cyrtranslit.to_latin('Ѝ', lang_code='mk', preserve_accents=False), 'I')
536 |         self.assertEqual(cyrtranslit.to_latin('ѝ', lang_code='mk', preserve_accents=False), 'i')
537 |         self.assertEqual(cyrtranslit.to_latin('ѝ је', lang_code='mk', preserve_accents=False), 'i je')
538 | 
539 |     def test_i_with_grave_to_latin_preserve_accents_true(self):
540 |         ''' Ѝ/ѝ should transliterate to Ì/ì when preserve_accents=True.
541 |         '''
542 |         self.assertEqual(cyrtranslit.to_latin('Ѝ', lang_code='mk', preserve_accents=True), 'Ì')
543 |         self.assertEqual(cyrtranslit.to_latin('ѝ', lang_code='mk', preserve_accents=True), 'ì')
544 |         self.assertEqual(cyrtranslit.to_latin('ѝ је', lang_code='mk', preserve_accents=True), 'ì je')
545 | 
546 |     def test_latin_e_with_grave_to_cyrillic_preserve_accents_false(self):
547 |         ''' È/è should transliterate to Е/е when preserve_accents=False (default).
548 |         '''
549 |         self.assertEqual(cyrtranslit.to_cyrillic('È', lang_code='mk', preserve_accents=False), 'Е')
550 |         self.assertEqual(cyrtranslit.to_cyrillic('è', lang_code='mk', preserve_accents=False), 'е')
551 |         self.assertEqual(cyrtranslit.to_cyrillic('nè', lang_code='mk', preserve_accents=False), 'не')
552 | 
553 |     def test_latin_e_with_grave_to_cyrillic_preserve_accents_true(self):
554 |         ''' È/è should transliterate to Ѐ/ѐ when preserve_accents=True.
555 |         '''
556 |         self.assertEqual(cyrtranslit.to_cyrillic('È', lang_code='mk', preserve_accents=True), 'Ѐ')
557 |         self.assertEqual(cyrtranslit.to_cyrillic('è', lang_code='mk', preserve_accents=True), 'ѐ')
558 |         self.assertEqual(cyrtranslit.to_cyrillic('nè', lang_code='mk', preserve_accents=True), 'нѐ')
559 | 
560 |     def test_latin_i_with_grave_to_cyrillic_preserve_accents_false(self):
561 |         ''' Ì/ì should transliterate to И/и when preserve_accents=False (default).
562 |         '''
563 |         self.assertEqual(cyrtranslit.to_cyrillic('Ì', lang_code='mk', preserve_accents=False), 'И')
564 |         self.assertEqual(cyrtranslit.to_cyrillic('ì', lang_code='mk', preserve_accents=False), 'и')
565 |         self.assertEqual(cyrtranslit.to_cyrillic('ì je', lang_code='mk', preserve_accents=False), 'и је')
566 | 
567 |     def test_latin_i_with_grave_to_cyrillic_preserve_accents_true(self):
568 |         ''' Ì/ì should transliterate to Ѝ/ѝ when preserve_accents=True.
569 |         '''
570 |         self.assertEqual(cyrtranslit.to_cyrillic('Ì', lang_code='mk', preserve_accents=True), 'Ѝ')
571 |         self.assertEqual(cyrtranslit.to_cyrillic('ì', lang_code='mk', preserve_accents=True), 'ѝ')
572 |         self.assertEqual(cyrtranslit.to_cyrillic('ì je', lang_code='mk', preserve_accents=True), 'ѝ је')
573 | 
574 |     def test_default_behavior_strips_accents(self):
575 |         ''' When preserve_accents parameter is omitted, accents should be stripped (default=False).
576 |         '''
577 |         # Default behavior should strip accents
578 |         self.assertEqual(cyrtranslit.to_latin('ѝ', lang_code='mk'), 'i')
579 |         self.assertEqual(cyrtranslit.to_latin('ѐ', lang_code='mk'), 'e')
580 | 
581 |     def test_file_transliteration_preserve_accents_false(self):
582 |         ''' Test file-based transliteration with preserve_accents=False (default).
583 |         '''
584 |         with open('tests/mk_accented.txt', 'r', encoding='utf-8') as f:
585 |             content = f.read()
586 | 
587 |         result = cyrtranslit.to_latin(content, lang_code='mk', preserve_accents=False)
588 | 
589 |         # Accents should be stripped
590 |         self.assertIn('i je tuka', result)
591 |         self.assertIn('ne sme tamu', result)
592 |         self.assertIn('se e dobro', result)
593 |         self.assertNotIn('ì', result)
594 |         self.assertNotIn('è', result)
595 | 
596 |     def test_file_transliteration_preserve_accents_true(self):
597 |         ''' Test file-based transliteration with preserve_accents=True.
598 |         '''
599 |         with open('tests/mk_accented.txt', 'r', encoding='utf-8') as f:
600 |             content = f.read()
601 | 
602 |         result = cyrtranslit.to_latin(content, lang_code='mk', preserve_accents=True)
603 | 
604 |         # Accents should be preserved
605 |         self.assertIn('ì je tuka', result)
606 |         self.assertIn('nè sme tamu', result)
607 |         self.assertIn('sè e dobro', result)
608 | 
609 | 
610 | class TestBulgarianAccentedCharacters(unittest.TestCase):
611 |     ''' Test Bulgarian accented I with grave for stress marking and homograph disambiguation.
612 |         Addresses issue #4.
613 | 
614 |         According to ISO 9:1995:
615 |         - Ѝ (U+040D) / ѝ (U+045D) - Cyrillic I with grave
616 | 
617 |         Used to distinguish:
618 |         - ѝ (her) vs и (and)
619 |     '''
620 | 
621 |     def test_i_with_grave_to_latin_preserve_accents_false(self):
622 |         ''' Ѝ/ѝ should transliterate to I/i when preserve_accents=False (default).
623 |         '''
624 |         self.assertEqual(cyrtranslit.to_latin('Ѝ', lang_code='bg', preserve_accents=False), 'I')
625 |         self.assertEqual(cyrtranslit.to_latin('ѝ', lang_code='bg', preserve_accents=False), 'i')
626 |         self.assertEqual(cyrtranslit.to_latin('ѝ е', lang_code='bg', preserve_accents=False), 'i e')
627 | 
628 |     def test_i_with_grave_to_latin_preserve_accents_true(self):
629 |         ''' Ѝ/ѝ should transliterate to Ì/ì when preserve_accents=True.
630 |         '''
631 |         self.assertEqual(cyrtranslit.to_latin('Ѝ', lang_code='bg', preserve_accents=True), 'Ì')
632 |         self.assertEqual(cyrtranslit.to_latin('ѝ', lang_code='bg', preserve_accents=True), 'ì')
633 |         self.assertEqual(cyrtranslit.to_latin('ѝ е', lang_code='bg', preserve_accents=True), 'ì e')
634 | 
635 |     def test_latin_i_with_grave_to_cyrillic_preserve_accents_false(self):
636 |         ''' Ì/ì should transliterate to И/и when preserve_accents=False (default).
637 |         '''
638 |         self.assertEqual(cyrtranslit.to_cyrillic('Ì', lang_code='bg', preserve_accents=False), 'И')
639 |         self.assertEqual(cyrtranslit.to_cyrillic('ì', lang_code='bg', preserve_accents=False), 'и')
640 | 
641 |     def test_latin_i_with_grave_to_cyrillic_preserve_accents_true(self):
642 |         ''' Ì/ì should transliterate to Ѝ/ѝ when preserve_accents=True.
643 |         '''
644 |         self.assertEqual(cyrtranslit.to_cyrillic('Ì', lang_code='bg', preserve_accents=True), 'Ѝ')
645 |         self.assertEqual(cyrtranslit.to_cyrillic('ì', lang_code='bg', preserve_accents=True), 'ѝ')
646 | 
647 |     def test_default_behavior_strips_accents(self):
648 |         ''' When preserve_accents parameter is omitted, accents should be stripped (default=False).
649 |         '''
650 |         self.assertEqual(cyrtranslit.to_latin('ѝ', lang_code='bg'), 'i')
651 | 
652 |     def test_file_transliteration_preserve_accents_false(self):
653 |         ''' Test file-based transliteration with preserve_accents=False (default).
654 |         '''
655 |         with open('tests/bg_accented.txt', 'r', encoding='utf-8') as f:
656 |             content = f.read()
657 | 
658 |         result = cyrtranslit.to_latin(content, lang_code='bg', preserve_accents=False)
659 | 
660 |         # Accents should be stripped
661 |         self.assertIn('i e tuk', result)
662 |         self.assertNotIn('ì', result)
663 | 
664 |     def test_file_transliteration_preserve_accents_true(self):
665 |         ''' Test file-based transliteration with preserve_accents=True.
666 |         '''
667 |         with open('tests/bg_accented.txt', 'r', encoding='utf-8') as f:
668 |             content = f.read()
669 | 
670 |         result = cyrtranslit.to_latin(content, lang_code='bg', preserve_accents=True)
671 | 
672 |         # Accents should be preserved
673 |         self.assertIn('ì e tuk', result)
674 | 
675 | 
676 | class TestCLI(unittest.TestCase):
677 |     ''' Test command-line interface functionality. '''
678 | 
679 |     def test_invalid_language_code(self):
680 |         ''' Test that invalid language code produces error. '''
681 |         import subprocess
682 |         import sys
683 | 
684 |         # Try to use an invalid language code
685 |         result = subprocess.run(
686 |             [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'xx', '-i', 'tests/sr.txt'],
687 |             capture_output=True,
688 |             text=True
689 |         )
690 | 
691 |         # Should fail
692 |         self.assertNotEqual(result.returncode, 0)
693 | 
694 |         # Should show error message
695 |         self.assertIn('not supported', result.stderr)
696 | 
697 |     def test_output_file_creation(self):
698 |         ''' Test that output file is created correctly. '''
699 |         import subprocess
700 |         import sys
701 |         import os
702 |         import tempfile
703 | 
704 |         # Create a temporary output file path
705 |         with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as tmp:
706 |             output_file = tmp.name
707 | 
708 |         try:
709 |             # Run CLI with output file
710 |             result = subprocess.run(
711 |                 [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'sr', '-i', 'tests/sr.txt', '-o', output_file],
712 |                 capture_output=True,
713 |                 text=True
714 |             )
715 | 
716 |             # Should succeed
717 |             self.assertEqual(result.returncode, 0, f"Command failed with: {result.stderr}")
718 | 
719 |             # Output file should exist
720 |             self.assertTrue(os.path.exists(output_file))
721 | 
722 |             # Read and verify content
723 |             with open(output_file, 'r', encoding='utf-8') as f:
724 |                 content = f.read()
725 |                 # Should contain transliterated Serbian text
726 |                 self.assertIn('a', content.lower())
727 | 
728 |         finally:
729 |             # Clean up
730 |             if os.path.exists(output_file):
731 |                 os.remove(output_file)
732 | 
733 |     def test_reverse_transliteration_flag(self):
734 |         ''' Test -c flag for Latin to Cyrillic transliteration. '''
735 |         import subprocess
736 |         import sys
737 | 
738 |         # Run CLI with -c flag on Latin text
739 |         result = subprocess.run(
740 |             [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'sr', '-c', '-i', 'tests/sr_latinica.txt'],
741 |             capture_output=True,
742 |             text=True,
743 |             encoding='utf-8'
744 |         )
745 | 
746 |         # Should succeed
747 |         self.assertEqual(result.returncode, 0, f"Command failed with: {result.stderr}")
748 | 
749 |         # Output should contain Cyrillic characters
750 |         # Check for common Serbian Cyrillic letters
751 |         self.assertTrue(any(ord(c) >= 0x0400 and ord(c) <= 0x04FF for c in result.stdout),
752 |                        "Output should contain Cyrillic characters")
753 | 
754 |     def test_preserve_accents_flag(self):
755 |         ''' Test -p flag for preserving accents. '''
756 |         import subprocess
757 |         import sys
758 | 
759 |         # Run CLI with -p flag on Macedonian text with accents
760 |         result = subprocess.run(
761 |             [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'mk', '-p', '-i', 'tests/mk_accented.txt'],
762 |             capture_output=True,
763 |             text=True,
764 |             encoding='utf-8'
765 |         )
766 | 
767 |         # Should succeed
768 |         self.assertEqual(result.returncode, 0, f"Command failed with: {result.stderr}")
769 | 
770 |         # Output should contain Latin letters with grave accents
771 |         self.assertIn('ì', result.stdout)
772 | 
773 |     def test_combined_flags(self):
774 |         ''' Test combining -c and -p flags. '''
775 |         import subprocess
776 |         import sys
777 | 
778 |         # Run CLI with both -c and -p flags
779 |         result = subprocess.run(
780 |             [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'mk', '-c', '-p', '-i', 'tests/mk_accented_latin.txt'],
781 |             capture_output=True,
782 |             text=True,
783 |             encoding='utf-8'
784 |         )
785 | 
786 |         # Should succeed
787 |         self.assertEqual(result.returncode, 0, f"Command failed with: {result.stderr}")
788 | 
789 |         # Output should contain Cyrillic with accents
790 |         self.assertIn('ѝ', result.stdout)
791 | 
792 |     def test_file_not_found(self):
793 |         ''' Test error handling when input file doesn't exist. '''
794 |         import subprocess
795 |         import sys
796 | 
797 |         # Try to read non-existent file
798 |         result = subprocess.run(
799 |             [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'sr', '-i', 'nonexistent_file.txt'],
800 |             capture_output=True,
801 |             text=True
802 |         )
803 | 
804 |         # Should fail
805 |         self.assertNotEqual(result.returncode, 0)
806 | 
807 | 
808 | if __name__ == '__main__':
809 |     # Run all tests.
810 |     unittest.main()
811 | 


--------------------------------------------------------------------------------