├── setup.cfg ├── tests ├── mk_accented_latin.txt ├── bg.txt ├── sr_latinica.txt ├── bg_accented.txt ├── by.txt ├── sr.txt ├── ua.txt ├── mk.txt ├── mn.txt ├── ru.txt ├── me.txt ├── tj.txt ├── mk_accented.txt └── bg_windows1251.txt ├── .gitignore ├── cyrtranslit ├── mapping │ ├── me.py │ ├── sr.py │ ├── tj.py │ ├── ua.py │ ├── by.py │ ├── ru.py │ ├── el.py │ ├── mn.py │ ├── bg.py │ ├── __init__.py │ └── mk.py ├── cyrtranslit.py └── __init__.py ├── LICENSE ├── .github └── workflows │ └── test.yml ├── setup.py ├── README.md └── tests.py /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /tests/mk_accented_latin.txt: -------------------------------------------------------------------------------- 1 | ì e tuka 2 | nè sme tamu 3 | -------------------------------------------------------------------------------- /tests/bg.txt: -------------------------------------------------------------------------------- 1 | АаБбВвГгДдЕеЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЬьЮюЯя -------------------------------------------------------------------------------- /tests/sr_latinica.txt: -------------------------------------------------------------------------------- 1 | Dobar dan 2 | Kako si? 3 | Ovo je test fajl. 4 | -------------------------------------------------------------------------------- /tests/bg_accented.txt: -------------------------------------------------------------------------------- 1 | ѝ е тук 2 | и аз съм тук 3 | ѝ каза нещо 4 | Ѝ дойде 5 | -------------------------------------------------------------------------------- /tests/by.txt: -------------------------------------------------------------------------------- 1 | АаБбВвГгДдЕеЁёЖжЗзІіЙйКкЛлМмНнОоПпРрСсТтУуЎўФфХхЦцЧчШшЫыЬьЭэЮюЯя -------------------------------------------------------------------------------- /tests/sr.txt: -------------------------------------------------------------------------------- 1 | АаБбВвГгДдЂђЕеЖжЗзИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЋћУуФфХхЦцЧчЏџШш 2 | -------------------------------------------------------------------------------- /tests/ua.txt: -------------------------------------------------------------------------------- 1 | АаБбВвГ㥴ДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЮюЯяь -------------------------------------------------------------------------------- /tests/mk.txt: -------------------------------------------------------------------------------- 1 | АаБбВвГгДдЃѓЕеЖжЗзЅѕИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЌќУуФфХхЦцЧчЏџШш 2 | -------------------------------------------------------------------------------- /tests/mn.txt: -------------------------------------------------------------------------------- 1 | АаБбВвГгДдЂђЕеЖжЗзЗ́з́ИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЋћУуФфХхЦцЧчЏџШшС́с́ -------------------------------------------------------------------------------- /tests/ru.txt: -------------------------------------------------------------------------------- 1 | АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя 2 | -------------------------------------------------------------------------------- /tests/me.txt: -------------------------------------------------------------------------------- 1 | АаБбВвГгДдЂђЕеЖжЗзЗ́з́ИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЋћУуФфХхЦцЧчЏџШшС́с́ 2 | -------------------------------------------------------------------------------- /tests/tj.txt: -------------------------------------------------------------------------------- 1 | АаБбВвГгҒғДдЕеЁёЖжЗзИиӢӣЙйКкЛлМмНнОоПпРрСсТтУуӮӯФфХхҲҳЧчҶҷШшъЭэЮюЯя 2 | -------------------------------------------------------------------------------- /tests/mk_accented.txt: -------------------------------------------------------------------------------- 1 | ѝ је тука 2 | нѐ сме таму 3 | сѐ е добро 4 | Ѐдна работа 5 | Ѝ си дошла 6 | -------------------------------------------------------------------------------- /tests/bg_windows1251.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatakosovo/cyrillic-transliteration/HEAD/tests/bg_windows1251.txt -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .pypirc 2 | _build 3 | MANIFEST 4 | .idea 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | venv/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | tests/output.txt 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | 59 | # Sphinx documentation 60 | docs/_build/ 61 | 62 | # PyBuilder 63 | target/ 64 | -------------------------------------------------------------------------------- /cyrtranslit/mapping/me.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Montenegrin (me) transliteration mappings. 4 | 5 | ISO 3166-1 country code: me 6 | 7 | Montenegrin Latin is based on Serbo-Croatian Latin, with the addition of the two letters Ś and Ź, 8 | to replace the digraphs SJ and ZJ. These parallel the two letters of the Montenegrin Cyrillic 9 | alphabet not found in Serbian, С́ and З́. These, respectively, could also be represented in the 10 | original alphabets as šj and žj, and шj and жj. 11 | 12 | Source: https://en.wikipedia.org/wiki/Montenegrin_alphabet#Latin_alphabet 13 | Also see: http://news.bbc.co.uk/2/hi/8520466.stm 14 | """ 15 | 16 | import copy 17 | from .sr import SR_CYR_TO_LAT_DICT 18 | 19 | ME_CYR_TO_LAT_DICT = copy.deepcopy(SR_CYR_TO_LAT_DICT) 20 | ME_CYR_TO_LAT_DICT.update({ 21 | u'С́': u'Ś', u'с́': u'ś', # Montenegrin 22 | u'З́': u'Ź', u'з́': u'ź' # Montenegrin 23 | }) 24 | 25 | # This dictionary is to transliterate from Montenegrin Latin to Cyrillic. 26 | ME_LAT_TO_CYR_DICT = {y: x for x, y in iter(ME_CYR_TO_LAT_DICT.items())} 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Open Data Kosovo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13', '3.14'] 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install pytest pytest-cov 28 | pip install -e . 29 | 30 | - name: Run tests with coverage 31 | run: | 32 | pytest tests.py --cov=cyrtranslit --cov-report=xml --cov-report=term-missing 33 | 34 | - name: Generate coverage report 35 | uses: irongut/CodeCoverageSummary@v1.3.0 36 | with: 37 | filename: coverage.xml 38 | badge: true 39 | format: markdown 40 | output: both 41 | 42 | - name: Add coverage to job summary 43 | run: | 44 | cat code-coverage-results.md >> $GITHUB_STEP_SUMMARY 45 | -------------------------------------------------------------------------------- /cyrtranslit/mapping/sr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Serbian (sr/rs) transliteration mappings. 4 | 5 | ISO 639-1 language code: sr 6 | ISO 3166-1 country code alias: rs 7 | """ 8 | 9 | # This dictionary is to transliterate from Serbian Cyrillic to Latin. 10 | SR_CYR_TO_LAT_DICT = { 11 | u'А': u'A', u'а': u'a', 12 | u'Б': u'B', u'б': u'b', 13 | u'В': u'V', u'в': u'v', 14 | u'Г': u'G', u'г': u'g', 15 | u'Д': u'D', u'д': u'd', 16 | u'Ђ': u'Đ', u'ђ': u'đ', 17 | u'Е': u'E', u'е': u'e', 18 | u'Ж': u'Ž', u'ж': u'ž', 19 | u'З': u'Z', u'з': u'z', 20 | u'И': u'I', u'и': u'i', 21 | u'Ј': u'J', u'ј': u'j', 22 | u'К': u'K', u'к': u'k', 23 | u'Л': u'L', u'л': u'l', 24 | u'Љ': u'Lj', u'љ': u'lj', 25 | u'М': u'M', u'м': u'm', 26 | u'Н': u'N', u'н': u'n', 27 | u'Њ': u'Nj', u'њ': u'nj', 28 | u'О': u'O', u'о': u'o', 29 | u'П': u'P', u'п': u'p', 30 | u'Р': u'R', u'р': u'r', 31 | u'С': u'S', u'с': u's', 32 | u'Т': u'T', u'т': u't', 33 | u'Ћ': u'Ć', u'ћ': u'ć', 34 | u'У': u'U', u'у': u'u', 35 | u'Ф': u'F', u'ф': u'f', 36 | u'Х': u'H', u'х': u'h', 37 | u'Ц': u'C', u'ц': u'c', 38 | u'Ч': u'Č', u'ч': u'č', 39 | u'Џ': u'Dž', u'џ': u'dž', 40 | u'Ш': u'Š', u'ш': u'š', 41 | } 42 | 43 | # This dictionary is to transliterate from Serbian Latin to Cyrillic. 44 | # Let's build it by simply swapping keys and values of previous dictionary. 45 | SR_LAT_TO_CYR_DICT = {y: x for x, y in iter(SR_CYR_TO_LAT_DICT.items())} 46 | -------------------------------------------------------------------------------- /cyrtranslit/mapping/tj.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Tajik (tj) transliteration mappings. 4 | 5 | ISO 639-1 language code: tj 6 | 7 | Transliteration follows ISO 9 (1995). 8 | https://en.wikipedia.org/wiki/Tajik_alphabet#Cyrillic 9 | """ 10 | 11 | import copy 12 | from .ru import RU_CYR_TO_LAT_DICT 13 | 14 | # Transliterate from Tajik cyrillic to latin 15 | TJ_CYR_TO_LAT_DICT = copy.deepcopy(RU_CYR_TO_LAT_DICT) 16 | # Change Mapping according to ISO 9 (1995) 17 | TJ_CYR_TO_LAT_DICT[u"Э"] = u"È" 18 | TJ_CYR_TO_LAT_DICT[u"э"] = u"è" 19 | TJ_CYR_TO_LAT_DICT[u"ъ"] = u"’" 20 | TJ_CYR_TO_LAT_DICT[u"Х"] = u"H" 21 | TJ_CYR_TO_LAT_DICT[u"х"] = u"h" 22 | TJ_CYR_TO_LAT_DICT[u"Ч"] = u"Č" 23 | TJ_CYR_TO_LAT_DICT[u"ч"] = u"č" 24 | TJ_CYR_TO_LAT_DICT[u"Ж"] = u"Ž" 25 | TJ_CYR_TO_LAT_DICT[u"ж"] = u"ž" 26 | TJ_CYR_TO_LAT_DICT[u"Ё"] = u"Ë" 27 | TJ_CYR_TO_LAT_DICT[u"ё"] = u"ë" 28 | TJ_CYR_TO_LAT_DICT[u"Ш"] = u"Š" 29 | TJ_CYR_TO_LAT_DICT[u"ш"] = u"š" 30 | TJ_CYR_TO_LAT_DICT[u"Ю"] = u"Û" 31 | TJ_CYR_TO_LAT_DICT[u"ю"] = u"û" 32 | TJ_CYR_TO_LAT_DICT[u"Я"] = u"Â" 33 | TJ_CYR_TO_LAT_DICT[u"я"] = u"â" 34 | # delete letters not used 35 | del TJ_CYR_TO_LAT_DICT[u"Ц"] 36 | del TJ_CYR_TO_LAT_DICT[u"ц"] 37 | del TJ_CYR_TO_LAT_DICT[u"Щ"] 38 | del TJ_CYR_TO_LAT_DICT[u"щ"] 39 | del TJ_CYR_TO_LAT_DICT[u"Ы"] 40 | del TJ_CYR_TO_LAT_DICT[u"ы"] 41 | 42 | # update the dict for the additional letters in the tajik cyrillic alphabet ( Ғ, Ӣ, Қ, Ӯ, Ҳ, Ҷ ) 43 | # see https://en.wikipedia.org/wiki/Tajik_alphabet#Cyrillic 44 | TJ_CYR_TO_LAT_DICT.update({ 45 | u"Ғ": u"Ǧ", u"ғ": u"ǧ", 46 | u"Ӣ": u"Ī", u"ӣ": u"ī", 47 | u"Қ": u"Q", u"қ": u"q", 48 | u"Ӯ": u"Ū", u"ӯ": u"ū", 49 | u"Ҳ": u"Ḩ", u"ҳ": u"ḩ", 50 | u"Ҷ": u"Ç", u"ҷ": u"ç" 51 | }) 52 | 53 | # transliterate from latin tajik to cyrillic 54 | TJ_LAT_TO_CYR_DICT = {y: x for x, y in iter(TJ_CYR_TO_LAT_DICT.items())} 55 | -------------------------------------------------------------------------------- /cyrtranslit/mapping/ua.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Ukrainian (ua) transliteration mappings. 4 | 5 | ISO 639-1 language code: ua 6 | 7 | Transliteration follows Scientific Ukrainian transliteration system. 8 | """ 9 | 10 | import copy 11 | from .ru import RU_CYR_TO_LAT_DICT 12 | 13 | # Transliterate from Ukrainian 14 | UA_CYR_TO_LAT_DICT = copy.deepcopy(RU_CYR_TO_LAT_DICT) 15 | # Change mapping to match with Scientific Ukrainian 16 | UA_CYR_TO_LAT_DICT[u"Г"] = u"H" 17 | UA_CYR_TO_LAT_DICT[u"г"] = u"h" 18 | UA_CYR_TO_LAT_DICT[u"Ж"] = u"Ž" 19 | UA_CYR_TO_LAT_DICT[u"ж"] = u"ž" 20 | UA_CYR_TO_LAT_DICT[u"И"] = u"Y" 21 | UA_CYR_TO_LAT_DICT[u"и"] = u"y" 22 | UA_CYR_TO_LAT_DICT[u"Х"] = u"X" 23 | UA_CYR_TO_LAT_DICT[u"х"] = u"x" 24 | UA_CYR_TO_LAT_DICT[u"Ц"] = u"C" 25 | UA_CYR_TO_LAT_DICT[u"ц"] = u"c" 26 | UA_CYR_TO_LAT_DICT[u"Ч"] = u"Č" 27 | UA_CYR_TO_LAT_DICT[u"ч"] = u"č" 28 | UA_CYR_TO_LAT_DICT[u"Ш"] = u"Š" 29 | UA_CYR_TO_LAT_DICT[u"ш"] = u"š" 30 | UA_CYR_TO_LAT_DICT[u"Щ"] = u"Šč" 31 | UA_CYR_TO_LAT_DICT[u"щ"] = u"šč" 32 | UA_CYR_TO_LAT_DICT[u"Ю"] = u"Ju" 33 | UA_CYR_TO_LAT_DICT[u"ю"] = u"ju" 34 | UA_CYR_TO_LAT_DICT[u"Я"] = u"Ja" 35 | UA_CYR_TO_LAT_DICT[u"я"] = u"ja" 36 | # Delete unused letters 37 | del UA_CYR_TO_LAT_DICT[u"Ё"] 38 | del UA_CYR_TO_LAT_DICT[u"ё"] 39 | del UA_CYR_TO_LAT_DICT[u"Ъ"] 40 | del UA_CYR_TO_LAT_DICT[u"ъ"] 41 | del UA_CYR_TO_LAT_DICT[u"Ы"] 42 | del UA_CYR_TO_LAT_DICT[u"ы"] 43 | del UA_CYR_TO_LAT_DICT[u"Э"] 44 | del UA_CYR_TO_LAT_DICT[u"э"] 45 | 46 | # Update for Ukrainian letters 47 | UA_CYR_TO_LAT_DICT.update({ 48 | u"Ґ": u"G", u"ґ": u"g", 49 | u"Є": u"Je", u"є": u"je", 50 | u"І": u"I", u"і": u"i", 51 | u"Ї": u"Ji", u"ї": u"ji" 52 | }) 53 | 54 | # Latin to Cyrillic 55 | UA_LAT_TO_CYR_DICT = {y: x for x, y in iter(UA_CYR_TO_LAT_DICT.items())} 56 | UA_LAT_TO_CYR_DICT.update({ 57 | u"JE": u"Є", u"jE": u"є", 58 | u"JI": u"Ї", u"jI": u"ї" 59 | }) 60 | -------------------------------------------------------------------------------- /cyrtranslit/mapping/by.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Belarusian (by) transliteration mappings. 4 | 5 | ISO 639-1 language code: by 6 | 7 | Transliteration follows ISO 9:1995 and BGN/PCGN romanization standards. 8 | https://en.wikipedia.org/wiki/Belarusian_alphabet 9 | https://en.wikipedia.org/wiki/Romanization_of_Belarusian 10 | """ 11 | 12 | import copy 13 | from .ru import RU_CYR_TO_LAT_DICT 14 | 15 | # Transliterate from Belarusian (based on ISO 9:1995 and BGN/PCGN) 16 | BY_CYR_TO_LAT_DICT = copy.deepcopy(RU_CYR_TO_LAT_DICT) 17 | # Change mapping to match Belarusian scientific transliteration 18 | BY_CYR_TO_LAT_DICT[u"Г"] = u"H" 19 | BY_CYR_TO_LAT_DICT[u"г"] = u"h" 20 | BY_CYR_TO_LAT_DICT[u"Ё"] = u"Ë" 21 | BY_CYR_TO_LAT_DICT[u"ё"] = u"ë" 22 | BY_CYR_TO_LAT_DICT[u"Ж"] = u"Ž" 23 | BY_CYR_TO_LAT_DICT[u"ж"] = u"ž" 24 | BY_CYR_TO_LAT_DICT[u"Х"] = u"X" 25 | BY_CYR_TO_LAT_DICT[u"х"] = u"x" 26 | BY_CYR_TO_LAT_DICT[u"Ц"] = u"C" 27 | BY_CYR_TO_LAT_DICT[u"ц"] = u"c" 28 | BY_CYR_TO_LAT_DICT[u"Ч"] = u"Č" 29 | BY_CYR_TO_LAT_DICT[u"ч"] = u"č" 30 | BY_CYR_TO_LAT_DICT[u"Ш"] = u"Š" 31 | BY_CYR_TO_LAT_DICT[u"ш"] = u"š" 32 | BY_CYR_TO_LAT_DICT[u"Ы"] = u"Y" 33 | BY_CYR_TO_LAT_DICT[u"ы"] = u"y" 34 | BY_CYR_TO_LAT_DICT[u"Ь"] = u"'" 35 | BY_CYR_TO_LAT_DICT[u"ь"] = u"'" 36 | BY_CYR_TO_LAT_DICT[u"Э"] = u"Ė" 37 | BY_CYR_TO_LAT_DICT[u"э"] = u"ė" 38 | BY_CYR_TO_LAT_DICT[u"Ю"] = u"Ju" 39 | BY_CYR_TO_LAT_DICT[u"ю"] = u"ju" 40 | BY_CYR_TO_LAT_DICT[u"Я"] = u"Ja" 41 | BY_CYR_TO_LAT_DICT[u"я"] = u"ja" 42 | # Delete letters not used in Belarusian 43 | del BY_CYR_TO_LAT_DICT[u"Щ"] 44 | del BY_CYR_TO_LAT_DICT[u"щ"] 45 | del BY_CYR_TO_LAT_DICT[u"Ъ"] 46 | del BY_CYR_TO_LAT_DICT[u"ъ"] 47 | # Update for Belarusian-specific letters 48 | BY_CYR_TO_LAT_DICT.update({ 49 | u"І": u"I", u"і": u"i", 50 | u"Ў": u"Ŭ", u"ў": u"ŭ" 51 | }) 52 | 53 | # Latin to Cyrillic 54 | BY_LAT_TO_CYR_DICT = {y: x for x, y in iter(BY_CYR_TO_LAT_DICT.items())} 55 | BY_LAT_TO_CYR_DICT.update({ 56 | u"JU": u"Ю", u"Ju": u"Ю", u"ju": u"ю", 57 | u"JA": u"Я", u"Ja": u"Я", u"ja": u"я", 58 | u"''": u"Ьь" # Two apostrophes for Ьь 59 | }) 60 | # Single apostrophe defaults to lowercase ь 61 | BY_LAT_TO_CYR_DICT[u"'"] = u"ь" 62 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from setuptools import setup 3 | 4 | # use this to read the contents of the README.md file 5 | from pathlib import Path 6 | 7 | setup( 8 | name='cyrtranslit', 9 | packages=['cyrtranslit', 'cyrtranslit.mapping'], 10 | version='1.2.0', 11 | description='Bi-directional Cyrillic transliteration. Transliterate Cyrillic script to Latin script and vice versa. Supports transliteration for Belarusian, Bulgarian, Greek, Montenegrin, Macedonian, Mongolian, Russian, Serbian, Tajik, and Ukrainian.', 12 | long_description=(Path(__file__).parent / "README.md").read_text(), 13 | long_description_content_type='text/markdown', 14 | author='Georges Labrèche, Open Data Kosovo', 15 | author_email='georges@tanagraspace.com', 16 | url='https://github.com/opendatakosovo/cyrillic-transliteration', 17 | download_url='https://github.com/opendatakosovo/cyrillic-transliteration/archive/v1.2.0.tar.gz', 18 | license='MIT', 19 | keywords=['cyrillic', 'latin', 'transliteration', 'transliterate', 'cyrtranslit', 'belarusian', 'bulgarian', 'greek', 'montenegrin', 'macedonian', 'mongolian', 'russian', 'serbian', 'tajik', 'ukrainian'], 20 | classifiers=['Development Status :: 5 - Production/Stable', 21 | 'Intended Audience :: Developers', 22 | 'License :: OSI Approved :: MIT License', 23 | 'Programming Language :: Python', 24 | 'Programming Language :: Python :: 2.7', 25 | 'Programming Language :: Python :: 3', 26 | 'Programming Language :: Python :: 3.1', 27 | 'Programming Language :: Python :: 3.2', 28 | 'Programming Language :: Python :: 3.3', 29 | 'Programming Language :: Python :: 3.4', 30 | 'Programming Language :: Python :: 3.5', 31 | 'Programming Language :: Python :: 3.6', 32 | 'Programming Language :: Python :: 3.7', 33 | 'Programming Language :: Python :: 3.8', 34 | 'Programming Language :: Python :: 3.9', 35 | 'Programming Language :: Python :: 3.10', 36 | 'Programming Language :: Python :: 3.11', 37 | 'Programming Language :: Python :: 3.12', 38 | 'Programming Language :: Python :: 3.13', 39 | 'Programming Language :: Python :: 3.14'], 40 | entry_points={ 41 | "console_scripts": [ 42 | "cyrtranslit=cyrtranslit.cyrtranslit:main", 43 | ] 44 | } 45 | ) -------------------------------------------------------------------------------- /cyrtranslit/mapping/ru.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Russian (ru) transliteration mappings. 4 | 5 | ISO 639-1 language code: ru 6 | 7 | Transliteration follows GOST 7.79-2000 System B. 8 | """ 9 | 10 | # This dictionary is to transliterate from Russian Cyrillic to Latin (GOST_7.79-2000 System B). 11 | RU_CYR_TO_LAT_DICT = { 12 | u"А": u"A", u"а": u"a", 13 | u"Б": u"B", u"б": u"b", 14 | u"В": u"V", u"в": u"v", 15 | u"Г": u"G", u"г": u"g", 16 | u"Д": u"D", u"д": u"d", 17 | u"Е": u"E", u"е": u"e", 18 | u"Ё": u"YO", u"ё": u"yo", 19 | u"Ж": u"ZH", u"ж": u"zh", 20 | u"З": u"Z", u"з": u"z", 21 | u"И": u"I", u"и": u"i", 22 | u"Й": u"J", u"й": u"j", 23 | u"К": u"K", u"к": u"k", 24 | u"Л": u"L", u"л": u"l", 25 | u"М": u"M", u"м": u"m", 26 | u"Н": u"N", u"н": u"n", 27 | u"О": u"O", u"о": u"o", 28 | u"П": u"P", u"п": u"p", 29 | u"Р": u"R", u"р": u"r", 30 | u"С": u"S", u"с": u"s", 31 | u"Т": u"T", u"т": u"t", 32 | u"У": u"U", u"у": u"u", 33 | u"Ф": u"F", u"ф": u"f", 34 | u"Х": u"H", u"х": u"h", 35 | u"Ц": u"CZ", u"ц": u"cz", 36 | u"Ч": u"CH", u"ч": u"ch", 37 | u"Ш": u"SH", u"ш": u"sh", 38 | u"Щ": u"SHH", u"щ": u"shh", 39 | u"Ъ": u"''", u"ъ": u"''", 40 | u"Ы": u"Y'", u"ы": u"y'", 41 | u"Ь": u"'", u"ь": u"'", 42 | u"Э": u"E'", u"э": u"e'", 43 | u"Ю": u"Yu", u"ю": u"yu", 44 | u"Я": u"Ya", u"я": u"ya", 45 | } 46 | 47 | # This dictionary is to transliterate from Russian Latin to Cyrillic. 48 | RU_LAT_TO_CYR_DICT = {y: x for x, y in RU_CYR_TO_LAT_DICT.items()} 49 | RU_LAT_TO_CYR_DICT.update({ 50 | u"''": u"ъ", 51 | u"'": u"ь", 52 | u"C": u"К", u"c": u"к", 53 | u"CK": u"К", u"Ck": u"К", u"ck": u"к", 54 | u"JA": u"ЖА", u"Ja": u"Жа", u"ja": u"жа", 55 | u"JE": u"ЖЕ", u"Je": u"Же", u"je": u"же", 56 | u"JI": u"ЖИ", u"Ji": u"Жи", u"ji": u"жи", 57 | u"JO": u"ЖО", u"Jo": u"Жо", u"jo": u"жо", 58 | u"JU": u"ЖУ", u"Ju": u"Жу", u"ju": u"жу", 59 | u"PH": u"Ф", u"Ph": u"Ф", u"ph": u"ф", 60 | u"TH": u"З", u"Th": u"З", u"th": u"з", 61 | u"W": u"В", u"w": u"в", u"Q": u"К", u"q": u"к", 62 | u"WH": u"В", u"Wh": u"В", u"wh": u"в", 63 | u"Y": u"И", u"y": u"и", 64 | u"YA": u"Я", u"Ya": u"Я", u"ya": u"я", 65 | u"YE": u"Е", u"Ye": u"Е", u"ye": u"е", 66 | u"YI": u"И", u"Yi": u"И", u"yi": u"и", 67 | u"YO": u"Ё", u"Yo": u"Ё", u"yo": u"ё", 68 | u"YU": u"Ю", u"Yu": u"Ю", u"yu": u"ю", 69 | u"Y'": u"ы", u"y'": u"ы", 70 | u"iy": u"ый", u"ij": u"ый", # dobriy => добрый 71 | }) 72 | -------------------------------------------------------------------------------- /cyrtranslit/mapping/el.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Greek (el) transliteration mappings. 4 | 5 | ISO 639-1 language code: el 6 | 7 | Transliteration follows ELOT 743 / ISO 843. 8 | https://en.wikipedia.org/wiki/Greek_alphabet 9 | https://en.wikipedia.org/wiki/Romanization_of_Greek 10 | """ 11 | 12 | # Transliterate from Greek to Latin (ELOT 743 / ISO 843) 13 | EL_GRE_TO_LAT_DICT = { 14 | u"Α": u"A", u"α": u"a", u"Ά": u"A", u"ά": u"a", # alpha (with/without tonos) 15 | u"Β": u"V", u"β": u"v", 16 | u"Γ": u"G", u"γ": u"g", 17 | u"Δ": u"D", u"δ": u"d", 18 | u"Ε": u"E", u"ε": u"e", u"Έ": u"E", u"έ": u"e", # epsilon (with/without tonos) 19 | u"Ζ": u"Z", u"ζ": u"z", 20 | u"Η": u"H", u"η": u"h", u"Ή": u"H", u"ή": u"h", # eta (with/without tonos) 21 | u"Θ": u"Th", u"θ": u"th", 22 | u"Ι": u"I", u"ι": u"i", u"Ί": u"I", u"ί": u"i", u"Ϊ": u"I", u"ϊ": u"i", # iota (with tonos/dialytika) 23 | u"Κ": u"K", u"κ": u"k", 24 | u"Λ": u"L", u"λ": u"l", 25 | u"Μ": u"M", u"μ": u"m", 26 | u"Ν": u"N", u"ν": u"n", 27 | u"Ξ": u"X", u"ξ": u"x", 28 | u"Ο": u"O", u"ο": u"o", u"Ό": u"O", u"ό": u"o", # omicron (with/without tonos) 29 | u"Π": u"P", u"π": u"p", 30 | u"Ρ": u"R", u"ρ": u"r", 31 | u"Σ": u"S", u"σ": u"s", u"ς": u"s", # sigma (ς is final sigma) 32 | u"Τ": u"T", u"τ": u"t", 33 | u"Υ": u"Y", u"υ": u"y", u"Ύ": u"Y", u"ύ": u"y", u"Ϋ": u"Y", u"ϋ": u"y", # upsilon (with tonos/dialytika) 34 | u"Φ": u"F", u"φ": u"f", 35 | u"Χ": u"Ch", u"χ": u"ch", 36 | u"Ψ": u"Ps", u"ψ": u"ps", 37 | u"Ω": u"W", u"ω": u"w", u"Ώ": u"W", u"ώ": u"w", # omega (with/without tonos) 38 | } 39 | 40 | # This dictionary is to transliterate from Latin to Greek 41 | # Build the reverse mapping, but only include unaccented letters 42 | # (accented vowels transliterate to same Latin as unaccented, so we default to unaccented) 43 | EL_LAT_TO_GRE_DICT = { 44 | u"A": u"Α", u"a": u"α", 45 | u"V": u"Β", u"v": u"β", 46 | u"G": u"Γ", u"g": u"γ", 47 | u"D": u"Δ", u"d": u"δ", 48 | u"E": u"Ε", u"e": u"ε", 49 | u"Z": u"Ζ", u"z": u"ζ", 50 | u"H": u"Η", u"h": u"η", 51 | u"I": u"Ι", u"i": u"ι", 52 | u"K": u"Κ", u"k": u"κ", 53 | u"L": u"Λ", u"l": u"λ", 54 | u"M": u"Μ", u"m": u"μ", 55 | u"N": u"Ν", u"n": u"ν", 56 | u"X": u"Ξ", u"x": u"ξ", 57 | u"O": u"Ο", u"o": u"ο", 58 | u"P": u"Π", u"p": u"π", 59 | u"R": u"Ρ", u"r": u"ρ", 60 | u"S": u"Σ", u"s": u"σ", 61 | u"T": u"Τ", u"t": u"τ", 62 | u"Y": u"Υ", u"y": u"υ", 63 | u"F": u"Φ", u"f": u"φ", 64 | u"W": u"Ω", u"w": u"ω", 65 | } 66 | EL_LAT_TO_GRE_DICT.update({ 67 | u"TH": u"Θ", u"Th": u"Θ", u"th": u"θ", 68 | u"CH": u"Χ", u"Ch": u"Χ", u"ch": u"χ", 69 | u"PS": u"Ψ", u"Ps": u"Ψ", u"ps": u"ψ", 70 | }) 71 | -------------------------------------------------------------------------------- /cyrtranslit/mapping/mn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Mongolian (mn) transliteration mappings. 4 | 5 | ISO 639-1 language code: mn 6 | 7 | This version of Mongolian Latin <-> Cyrillic is based on MNS 5217:2012 8 | as far as I know this is the latest standard. Inform me @ https://github.com/Serbipunk 9 | 10 | References: 11 | https://gogo.mn/r/101115 12 | https://en.wikipedia.org/wiki/Mongolian_Cyrillic_alphabet 13 | """ 14 | 15 | # This list contains alternating Cyrillic and Latin mappings 16 | # Format: [Cyrillic_upper, Latin_upper, Cyrillic_lower, Latin_lower, ...] 17 | MN_CYR_LAT_LIST = [ 18 | u"А", u"A", u"а", u"a", 19 | u"Э", u"E", u"э", u"e", 20 | u"И", u"I", u"и", u"i", # i 21 | u"О", u"O", u"о", u"o", 22 | u"У", u"U", u"у", u"u", 23 | u"Ө", u"Ö", u"ө", u"ö", 24 | u"Ү", u"Ü", u"ү", u"ü", 25 | u"Н", u"N", u"н", u"n", 26 | u"М", u"M", u"м", u"m", 27 | u"Л", u"L", u"л", u"l", 28 | u"В", u"V", u"в", u"v", 29 | u"П", u"P", u"п", u"p", 30 | u"Ф", u"F", u"ф", u"f", 31 | u"К", u"K", u"к", u"k", 32 | u"Х", u"Kh", u"х", u"kh", # lat 1 33 | u"Х", u"KH", u"х", u"kH", # lat 1 34 | u"Г", u"G", u"г", u"g", 35 | u"С", u"S", u"с", u"s", 36 | u"Ш", u"Sh", u"ш", u"sh", # sh # lat2 37 | u"Ш", u"SH", u"ш", u"sH", # sh # lat2 38 | u"Т", u"T", u"т", u"t", 39 | u"Д", u"D", u"д", u"d", 40 | u"Ц", u"Ts", u"ц", u"ts", # lat3 41 | u"Ц", u"TS", u"ц", u"tS", # lat3 42 | u"Ч", u"Ch", u"ч", u"ch", # lat4 43 | u"Ч", u"CH", u"ч", u"cH", # lat4 44 | u"З", u"Z", u"з", u"z", 45 | u"Ж", u"J", u"ж", u"j", 46 | u"Й", u"I", u"й", u"i", # i * 2 47 | u"Р", u"R", u"р", u"r", 48 | u"Б", u"B", u"б", u"b", 49 | u"Е", u"Ye", u"е", u"ye", # lat 5 50 | u"Е", u"YE", u"е", u"yE", # lat 5 51 | u"Ё", u"Yo", u"ё", u"yo", # lat 6 52 | u"Ё", u"YO", u"ё", u"yO", # lat 6 53 | u"Ъ", u"I", u"ъ", u"i", # i * 3 54 | u"Ы", u"Y", u"ы", u"y", 55 | u"Ь", u"I", u"ь", u"i", # i * 4 56 | u"Ю", u"Yu", u"ю", u"yu", # lat 8 57 | u"Ю", u"YU", u"ю", u"yU", # lat 8 58 | u"Я", u"Ya", u"я", u"ya", # lat 9 59 | u"Я", u"YA", u"я", u"yA", # lat 9 60 | ] 61 | # Building the dictionary with the filter to skip pairs with 2-character Latin letters where the second character is uppercase 62 | MN_CYR_TO_LAT_DICT = { 63 | c: l for c, l in zip(MN_CYR_LAT_LIST[::2], MN_CYR_LAT_LIST[1::2]) 64 | if not (len(l) == 2 and l[1].isupper()) 65 | } 66 | 67 | # Handle Щ (shcha): This letter is part of Mongolian Cyrillic (inherited from Russian) 68 | # but is rarely used in practice. It's pronounced the same as Ш (/ʃ/), so both 69 | # transliterate to "Sh/sh" in Latin. When going Latin → Cyrillic, "Sh" defaults to Ш. 70 | MN_CYR_TO_LAT_DICT['Щ'] = 'Sh' 71 | MN_CYR_TO_LAT_DICT['щ'] = 'sh' 72 | 73 | MN_LAT_TO_CYR_DICT = dict([(l, c) for c, l in zip(MN_CYR_LAT_LIST[-2::-2], MN_CYR_LAT_LIST[-1::-2])]) 74 | -------------------------------------------------------------------------------- /cyrtranslit/mapping/bg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Bulgarian (bg) transliteration mappings. 4 | 5 | ISO 639-1 language code: bg 6 | 7 | Supports accented I with grave for stress marking and homograph disambiguation. 8 | Following ISO 9:1995. 9 | """ 10 | 11 | import copy 12 | from .ru import RU_CYR_TO_LAT_DICT 13 | 14 | # Transliterate from Bulgarian Cyrillic to Latin 15 | BG_CYR_TO_LAT_DICT = copy.deepcopy(RU_CYR_TO_LAT_DICT) 16 | 17 | # There are a couple of letters that don't exist in Bulgarian: 18 | del BG_CYR_TO_LAT_DICT[u"Ё"] 19 | del BG_CYR_TO_LAT_DICT[u"ё"] 20 | del BG_CYR_TO_LAT_DICT[u"Ы"] 21 | del BG_CYR_TO_LAT_DICT[u"ы"] 22 | del BG_CYR_TO_LAT_DICT[u"Э"] 23 | del BG_CYR_TO_LAT_DICT[u"э"] 24 | 25 | # Some letters that are pronounced differently 26 | BG_CYR_TO_LAT_DICT[u"Й"] = u"Y" 27 | BG_CYR_TO_LAT_DICT[u"й"] = u"y" 28 | BG_CYR_TO_LAT_DICT[u"Х"] = u"H" 29 | BG_CYR_TO_LAT_DICT[u"х"] = u"h" 30 | BG_CYR_TO_LAT_DICT[u"Ц"] = u"TS" 31 | BG_CYR_TO_LAT_DICT[u"ц"] = u"ts" 32 | BG_CYR_TO_LAT_DICT[u"Щ"] = u"SHT" 33 | BG_CYR_TO_LAT_DICT[u"щ"] = u"sht" 34 | BG_CYR_TO_LAT_DICT[u"Ю"] = u"YU" 35 | BG_CYR_TO_LAT_DICT[u"ю"] = u"yu" 36 | BG_CYR_TO_LAT_DICT[u"Я"] = u"YA" 37 | BG_CYR_TO_LAT_DICT[u"я"] = u"ya" 38 | 39 | # The following letters use the pre-2012 "Andreichin" system for lettering, 40 | # because in the newest "Ivanov" system "a" and "y" translate to two Bulgarian 41 | # letters and choosing to which one depends on the word and text context 42 | # https://en.wikipedia.org/wiki/Romanization_of_Bulgarian 43 | BG_CYR_TO_LAT_DICT[u"Ъ"] = u"Ă" 44 | BG_CYR_TO_LAT_DICT[u"ъ"] = u"ă" 45 | BG_CYR_TO_LAT_DICT[u"Ь"] = u"J" 46 | BG_CYR_TO_LAT_DICT[u"ь"] = u"j" 47 | 48 | # Transliterate from Latin Bulgarian to Cyrillic. 49 | # Build this BEFORE adding accented Cyrillic characters to avoid reverse mapping conflicts 50 | BG_LAT_TO_CYR_DICT = {y: x for x, y in iter(BG_CYR_TO_LAT_DICT.items())} 51 | 52 | # Accented vowels with grave accent (used for stress marking and homograph disambiguation) 53 | # Following ISO 9:1995 54 | # Source: https://en.wikipedia.org/wiki/I_with_grave_(Cyrillic) 55 | # Used to distinguish: ѝ (her) vs и (and) 56 | # 57 | # By default (preserve_accents=False), accented Cyrillic maps to unaccented Latin 58 | BG_CYR_TO_LAT_DICT[u"Ѝ"] = u"I" # Cyrillic I with grave → I (U+040D) 59 | BG_CYR_TO_LAT_DICT[u"ѝ"] = u"i" # Cyrillic i with grave → i (U+045D) 60 | 61 | # Accented map: When preserve_accents=True, these override the standard mappings 62 | BG_CYR_TO_LAT_ACCENTED_DICT = { 63 | u"Ѝ": u"Ì", # Cyrillic I with grave → Ì 64 | u"ѝ": u"ì", # Cyrillic i with grave → ì 65 | } 66 | 67 | BG_LAT_TO_CYR_DICT.update({ 68 | u"ZH": u"Ж", u"Zh": u"Ж", u"zh": u"ж", 69 | u"TS": u"Ц", u"Ts": u"Ц", u"ts": u"ц", 70 | u"CH": u"Ч", u"Ch": u"Ч", u"ch": u"ч", 71 | u"SH": u"Ш", u"Sh": u"Ш", u"sh": u"ш", 72 | u"SHT": u"Щ", u"Sht": u"Щ", u"sht": u"щ", 73 | u"YU": u"Ю", u"Yu": u"Ю", u"yu": u"ю", 74 | u"YA": u"Я", u"Ya": u"Я", u"ya": u"я", 75 | # Accented Latin to unaccented Cyrillic (preserve_accents=False) 76 | u"Ì": u"И", u"ì": u"и", # Latin I with grave → Cyrillic I 77 | }) 78 | 79 | # Accented map for Latin→Cyrillic: When preserve_accents=True, these override 80 | BG_LAT_TO_CYR_ACCENTED_DICT = { 81 | u"Ì": u"Ѝ", u"ì": u"ѝ", # Latin I with grave → Cyrillic Ѝ 82 | } 83 | -------------------------------------------------------------------------------- /cyrtranslit/mapping/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Cyrillic transliteration mapping package. 4 | 5 | This package contains transliteration mappings for various Cyrillic and Greek scripts. 6 | Each language has its own module with specific transliteration dictionaries. 7 | 8 | Supported languages: 9 | - bg: Bulgarian 10 | - by: Belarusian 11 | - el: Greek 12 | - me: Montenegrin 13 | - mk: Macedonian 14 | - mn: Mongolian 15 | - rs: Serbian (ISO 3166-1 country code alias) 16 | - ru: Russian 17 | - sr: Serbian 18 | - tj: Tajik 19 | - ua: Ukrainian 20 | """ 21 | 22 | # Import all language-specific mappings 23 | from .sr import SR_CYR_TO_LAT_DICT, SR_LAT_TO_CYR_DICT 24 | from .me import ME_CYR_TO_LAT_DICT, ME_LAT_TO_CYR_DICT 25 | from .mk import ( 26 | MK_CYR_TO_LAT_DICT, 27 | MK_LAT_TO_CYR_DICT, 28 | MK_CYR_TO_LAT_ACCENTED_DICT, 29 | MK_LAT_TO_CYR_ACCENTED_DICT 30 | ) 31 | from .ru import RU_CYR_TO_LAT_DICT, RU_LAT_TO_CYR_DICT 32 | from .tj import TJ_CYR_TO_LAT_DICT, TJ_LAT_TO_CYR_DICT 33 | from .bg import ( 34 | BG_CYR_TO_LAT_DICT, 35 | BG_LAT_TO_CYR_DICT, 36 | BG_CYR_TO_LAT_ACCENTED_DICT, 37 | BG_LAT_TO_CYR_ACCENTED_DICT 38 | ) 39 | from .ua import UA_CYR_TO_LAT_DICT, UA_LAT_TO_CYR_DICT 40 | from .by import BY_CYR_TO_LAT_DICT, BY_LAT_TO_CYR_DICT 41 | from .mn import MN_CYR_TO_LAT_DICT, MN_LAT_TO_CYR_DICT 42 | from .el import EL_GRE_TO_LAT_DICT, EL_LAT_TO_GRE_DICT 43 | 44 | # Bundle up all the dictionaries in a lookup dictionary 45 | TRANSLIT_DICT = { 46 | 'sr': { # Serbian (ISO 639-1 language code) 47 | 'tolatin': SR_CYR_TO_LAT_DICT, 48 | 'tocyrillic': SR_LAT_TO_CYR_DICT 49 | }, 50 | 'rs': { # Serbian (ISO 3166-1 country code alias) 51 | 'tolatin': SR_CYR_TO_LAT_DICT, 52 | 'tocyrillic': SR_LAT_TO_CYR_DICT 53 | }, 54 | 'me': { # Montenegro 55 | 'tolatin': ME_CYR_TO_LAT_DICT, 56 | 'tocyrillic': ME_LAT_TO_CYR_DICT 57 | }, 58 | 'mk': { # Macedonia 59 | 'tolatin': MK_CYR_TO_LAT_DICT, 60 | 'tocyrillic': MK_LAT_TO_CYR_DICT, 61 | 'tolatin_accented': MK_CYR_TO_LAT_ACCENTED_DICT, 62 | 'tocyrillic_accented': MK_LAT_TO_CYR_ACCENTED_DICT 63 | }, 64 | 'ru': { # Russian 65 | 'tolatin': RU_CYR_TO_LAT_DICT, 66 | 'tocyrillic': RU_LAT_TO_CYR_DICT 67 | }, 68 | 'tj': { # Tajik 69 | 'tolatin': TJ_CYR_TO_LAT_DICT, 70 | 'tocyrillic': TJ_LAT_TO_CYR_DICT 71 | }, 72 | 'bg': { # Bulgarian 73 | 'tolatin': BG_CYR_TO_LAT_DICT, 74 | 'tocyrillic': BG_LAT_TO_CYR_DICT, 75 | 'tolatin_accented': BG_CYR_TO_LAT_ACCENTED_DICT, 76 | 'tocyrillic_accented': BG_LAT_TO_CYR_ACCENTED_DICT 77 | }, 78 | 'ua': { # Ukrainian 79 | 'tolatin': UA_CYR_TO_LAT_DICT, 80 | 'tocyrillic': UA_LAT_TO_CYR_DICT 81 | }, 82 | 'by': { # Belarusian 83 | 'tolatin': BY_CYR_TO_LAT_DICT, 84 | 'tocyrillic': BY_LAT_TO_CYR_DICT 85 | }, 86 | 'mn': { # Mongolian 87 | 'tolatin': MN_CYR_TO_LAT_DICT, 88 | 'tocyrillic': MN_LAT_TO_CYR_DICT 89 | }, 90 | 'el': { # Greek (ISO 639-1 language code) 91 | 'tolatin': EL_GRE_TO_LAT_DICT, 92 | 'tocyrillic': EL_LAT_TO_GRE_DICT 93 | } 94 | } 95 | 96 | # Export the main dictionary for backward compatibility 97 | __all__ = ['TRANSLIT_DICT'] 98 | -------------------------------------------------------------------------------- /cyrtranslit/mapping/mk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Macedonian (mk) transliteration mappings. 4 | 5 | ISO 639-1 language code: mk 6 | 7 | Supports accented vowels with grave accent used for homograph disambiguation: 8 | - Ѐ/ѐ (U+0400/U+0450) - IE with grave 9 | - Ѝ/ѝ (U+040D/U+045D) - I with grave 10 | 11 | Following ISO 9:1968/1995, adopted by Macedonian Academy of Arts and Sciences (1970). 12 | """ 13 | 14 | import copy 15 | from .sr import SR_CYR_TO_LAT_DICT 16 | 17 | # Build the dictionaries to transliterate Macedonian Cyrillic to Latin and vice versa. 18 | MK_CYR_TO_LAT_DICT = copy.deepcopy(SR_CYR_TO_LAT_DICT) 19 | 20 | # Differences with Serbian: 21 | # 1) Between Ze (З з) and I (И и) is the letter Dze (Ѕ ѕ), which looks like the Latin letter S and represents /d͡z/. 22 | MK_CYR_TO_LAT_DICT[u'Ѕ'] = u'Dz' 23 | MK_CYR_TO_LAT_DICT[u'ѕ'] = u'dz' 24 | 25 | # 2) Dje (Ђ ђ) is replaced by Gje (Ѓ ѓ), which represents /ɟ/ (voiced palatal stop). 26 | # In some dialects, it represents /d͡ʑ/ instead, like Dje 27 | # It is written ⟨Ǵ ǵ⟩ in the corresponding Macedonian Latin alphabet. 28 | del MK_CYR_TO_LAT_DICT[u'Ђ'] 29 | del MK_CYR_TO_LAT_DICT[u'ђ'] 30 | MK_CYR_TO_LAT_DICT[u'Ѓ'] = u'Ǵ' 31 | MK_CYR_TO_LAT_DICT[u'ѓ'] = u'ǵ' 32 | 33 | # 3) Tshe (Ћ ћ) is replaced by Kje (Ќ ќ), which represents /c/ (voiceless palatal stop). 34 | # In some dialects, it represents /t͡ɕ/ instead, like Tshe. 35 | # It is written ⟨Ḱ ḱ⟩ in the corresponding Macedonian Latin alphabet. 36 | del MK_CYR_TO_LAT_DICT[u'Ћ'] 37 | del MK_CYR_TO_LAT_DICT[u'ћ'] 38 | MK_CYR_TO_LAT_DICT[u'Ќ'] = u'Ḱ' 39 | MK_CYR_TO_LAT_DICT[u'ќ'] = u'ḱ' 40 | 41 | # This dictionary is to transliterate from Macedonian Latin to Cyrillic. 42 | # Build this BEFORE adding accented Cyrillic characters to avoid reverse mapping conflicts 43 | MK_LAT_TO_CYR_DICT = {y: x for x, y in iter(MK_CYR_TO_LAT_DICT.items())} 44 | 45 | # 4) Accented vowels with grave accent (used to disambiguate homographs in Macedonian) 46 | # Following ISO 9:1968/1995, adopted by Macedonian Academy of Arts and Sciences in 1970 47 | # Source: https://en.wikipedia.org/wiki/I_with_grave_(Cyrillic) 48 | # These are used to distinguish homographs: 49 | # - ѝ (her) vs и (and) 50 | # - нѐ (us) vs не (no) 51 | # - сѐ (everything) vs се (short reflexive pronoun) 52 | # 53 | # By default (preserve_accents=False), accented Cyrillic maps to unaccented Latin 54 | MK_CYR_TO_LAT_DICT[u'Ѐ'] = u'E' # Cyrillic E with grave → E (U+0400) 55 | MK_CYR_TO_LAT_DICT[u'ѐ'] = u'e' # Cyrillic e with grave → e (U+0450) 56 | MK_CYR_TO_LAT_DICT[u'Ѝ'] = u'I' # Cyrillic I with grave → I (U+040D) 57 | MK_CYR_TO_LAT_DICT[u'ѝ'] = u'i' # Cyrillic i with grave → i (U+045D) 58 | 59 | # Accented map: When preserve_accents=True, these override the standard mappings 60 | MK_CYR_TO_LAT_ACCENTED_DICT = { 61 | u'Ѐ': u'È', # Cyrillic E with grave → È 62 | u'ѐ': u'è', # Cyrillic e with grave → è 63 | u'Ѝ': u'Ì', # Cyrillic I with grave → Ì 64 | u'ѝ': u'ì', # Cyrillic i with grave → ì 65 | } 66 | 67 | # Add mappings for accented Latin to unaccented Cyrillic (preserve_accents=False) 68 | MK_LAT_TO_CYR_DICT.update({ 69 | u'È': u'Е', u'è': u'е', # Latin E with grave → Cyrillic E 70 | u'Ì': u'И', u'ì': u'и', # Latin I with grave → Cyrillic I 71 | }) 72 | 73 | # Accented map for Latin→Cyrillic: When preserve_accents=True, these override 74 | MK_LAT_TO_CYR_ACCENTED_DICT = { 75 | u'È': u'Ѐ', u'è': u'ѐ', # Latin E with grave → Cyrillic Ѐ 76 | u'Ì': u'Ѝ', u'ì': u'ѝ', # Latin I with grave → Cyrillic Ѝ 77 | } 78 | -------------------------------------------------------------------------------- /cyrtranslit/cyrtranslit.py: -------------------------------------------------------------------------------- 1 | import cyrtranslit 2 | from cyrtranslit.mapping import TRANSLIT_DICT 3 | from argparse import ArgumentParser, FileType 4 | import os 5 | import sys 6 | 7 | def __is_valid_language_code(parse, arg): 8 | ''' Validates inputted two-letter language code. 9 | :param parse: The argument parser. Used to display error message. 10 | :param arg: The language code argument. 11 | ''' 12 | if arg.lower() not in TRANSLIT_DICT: 13 | parser.error("The language code %s is not supported. Support language codes are: %s." % (arg, ", ".join(TRANSLIT_DICT.keys()).upper())) 14 | else: 15 | return arg 16 | 17 | def main(): 18 | # Setup argument parser 19 | parser = ArgumentParser(description="Transiliterate text in a given file.") 20 | 21 | # Input file. 22 | # Not required. 23 | parser.add_argument("-i", dest="input_file", required=False, 24 | help="input file", 25 | default=None) 26 | 27 | # Output file. 28 | # Not required. If not specified, transliteration will appear as console output. 29 | parser.add_argument("-o", dest="output_file", required=False, 30 | help="ouput file", 31 | default=None) 32 | 33 | # Language code for cyrillic text in inputted file. 34 | # Required. 35 | parser.add_argument("-l", dest="language_code", required=True, 36 | help="two-letter ISO 639-1 language code of cyrillic text", 37 | type=lambda x: __is_valid_language_code(parser, x)) 38 | 39 | # Flag for reverse transliteration, i.e. from latin/roman alphabet to cyrillic. 40 | parser.add_argument("-c", dest="to_cyrillic", action='store_true', 41 | help="Parse latin characters to cyrillic (reverse of transliteration)") 42 | 43 | # Flag to preserve accent marks in transliteration 44 | parser.add_argument("-p", "--preserve-accents", dest="preserve_accents", action='store_true', 45 | help="Preserve accent marks (e.g., Macedonian/Bulgarian Ѐ→È, ѝ→ì instead of Ѐ→E, ѝ→i)") 46 | 47 | # Input file encoding. 48 | # Not required. Defaults to utf-8 with fallback to common Cyrillic encodings. 49 | parser.add_argument("-e", "--encoding", dest="encoding", required=False, 50 | help="input file encoding (default: utf-8 with automatic fallback to windows-1251, iso-8859-5, koi8-r, cp866)", 51 | default="utf-8") 52 | 53 | # Parse arguments 54 | args = parser.parse_args() 55 | 56 | # Fetch arguments. 57 | lang_code = args.language_code 58 | to_cyrillic = args.to_cyrillic 59 | preserve_accents = args.preserve_accents 60 | encoding = args.encoding 61 | 62 | # Open input file with proper encoding handling 63 | if args.input_file: 64 | # Try specified encoding first 65 | file_input = None 66 | tried_encodings = [encoding] 67 | 68 | # Helper function to test if an encoding works 69 | def try_encoding(filepath, enc): 70 | try: 71 | f = open(filepath, 'r', encoding=enc) 72 | # Try to read the file to actually test the encoding 73 | f.read() 74 | # If successful, reopen from the beginning 75 | f.close() 76 | return open(filepath, 'r', encoding=enc) 77 | except (UnicodeDecodeError, LookupError): 78 | if f: 79 | f.close() 80 | return None 81 | 82 | file_input = try_encoding(args.input_file, encoding) 83 | 84 | if file_input is None: 85 | # If specified encoding fails, try common Cyrillic encodings as fallback 86 | fallback_encodings = ['windows-1251', 'iso-8859-5', 'koi8-r', 'cp866'] 87 | 88 | for fallback_enc in fallback_encodings: 89 | if fallback_enc == encoding: 90 | continue # Already tried this one 91 | tried_encodings.append(fallback_enc) 92 | file_input = try_encoding(args.input_file, fallback_enc) 93 | if file_input is not None: 94 | print(f"Warning: Failed to decode with {encoding}, using {fallback_enc} instead.", file=sys.stderr) 95 | break 96 | 97 | if file_input is None: 98 | print(f"Error: Unable to decode file with any of the attempted encodings: {', '.join(tried_encodings)}", file=sys.stderr) 99 | print(f"Try specifying the correct encoding with -e/--encoding parameter.", file=sys.stderr) 100 | print(f"Common Cyrillic encodings: windows-1251, iso-8859-5, koi8-r, cp866", file=sys.stderr) 101 | sys.exit(1) 102 | else: 103 | file_input = sys.stdin 104 | 105 | # Open output file 106 | if args.output_file: 107 | file_output = open(args.output_file, 'w', encoding='utf-8') 108 | else: 109 | file_output = sys.stdout 110 | 111 | # Transliterate and write directly to output line by line 112 | try: 113 | for line in file_input: 114 | if to_cyrillic is True: 115 | file_output.write(cyrtranslit.to_cyrillic(line, lang_code=lang_code, preserve_accents=preserve_accents)) 116 | else: 117 | file_output.write(cyrtranslit.to_latin(line, lang_code=lang_code, preserve_accents=preserve_accents)) 118 | finally: 119 | # Close streams if they're not stdin/stdout 120 | if args.input_file and file_input: 121 | file_input.close() 122 | if args.output_file and file_output: 123 | file_output.close() 124 | 125 | if __name__ == "__main__": 126 | main() 127 | -------------------------------------------------------------------------------- /cyrtranslit/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .mapping import TRANSLIT_DICT 3 | import sys 4 | 5 | def __encode_utf8(_string): 6 | if sys.version_info < (3, 0): 7 | return _string.encode('utf-8') 8 | else: 9 | return _string 10 | 11 | def __decode_utf8(_string): 12 | if sys.version_info < (3, 0): 13 | return _string.decode('utf-8') 14 | else: 15 | return _string 16 | 17 | def to_latin(string_to_transliterate, lang_code='sr', preserve_accents=False): 18 | ''' Transliterate cyrillic string of characters to latin string of characters. 19 | :param string_to_transliterate: The cyrillic string to transliterate into latin characters. 20 | :param lang_code: Indicates the cyrillic language code we are translating from. Defaults to Serbian (sr). 21 | :param preserve_accents: If False (default), uses standard mappings (accented Cyrillic → unaccented Latin, e.g., Ѐ→E, ѝ→i). 22 | If True, merges accented mappings (accented Cyrillic → accented Latin, e.g., Ѐ→È, ѝ→ì). 23 | :return: A string of latin characters transliterated from the given cyrillic string. 24 | ''' 25 | 26 | # First check if we support the cyrillic alphabet we want to transliterate to latin. 27 | if lang_code.lower() not in TRANSLIT_DICT: 28 | # If we don't support it, then just return the original string. 29 | return string_to_transliterate 30 | 31 | # If we do support it, check if the implementation is not missing before proceeding. 32 | elif not TRANSLIT_DICT[lang_code.lower()]['tolatin']: 33 | return string_to_transliterate 34 | 35 | # Everything checks out, proceed with transliteration. 36 | else: 37 | 38 | # Get the character per character transliteration dictionary 39 | transliteration_dict = TRANSLIT_DICT[lang_code.lower()]['tolatin'].copy() 40 | 41 | # If preserve_accents=True and accented mappings exist, merge them (accented overrides standard) 42 | if preserve_accents and 'tolatin_accented' in TRANSLIT_DICT[lang_code.lower()]: 43 | transliteration_dict.update(TRANSLIT_DICT[lang_code.lower()]['tolatin_accented']) 44 | 45 | # Initialize the output latin string variable 46 | latinized_str = '' 47 | 48 | # Transliterate by traversing the input string character by character. 49 | string_to_transliterate = __decode_utf8(string_to_transliterate) 50 | 51 | 52 | for c in string_to_transliterate: 53 | 54 | # If character is in dictionary, it means it's a cyrillic so let's transliterate that character. 55 | if c in transliteration_dict: 56 | # Transliterate current character. 57 | latinized_str += transliteration_dict[c] 58 | 59 | # If character is not in character transliteration dictionary, 60 | # it is most likely a number or a special character so just keep it. 61 | else: 62 | latinized_str += c 63 | 64 | # Return the transliterated string. 65 | return __encode_utf8(latinized_str) 66 | 67 | 68 | def to_cyrillic(string_to_transliterate, lang_code='sr', preserve_accents=False): 69 | ''' Transliterate latin string of characters to cyrillic string of characters. 70 | :param string_to_transliterate: The latin string to transliterate into cyrillic characters. 71 | :param lang_code: Indicates the cyrillic language code we are translating to. Defaults to Serbian (sr). 72 | :param preserve_accents: If False (default), uses standard mappings (accented Latin → unaccented Cyrillic, e.g., È→Е, ì→и). 73 | If True, merges accented mappings (accented Latin → accented Cyrillic, e.g., È→Ѐ, ì→ѝ). 74 | :return: A string of cyrillic characters transliterated from the given latin string. 75 | ''' 76 | 77 | # First check if we support the cyrillic alphabet we want to transliterate to latin. 78 | if lang_code.lower() not in TRANSLIT_DICT: 79 | # If we don't support it, then just return the original string. 80 | return string_to_transliterate 81 | 82 | # If we do support it, check if the implementation is not missing before proceeding. 83 | elif not TRANSLIT_DICT[lang_code.lower()]['tocyrillic']: 84 | return string_to_transliterate 85 | 86 | else: 87 | # Get the character per character transliteration dictionary 88 | transliteration_dict = TRANSLIT_DICT[lang_code.lower()]['tocyrillic'].copy() 89 | 90 | # If preserve_accents=True and accented mappings exist, merge them (accented overrides standard) 91 | if preserve_accents and 'tocyrillic_accented' in TRANSLIT_DICT[lang_code.lower()]: 92 | transliteration_dict.update(TRANSLIT_DICT[lang_code.lower()]['tocyrillic_accented']) 93 | 94 | # Initialize the output cyrillic string variable 95 | cyrillic_str = '' 96 | 97 | string_to_transliterate = __decode_utf8(string_to_transliterate) 98 | 99 | # Transliterate by traversing the inputted string character by character. 100 | length_of_string_to_transliterate = len(string_to_transliterate) 101 | index = 0 102 | 103 | while index < length_of_string_to_transliterate: 104 | # Grab a character from the string at the current index 105 | c = string_to_transliterate[index] 106 | 107 | # Watch out for Lj and lj. Don't want to interpret Lj/lj as L/l and j. 108 | # Watch out for Nj and nj. Don't want to interpret Nj/nj as N/n and j. 109 | # Watch out for Dž and and dž. Don't want to interpret Dž/dž as D/d and j. 110 | c_plus_1 = u'' 111 | if index != length_of_string_to_transliterate - 1: 112 | c_plus_1 = string_to_transliterate[index + 1] 113 | 114 | c_plus_2 = u'' 115 | if index + 2 <= length_of_string_to_transliterate - 1: 116 | c_plus_2 = string_to_transliterate[index + 2] 117 | 118 | if ((c == u'L' or c == u'l') and c_plus_1 == u'j') or \ 119 | ((c == u'N' or c == u'n') and c_plus_1 == u'j') or \ 120 | ((c == u'D' or c == u'd') and c_plus_1 == u'ž') or \ 121 | (lang_code == 'mk' and (c == u'D' or c == u'd') and c_plus_1 == u'z') or \ 122 | (lang_code == 'bg' and ( 123 | (c in u'Zz' and c_plus_1 in u'Hh') or # Zh, zh 124 | (c in u'Tt' and c_plus_1 in u'Ss') or # Ts, ts 125 | (c in u'Ss' and c_plus_1 in u'Hh') or # Sh, sh (and also covers Sht, sht) 126 | (c in u'Cc' and c_plus_1 in u'Hh') or # Ch, ch 127 | (c in u'Yy' and c_plus_1 in u'Uu') or # Yu, yu 128 | (c in u'Yy' and c_plus_1 in u'Aa') # Ya, ya 129 | )) or \ 130 | (lang_code == 'ru' and ( 131 | (c in u'Cc' and c_plus_1 in u'HhKkZz') or # c, ch, ck, cz 132 | (c in u'Tt' and c_plus_1 in u'Hh') or # th 133 | (c in u'Ww' and c_plus_1 in u'Hh') or # wh 134 | (c in u'Pp' and c_plus_1 in u'Hh') or # ph 135 | (c in u'Ee' and c_plus_1 == u'\'') or # e' 136 | 137 | (c == u'i' and c_plus_1 == u'y' and 138 | string_to_transliterate[index + 2:index + 3] not in u'aou') or # iy[^AaOoUu] 139 | (c in u'Jj' and c_plus_1 in u'UuAaEeIiOo') or # j, ju, ja, je, ji, jo 140 | (c in u'Ss' and c_plus_1 in u'HhZz') or # s, sh, sz 141 | (c in u'Yy' and c_plus_1 in u'AaOoUuEeIi\'') or # y, ya, yo, yu, ye, yi, y' 142 | (c in u'Zz' and c_plus_1 in u'Hh') or # z, zh 143 | (c == u'\'' and c_plus_1 == u'\'') # '' 144 | )) or \ 145 | (lang_code == 'ua' and ( 146 | (c in u'Jj' and c_plus_1 in u'eEaAuUiI') or # je, ja, ju 147 | (c in u'Šš' and c_plus_1 in u'č') # šč 148 | )) or \ 149 | (lang_code == 'by' and ( 150 | (c in u'Jj' and c_plus_1 in u'uUaA') or # ju, ja 151 | (c == u'\'' and c_plus_1 == u'\'') # '' for Ьь 152 | )) or \ 153 | (lang_code == "mn" and ( 154 | (c in u'Kk' and c_plus_1 in u'Hh') or # Х х 155 | (c in u'Ss' and c_plus_1 in u'Hh') or # Ш ш 156 | (c in u'Tt' and c_plus_1 in u'Ss') or # Ц ц 157 | (c in u'Cc' and c_plus_1 in u'Hh') or # Ч ч 158 | (c in u'Yy' and c_plus_1 in u'EeOoUuAa') # Е Ё Ю Я 159 | )) or \ 160 | (lang_code == "el" and ( 161 | (c in u'Tt' and c_plus_1 in u'Hh') or # Θ θ - Theta 162 | (c in u'Cc' and c_plus_1 in u'Hh') or # Χ χ - Chi 163 | (c in u'Pp' and c_plus_1 in u'Ss') # Ψ ψ - Psi 164 | )): 165 | index += 1 166 | c += c_plus_1 167 | 168 | # In Bulgarian, the letter "щ" is represented by three latin letters: "sht", 169 | # so we need this logic to support the third latin letter 170 | if lang_code == 'bg' and \ 171 | index + 2 <= length_of_string_to_transliterate - 1 and \ 172 | (c == 'sh' or c == 'Sh' or c == 'SH') and \ 173 | string_to_transliterate[index + 1] in u'Tt': 174 | index += 1 175 | c += string_to_transliterate[index] 176 | 177 | # Similarly in Russian, the letter "щ" шы represented by "shh". 178 | if lang_code == 'ru' and \ 179 | index + 2 <= length_of_string_to_transliterate - 1 and \ 180 | (c == u'sh' or c == 'Sh' or c == 'SH') and \ 181 | string_to_transliterate[index + 1] in u'Hh': # shh 182 | index += 1 183 | c += string_to_transliterate[index] 184 | 185 | # In Mongolia the begining of if statement is not the truth 186 | # ((c == u'L' or c == u'l') and c_plus_1 == u'j') or \ 187 | # ((c == u'N' or c == u'n') and c_plus_1 == u'j') or \ 188 | # ((c == u'D' or c == u'd') and c_plus_1 == u'ž') or \ 189 | # Sü(nj)idmaa -> Сүнжидмаагаа not Сүnjидмаа 190 | # I add post-processing , wonder if @georgeslabreche would like to change the old code, thx 191 | if lang_code == 'mn' and c in [u'Lj', u'lj', u'Nj', u'nj']: 192 | index -= 1 193 | c = c[:-1] 194 | 195 | # If character is in dictionary, it means it's a cyrillic so let's transliterate that character. 196 | if c in transliteration_dict: 197 | # ay, ey, iy, oy, uy 198 | if lang_code == 'ru' and c in u'Yy' and \ 199 | cyrillic_str and cyrillic_str[-1].lower() in u"аеиоуэя": 200 | cyrillic_str += u"й" if c == u'y' else u"Й" 201 | else: 202 | # Transliterate current character. 203 | cyrillic_str += transliteration_dict[c] 204 | 205 | # If character is not in character transliteration dictionary, 206 | # it is most likely a number or a special character so just keep it. 207 | else: 208 | cyrillic_str += c 209 | 210 | index += 1 211 | 212 | return __encode_utf8(cyrillic_str) 213 | 214 | 215 | def supported(): 216 | ''' Returns list of supported languages, sorted alphabetically. 217 | :return: 218 | ''' 219 | return sorted(TRANSLIT_DICT.keys()) 220 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.17663256.svg)](https://doi.org/10.5281/zenodo.17663256) 2 | 3 | ## What is CyrTranslit? 4 | 5 | A Python package for bi-directional transliteration of Cyrillic script to Latin script and vice versa. 6 | 7 | By default, transliterates for the Serbian language. A language flag can be set in order to transliterate to and from Belarusian, Bulgarian, Greek, Montenegrin, Macedonian, Mongolian, Russian, Serbian, Tajik, and Ukrainian. 8 | 9 | **Note:** Greek is also supported. While Greek uses its own alphabet and is not Cyrillic, it has been included due to user demand and shared transliteration needs. 10 | 11 | ## What is transliteration? 12 | 13 | Transliteration is the conversion of a text from one script to another. For instance, a Latin alphabet transliteration of the Serbian phrase _"Мој ховеркрафт је пун јегуља"_ is _"Moj hoverkraft je pun jegulja"_. 14 | 15 | ## Citation 16 | 17 | A citation would be much appreciated if you use CyrTranslit in a research publication: 18 | 19 | [Georges Labrèche. (2025). CyrTranslit (1.2.0). Zenodo. https://doi.org/10.5281/zenodo.17663256](https://doi.org/10.5281/zenodo.17663256) 20 | 21 | BibTex entry: 22 | ```bibtex 23 | @software{georges_labreche_nov2025, 24 | author = {Georges Labrèche}, 25 | title = {CyrTranslit}, 26 | month = nov, 27 | year = 2025, 28 | note = {{A Python package for bi-directional 29 | transliteration of Cyrillic script to Latin script 30 | and vice versa. Supports transliteration for Belarusian, 31 | Bulgarian, Greek, Montenegrin, Macedonian, Mongolian, 32 | Russian, Serbian, Tajik, and Ukrainian.}}, 33 | publisher = {Zenodo}, 34 | version = {1.2.0}, 35 | doi = {10.5281/zenodo.17663256}, 36 | url = {https://doi.org/10.5281/zenodo.17663256} 37 | } 38 | ``` 39 | 40 | 41 | ## Advancing research 42 | 43 | CyrTranslit is actively used as a reliable tool to advance research! Here's an incomplete list of publications for research projects that have relied on CyrTranslit: 44 | 45 | ### Text Normalization, Unicode Perturbations & Robustness 46 | 47 | - Cooper, Portia, Blanco, Eduardo, and Surdeanu, Mihai. (2025). "[The Lies Characters Tell: Utilizing Large Language Models to Normalize Adversarial Unicode Perturbations](https://aclanthology.org/2025.findings-acl.969.pdf)," *Findings of the Association for Computational Linguistics: ACL 2025*. 48 | 49 | - Cooper, Portia, Surdeanu, Mihai, and Blanco, Eduardo. (2023). "[Hiding in Plain Sight: Tweets with Hate Speech Masked by Homoglyphs](https://aclanthology.org/2023.findings-emnlp.192.pdf)," *Findings of the Association for Computational Linguistics: EMNLP 2023*. 50 | 51 | 52 | ### Low-Resource NLP & Machine Translation 53 | 54 | - Cvetanović, Aleksa and Tadić, Predrag. (2024). "[Synthetic Dataset Creation and Fine-Tuning of Transformer Models for Question Answering in Serbian](https://arxiv.org/pdf/2404.08617)," arXiv:2404.08617. 55 | 56 | - Lakew, Surafel Melaku. (2020). "[Multilingual Neural Machine Translation for Low Resource Languages](https://surafelml.github.io/phd-thesis/)," PhD Thesis, University of Trento. 57 | 58 | - Filo, Denis. (2020). "[Neuronový strojový překlad pro jazykové páry s malým množstvím trénovacích dat: Low-Resource Neural Machine Translation](https://www.fit.vut.cz/study/thesis/23087/.en)," Master's Thesis, Brno University of Technology. 59 | 60 | - Lakew, Surafel Melaku, Erofeeva, Aliia, and Federico, Marcello. (2018). "[Neural Machine Translation into Language Varieties](https://aclanthology.org/W18-6316/)," *Proceedings of the Third Conference on Machine Translation (WMT 2018)*. 61 | 62 | 63 | ### Serbian Language NLP (Topic Modeling, Sentiment, Lexicons, QA, Abuse Detection) 64 | 65 | - Medvecki, Darija, Bašaragin, Bojana, Ljajić, Adela, and Milošević, Nikola. (2024). "[Multilingual transformer and BERTopic for short text topic modeling: The case of Serbian](https://doi.org/10.1007/978-3-031-50755-7_16)," *Lecture Notes in Networks and Systems* 872:159-169, Springer. 66 | 67 | - Bogdanović, Miloš, Kocić, Jelena, and Stoimenov, Leonid. (2024). "[SRBerta—A Transformer Language Model for Serbian Cyrillic Legal Texts](https://doi.org/10.3390/info15020074)," *Information* 15(2):74. 68 | 69 | - Košprdić, Miloš, Prodanović, Nikola, Ljajić, Adela, Bašaragin, Bojana, and Milošević, Nikola. (2024). "[From Zero to Hero: Harnessing Transformers for Biomedical Named Entity Recognition in Zero- and Few-shot Contexts](https://doi.org/10.1016/j.artmed.2024.102970)," *Artificial Intelligence in Medicine* 157:102970. 70 | 71 | - Ljajić, Adela, Prodanović, Nikola, Medvecki, Darija, Bašaragin, Bojana, and Mitrović, Jelena. (2022). "[Uncovering the Reasons Behind COVID-19 Vaccine Hesitancy in Serbia: Sentiment-Based Topic Modeling](https://doi.org/10.2196/42261)," *Journal of Medical Internet Research* 24(11):e42261. 72 | 73 | - Ljajić, Adela, Prodanović, Nikola, Medvecki, Darija, Bašaragin, Bojana, and Mitrović, Jelena. (2022). "[Topic Modeling Technique on Covid19 Tweets in Serbian](https://www.researchgate.net/publication/364302202_Topic_Modeling_Technique_on_Covid19_Tweets_in_Serbian)," *Proceedings of the 12th International Conference on Information Society and Technology (ICIST 2022)*. 74 | 75 | - Jokic, Danka, Stanković, Ranka, Krstev, Cvetana, and Šandrih Todorović, Branislava. (2021). "[A Twitter Corpus and Lexicon for Abusive Speech Detection in Serbian](https://drops.dagstuhl.de/opus/volltexte/2021/14549/)," *Proceedings of the 3rd Conference on Language, Data and Knowledge (LDK 2021)*. 76 | 77 | - Batanović, Vuk and Nikolic, Bosko. (2019). "[Using Language Technologies to Automate the UNDP Rapid Integrated Assessment Mechanism in Serbian](https://www.researchgate.net/publication/339615659_Using_Language_Technologies_to_Automate_the_UNDP_Rapid_Integrated_Assessment_Mechanism_in_Serbian)," *Proceedings of the Conference on Language Technologies for All (LT4All)*. 78 | 79 | - Ljajić, Adela and Marovac, Ulfeta. (2018). "[Improving sentiment analysis for twitter data by handling negation rules in the Serbian language](http://www.doiserbia.nb.rs/Article.aspx?ID=1820-02141800013L)," *Computer Science and Information Systems* 16(1):13-33. 80 | 81 | 82 | ### NLP Applications for Society, Government, and Political Analysis 83 | 84 | - Paula, Katrin and Scholz, Nele. (2025). "[Where do regimes rally their supporters? The geographical distribution of pro-government mobilization in Russia from February to April 2022](https://www.sciencedirect.com/science/article/pii/S096262982500068X)," *Political Geography* 116:103277. 85 | 86 | 87 | ### Engineering, Software Systems, and Backend Development 88 | 89 | - Alyoshin, S.P., Borodina, E.A., Hafiiak, A.M., Zhabran, I.B., and Kikot, A.S. (2019). "[Developing Q-Orca site backend using various Python programming language libraries](https://reposit.nupp.edu.ua/bitstream/PoltNTU/5811/1/ME%26IT_Part%203_P%2048_March%202019_Aleshin_Borodina_Hafiiak_Zhabran_Kikot%20%28pdf.io%29.pdf)," *Modern Engineering and Innovative Technologies* 3(7-3):48-53. 90 | 91 | 92 | ### Proceedings, Collections, and Meta-Documents 93 | 94 | - LDK. (2021). "[Complete Volume: Proceedings of the 3rd Conference on Language, Data and Knowledge (LDK 2021)](http://dagstuhl.sunsite.rwth-aachen.de/volltexte/2021/14535/pdf/oasics-vol093-ldk2021-complete.pdf)," *OASIcs* Vol. 93. 95 | 96 | - Brown, J. M. M., Schmidt, Andreas, and Wierzba, Marta (Eds.). (2019). "[Of trees and birds: A Festschrift for Gisbert Fanselow](https://publishup.uni-potsdam.de/opus4-ubp/frontdoor/deliver/index/docId/42654/file/of_trees_and_birds.pdf)," Universitätsverlag Potsdam. 97 | 98 | 99 | ### Addresses, Geocoding, and NLP 100 | 101 | - Mussylmanbay, Meiirgali. (2022). "[Addresses Standardization and Geocoding using Natural Language Processing](https://nur.nu.edu.kz/handle/123456789/6705)," Master's Thesis, Nazarbayev University. 102 | 103 | 104 | ## How do I install this? 105 | 106 | CyrTranslit is [hosted in the Python Package Index (PyPI)](https://pypi.python.org/pypi/cyrtranslit) so it can be installed using pip: 107 | ``` 108 | python3 -m pip install cyrtranslit # latest version 109 | python3 -m pip install cyrtranslit==1.2.0 # specific version 110 | python3 -m pip install cyrtranslit>=1.2.0 # minimum version 111 | ``` 112 | 113 | ## What languages are supported? 114 | 115 | CyrTranslit currently supports bi-directional transliteration of Belarusian, Bulgarian, Greek, Montenegrin, Macedonian, Mongolian, Russian, Serbian, Tajik, and Ukrainian. 116 | 117 | Language codes are based on ISO 639-1 standards. For Serbian, both `sr` (ISO 639-1 language code) and `rs` (ISO 3166-1 country code) are accepted: 118 | ```python 119 | >>> import cyrtranslit 120 | >>> cyrtranslit.supported() 121 | ['bg', 'by', 'el', 'me', 'mk', 'mn', 'rs', 'ru', 'sr', 'tj', 'ua'] 122 | ``` 123 | 124 | ## How do I use this? 125 | 126 | CyrTranslit can be used both programatically and via command line interface. 127 | 128 | ### Programmatically 129 | 130 | #### Belarusian 131 | 132 | ```python 133 | >>> import cyrtranslit 134 | >>> cyrtranslit.to_latin("Прывітанне, свет!", "by") 135 | "Pryvitanne, svet!" 136 | >>> cyrtranslit.to_cyrillic("Pryvitanne, svet!", "by") 137 | "Прывітанне, свет!" 138 | ``` 139 | 140 | #### Bulgarian 141 | 142 | ```python 143 | >>> import cyrtranslit 144 | >>> cyrtranslit.to_latin("Съединението прави силата!", "bg") 145 | "Săedinenieto pravi silata!" 146 | >>> cyrtranslit.to_cyrillic("Săedinenieto pravi silata!", "bg") 147 | "Съединението прави силата!" 148 | ``` 149 | 150 | #### Greek 151 | 152 | ```python 153 | >>> import cyrtranslit 154 | >>> cyrtranslit.to_latin("Το χόβερκραφτ μου είναι γεμάτο χέλια", "el") 155 | "To choverkraft moy einai gemato chelia" 156 | >>> cyrtranslit.to_cyrillic("To choverkraft moy einai gemato chelia", "el") 157 | "Το χόβερκραφτ μου είναι γεμάτο χέλια" 158 | ``` 159 | 160 | #### Montenegrin 161 | 162 | ```python 163 | >>> import cyrtranslit 164 | >>> cyrtranslit.to_latin("Република", "me") 165 | "Republika" 166 | >>> cyrtranslit.to_cyrillic("Republika", "me") 167 | "Република" 168 | ``` 169 | 170 | #### Macedonian 171 | 172 | ```python 173 | >>> import cyrtranslit 174 | >>> cyrtranslit.to_latin("Моето летачко возило е полно со јагули", "mk") 175 | "Moeto letačko vozilo e polno so jaguli" 176 | >>> cyrtranslit.to_cyrillic("Moeto letačko vozilo e polno so jaguli", "mk") 177 | "Моето летачко возило е полно со јагули" 178 | ``` 179 | 180 | #### Mongolian 181 | 182 | ```python 183 | >>> import cyrtranslit 184 | >>> cyrtranslit.to_latin("Амрагаа Сүнжидмаагаа гэсээр ирлээ дээ хө-хө-хө", "mn") 185 | "Amragaa Sünjidmaagaa geseer irlee dee khö-khö-khö" 186 | >>> cyrtranslit.to_cyrillic("Amragaa Sünjidmaagaa geseer irlee dee khö-khö-khö", "mn") 187 | "Амрагаа Сүнжидмаагаа гэсээр ирлээ дээ хө-хө-хө" 188 | ``` 189 | 190 | #### Russian 191 | 192 | ```python 193 | >>> import cyrtranslit 194 | >>> cyrtranslit.to_latin("Моё судно на воздушной подушке полно угрей", "ru") 195 | "Moyo sudno na vozdushnoj podushke polno ugrej" 196 | >>> cyrtranslit.to_cyrillic("Moyo sudno na vozdushnoj podushke polno ugrej", "ru") 197 | "Моё судно на воздушной подушке полно угрей" 198 | ``` 199 | 200 | #### Serbian 201 | 202 | ```python 203 | >>> import cyrtranslit 204 | >>> cyrtranslit.to_latin("Мој ховеркрафт је пун јегуља") 205 | "Moj hoverkraft je pun jegulja" 206 | >>> cyrtranslit.to_cyrillic("Moj hoverkraft je pun jegulja") 207 | "Мој ховеркрафт је пун јегуља" 208 | ``` 209 | 210 | #### Tajik 211 | 212 | ```python 213 | >>> import cyrtranslit 214 | >>> cyrtranslit.to_latin("Ман мактуб навишта истодам", "tj") 215 | "Man maktub navišta istodam" 216 | >>> cyrtranslit.to_cyrillic("Man maktub navišta istodam", "tj") 217 | "Ман мактуб навишта истодам" 218 | ``` 219 | 220 | #### Ukrainian 221 | 222 | ```python 223 | >>> import cyrtranslit 224 | >>> cyrtranslit.to_latin("Під лежачий камінь вода не тече", "ua") 225 | "Pid ležačyj kamin' voda ne teče" 226 | >>> cyrtranslit.to_cyrillic("Pid ležačyj kamin' voda ne teče", "ua") 227 | "Під лежачий камінь вода не тече" 228 | ``` 229 | 230 | ### Accented Characters (Macedonian & Bulgarian) 231 | 232 | CyrTranslit supports Cyrillic characters with grave accents used in Macedonian and Bulgarian for homograph disambiguation and stress marking. By default, accents are stripped during transliteration for cleaner output. Use the `preserve_accents` parameter to preserve them. 233 | 234 | #### Supported Accented Characters 235 | 236 | **Macedonian:** 237 | - **Ѐ/ѐ** (U+0400/U+0450) - Cyrillic IE with grave 238 | - **Purpose:** Distinguishes homographs (e.g., нѐ "us" vs не "no", сѐ "everything" vs се "reflexive pronoun") 239 | - **Standard:** ISO 9:1968/1995, adopted by Macedonian Academy of Arts and Sciences (1970) 240 | 241 | - **Ѝ/ѝ** (U+040D/U+045D) - Cyrillic I with grave 242 | - **Purpose:** Distinguishes homographs (e.g., ѝ "her" vs и "and") 243 | - **Standard:** ISO 9:1968/1995 244 | 245 | **Bulgarian:** 246 | - **Ѝ/ѝ** (U+040D/U+045D) - Cyrillic I with grave 247 | - **Purpose:** Stress marking and homograph disambiguation (e.g., ѝ "her" vs и "and") 248 | - **Standard:** ISO 9:1995 249 | 250 | **Sources:** 251 | - ISO 9:1995 - Information and documentation — Transliteration of Cyrillic characters into Latin characters 252 | - [Wikipedia: I with grave (Cyrillic)](https://en.wikipedia.org/wiki/I_with_grave_(Cyrillic)) 253 | - [Wikipedia: Ye with grave](https://en.wikipedia.org/wiki/Ye_with_grave) 254 | 255 | #### Usage Examples 256 | 257 | **Default behavior (accents stripped):** 258 | 259 | ```python 260 | >>> import cyrtranslit 261 | >>> cyrtranslit.to_latin("ѝ је", "mk") 262 | "i je" 263 | >>> cyrtranslit.to_latin("нѐ сме", "mk") 264 | "ne sme" 265 | >>> cyrtranslit.to_cyrillic("i je", "mk") 266 | "и је" 267 | ``` 268 | 269 | **With accents preserved:** 270 | 271 | ```python 272 | >>> import cyrtranslit 273 | >>> cyrtranslit.to_latin("ѝ је", "mk", preserve_accents=True) 274 | "ì je" 275 | >>> cyrtranslit.to_latin("нѐ сме", "mk", preserve_accents=True) 276 | "nè sme" 277 | >>> cyrtranslit.to_cyrillic("ì je", "mk", preserve_accents=True) 278 | "ѝ је" 279 | >>> cyrtranslit.to_cyrillic("nè sme", "mk", preserve_accents=True) 280 | "нѐ сме" 281 | ``` 282 | 283 | **Command-line usage:** 284 | 285 | ```bash 286 | # Default (accents stripped) 287 | $ echo "ѝ је" | cyrtranslit -l mk 288 | i je 289 | 290 | # Preserve accents 291 | $ echo "ѝ је" | cyrtranslit -l mk --preserve-accents 292 | ì je 293 | ``` 294 | 295 | ## Command Line Interface 296 | 297 | Sample command line call to transliterate a Russian text file: 298 | ```bash 299 | $ cyrtranslit -l RU -i tests/ru.txt -o tests/output.txt 300 | ``` 301 | 302 | Use the -c argument to accomplish the reverse, that is to input latin characters and output cyrillic. 303 | 304 | Use the -h argument for help. 305 | 306 | You can also omit the input and output files and use standard input/output 307 | ```bash 308 | $ echo 'Мој ховеркрафт је пун јегуља' | cyrtranslit -l sr 309 | Moj hoverkraft je pun jegulja 310 | $ echo 'Moj hoverkraft je pun jegulja' | cyrtranslit -l sr 311 | Мој ховеркрафт је пун јегуља 312 | ``` 313 | 314 | ### File Encodings 315 | 316 | By default, input files are expected to be UTF-8. For files with different encodings, use the `-e/--encoding` parameter: 317 | 318 | ```bash 319 | $ cyrtranslit -l BG -i file.txt -e windows-1251 320 | ``` 321 | 322 | If no encoding is specified and encoding fails with the default UTF-8, then CyrTranslit automatically tries the following common Cyrillic encodings: windows-1251, iso-8859-5, koi8-r, and cp866. 323 | 324 | Try CyrTranslit by running it directly on the Python command line interface, e.g.: 325 | ```python 326 | >>> import sys 327 | >>> import cyrtranslit.cyrtranslit 328 | >>> sys.argv.extend(['-l', 'UA']) 329 | >>> sys.argv.extend(['-i', 'tests/ua.txt']) 330 | >>> sys.argv.extend(['-o', 'tests/output.txt']) 331 | >>> cyrtranslit.cyrtranslit.main() 332 | >>> exit() 333 | ``` 334 | 335 | 336 | ## How can I contribute? 337 | 338 | Include support for other Cyrillic script alphabets. Follow these steps in order to do so: 339 | 340 | 1. Create a new transliteration mapping file in the **[mapping/](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping/)** directory (using the language code as the filename, e.g., `xx.py`) and reference to it in the _**[TRANSLIT\_DICT](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping/__init__.py)**_ dictionary in **mapping/\_\_init\_\_.py**. If the language uses accented characters (like Macedonian and Bulgarian), create separate accented dictionaries (e.g., `XX_CYR_TO_LAT_ACCENTED_DICT`) following the pattern in **[mk.py](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping/mk.py)** or **[bg.py](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping/bg.py)**. 341 | 2. Watch out for cases where two consecutive Latin alphabet letters are meant to transliterate into a single Cyrillic script letter. These cases need to be explicitly checked for inside the **to_cyrillic()** function in **[\_\_init\_\_.py](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/__init__.py)**. 342 | 3. Add test cases inside of **[tests.py](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/tests.py)**. 343 | 4. Add test CLI input files in the **[tests](https://github.com/opendatakosovo/cyrillic-transliteration/tree/master/tests)** directory. 344 | 5. Update the documentation in the **[README.md](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/README.md)**. 345 | 6. List yourself as one of the contributors. 346 | 347 | Before tagging a release version and deploying to [PyPI](https://pypi.org/): 348 | 349 | 1. Update the `version` and `download_url` properties in [setup.py](https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/setup.py). 350 | 2. [Reserve a Zenodo DOI](https://cassgvp.github.io/github-for-collaborative-documentation/docs/tut/6-Zenodo-integration.html) for the release and update this readme's Zenodo badge and [citation instructions](https://github.com/opendatakosovo/cyrillic-transliteration#citation). 351 | 352 | A big thank you to everyone who contributed: 353 | 354 | - Bulgarian 🇧🇬: [@Syndamia](https://github.com/Syndamia) and [@Sparkycz](https://github.com/Sparkycz). 355 | - Russian 🇷🇺: [@ratijas](https://github.com/ratijas) and [@rominf](https://github.com/rominf). 356 | - Tajik 🇹🇯: [@diejani](https://github.com/diejani). 357 | - Ukrainian 🇺🇦: [@AnonymousVoice1](https://github.com/AnonymousVoice1). 358 | - Mongolian 🇲🇳: [@Serbipunk](https://github.com/Serbipunk). 359 | - Command Line Interface (CLI): [@ZJaume](https://github.com/ZJaume). -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | import cyrtranslit 4 | 5 | # Test inputs and output strings 6 | serbian_alphabet_cyrillic = 'АаБбВвГгДдЂђЕеЖжЗзИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЋћУуФфХхЦцЧчЏџШш' 7 | serbian_alphabet_latin = 'AaBbVvGgDdĐđEeŽžZzIiJjKkLlLjljMmNnNjnjOoPpRrSsTtĆćUuFfHhCcČčDždžŠš' 8 | 9 | montenegrin_alphabet_cyrillic = 'АаБбВвГгДдЂђЕеЖжЗзЗ́з́ИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЋћУуФфХхЦцЧчЏџШшС́с́' 10 | montenegrin_alphabet_latin = 'AaBbVvGgDdĐđEeŽžZzŹźIiJjKkLlLjljMmNnNjnjOoPpRrSsTtĆćUuFfHhCcČčDždžŠšŚś' 11 | 12 | macedonian_alphabet_cyrillic = 'АаБбВвГгДдЃѓЕеЖжЗзЅѕИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЌќУуФфХхЦцЧчЏџШш' 13 | macedonian_alphabet_latin = 'AaBbVvGgDdǴǵEeŽžZzDzdzIiJjKkLlLjljMmNnNjnjOoPpRrSsTtḰḱUuFfHhCcČčDždžŠš' 14 | 15 | russian_alphabet_cyrillic = 'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыьЭэЮюЯя' 16 | russian_alphabet_latin = 'AaBbVvGgDdEeYOyoZHzhZzIiJjKkLlMmNnOoPpRrSsTtUuFfHhCZczCHchSHshSHHshh\'\'\'\'Y\'y\'\'E\'e\'YuyuYaya' 17 | 18 | tajik_alphabet_cyrillic = 'АаБбВвГгҒғДдЕеЁёЖжЗзИиӢӣЙйКкЛлМмНнОоПпРрСсТтУуӮӯФфХхҲҳЧчҶҷШшъЭэЮюЯя' 19 | tajik_alphabet_latin = 'AaBbVvGgǦǧDdEeË뎞ZzIiĪīJjKkLlMmNnOoPpRrSsTtUuŪūFfHhḨḩČčÇ犚’ÈèÛûÂâ' 20 | 21 | bulgarian_alphabet_cyrillic = 'АаБбВвГгДдЕеЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЬьЮюЯя' 22 | bulgarian_alphabet_latin = 'AaBbVvGgDdEeZHzhZzIiYyKkLlMmNnOoPpRrSsTtUuFfHhTStsCHchSHshSHTshtĂăJjYUyuYAya' 23 | 24 | # not testing Ь for the apostrophe, sticking with just ь. Both will transliterate to '. 25 | ukrainian_alphabet_cyrillic = 'АаБбВвГ㥴ДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЮюЯяь' 26 | ukrainian_alphabet_latin = 'AaBbVvHhGgDdEeJejeŽžZzYyIiJijiJjKkLlMmNnOoPpRrSsTtUuFfXxCcČ芚ŠčščJujuJaja\'' 27 | 28 | belarusian_alphabet_cyrillic = 'АаБбВвГгДдЕеЁёЖжЗзІіЙйКкЛлМмНнОоПпРрСсТтУуЎўФфХхЦцЧчШшЫыЬьЭэЮюЯя' 29 | belarusian_alphabet_latin = 'AaBbVvHhDdEeË뎞ZzIiJjKkLlMmNnOoPpRrSsTtUuŬŭFfXxCcČ芚Yy\'\'ĖėJujuJaja' 30 | 31 | greek_alphabet = 'ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω' 32 | greek_alphabet_latin = 'AaVvGgDdEeZzHhThthIiKkLlMmNnXxOoPpRrSssTtYyFfChchPspsWw' 33 | 34 | mongolian_alphabet_cyrillic = 'АаЭэИиОоУуӨөҮүНнМмЛлВвПпФфКкХхГгСсШшТтДдЦцЧчЗзЖжРрБбЕеЁёЫыЮюЯя' # exclude (Й Ъ Ь)<->I Щ<->Sh 35 | mongolian_alphabet_latin = 'AaEeIiOoUuÖöÜüNnMmLlVvPpFfKkKhkhGgSsShshTtDdTstsChchZzJjRrBbYeyeYoyoYyYuyuYaya' 36 | 37 | special_chars = '‘’‚“”„†‡‰‹›♠♣♥♦‾←↑→↓™!"#$%&\'()*+,-./ :;<=>?@[\\]^_`{|}~…–—¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿×' 38 | 39 | diacritic_chars = 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïðñòóôõöøùúûüý' 40 | 41 | numerical_chars = '1234567890' 42 | 43 | alphabet_chars = 'AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz' 44 | 45 | mix_characters_some_cyrillic = '!ЉFљñМ мНÈÆнЊњО)*+,оП>пР?р' 46 | mix_characters_all_latin = '!LjFljñM mNÈÆnNjnjO)*+,oP>pR?r' 47 | 48 | mix_characters_some_cyrillic_no_alpha = '\'Ћ<=>?ћУуФфХхЦцЧчЏ%4џШ12ш♥' 49 | mix_characters_all_latin_no_alpha = '\'Ć<=>?ćUuFfHhCcČčDž%4džŠ12š♥' 50 | 51 | 52 | class TestSerbianTransliterationFromCyrillicToLatin(unittest.TestCase): 53 | 54 | def test_alphabet_transliteration(self): 55 | ''' Transliteration of entire Serbian cyrillic alphabet to latin. 56 | ''' 57 | transliterated_serbian_alphabet = cyrtranslit.to_latin(serbian_alphabet_cyrillic) 58 | 59 | self.assertEqual(transliterated_serbian_alphabet, serbian_alphabet_latin) 60 | 61 | 62 | def test_special_characters(self): 63 | ''' Special characters should remain the same. 64 | ''' 65 | transliterated_special_chars = cyrtranslit.to_latin(special_chars) 66 | 67 | self.assertEqual(transliterated_special_chars, special_chars) 68 | 69 | 70 | def test_special_diacritic_characters(self): 71 | ''' Diacritic characters should remain the same. 72 | ''' 73 | transliterated_diacritic_chars = cyrtranslit.to_latin(diacritic_chars) 74 | 75 | self.assertEqual(transliterated_diacritic_chars, diacritic_chars) 76 | 77 | 78 | def test_numerical_characters(self): 79 | ''' Numerical characters should remain the same. 80 | ''' 81 | transliterated_numerical_chars = cyrtranslit.to_latin(numerical_chars) 82 | 83 | self.assertEqual(transliterated_numerical_chars, numerical_chars) 84 | 85 | 86 | def test_latin_alphabet_characters(self): 87 | ''' Alphabet characters should remain the same. 88 | ''' 89 | transliterated_alphabet_chars = cyrtranslit.to_latin(alphabet_chars) 90 | 91 | self.assertEqual(transliterated_alphabet_chars, alphabet_chars) 92 | 93 | 94 | def test_mix_characters(self): 95 | ''' Serbian cyrillic characters should be transliterated but non serbian cyrillic ones shouldn't. 96 | ''' 97 | 98 | transliterated_mix = cyrtranslit.to_latin(mix_characters_some_cyrillic) 99 | 100 | self.assertEqual(transliterated_mix, mix_characters_all_latin) 101 | 102 | 103 | class TestSerbianTransliterationFromLatinToCyrillic(unittest.TestCase): 104 | 105 | def test_alphabet_transliteration(self): 106 | ''' Transliteration of entire Serbian cyrillic alphabet to latin. 107 | ''' 108 | transliterated_serbian_alphabet = cyrtranslit.to_cyrillic(serbian_alphabet_latin) 109 | 110 | self.assertEqual(transliterated_serbian_alphabet, serbian_alphabet_cyrillic) 111 | 112 | 113 | def test_special_characters(self): 114 | ''' Special characters should remain the same. 115 | ''' 116 | transliterated_special_chars = cyrtranslit.to_cyrillic(special_chars) 117 | 118 | self.assertEqual(transliterated_special_chars, special_chars) 119 | 120 | 121 | def test_special_diacritic_characters(self): 122 | ''' Diacritic characters should remain the same. 123 | ''' 124 | transliterated_diacritic_chars = cyrtranslit.to_cyrillic(diacritic_chars) 125 | 126 | self.assertEqual(transliterated_diacritic_chars, diacritic_chars) 127 | 128 | 129 | def test_numerical_characters(self): 130 | ''' Numerical characters should remain the same. 131 | ''' 132 | transliterated_numerical_chars = cyrtranslit.to_cyrillic(numerical_chars) 133 | 134 | self.assertEqual(transliterated_numerical_chars, numerical_chars) 135 | 136 | def test_mix_characters(self): 137 | ''' Serbian cyrillic characters should be transliterated but non serbian cyrillic ones shouldn't. 138 | ''' 139 | transliterated_mix = cyrtranslit.to_cyrillic(mix_characters_all_latin_no_alpha) 140 | 141 | self.assertEqual(transliterated_mix, mix_characters_some_cyrillic_no_alpha) 142 | 143 | 144 | class TestSerbianCountryCodeAlias(unittest.TestCase): 145 | ''' Test that 'rs' (ISO 3166-1 country code) works as alias for 'sr' (ISO 639-1 language code). 146 | Addresses issue #46. 147 | ''' 148 | 149 | def test_rs_to_latin(self): 150 | ''' Test transliteration using 'rs' country code to latin. 151 | ''' 152 | transliterated = cyrtranslit.to_latin("Мој ховеркрафт је пун јегуља", lang_code='rs') 153 | self.assertEqual(transliterated, "Moj hoverkraft je pun jegulja") 154 | 155 | def test_rs_to_cyrillic(self): 156 | ''' Test transliteration using 'rs' country code to cyrillic. 157 | ''' 158 | transliterated = cyrtranslit.to_cyrillic("Moj hoverkraft je pun jegulja", lang_code='rs') 159 | self.assertEqual(transliterated, "Мој ховеркрафт је пун јегуља") 160 | 161 | def test_rs_alphabet_to_latin(self): 162 | ''' Test full alphabet transliteration with 'rs' code. 163 | ''' 164 | transliterated = cyrtranslit.to_latin(serbian_alphabet_cyrillic, lang_code='rs') 165 | self.assertEqual(transliterated, serbian_alphabet_latin) 166 | 167 | 168 | class TestMontenegrinTransliteration(unittest.TestCase): 169 | def test_alphabet_transliteration_cyrillic_to_latin(self): 170 | ''' Transliteration of entire cyrillic alphabet to latin. 171 | ''' 172 | transliterated_alphabet = cyrtranslit.to_latin(montenegrin_alphabet_cyrillic, lang_code='me') 173 | 174 | # transliterated_alphabet = u's\u0301' 's\xcc\x81' 175 | self.assertEqual(transliterated_alphabet, montenegrin_alphabet_latin) 176 | 177 | def test_alphabet_transliteration_latin_to_cyrillic(self): 178 | ''' Transliteration of entire latin alphabet to cyrillic. 179 | ''' 180 | transliterated_alphabet = cyrtranslit.to_cyrillic(montenegrin_alphabet_latin, lang_code='me') 181 | 182 | self.assertEqual(transliterated_alphabet, montenegrin_alphabet_cyrillic) 183 | 184 | 185 | class TestMacedonianTransliteration(unittest.TestCase): 186 | def test_alphabet_transliteration_cyrillic_to_latin(self): 187 | ''' Transliteration of entire cyrillic alphabet to latin. 188 | ''' 189 | transliterated_alphabet = cyrtranslit.to_latin(macedonian_alphabet_cyrillic, lang_code='mk') 190 | 191 | # transliterated_alphabet = u's\u0301' 's\xcc\x81' 192 | self.assertEqual(transliterated_alphabet, macedonian_alphabet_latin) 193 | 194 | def test_alphabet_transliteration_latin_to_cyrillic(self): 195 | ''' Transliteration of entire latin alphabet to cyrillic. 196 | ''' 197 | transliterated_alphabet = cyrtranslit.to_cyrillic(macedonian_alphabet_latin, lang_code='mk') 198 | 199 | self.assertEqual(transliterated_alphabet, macedonian_alphabet_cyrillic) 200 | 201 | 202 | class TestRussianTransliteration(unittest.TestCase): 203 | def test_alphabet_transliteration_cyrillic_to_latin(self): 204 | ''' Transliteration of entire cyrillic alphabet to latin. 205 | ''' 206 | transliterated_alphabet = cyrtranslit.to_latin(russian_alphabet_cyrillic, lang_code='ru') 207 | 208 | self.assertEqual(transliterated_alphabet, russian_alphabet_latin) 209 | 210 | def test_alphabet_transliteration_latin_to_cyrillic(self): 211 | ''' Transliteration of entire latin alphabet to cyrillic. 212 | ''' 213 | transliterated_alphabet = cyrtranslit.to_cyrillic(russian_alphabet_latin, lang_code='ru') 214 | 215 | self.assertEqual(transliterated_alphabet, russian_alphabet_cyrillic.replace('Ъ', 'ъ').replace('Ь', 'ь').replace('Ы', 'ы')) 216 | 217 | def test_h_transliteration(self): 218 | ''' Cyrillic Х should transliterate to H, not X. 219 | ''' 220 | self.assertEqual(cyrtranslit.to_latin('Х', lang_code='ru'), 'H') 221 | self.assertEqual(cyrtranslit.to_latin('х', lang_code='ru'), 'h') 222 | self.assertEqual(cyrtranslit.to_cyrillic('H', lang_code='ru'), 'Х') 223 | self.assertEqual(cyrtranslit.to_cyrillic('h', lang_code='ru'), 'х') 224 | 225 | def test_ya_capitalization(self): 226 | ''' Capital Я should transliterate to Ya, not YA. 227 | ''' 228 | self.assertEqual(cyrtranslit.to_latin('Я', lang_code='ru'), 'Ya') 229 | self.assertEqual(cyrtranslit.to_latin('я', lang_code='ru'), 'ya') 230 | self.assertEqual(cyrtranslit.to_latin('Янковский', lang_code='ru'), 'Yankovskij') 231 | self.assertEqual(cyrtranslit.to_latin('яблоко', lang_code='ru'), 'yabloko') 232 | self.assertEqual(cyrtranslit.to_cyrillic('Ya', lang_code='ru'), 'Я') 233 | self.assertEqual(cyrtranslit.to_cyrillic('ya', lang_code='ru'), 'я') 234 | 235 | class TestTajikTransliteration(unittest.TestCase): 236 | def test_alphabet_transliteration_cyrillic_to_latin(self): 237 | ''' Transliterate the entire cyrillic alphabet to latin ''' 238 | transliterated_alphabet = cyrtranslit.to_latin(tajik_alphabet_cyrillic, lang_code='tj') 239 | 240 | self.assertEqual(transliterated_alphabet, tajik_alphabet_latin) 241 | 242 | def test_alphabet_transliteration_latin_to_cyrillic(self): 243 | ''' Transliterate the entire latin alphabet to cyrillic ''' 244 | transliterated_alphabet = cyrtranslit.to_cyrillic(tajik_alphabet_latin, lang_code='tj') 245 | 246 | self.assertEqual(transliterated_alphabet, tajik_alphabet_cyrillic) 247 | 248 | class TestUkrainianTransliteration(unittest.TestCase): 249 | def test_alphabet_transliteration_cyrillic_to_latin(self): 250 | ''' Transliterate the entire cyrillic alphabet to latin ''' 251 | transliterated_alphabet = cyrtranslit.to_latin(ukrainian_alphabet_cyrillic, lang_code='ua') 252 | 253 | self.assertEqual(transliterated_alphabet, ukrainian_alphabet_latin) 254 | 255 | def test_alphabet_transliteration_latin_to_cyrillic(self): 256 | ''' Transliterate the entire latin alphabet to cyrillic ''' 257 | transliterated_alphabet = cyrtranslit.to_cyrillic(ukrainian_alphabet_latin, lang_code='ua') 258 | 259 | self.assertEqual(transliterated_alphabet, ukrainian_alphabet_cyrillic) 260 | 261 | 262 | def test_special_diacritic_characters(self): 263 | ''' Diacritic characters should remain the same. 264 | ''' 265 | transliterated_diacritic_chars = cyrtranslit.to_latin(diacritic_chars, lang_code='tj') 266 | 267 | self.assertEqual(transliterated_diacritic_chars, diacritic_chars) 268 | 269 | 270 | def test_numerical_characters(self): 271 | ''' Numerical characters should remain the same. 272 | ''' 273 | transliterated_numerical_chars = cyrtranslit.to_latin(numerical_chars, lang_code='tj') 274 | 275 | self.assertEqual(transliterated_numerical_chars, numerical_chars) 276 | 277 | 278 | class TestBelarusianTransliteration(unittest.TestCase): 279 | ''' Test Belarusian transliteration. Addresses issue #47. 280 | ''' 281 | 282 | def test_alphabet_transliteration_cyrillic_to_latin(self): 283 | ''' Transliterate the entire Belarusian cyrillic alphabet to latin. 284 | ''' 285 | transliterated_alphabet = cyrtranslit.to_latin(belarusian_alphabet_cyrillic, lang_code='by') 286 | 287 | self.assertEqual(transliterated_alphabet, belarusian_alphabet_latin) 288 | 289 | def test_alphabet_transliteration_latin_to_cyrillic(self): 290 | ''' Transliterate the entire Belarusian latin alphabet to cyrillic. 291 | ''' 292 | transliterated_alphabet = cyrtranslit.to_cyrillic(belarusian_alphabet_latin, lang_code='by') 293 | 294 | self.assertEqual(transliterated_alphabet, belarusian_alphabet_cyrillic) 295 | 296 | def test_phrase_transliteration_to_latin(self): 297 | ''' Test common Belarusian phrase transliteration. 298 | ''' 299 | # "Hello, World!" in Belarusian 300 | cyrillic_text = "Прывітанне, свет!" 301 | expected_latin = "Pryvitanne, svet!" 302 | 303 | transliterated = cyrtranslit.to_latin(cyrillic_text, lang_code='by') 304 | self.assertEqual(transliterated, expected_latin) 305 | 306 | def test_short_u_transliteration(self): 307 | ''' Test Belarusian unique letter Ў (short U). 308 | ''' 309 | # Ў is unique to Belarusian 310 | self.assertEqual(cyrtranslit.to_latin("Ў", lang_code='by'), "Ŭ") 311 | self.assertEqual(cyrtranslit.to_latin("ў", lang_code='by'), "ŭ") 312 | self.assertEqual(cyrtranslit.to_cyrillic("Ŭ", lang_code='by'), "Ў") 313 | self.assertEqual(cyrtranslit.to_cyrillic("ŭ", lang_code='by'), "ў") 314 | 315 | 316 | class TestGreekTransliteration(unittest.TestCase): 317 | ''' Test Greek transliteration. Addresses issue #40. 318 | ''' 319 | 320 | def test_alphabet_transliteration_to_latin(self): 321 | ''' Transliterate the entire Greek alphabet to latin. 322 | ''' 323 | transliterated_alphabet = cyrtranslit.to_latin(greek_alphabet, lang_code='el') 324 | 325 | self.assertEqual(transliterated_alphabet, greek_alphabet_latin) 326 | 327 | def test_alphabet_transliteration_to_greek(self): 328 | ''' Transliterate the entire latin alphabet to Greek. 329 | Note: Final sigma (ς) converts to regular sigma (σ) when going Latin→Greek 330 | since we can't determine word endings from Latin text. 331 | ''' 332 | transliterated_alphabet = cyrtranslit.to_cyrillic(greek_alphabet_latin, lang_code='el') 333 | 334 | # Replace final sigma with regular sigma for comparison 335 | expected_greek = greek_alphabet.replace('ς', 'σ') 336 | self.assertEqual(transliterated_alphabet, expected_greek) 337 | 338 | def test_theta_transliteration(self): 339 | ''' Test Greek Theta (Θθ) transliterates to Th/th. 340 | ''' 341 | self.assertEqual(cyrtranslit.to_latin('Θ', lang_code='el'), 'Th') 342 | self.assertEqual(cyrtranslit.to_latin('θ', lang_code='el'), 'th') 343 | self.assertEqual(cyrtranslit.to_cyrillic('Th', lang_code='el'), 'Θ') 344 | self.assertEqual(cyrtranslit.to_cyrillic('th', lang_code='el'), 'θ') 345 | 346 | def test_chi_transliteration(self): 347 | ''' Test Greek Chi (Χχ) transliterates to Ch/ch. 348 | ''' 349 | self.assertEqual(cyrtranslit.to_latin('Χ', lang_code='el'), 'Ch') 350 | self.assertEqual(cyrtranslit.to_latin('χ', lang_code='el'), 'ch') 351 | self.assertEqual(cyrtranslit.to_cyrillic('Ch', lang_code='el'), 'Χ') 352 | self.assertEqual(cyrtranslit.to_cyrillic('ch', lang_code='el'), 'χ') 353 | 354 | def test_psi_transliteration(self): 355 | ''' Test Greek Psi (Ψψ) transliterates to Ps/ps. 356 | ''' 357 | self.assertEqual(cyrtranslit.to_latin('Ψ', lang_code='el'), 'Ps') 358 | self.assertEqual(cyrtranslit.to_latin('ψ', lang_code='el'), 'ps') 359 | self.assertEqual(cyrtranslit.to_cyrillic('Ps', lang_code='el'), 'Ψ') 360 | self.assertEqual(cyrtranslit.to_cyrillic('ps', lang_code='el'), 'ψ') 361 | 362 | def test_final_sigma(self): 363 | ''' Test Greek final sigma (ς) transliterates same as regular sigma. 364 | ''' 365 | self.assertEqual(cyrtranslit.to_latin('ς', lang_code='el'), 's') 366 | self.assertEqual(cyrtranslit.to_latin('Σ', lang_code='el'), 'S') 367 | self.assertEqual(cyrtranslit.to_latin('σ', lang_code='el'), 's') 368 | 369 | def test_phrase_transliteration(self): 370 | ''' Test common Greek phrase transliteration. 371 | ''' 372 | # "Hello" in Greek (Γειά σου) 373 | greek_text = "Γειά σου" 374 | expected_latin = "Geia soy" 375 | 376 | transliterated = cyrtranslit.to_latin(greek_text, lang_code='el') 377 | self.assertEqual(transliterated, expected_latin) 378 | 379 | 380 | class TestBulgarianTransliteration(unittest.TestCase): 381 | def test_alphabet_transliteration_cyrillic_to_latin(self): 382 | ''' Transliteration of entire cyrillic alphabet to latin. 383 | ''' 384 | transliterated_alphabet = cyrtranslit.to_latin(bulgarian_alphabet_cyrillic, lang_code='bg') 385 | 386 | self.assertEqual(transliterated_alphabet, bulgarian_alphabet_latin) 387 | 388 | def test_alphabet_transliteration_latin_to_cyrillic(self): 389 | ''' Transliteration of entire latin alphabet to cyrillic. 390 | ''' 391 | transliterated_alphabet = cyrtranslit.to_cyrillic(bulgarian_alphabet_latin, lang_code='bg') 392 | 393 | self.assertEqual(transliterated_alphabet, bulgarian_alphabet_cyrillic) 394 | 395 | def test_sh_at_the_end_of_string(self): 396 | ''' Check if "sh" at the of the string doesn't cause any exception.''' 397 | transliterated_alphabet = cyrtranslit.to_cyrillic("AaBbsh", lang_code='bg') 398 | 399 | self.assertEqual(transliterated_alphabet, "АаБбш") 400 | 401 | 402 | class TestMongolianTransliterationFromCyrillicToLatin(unittest.TestCase): 403 | 404 | def test_alphabet_transliteration_cyrillic_to_latin(self): 405 | ''' Transliteration of entire Mongolian cyrillic alphabet to latin. 406 | ''' 407 | transliterated_mongolian_alphabet = cyrtranslit.to_latin(mongolian_alphabet_cyrillic, 'mn') 408 | 409 | self.assertEqual(transliterated_mongolian_alphabet, mongolian_alphabet_latin) 410 | 411 | def test_alphabet_transliteration_latin_to_cyrillic(self): 412 | ''' Transliteration of entire latin alphabet to cyrillic. 413 | ''' 414 | transliterated_mongolian_alphabet = cyrtranslit.to_cyrillic(mongolian_alphabet_latin, 'mn') 415 | 416 | self.assertEqual(transliterated_mongolian_alphabet, mongolian_alphabet_cyrillic) 417 | 418 | def test_mixed_casing_transliteration_latin_to_cyrillic(self): 419 | ''' Transliteration from latin with mixed casing, e.g. Sh SH sh sH. 420 | ''' 421 | input_latin = 'KhKHkhkHShSHshsHTsTStstSChCHchcHYeYEyeyEYoYOyoyOYaYAyayA' 422 | expected_output_cyrillic = 'ХХххШШшшЦЦццЧЧччЕЕееЁЁёёЯЯяя' 423 | 424 | actual_output_cyrillic = cyrtranslit.to_cyrillic(input_latin, 'mn') 425 | 426 | self.assertEqual(actual_output_cyrillic, expected_output_cyrillic) 427 | 428 | def test_transliteration_cyrillic_to_sh(self): 429 | ''' Transliteration from Ш/Щ and ш/щ should be Sh and sh. 430 | Both Ш and Щ are pronounced the same (/ʃ/) in Mongolian. 431 | ''' 432 | input_cyrillic = 'ШшЩщ' 433 | expected_output_latin = 'ShshShsh' 434 | 435 | actual_output_latin = cyrtranslit.to_latin(input_cyrillic, 'mn') 436 | 437 | self.assertEqual(actual_output_latin, expected_output_latin) 438 | 439 | def test_transliteration_sh_to_cyrillic_defaults_to_sha(self): 440 | ''' Transliteration from Latin Sh/sh should default to Ш (not Щ). 441 | Ш is more commonly used in Mongolian than Щ (which appears mainly in loanwords). 442 | ''' 443 | input_latin = 'ShSHshsH' 444 | expected_output_cyrillic = 'ШШшш' # All variants should produce Ш (with proper casing) 445 | 446 | actual_output_cyrillic = cyrtranslit.to_cyrillic(input_latin, 'mn') 447 | 448 | self.assertEqual(actual_output_cyrillic, expected_output_cyrillic) 449 | 450 | 451 | class TestFileEncoding(unittest.TestCase): 452 | ''' Test transliteration from files with different encodings. 453 | ''' 454 | 455 | def test_windows1251_encoded_file(self): 456 | ''' Test that we can read and transliterate a windows-1251 encoded file. 457 | This addresses issue #49 where files with non-UTF-8 Cyrillic encodings 458 | would fail with UnicodeDecodeError. 459 | ''' 460 | import subprocess 461 | import sys 462 | 463 | # Run the CLI tool on a windows-1251 encoded file (auto-detection) 464 | result = subprocess.run( 465 | [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'bg', '-i', 'tests/bg_windows1251.txt'], 466 | capture_output=True, 467 | text=True 468 | ) 469 | 470 | # Should not fail with UnicodeDecodeError 471 | self.assertEqual(result.returncode, 0, f"Command failed with: {result.stderr}") 472 | 473 | # Should produce Latin output 474 | self.assertIn('Zdravey', result.stdout) 475 | 476 | # Should show a warning that it fell back to windows-1251 477 | self.assertIn('windows-1251', result.stderr) 478 | 479 | def test_explicit_encoding_parameter(self): 480 | ''' Test that we can explicitly specify the encoding with -e parameter. 481 | ''' 482 | import subprocess 483 | import sys 484 | 485 | # Run the CLI tool with explicit encoding parameter 486 | result = subprocess.run( 487 | [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'bg', '-i', 'tests/bg_windows1251.txt', '-e', 'windows-1251'], 488 | capture_output=True, 489 | text=True 490 | ) 491 | 492 | # Should not fail 493 | self.assertEqual(result.returncode, 0, f"Command failed with: {result.stderr}") 494 | 495 | # Should produce Latin output 496 | self.assertIn('Zdravey', result.stdout) 497 | 498 | # Should NOT show a warning when correct encoding is specified 499 | self.assertNotIn('Warning', result.stderr) 500 | 501 | 502 | class TestMacedonianAccentedCharacters(unittest.TestCase): 503 | ''' Test Macedonian accented vowels with grave accent for homograph disambiguation. 504 | Addresses issue #4. 505 | 506 | According to ISO 9:1968/1995 (adopted by Macedonian Academy of Arts and Sciences in 1970): 507 | - Ѐ (U+0400) / ѐ (U+0450) - Cyrillic Ie with grave 508 | - Ѝ (U+040D) / ѝ (U+045D) - Cyrillic I with grave 509 | 510 | These are used to distinguish homographs: 511 | - ѝ (her) vs и (and) 512 | - нѐ (us) vs не (no) 513 | - сѐ (everything) vs се (short reflexive pronoun) 514 | ''' 515 | 516 | def test_ie_with_grave_to_latin_preserve_accents_false(self): 517 | ''' Ѐ/ѐ should transliterate to E/e when preserve_accents=False (default). 518 | ''' 519 | self.assertEqual(cyrtranslit.to_latin('Ѐ', lang_code='mk', preserve_accents=False), 'E') 520 | self.assertEqual(cyrtranslit.to_latin('ѐ', lang_code='mk', preserve_accents=False), 'e') 521 | self.assertEqual(cyrtranslit.to_latin('нѐ', lang_code='mk', preserve_accents=False), 'ne') 522 | self.assertEqual(cyrtranslit.to_latin('сѐ', lang_code='mk', preserve_accents=False), 'se') 523 | 524 | def test_ie_with_grave_to_latin_preserve_accents_true(self): 525 | ''' Ѐ/ѐ should transliterate to È/è when preserve_accents=True. 526 | ''' 527 | self.assertEqual(cyrtranslit.to_latin('Ѐ', lang_code='mk', preserve_accents=True), 'È') 528 | self.assertEqual(cyrtranslit.to_latin('ѐ', lang_code='mk', preserve_accents=True), 'è') 529 | self.assertEqual(cyrtranslit.to_latin('нѐ', lang_code='mk', preserve_accents=True), 'nè') 530 | self.assertEqual(cyrtranslit.to_latin('сѐ', lang_code='mk', preserve_accents=True), 'sè') 531 | 532 | def test_i_with_grave_to_latin_preserve_accents_false(self): 533 | ''' Ѝ/ѝ should transliterate to I/i when preserve_accents=False (default). 534 | ''' 535 | self.assertEqual(cyrtranslit.to_latin('Ѝ', lang_code='mk', preserve_accents=False), 'I') 536 | self.assertEqual(cyrtranslit.to_latin('ѝ', lang_code='mk', preserve_accents=False), 'i') 537 | self.assertEqual(cyrtranslit.to_latin('ѝ је', lang_code='mk', preserve_accents=False), 'i je') 538 | 539 | def test_i_with_grave_to_latin_preserve_accents_true(self): 540 | ''' Ѝ/ѝ should transliterate to Ì/ì when preserve_accents=True. 541 | ''' 542 | self.assertEqual(cyrtranslit.to_latin('Ѝ', lang_code='mk', preserve_accents=True), 'Ì') 543 | self.assertEqual(cyrtranslit.to_latin('ѝ', lang_code='mk', preserve_accents=True), 'ì') 544 | self.assertEqual(cyrtranslit.to_latin('ѝ је', lang_code='mk', preserve_accents=True), 'ì je') 545 | 546 | def test_latin_e_with_grave_to_cyrillic_preserve_accents_false(self): 547 | ''' È/è should transliterate to Е/е when preserve_accents=False (default). 548 | ''' 549 | self.assertEqual(cyrtranslit.to_cyrillic('È', lang_code='mk', preserve_accents=False), 'Е') 550 | self.assertEqual(cyrtranslit.to_cyrillic('è', lang_code='mk', preserve_accents=False), 'е') 551 | self.assertEqual(cyrtranslit.to_cyrillic('nè', lang_code='mk', preserve_accents=False), 'не') 552 | 553 | def test_latin_e_with_grave_to_cyrillic_preserve_accents_true(self): 554 | ''' È/è should transliterate to Ѐ/ѐ when preserve_accents=True. 555 | ''' 556 | self.assertEqual(cyrtranslit.to_cyrillic('È', lang_code='mk', preserve_accents=True), 'Ѐ') 557 | self.assertEqual(cyrtranslit.to_cyrillic('è', lang_code='mk', preserve_accents=True), 'ѐ') 558 | self.assertEqual(cyrtranslit.to_cyrillic('nè', lang_code='mk', preserve_accents=True), 'нѐ') 559 | 560 | def test_latin_i_with_grave_to_cyrillic_preserve_accents_false(self): 561 | ''' Ì/ì should transliterate to И/и when preserve_accents=False (default). 562 | ''' 563 | self.assertEqual(cyrtranslit.to_cyrillic('Ì', lang_code='mk', preserve_accents=False), 'И') 564 | self.assertEqual(cyrtranslit.to_cyrillic('ì', lang_code='mk', preserve_accents=False), 'и') 565 | self.assertEqual(cyrtranslit.to_cyrillic('ì je', lang_code='mk', preserve_accents=False), 'и је') 566 | 567 | def test_latin_i_with_grave_to_cyrillic_preserve_accents_true(self): 568 | ''' Ì/ì should transliterate to Ѝ/ѝ when preserve_accents=True. 569 | ''' 570 | self.assertEqual(cyrtranslit.to_cyrillic('Ì', lang_code='mk', preserve_accents=True), 'Ѝ') 571 | self.assertEqual(cyrtranslit.to_cyrillic('ì', lang_code='mk', preserve_accents=True), 'ѝ') 572 | self.assertEqual(cyrtranslit.to_cyrillic('ì je', lang_code='mk', preserve_accents=True), 'ѝ је') 573 | 574 | def test_default_behavior_strips_accents(self): 575 | ''' When preserve_accents parameter is omitted, accents should be stripped (default=False). 576 | ''' 577 | # Default behavior should strip accents 578 | self.assertEqual(cyrtranslit.to_latin('ѝ', lang_code='mk'), 'i') 579 | self.assertEqual(cyrtranslit.to_latin('ѐ', lang_code='mk'), 'e') 580 | 581 | def test_file_transliteration_preserve_accents_false(self): 582 | ''' Test file-based transliteration with preserve_accents=False (default). 583 | ''' 584 | with open('tests/mk_accented.txt', 'r', encoding='utf-8') as f: 585 | content = f.read() 586 | 587 | result = cyrtranslit.to_latin(content, lang_code='mk', preserve_accents=False) 588 | 589 | # Accents should be stripped 590 | self.assertIn('i je tuka', result) 591 | self.assertIn('ne sme tamu', result) 592 | self.assertIn('se e dobro', result) 593 | self.assertNotIn('ì', result) 594 | self.assertNotIn('è', result) 595 | 596 | def test_file_transliteration_preserve_accents_true(self): 597 | ''' Test file-based transliteration with preserve_accents=True. 598 | ''' 599 | with open('tests/mk_accented.txt', 'r', encoding='utf-8') as f: 600 | content = f.read() 601 | 602 | result = cyrtranslit.to_latin(content, lang_code='mk', preserve_accents=True) 603 | 604 | # Accents should be preserved 605 | self.assertIn('ì je tuka', result) 606 | self.assertIn('nè sme tamu', result) 607 | self.assertIn('sè e dobro', result) 608 | 609 | 610 | class TestBulgarianAccentedCharacters(unittest.TestCase): 611 | ''' Test Bulgarian accented I with grave for stress marking and homograph disambiguation. 612 | Addresses issue #4. 613 | 614 | According to ISO 9:1995: 615 | - Ѝ (U+040D) / ѝ (U+045D) - Cyrillic I with grave 616 | 617 | Used to distinguish: 618 | - ѝ (her) vs и (and) 619 | ''' 620 | 621 | def test_i_with_grave_to_latin_preserve_accents_false(self): 622 | ''' Ѝ/ѝ should transliterate to I/i when preserve_accents=False (default). 623 | ''' 624 | self.assertEqual(cyrtranslit.to_latin('Ѝ', lang_code='bg', preserve_accents=False), 'I') 625 | self.assertEqual(cyrtranslit.to_latin('ѝ', lang_code='bg', preserve_accents=False), 'i') 626 | self.assertEqual(cyrtranslit.to_latin('ѝ е', lang_code='bg', preserve_accents=False), 'i e') 627 | 628 | def test_i_with_grave_to_latin_preserve_accents_true(self): 629 | ''' Ѝ/ѝ should transliterate to Ì/ì when preserve_accents=True. 630 | ''' 631 | self.assertEqual(cyrtranslit.to_latin('Ѝ', lang_code='bg', preserve_accents=True), 'Ì') 632 | self.assertEqual(cyrtranslit.to_latin('ѝ', lang_code='bg', preserve_accents=True), 'ì') 633 | self.assertEqual(cyrtranslit.to_latin('ѝ е', lang_code='bg', preserve_accents=True), 'ì e') 634 | 635 | def test_latin_i_with_grave_to_cyrillic_preserve_accents_false(self): 636 | ''' Ì/ì should transliterate to И/и when preserve_accents=False (default). 637 | ''' 638 | self.assertEqual(cyrtranslit.to_cyrillic('Ì', lang_code='bg', preserve_accents=False), 'И') 639 | self.assertEqual(cyrtranslit.to_cyrillic('ì', lang_code='bg', preserve_accents=False), 'и') 640 | 641 | def test_latin_i_with_grave_to_cyrillic_preserve_accents_true(self): 642 | ''' Ì/ì should transliterate to Ѝ/ѝ when preserve_accents=True. 643 | ''' 644 | self.assertEqual(cyrtranslit.to_cyrillic('Ì', lang_code='bg', preserve_accents=True), 'Ѝ') 645 | self.assertEqual(cyrtranslit.to_cyrillic('ì', lang_code='bg', preserve_accents=True), 'ѝ') 646 | 647 | def test_default_behavior_strips_accents(self): 648 | ''' When preserve_accents parameter is omitted, accents should be stripped (default=False). 649 | ''' 650 | self.assertEqual(cyrtranslit.to_latin('ѝ', lang_code='bg'), 'i') 651 | 652 | def test_file_transliteration_preserve_accents_false(self): 653 | ''' Test file-based transliteration with preserve_accents=False (default). 654 | ''' 655 | with open('tests/bg_accented.txt', 'r', encoding='utf-8') as f: 656 | content = f.read() 657 | 658 | result = cyrtranslit.to_latin(content, lang_code='bg', preserve_accents=False) 659 | 660 | # Accents should be stripped 661 | self.assertIn('i e tuk', result) 662 | self.assertNotIn('ì', result) 663 | 664 | def test_file_transliteration_preserve_accents_true(self): 665 | ''' Test file-based transliteration with preserve_accents=True. 666 | ''' 667 | with open('tests/bg_accented.txt', 'r', encoding='utf-8') as f: 668 | content = f.read() 669 | 670 | result = cyrtranslit.to_latin(content, lang_code='bg', preserve_accents=True) 671 | 672 | # Accents should be preserved 673 | self.assertIn('ì e tuk', result) 674 | 675 | 676 | class TestCLI(unittest.TestCase): 677 | ''' Test command-line interface functionality. ''' 678 | 679 | def test_invalid_language_code(self): 680 | ''' Test that invalid language code produces error. ''' 681 | import subprocess 682 | import sys 683 | 684 | # Try to use an invalid language code 685 | result = subprocess.run( 686 | [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'xx', '-i', 'tests/sr.txt'], 687 | capture_output=True, 688 | text=True 689 | ) 690 | 691 | # Should fail 692 | self.assertNotEqual(result.returncode, 0) 693 | 694 | # Should show error message 695 | self.assertIn('not supported', result.stderr) 696 | 697 | def test_output_file_creation(self): 698 | ''' Test that output file is created correctly. ''' 699 | import subprocess 700 | import sys 701 | import os 702 | import tempfile 703 | 704 | # Create a temporary output file path 705 | with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as tmp: 706 | output_file = tmp.name 707 | 708 | try: 709 | # Run CLI with output file 710 | result = subprocess.run( 711 | [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'sr', '-i', 'tests/sr.txt', '-o', output_file], 712 | capture_output=True, 713 | text=True 714 | ) 715 | 716 | # Should succeed 717 | self.assertEqual(result.returncode, 0, f"Command failed with: {result.stderr}") 718 | 719 | # Output file should exist 720 | self.assertTrue(os.path.exists(output_file)) 721 | 722 | # Read and verify content 723 | with open(output_file, 'r', encoding='utf-8') as f: 724 | content = f.read() 725 | # Should contain transliterated Serbian text 726 | self.assertIn('a', content.lower()) 727 | 728 | finally: 729 | # Clean up 730 | if os.path.exists(output_file): 731 | os.remove(output_file) 732 | 733 | def test_reverse_transliteration_flag(self): 734 | ''' Test -c flag for Latin to Cyrillic transliteration. ''' 735 | import subprocess 736 | import sys 737 | 738 | # Run CLI with -c flag on Latin text 739 | result = subprocess.run( 740 | [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'sr', '-c', '-i', 'tests/sr_latinica.txt'], 741 | capture_output=True, 742 | text=True, 743 | encoding='utf-8' 744 | ) 745 | 746 | # Should succeed 747 | self.assertEqual(result.returncode, 0, f"Command failed with: {result.stderr}") 748 | 749 | # Output should contain Cyrillic characters 750 | # Check for common Serbian Cyrillic letters 751 | self.assertTrue(any(ord(c) >= 0x0400 and ord(c) <= 0x04FF for c in result.stdout), 752 | "Output should contain Cyrillic characters") 753 | 754 | def test_preserve_accents_flag(self): 755 | ''' Test -p flag for preserving accents. ''' 756 | import subprocess 757 | import sys 758 | 759 | # Run CLI with -p flag on Macedonian text with accents 760 | result = subprocess.run( 761 | [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'mk', '-p', '-i', 'tests/mk_accented.txt'], 762 | capture_output=True, 763 | text=True, 764 | encoding='utf-8' 765 | ) 766 | 767 | # Should succeed 768 | self.assertEqual(result.returncode, 0, f"Command failed with: {result.stderr}") 769 | 770 | # Output should contain Latin letters with grave accents 771 | self.assertIn('ì', result.stdout) 772 | 773 | def test_combined_flags(self): 774 | ''' Test combining -c and -p flags. ''' 775 | import subprocess 776 | import sys 777 | 778 | # Run CLI with both -c and -p flags 779 | result = subprocess.run( 780 | [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'mk', '-c', '-p', '-i', 'tests/mk_accented_latin.txt'], 781 | capture_output=True, 782 | text=True, 783 | encoding='utf-8' 784 | ) 785 | 786 | # Should succeed 787 | self.assertEqual(result.returncode, 0, f"Command failed with: {result.stderr}") 788 | 789 | # Output should contain Cyrillic with accents 790 | self.assertIn('ѝ', result.stdout) 791 | 792 | def test_file_not_found(self): 793 | ''' Test error handling when input file doesn't exist. ''' 794 | import subprocess 795 | import sys 796 | 797 | # Try to read non-existent file 798 | result = subprocess.run( 799 | [sys.executable, '-m', 'cyrtranslit.cyrtranslit', '-l', 'sr', '-i', 'nonexistent_file.txt'], 800 | capture_output=True, 801 | text=True 802 | ) 803 | 804 | # Should fail 805 | self.assertNotEqual(result.returncode, 0) 806 | 807 | 808 | if __name__ == '__main__': 809 | # Run all tests. 810 | unittest.main() 811 | --------------------------------------------------------------------------------