├── tests ├── __init__.py ├── test_CTS2UZBEK.py ├── test_CTS2IPA.py ├── test_UAS2UCS.py ├── test_UAS2CTS.py ├── test_UCS2UAS.py ├── test_CTS2XJUS.py ├── test_UAS2ULS.py ├── test_CTS2UAS.py ├── test_ULS2UAS.py ├── test_XJUS2CTS.py ├── test_UZBEK2CTS.py └── test_XJUS2UAS.py ├── umsc ├── __init__.py └── umsc.py ├── demo.py ├── CITATION.cff ├── setup.py ├── .gitignore ├── README.md └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /umsc/__init__.py: -------------------------------------------------------------------------------- 1 | from .umsc import UgMultiScriptConverter -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | from umsc import UgMultiScriptConverter 2 | 3 | converter = UgMultiScriptConverter('CTS', 'IPA') 4 | 5 | input = "encür" 6 | 7 | print(converter(input)) 8 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: Osman 5 | given-names: Tursun 6 | orcid: https://orcid.org/0000-0002-0592-0864 7 | title: "Uyghur Multi-Script Converter" 8 | version: 1.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='umsc', 5 | version='0.4.0', 6 | author='Osman Tursun', 7 | author_email='osmanjan.t@gmail.com', 8 | maintainer='Osman Tursun', 9 | maintainer_email='mpcabd@gmail.com', 10 | description='Script Converter for Uyghur Language', 11 | long_description=open('README.md').read(), 12 | long_description_content_type='text/markdown', 13 | keywords='uyghur script converter arabic latin cyrillic IPA ئۇيغۇر', 14 | url='https://github.com/neouyghur/ScriptConverter4Uyghur', 15 | packages=find_packages(), 16 | install_requires=[ 17 | "regex" 18 | ], 19 | classifiers=[ 20 | # Trove classifiers to categorize the package (https://pypi.org/classifiers/) 21 | 'Programming Language :: Python :: 3', 22 | 'License :: OSI Approved :: Apache Software License', 23 | 'Operating System :: OS Independent', 24 | ], 25 | python_requires='>=3.0', # Minimum version requirement of the package 26 | ) 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | .idea/ -------------------------------------------------------------------------------- /tests/test_CTS2UZBEK.py: -------------------------------------------------------------------------------- 1 | from umsc.umsc import UgMultiScriptConverter 2 | import pytest 3 | 4 | 5 | test_data = [("qol", "qol"), 6 | ("baş", "bosh"), 7 | ("put", "put"), 8 | ("köz", "ko‘z"), 9 | ("ceñçi", "jangchi"), 10 | ("cudé", "jude"), 11 | ("san", "son"), 12 | ("sey", "say"), 13 | ("é", "e"), 14 | ("şir", "shir"), 15 | ("şañxey", "shongxay"), 16 | ("kitab", "kitob"), 17 | ("veten", "vatan"), 18 | ("tomur", "tomur"), 19 | ("kömür", "ko‘mu‘r"), 20 | ("éliktir", "eliktir"), 21 | ("vyétnam", "vyetnom"), 22 | ("şincañ", "shinjong"), 23 | ("anar", "onor"), 24 | ("encür", "anju‘r"), 25 | ("orda", "ordo"), 26 | ("uruş", "urush"), 27 | ("ördek", "o‘rdak"), 28 | ("üzüm", "u‘zu‘m"), 29 | ("élan", "elon"), 30 | ("inkas", "inkos"), 31 | ("inik'ana", "inik'ono"), 32 | ("es'et", "as'at"), 33 | ("radio", "rodio"), 34 | ("mes'ul", "mas'ul"), 35 | ("qariörük", "qorio‘ru‘k"), 36 | ("naümid", "nou‘mid"), 37 | ("it'éyiq", "it'eyiq"), 38 | ("cem'iy", "jam'iy"), 39 | ("nemen'gan", "naman'gon"), 40 | ("özxan", "o‘zxon"), 41 | ("pasxa", "posxo"), 42 | ("bayrimi", "boyrimi"), 43 | ("maarip", "moorip"), 44 | ("muellim", "muallim"), 45 | ("daire", "doira"), 46 | ("mueyyen", "muayyan"), 47 | ("tebiiy", "tabiiy"), 48 | ("paaliyet", "pooliyat"), 49 | ("ishaq", "ishoq"), 50 | ("özbékistanğa", "o‘zbekistong‘o"), 51 | ("hingan", "hingon"), 52 | ("çeklengen", "chaklangan"), 53 | ("gañgirap", "gonggirop"), 54 | ("başlanğuç", "boshlong‘uch"), 55 | ("cem'iyet", "jam'iyat"), 56 | ] 57 | 58 | @pytest.mark.parametrize("input,expected", test_data) 59 | def test_UAS2CTS(input, expected): 60 | converter = UgMultiScriptConverter("CTS", "UZLS") 61 | assert converter(input) == expected 62 | -------------------------------------------------------------------------------- /tests/test_CTS2IPA.py: -------------------------------------------------------------------------------- 1 | from umsc.umsc import UgMultiScriptConverter 2 | import pytest 3 | 4 | # Pair each input with its expected output 5 | test_data = [ 6 | ('qol', 'qol'), 7 | ('baş', 'bɑʃ'), 8 | ('put', 'put'), 9 | ('köz', 'køz'), 10 | ('ceñçi', 'dʒæŋtʃi'), 11 | ('cudé', 'dʒudɛ'), 12 | ('san', 'sɑn'), 13 | ('sey', 'sæj'), 14 | ('é', 'ɛ'), 15 | ('şir', 'ʃir'), 16 | ('şañxey', 'ʃɑŋχæj'), 17 | ('kitab', 'kitɑb'), 18 | ('veten', 'wætæn'), 19 | ('tomur', 'tomur'), 20 | ('kömür', 'kømyr'), 21 | ('éliktir', 'ɛliktir'), 22 | ('vyétnam', 'wjɛtnɑm'), 23 | ('şincañ', 'ʃindʒɑŋ'), 24 | ('anar', 'ɑnɑr'), 25 | ('encür', 'ændʒyr'), 26 | ('orda', 'ordɑ'), 27 | ('uruş', 'uruʃ'), 28 | ('ördek', 'ørdæk'), 29 | ('üzüm', 'yzym'), 30 | ('élan', 'ɛlɑn'), 31 | ('inkas', 'inkɑs'), 32 | ("inik'ana", "inik'ɑnɑ"), 33 | ("es'et", "æs'æt"), 34 | ('radio', 'rɑdio'), 35 | ("mes'ul", "mæs'ul"), 36 | ('qariörük', 'qɑriøryk'), 37 | ('naümid', 'nɑymid'), 38 | ("it'éyiq", "it'ɛjiq"), 39 | ("cem'iy", "dʒæm'ij"), 40 | ('nemengan', 'næmænɡɑn'), 41 | ('özxan', 'øzχɑn'), 42 | ('pasxa', 'pɑsχɑ'), 43 | ('bayrimi', 'bɑjrimi'), 44 | ('maarip', 'mɑɑrip'), 45 | ('muellim', 'muællim'), 46 | ('daire', 'dɑiræ'), 47 | ('mueyyen', 'muæjjæn'), 48 | ('tebiiy', 'tæbiij'), 49 | ('paaliyet', 'pɑɑlijæt'), 50 | ('ishaq', 'ishɑq'), 51 | ('özbékistanğa', 'øzbɛkistɑnʁɑ'), 52 | ('hingan', 'hinɡɑn'), 53 | ('çeklengen', 'tʃæklænɡæn'), 54 | ('gañgirap', 'ɡɑŋɡirɑp'), 55 | ('başlanğuç', 'bɑʃlɑnʁutʃ'), 56 | ("cem'iyet", "dʒæm'ijæt") 57 | ] 58 | @pytest.mark.parametrize("input,expected", test_data) 59 | def test_CTS2UAS(input, expected): 60 | converter = UgMultiScriptConverter("CTS", "IPA") 61 | assert converter(input) == expected 62 | -------------------------------------------------------------------------------- /tests/test_UAS2UCS.py: -------------------------------------------------------------------------------- 1 | from umsc.umsc import UgMultiScriptConverter 2 | import pytest 3 | 4 | 5 | test_data = [ 6 | ("قول", "қол"), 7 | ("باش", "баш"), 8 | ("پۇت", "пут"), 9 | ("كۆز", "көз"), 10 | ("جەڭچى", "җәңчи"), 11 | ("جۇدې", "җуде"), 12 | ("سان", "сан"), 13 | ("سەي", "сәй"), 14 | ("ئې", "е"), 15 | ("شىر", "шир"), 16 | ("شاڭخەي", "шаңхәй"), 17 | ("كىتاب", "китаб"), 18 | ("ۋەتەن", "вәтән"), 19 | ("تومۇر", "томур"), 20 | ("كۆمۈر", "көмүр"), 21 | ("ئېلىكتىر", "еликтир"), 22 | ("ۋيېتنام", "вйетнам"), 23 | ("شىنجاڭ", "шинҗаң"), 24 | ("ئانار", "анар"), 25 | ("ئەنجۈر", "әнҗүр"), 26 | ("ئوردا", "орда"), 27 | ("ئۇرۇش", "уруш"), 28 | ("ئۆردەك", "өрдәк"), 29 | ("ئۈزۈم", "үзүм"), 30 | ("ئېلان", "елан"), 31 | ("ئىنكاس", "инкас"), 32 | ("ئىنىكئانا", "иник'ана"), 33 | ("ئەسئەت", "әс'әт"), 34 | ("رادىئو", "ради'о"), 35 | ("مەسئۇل", "мәс'ул"), 36 | ("قارىئۆرۈك", "қари'өрүк"), 37 | ("نائۈمىد", "на'үмид"), 38 | ("ئىتئېيىق", "ит'ейиқ"), 39 | ("جەمئىي", "җәм'ий"), 40 | ("نەمەنگان", "нәмәнган"), 41 | ("ئۆزخان", "өзхан"), 42 | ("پاسخا", "пасха"), 43 | ("بايرىمى", "байрими"), 44 | ("مائارىپ", "ма'арип"), 45 | ("مۇئەللىم", "му'әллим"), 46 | ("دائىرە", "да'ирә"), 47 | ("مۇئەييەن", "му'әййән"), 48 | ("تەبىئىي", "тәби'ий"), 49 | ("پائالىيەت", "па'алийәт"), 50 | ("ئىسھاق", "исһақ"), 51 | ("ئۆزبېكىستانغا", "өзбекистанға"), 52 | ("ھىنگان", "һинган"), 53 | ("چەكلەنگەن", "чәкләнгән"), 54 | ("گاڭگىراپ", "гаңгирап"), 55 | ("باشلانغۇچ", "башланғуч"), 56 | ("جەمئىيەت", "җәм'ийәт"), 57 | # ("جۇڭخۇا", "җуңхуа|ⱪol"), 58 | ] 59 | 60 | 61 | @pytest.mark.parametrize("input,expected", test_data) 62 | def test_UAS2CTS(input, expected): 63 | converter = UgMultiScriptConverter("UAS", "UCS") 64 | assert converter(input) == expected 65 | -------------------------------------------------------------------------------- /tests/test_UAS2CTS.py: -------------------------------------------------------------------------------- 1 | from umsc.umsc import UgMultiScriptConverter 2 | import pytest 3 | 4 | # Pair each input with its expected output 5 | test_data = [ 6 | ("قول", "qol"), 7 | ("باش", "baş"), 8 | ("پۇت", "put"), 9 | ("كۆز", "köz"), 10 | ("جەڭچى", "ceñçi"), 11 | ("جۇدې", "cudé"), 12 | ("سان", "san"), 13 | ("سەي", "sey"), 14 | ("ئې", "é"), 15 | ("شىر", "şir"), 16 | ("شاڭخەي", "şañxey"), 17 | ("كىتاب", "kitab"), 18 | ("ۋەتەن", "veten"), 19 | ("تومۇر", "tomur"), 20 | ("كۆمۈر", "kömür"), 21 | ("ئېلىكتىر", "éliktir"), 22 | ("ۋيېتنام", "vyétnam"), 23 | ("شىنجاڭ", "şincañ"), 24 | ("ئانار", "anar"), 25 | ("ئەنجۈر", "encür"), 26 | ("ئوردا", "orda"), 27 | ("ئۇرۇش", "uruş"), 28 | ("ئۆردەك", "ördek"), 29 | ("ئۈزۈم", "üzüm"), 30 | ("ئېلان", "élan"), 31 | ("ئىنكاس", "inkas"), 32 | ("ئىنىكئانا", "inik'ana"), 33 | ("ئەسئەت", "es'et"), 34 | ("رادىئو", "radio"), 35 | ("مەسئۇل", "mes'ul"), 36 | ("قارىئۆرۈك", "qariörük"), 37 | ("نائۈمىد", "naümid"), 38 | ("ئىتئېيىق", "it'éyiq"), 39 | ("جەمئىي", "cem'iy"), 40 | ("نەمەنگان", "nemengan"), 41 | ("ئۆزخان", "özxan"), 42 | ("پاسخا", "pasxa"), 43 | ("بايرىمى", "bayrimi"), 44 | ("مائارىپ", "maarip"), 45 | ("مۇئەللىم", "muellim"), 46 | ("دائىرە", "daire"), 47 | ("مۇئەييەن", "mueyyen"), 48 | ("تەبىئىي", "tebiiy"), 49 | ("پائالىيەت", "paaliyet"), 50 | ("ئىسھاق", "ishaq"), 51 | ("ئۆزبېكىستانغا", "özbékistanğa"), 52 | ("ھىنگان", "hingan"), 53 | ("چەكلەنگەن", "çeklengen"), 54 | ("گاڭگىراپ", "gañgirap"), 55 | ("باشلانغۇچ", "başlanğuç"), 56 | ("جەمئىيەت", "cem'iyet"), 57 | # ("جۇڭخۇا", "cuñxua"), 58 | ] 59 | 60 | 61 | @pytest.mark.parametrize("input,expected", test_data) 62 | def test_UAS2CTS(input, expected): 63 | converter = UgMultiScriptConverter("UAS", "CTS") 64 | assert converter(input) == expected 65 | -------------------------------------------------------------------------------- /tests/test_UCS2UAS.py: -------------------------------------------------------------------------------- 1 | from umsc.umsc import UgMultiScriptConverter 2 | import pytest 3 | 4 | 5 | test_data = [ 6 | ("قول", "қол"), 7 | ("باش", "баш"), 8 | ("پۇت", "пут"), 9 | ("كۆز", "көз"), 10 | ("جەڭچى", "җәңчи"), 11 | ("جۇدې", "җуде"), 12 | ("سان", "сан"), 13 | ("سەي", "сәй"), 14 | ("ئې", "е"), 15 | ("شىر", "шир"), 16 | ("شاڭخەي", "шаңхәй"), 17 | ("كىتاب", "китаб"), 18 | ("ۋەتەن", "вәтән"), 19 | ("تومۇر", "томур"), 20 | ("كۆمۈر", "көмүр"), 21 | ("ئېلىكتىر", "еликтир"), 22 | ("ۋيېتنام", "вйетнам"), 23 | ("شىنجاڭ", "шинҗаң"), 24 | ("ئانار", "анар"), 25 | ("ئەنجۈر", "әнҗүр"), 26 | ("ئوردا", "орда"), 27 | ("ئۇرۇش", "уруш"), 28 | ("ئۆردەك", "өрдәк"), 29 | ("ئۈزۈم", "үзүм"), 30 | ("ئېلان", "елан"), 31 | ("ئىنكاس", "инкас"), 32 | ("ئىنىكئانا", "иник'ана"), 33 | ("ئەسئەت", "әс'әт"), 34 | ("رادىئو", "ради'о"), 35 | ("مەسئۇل", "мәс'ул"), 36 | ("قارىئۆرۈك", "қари'өрүк"), 37 | ("نائۈمىد", "на'үмид"), 38 | ("ئىتئېيىق", "ит'ейиқ"), 39 | ("جەمئىي", "җәм'ий"), 40 | ("نەمەنگان", "нәмәнган"), 41 | ("ئۆزخان", "өзхан"), 42 | ("پاسخا", "пасха"), 43 | ("بايرىمى", "байрими"), 44 | ("مائارىپ", "ма'арип"), 45 | ("مۇئەللىم", "му'әллим"), 46 | ("دائىرە", "да'ирә"), 47 | ("مۇئەييەن", "му'әййән"), 48 | ("تەبىئىي", "тәби'ий"), 49 | ("پائالىيەت", "па'алийәт"), 50 | ("ئىسھاق", "исһақ"), 51 | ("ئۆزبېكىستانغا", "өзбекистанға"), 52 | ("ھىنگان", "һинган"), 53 | ("چەكلەنگەن", "чәкләнгән"), 54 | ("گاڭگىراپ", "гаңгирап"), 55 | ("باشلانغۇچ", "башланғуч"), 56 | ("جەمئىيەت", "җәм'ийәт"), 57 | # ("جۇڭخۇا", "җуңхуа|ⱪol"), 58 | ] 59 | 60 | test_data = [(value, key) for key, value in test_data] 61 | 62 | @pytest.mark.parametrize("input,expected", test_data) 63 | def test_UAS2CTS(input, expected): 64 | converter = UgMultiScriptConverter("UCS", "UAS") 65 | assert converter(input) == expected 66 | -------------------------------------------------------------------------------- /tests/test_CTS2XJUS.py: -------------------------------------------------------------------------------- 1 | from umsc.umsc import UgMultiScriptConverter 2 | import pytest 3 | 4 | # Pair each input with its expected output 5 | test_data = [ 6 | ("qol", "qol"), 7 | ("baş", "bax"), 8 | ("put", "put"), 9 | ("köz", "kOz"), 10 | ("ceñçi", "jANci"), 11 | ("cudé", "jude"), 12 | ("san", "san"), 13 | ("sey", "sAy"), 14 | ("é", "ve"), 15 | ("şir", "xir"), 16 | ("şañxey", "xaNHAy"), 17 | ("kitab", "kitab"), 18 | ("veten", "wAtAn"), 19 | ("tomur", "tomur"), 20 | ("kömür", "kOmUr"), 21 | ("éliktir", "veliktir"), 22 | ("vyétnam", "wyetnam"), 23 | ("şincañ", "xinjaN"), 24 | ("anar", "vanar"), 25 | ("encür", "vAnjUr"), 26 | ("orda", "vorda"), 27 | ("uruş", "vurux"), 28 | ("ördek", "vOrdAk"), 29 | ("üzüm", "vUzUm"), 30 | ("élan", "velan"), 31 | ("inkas", "vinkas"), 32 | ("inik'ana", "vinikvana"), 33 | ("es'et", "vAsvAt"), 34 | ("radio", "radivo"), 35 | ("mes'ul", "mAsvul"), 36 | ("qariörük", "qarivOrUk"), 37 | ("naümid", "navUmid"), 38 | ("it'éyiq", "vitveyiq"), 39 | ("cem'iy", "jAmviy"), 40 | ("nemengan", "nAmAngan"), 41 | ("özxan", "vOzHan"), 42 | ("pasxa", "pasHa"), 43 | ("bayrimi", "bayrimi"), 44 | ("maarip", "mavarip"), 45 | ("muellim", "muvAllim"), 46 | ("daire", "davirA"), 47 | ("mueyyen", "muvAyyAn"), 48 | ("tebiiy", "tAbiviy"), 49 | ("paaliyet", "pavaliyAt"), 50 | ("ishaq", "vishaq"), 51 | ("özbékistanğa", "vOzbekistanGa"), 52 | ("hingan", "hingan"), 53 | ("çeklengen", "cAklAngAn"), 54 | ("gañgirap", "gaNgirap"), 55 | ("başlanğuç", "baxlanGuc"), 56 | ("cem'iyet", "jAmviyAt"), 57 | # ('cuñxua', 'جۇڭخۇا'), 58 | # ('cuñxua', 'جۇڭخۇئا'), 59 | ] 60 | 61 | 62 | 63 | @pytest.mark.parametrize("input,expected", test_data) 64 | def test_CTS2UAS(input, expected): 65 | converter = UgMultiScriptConverter("CTS", "XJUS") 66 | assert converter(input) == expected 67 | -------------------------------------------------------------------------------- /tests/test_UAS2ULS.py: -------------------------------------------------------------------------------- 1 | from umsc.umsc import UgMultiScriptConverter 2 | import pytest 3 | 4 | # Pair each input with its expected output 5 | test_data = [ 6 | ("قول", "qol"), 7 | ("باش", "bash"), 8 | ("پۇت", "put"), 9 | ("كۆز", "köz"), 10 | ("جەڭچى", "jengchi"), 11 | ("جۇدې", "judé"), 12 | ("سان", "san"), 13 | ("سەي", "sey"), 14 | ("ئې", "é"), 15 | ("شىر", "shir"), 16 | ("شاڭخەي", "shangxey"), 17 | ("كىتاب", "kitab"), 18 | ("ۋەتەن", "weten"), 19 | ("تومۇر", "tomur"), 20 | ("كۆمۈر", "kömür"), 21 | ("ئېلىكتىر", "éliktir"), 22 | ("ۋيېتنام", "wyétnam"), 23 | ("شىنجاڭ", "shinjang"), 24 | ("ئانار", "anar"), 25 | ("ئەنجۈر", "enjür"), 26 | ("ئوردا", "orda"), 27 | ("ئۇرۇش", "urush"), 28 | ("ئۆردەك", "ördek"), 29 | ("ئۈزۈم", "üzüm"), 30 | ("ئېلان", "élan"), 31 | ("ئىنكاس", "inkas"), 32 | ("ئىنىكئانا", "inik'ana"), 33 | ("ئەسئەت", "es'et"), 34 | ("رادىئو", "radi'o"), 35 | ("مەسئۇل", "mes'ul"), 36 | ("قارىئۆرۈك", "qari'örük"), 37 | ("نائۈمىد", "na'ümid"), 38 | ("ئىتئېيىق", "it'éyiq"), 39 | ("جەمئىي", "jem'iy"), 40 | ("نەمەنگان", "nemen'gan"), 41 | ("ئۆزخان", "özxan"), 42 | ("پاسخا", "pasxa"), 43 | ("بايرىمى", "bayrimi"), 44 | ("مائارىپ", "ma'arip"), 45 | ("مۇئەللىم", "mu'ellim"), 46 | ("دائىرە", "da'ire"), 47 | ("مۇئەييەن", "mu'eyyen"), 48 | ("تەبىئىي", "tebi'iy"), 49 | ("پائالىيەت", "pa'aliyet"), 50 | ("ئىسھاق", "is'haq"), 51 | ("ئۆزبېكىستانغا", "özbékistan'gha"), 52 | ("ھىنگان", "hin'gan"), 53 | ("چەكلەنگەن", "cheklen'gen"), 54 | ("گاڭگىراپ", "ganggirap"), 55 | ("باشلانغۇچ", "bashlan'ghuch"), 56 | ("جەمئىيەت", "jem'iyet"), 57 | ("جۇڭخۇا", "jungxua"), 58 | ("ئەدەب-ئەخلاق", "edeb-exlaq"), 59 | ] 60 | 61 | @pytest.mark.parametrize("input,expected", test_data) 62 | def test_UAS2CTS(input, expected): 63 | converter = UgMultiScriptConverter("UAS", "ULS") 64 | assert converter(input) == expected 65 | -------------------------------------------------------------------------------- /tests/test_CTS2UAS.py: -------------------------------------------------------------------------------- 1 | from umsc.umsc import UgMultiScriptConverter 2 | import pytest 3 | 4 | # Pair each input with its expected output 5 | test_data = [ 6 | ("qol", "قول"), 7 | ("baş", "باش"), 8 | ("put", "پۇت"), 9 | ("köz", "كۆز"), 10 | ("ceñçi", "جەڭچى"), 11 | ("cudé", "جۇدې"), 12 | ("san", "سان"), 13 | ("sey", "سەي"), 14 | ("é", "ئې"), 15 | ("şir", "شىر"), 16 | ("şañxey", "شاڭخەي"), 17 | ("kitab", "كىتاب"), 18 | ("veten", "ۋەتەن"), 19 | ("tomur", "تومۇر"), 20 | ("kömür", "كۆمۈر"), 21 | ("éliktir", "ئېلىكتىر"), 22 | ("veten", "ۋەتەن"), 23 | ("vyétnam", "ۋيېتنام"), 24 | ("şincañ", "شىنجاڭ"), 25 | ("anar", "ئانار"), 26 | ("encür", "ئەنجۈر"), 27 | ("orda", "ئوردا"), 28 | ("uruş", "ئۇرۇش"), 29 | ("ördek", "ئۆردەك"), 30 | ("üzüm", "ئۈزۈم"), 31 | ("élan", "ئېلان"), 32 | ("inkas", "ئىنكاس"), 33 | ("inik'ana", "ئىنىكئانا"), 34 | ("es'et", "ئەسئەت"), 35 | ("radio", "رادىئو"), 36 | ("mes'ul", "مەسئۇل"), 37 | ("qariörük", "قارىئۆرۈك"), 38 | ("naümid", "نائۈمىد"), 39 | ("it'éyiq", "ئىتئېيىق"), 40 | ("cem'iy", "جەمئىي"), 41 | ("nemengan", "نەمەنگان"), 42 | ("özxan", "ئۆزخان"), 43 | ("pasxa", "پاسخا"), 44 | ("bayrimi", "بايرىمى"), 45 | ("maarip", "مائارىپ"), 46 | ("muellim", "مۇئەللىم"), 47 | ("daire", "دائىرە"), 48 | ("mueyyen", "مۇئەييەن"), 49 | ("tebiiy", "تەبىئىي"), 50 | ("paaliyet", "پائالىيەت"), 51 | ("ishaq", "ئىسھاق"), 52 | ("özbékistanğa", "ئۆزبېكىستانغا"), 53 | ("hingan", "ھىنگان"), 54 | ("çeklengen", "چەكلەنگەن"), 55 | ("gañgirap", "گاڭگىراپ"), 56 | ("başlanğuç", "باشلانغۇچ"), 57 | ("cem'iyet", "جەمئىيەت"), 58 | # ('cuñxua', 'جۇڭخۇا'), 59 | # ('cuñxua', 'جۇڭخۇئا'), 60 | ] 61 | 62 | 63 | 64 | @pytest.mark.parametrize("input,expected", test_data) 65 | def test_CTS2UAS(input, expected): 66 | converter = UgMultiScriptConverter("CTS", "UAS") 67 | assert converter(input) == expected 68 | -------------------------------------------------------------------------------- /tests/test_ULS2UAS.py: -------------------------------------------------------------------------------- 1 | from umsc.umsc import UgMultiScriptConverter 2 | import pytest 3 | 4 | # Pair each input with its expected output 5 | test_data = [ 6 | ("قول", "qol"), 7 | ("باش", "bash"), 8 | ("پۇت", "put"), 9 | ("كۆز", "köz"), 10 | ("جەڭچى", "jengchi"), 11 | ("جۇدې", "judé"), 12 | ("سان", "san"), 13 | ("سەي", "sey"), 14 | ("ئې", "é"), 15 | ("شىر", "shir"), 16 | ("شاڭخەي", "shangxey"), 17 | ("كىتاب", "kitab"), 18 | ("ۋەتەن", "weten"), 19 | ("تومۇر", "tomur"), 20 | ("كۆمۈر", "kömür"), 21 | ("ئېلىكتىر", "éliktir"), 22 | ("ۋيېتنام", "wyétnam"), 23 | ("شىنجاڭ", "shinjang"), 24 | ("ئانار", "anar"), 25 | ("ئەنجۈر", "enjür"), 26 | ("ئوردا", "orda"), 27 | ("ئۇرۇش", "urush"), 28 | ("ئۆردەك", "ördek"), 29 | ("ئۈزۈم", "üzüm"), 30 | ("ئېلان", "élan"), 31 | ("ئىنكاس", "inkas"), 32 | ("ئىنىكئانا", "inik'ana"), 33 | ("ئەسئەت", "es'et"), 34 | ("رادىئو", "radio"), 35 | ("مەسئۇل", "mes'ul"), 36 | ("قارىئۆرۈك", "qariörük"), 37 | ("نائۈمىد", "naümid"), 38 | ("ئىتئېيىق", "it'éyiq"), 39 | ("جەمئىي", "jem'iy"), 40 | ("نەمەنگان", "nemen'gan"), 41 | ("ئۆزخان", "özxan"), 42 | ("پاسخا", "pasxa"), 43 | ("بايرىمى", "bayrimi"), 44 | ("مائارىپ", "maarip"), 45 | ("مۇئەللىم", "muellim"), 46 | ("دائىرە", "daire"), 47 | ("مۇئەييەن", "mueyyen"), 48 | ("تەبىئىي", "tebiiy"), 49 | ("پائالىيەت", "paaliyet"), 50 | ("ئىسھاق", "is'haq"), 51 | ("ئۆزبېكىستانغا", "özbékistan'gha"), 52 | ("ھىنگان", "hin'gan"), 53 | ("چەكلەنگەن", "cheklen'gen"), 54 | ("گاڭگىراپ", "ganggirap"), 55 | ("باشلانغۇچ", "bashlan'ghuch"), 56 | ("جەمئىيەت", "jem'iyet"), 57 | # ("جۇڭخۇا", "jungxua"), 58 | ] 59 | 60 | test_data = [(value, key) for key, value in test_data] 61 | 62 | @pytest.mark.parametrize("input,expected", test_data) 63 | def test_UAS2CTS(input, expected): 64 | converter = UgMultiScriptConverter("ULS", "UAS") 65 | assert converter(input) == expected 66 | -------------------------------------------------------------------------------- /tests/test_XJUS2CTS.py: -------------------------------------------------------------------------------- 1 | from umsc.umsc import UgMultiScriptConverter 2 | import pytest 3 | 4 | # Pair each input with its expected output 5 | test_data = [ 6 | ("qol", "qol"), 7 | ("baş", "bax"), 8 | ("put", "put"), 9 | ("köz", "kOz"), 10 | ("ceñçi", "jANci"), 11 | ("cudé", "jude"), 12 | ("san", "san"), 13 | ("sey", "sAy"), 14 | ("é", "ve"), 15 | ("şir", "xir"), 16 | ("şañxey", "xaNHAy"), 17 | ("kitab", "kitab"), 18 | ("veten", "wAtAn"), 19 | ("tomur", "tomur"), 20 | ("kömür", "kOmUr"), 21 | ("éliktir", "veliktir"), 22 | ("vyétnam", "wyetnam"), 23 | ("şincañ", "xinjaN"), 24 | ("anar", "vanar"), 25 | ("encür", "vAnjUr"), 26 | ("orda", "vorda"), 27 | ("uruş", "vurux"), 28 | ("ördek", "vOrdAk"), 29 | ("üzüm", "vUzUm"), 30 | ("élan", "velan"), 31 | ("inkas", "vinkas"), 32 | ("inik'ana", "vinikvana"), 33 | ("es'et", "vAsvAt"), 34 | ("radio", "radivo"), 35 | ("mes'ul", "mAsvul"), 36 | ("qariörük", "qarivOrUk"), 37 | ("naümid", "navUmid"), 38 | ("it'éyiq", "vitveyiq"), 39 | ("cem'iy", "jAmviy"), 40 | ("nemengan", "nAmAngan"), 41 | ("özxan", "vOzHan"), 42 | ("pasxa", "pasHa"), 43 | ("bayrimi", "bayrimi"), 44 | ("maarip", "mavarip"), 45 | ("muellim", "muvAllim"), 46 | ("daire", "davirA"), 47 | ("mueyyen", "muvAyyAn"), 48 | ("tebiiy", "tAbiviy"), 49 | ("paaliyet", "pavaliyAt"), 50 | ("ishaq", "vishaq"), 51 | ("özbékistanğa", "vOzbekistanGa"), 52 | ("hingan", "hingan"), 53 | ("çeklengen", "cAklAngAn"), 54 | ("gañgirap", "gaNgirap"), 55 | ("başlanğuç", "baxlanGuc"), 56 | ("cem'iyet", "jAmviyAt"), 57 | # ('cuñxua', 'جۇڭخۇا'), 58 | # ('cuñxua', 'جۇڭخۇئا'), 59 | ] 60 | 61 | 62 | test_data = [(value, key) for key, value in test_data] 63 | @pytest.mark.parametrize("input,expected", test_data) 64 | def test_CTS2UAS(input, expected): 65 | converter = UgMultiScriptConverter("XJUS", "CTS") 66 | assert converter(input) == expected 67 | -------------------------------------------------------------------------------- /tests/test_UZBEK2CTS.py: -------------------------------------------------------------------------------- 1 | from umsc.umsc import UgMultiScriptConverter 2 | import pytest 3 | 4 | test_data = [('qol', 'qal'), 5 | ('bosh', 'baş'), 6 | ('put', 'put'), 7 | ('ko‘z', 'köz'), 8 | ('jangchi', 'ceñçi'), 9 | ('jude', 'cudé'), 10 | ('son', 'san'), 11 | ('say', 'sey'), 12 | ('e', 'é'), 13 | ('shir', 'şir'), 14 | ('shongxay', 'şañxey'), 15 | ('kitob', 'kitab'), 16 | ('vatan', 'veten'), 17 | ('tomur', 'tamur'), 18 | ('ko‘mu‘r', 'kömür'), 19 | ('eliktir', 'éliktir'), 20 | ('vyetnom', 'vyétnam'), 21 | ('shinjong', 'şincañ'), 22 | ('onor', 'anar'), 23 | ('anju‘r', 'encür'), 24 | ('ordo', 'arda'), 25 | ('urush', 'uruş'), 26 | ('o‘rdak', 'ördek'), 27 | ('u‘zu‘m', 'üzüm'), 28 | ('elon', 'élan'), 29 | ('inkos', 'inkas'), 30 | ("inik'ono", "inik'ana"), 31 | ("as'at", "es'et"), 32 | ('rodio', 'radia'), 33 | ("mas'ul", "mes'ul"), 34 | ('qorio‘ru‘k', 'qariörük'), 35 | ('nou‘mid', 'naümid'), 36 | ("it'eyiq", "it'éyiq"), 37 | ("jam'iy", "cem'iy"), 38 | ("naman'gon", "nemengan"), 39 | ('o‘zxon', 'özxan'), 40 | ('posxo', 'pasxa'), 41 | ('boyrimi', 'bayrimi'), 42 | ('moorip', 'maarip'), 43 | ('muallim', 'muellim'), 44 | ('doira', 'daire'), 45 | ('muayyan', 'mueyyen'), 46 | ('tabiiy', 'tebiiy'), 47 | ('pooliyat', 'paaliyet'), 48 | ("is'hoq", 'ishaq'), 49 | ("o‘zbekiston'g‘o", "özbékistanğa"), 50 | ("hin'gon", "hingan"), 51 | ("chaklan'gan", 'çeklengen'), 52 | ('gonggirop', 'gañgirap'), 53 | ("boshlon'g‘uch", 'başlanğuç'), 54 | ("jam'iyat", "cem'iyet") 55 | ] 56 | 57 | @pytest.mark.parametrize("input,expected", test_data) 58 | def test_UAS2CTS(input, expected): 59 | converter = UgMultiScriptConverter("UZLS", "CTS") 60 | assert converter(input) == expected 61 | -------------------------------------------------------------------------------- /tests/test_XJUS2UAS.py: -------------------------------------------------------------------------------- 1 | from umsc.umsc import UgMultiScriptConverter 2 | import pytest 3 | 4 | test_data = [ 5 | ("qol", "قول"), 6 | ("bax", "باش"), 7 | ("put", "پۇت"), 8 | ("kOz", "كۆز"), 9 | ("wAdA", "ۋەدە"), 10 | ("juda", "جۇدا"), 11 | ("sAn", "سەن"), 12 | ("samsaq", "سامساق"), 13 | ("haywan", "ھايۋان"), 14 | ("xir", "شىر"), 15 | ("Gulja", "غۇلجا"), 16 | ("kitab", "كىتاب"), 17 | ("wAtAn", "ۋەتەن"), 18 | ("tomur", "تومۇر"), 19 | ("kOmUr", "كۆمۈر"), 20 | ("asman", "ئاسمان"), 21 | ("muAllim", "مۇئەللىم"), 22 | ("sincay", "سىنچاي"), 23 | ("anar", "ئانار"), 24 | ("vAnjur", "ئەنجۇر"), 25 | ("orda", "ئوردا"), 26 | ("urux", "ئۇرۇش"), 27 | ("OrdAk", "ئۆردەك"), 28 | ("UzUm", "ئۈزۈم"), 29 | ("elan", "ئېلان"), 30 | ("inkas", "ئىنكاس"), 31 | ("ana", "ئانا"), 32 | ("bizniN muAllim", "بىزنىڭ مۇئەللىم"), 33 | ("uzun sApAr", "ئۇزۇن سەپەر"), 34 | ("mAktAp mudir", "مەكتەپ مۇدىر"), 35 | ("yoGan bax", "يوغان باش"), 36 | ("maymun", "مايمۇن"), 37 | ("bayraq", "بايراق"), 38 | ("bArkAtlik", "بەركەتلىك"), 39 | ("poyuz", "پويۇز"), 40 | ("bizniN Oy", "بىزنىڭ ئۆي"), 41 | ("kUnlAr", "كۈنلەر"), 42 | ("bayramlar", "بايراملار"), 43 | ("tirixcan", "تىرىشچان"), 44 | ("yalGuz", "يالغۇز"), 45 | ("bir kuni", "بىر كۇنى"), 46 | ("balaN azsa ah urma yol tapidu bir kuni", 47 | "بالاڭ ئازسا ئاھ ئۇرما يول تاپىدۇ بىر كۇنى"), 48 | ("aman bolsa ularmu pul tapidu bir kuni", 49 | "ئامان بولسا ئۇلارمۇ پۇل تاپىدۇ بىر كۇنى"), 50 | ("Hata basqan qAdAmlAr qaldurGanda qAdAmdin", 51 | "خاتا باسقان قەدەملەر قالدۇرغاندا قەدەمدىن"), 52 | ("Ozi maNGan catqalGa ot yaqidu bir kuni", 53 | "ئۆزى ماڭغان چاتقالغا ئوت ياقىدۇ بىر كۇنى"), 54 | ("gAdAnkAxlik hickimdin udum AmAs ularGa", 55 | "گەدەنكەشلىك ھىچكىمدىن ئۇدۇم ئەمەس ئۇلارغا"), 56 | ("Ozi kilip aldiNGa gAp acidu bir kuni", 57 | "ئۆزى كىلىپ ئالدىڭغا گەپ ئاچىدۇ بىر كۇنى"), 58 | ("at bolGicA asawtay hAryan cipip baqmaydu", 59 | "ئات بولغىچە ئاساۋتاي ھەريان چىپىپ باقمايدۇ"), 60 | ("ularGimu bu dunya yUk artidu bir kuni", 61 | "ئۇلارغىمۇ بۇ دۇنيا يۈك ئارتىدۇ بىر كۇنى"), 62 | ] 63 | 64 | 65 | @pytest.mark.parametrize("input,expected", test_data) 66 | def test_CTS2UAS(input, expected): 67 | converter = UgMultiScriptConverter("XJUS", "UAS") 68 | assert converter(input) == expected 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Script Converter for Uyghur Language 2 | This converter supports multiple Uyghur writing systems: 3 | - **ULS** — Uyghur Latin Script 4 | - **UAS** — Uyghur Arabic Script 5 | - **CTS** — Common Turkish Script 6 | - **UCS** — Uyghur Cyrillic Script 7 | - **UYS** — Uyghur Yengi (New) Script 8 | - **IPA** — International Phonetic Alphabet 9 | - **UZLS** — Uzbek Latin Script 10 | - **XJUS** — Xinjiang University Script 11 | 12 | ## Installation 13 | ``` 14 | pip install umsc 15 | ``` 16 | 17 | 18 | ## Mapping table 19 | | UAS | CTS | ULS| UCS|UYS| IPA | UZLS | XJUS | 20 | |-----|----| ---- | --- | -- |-------|---|------| 21 | | ا | a | a | а |a | /ɑ/ | o | a | 22 | | ە | e | e | ә |ə | /æ/ | a | A | 23 | | ب | b | b | б |b | /b/ | b | b | 24 | | پ | p | p | п |p | /p/ | p | p | 25 | | ت | t | t | т |t | /t/ | t | t | 26 | | ج | c | j | җ |j | /d͡ʒ/ | j | j | 27 | | چ | ç | ch | ч |q | /t͡ʃ/ | ch | c | 28 | | خ | x | x | х |h | /χ/ | x | H | 29 | | د | d | d | д |d | /d/ | d | d | 30 | | ر | r | r | р |r | /r/ | r | r | 31 | | ز | z | z | з |z | /z/ | z | z | 32 | | ژ | j | zh | ж |ⱬ | /ʒ/ | j | J | 33 | | س | s | s | с |s | /s/ | s | s | 34 | | ش | ş | sh | ш |x | /ʃ/ | sh | x | 35 | | ف | f | f | ф |f | /f/ | f | f | 36 | | ڭ | ñ | ng | ң |ng | /ŋ/ | ng | N | 37 | | ل | l | l | л |l | /l/ | l | l | 38 | | م | m | m | м |m | /m/ | m | m | 39 | | ھ | h | h | һ |ⱨ | /h/ | h | h | 40 | | و | o | o | о |o | /o/ | oʻ | o | 41 | | ۇ | u | u | у |u | /u/ | u | u | 42 | | ۆ | ö | ö | ө |ɵ | /ø/ | oʻ | O | 43 | | ۈ | ü | ü | ү |ü | /y/ | uʻ | U | 44 | | ۋ | v | w | в |w | /w/ | v | w | 45 | | ې | é | é | е |e | /ɛ/ | e | e | 46 | | ى | i | i | и |i | /i/ | i | i | 47 | | ي | y | y | й |y | /j/ | y | y | 48 | | ق | q | q | қ |ⱪ | /q/ | q | q | 49 | | ك | k | k | к |k | /k/ | k | k | 50 | | گ | g | g | г |g | /ɡ/ | g | g | 51 | | ن | n | n | н |n | /n/ | n | n | 52 | | غ | ğ | gh | ғ |ƣ | /ʁ/ | gʻ | G | 53 | | ئ | | | | | | | v | 54 | | يا | ya | ya | я |ya | | ya | ya | 55 | | يۇ | yu | yu | ю |yu | | yu | yu | 56 | 57 | ## Sample input and output examples 58 | 59 | Review the files in the tests directory for examples of converting between different scripts. 60 | 61 | ## Usage 62 | 63 | ``` 64 | from umsc import UgMultiScriptConverter 65 | # To convert text, you need to define source and target scripts 66 | # The abbreviation of scrips 67 | # ULS | Uyghur Latin Script 68 | # UYS | Uyghur Yengi (New) Script 69 | # CPS | Chinese Pinyin Script 70 | # UAS | Uyghur Arabic Script 71 | # CTS |Common Turkic Script 72 | # UCS | Uyghur Cyrillic Script 73 | # XJU | Xinjinag University English Case Sensitive 74 | # UZLS | Uzbek Latin Script 75 | # Convert Uyghur Arabic Script to Uyghur Latin Script 76 | source_script = 'UAS' 77 | target_script = 'ULS' 78 | converter = UgMultiScriptConverter(source_script, target_script) 79 | text1 = 'ياخشىمۇسىز!' 80 | text1 = converter(text1) 81 | print(text1) 82 | # Convert Uyghur Latin Script to Uyghur Arabic Script 83 | source_script = 'ULS' 84 | target_script = 'UAS' 85 | converter = UgMultiScriptConverter(source_script, target_script) 86 | text2 = 'yaxshimusiz!' 87 | text2 = converter(text2) 88 | print(text2) 89 | ``` 90 | 91 | ## Notes 92 | - API and Uzbek are currently developing. The mapping might not be very accurate. Especially for Uzbek, it is not very clear how to map "ئا" and "ئە" to Uzbek. 93 | 94 | 95 | ## Citation 96 | 97 | If you wish to cite this project, please use `cite this repository`. 98 | 99 | ## Contributing 100 | Feel free to raise issue and pull request. 101 | 102 | ## License 103 | Distributed under the Apache 2.0 License. See [`LICENSE`](LICENSE) for more information. 104 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /umsc/umsc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 5 | 6 | # Original Author: neouyghur 7 | # Mail: osmanjan.t@gmail.com 8 | # Licence: MIT License 9 | 10 | This is a simple script to convert Uyghur texts written in different Uyghur scripts. It supports Uyghur Arabic, 11 | Latin Common Turkish scripts, Uyghur Latin Script (also known as computer script), Uyghur Yengi (new) script and Uyghur 12 | Cyrillic script. It is written in Python and uses PyQt5 for GUI. The source script will be converted to common turkic script, 13 | then converted to target script. Therefore, the program is not very efficient but easy to add new scripts. 14 | 15 | Abbreviations used in this file: 16 | 17 | ULS | Uyghur Latin Script 18 | UYS | Uyghur Yengi (New) Script 19 | CPS | Chinese Pinyin Script 20 | UAS | Uyghur Arabic Script 21 | CTS |Common Turkic Script 22 | UCS | Uyghur Cyrillic Script 23 | XJU | Xinjinag University English Case Sensitive 24 | UZLS | Uzbek Latin Script 25 | 26 | ''' 27 | import regex as re 28 | import argparse 29 | 30 | 31 | class UgMultiScriptConverter: 32 | def __init__(self, source_script, target_script, less_apostrophe=False): 33 | self.source_script = source_script 34 | self.target_script = target_script 35 | # self.less_apostrophe = less_apostrophe 36 | 37 | self.__uas_group1 = [u'ا', u'ە', u'ب', u'پ', u'ت', u'ج', u'چ', u'خ', u'د', u'ر', 38 | u'ز', u'ژ', u'س', u'ش', u'ف', u'ڭ', u'ل', u'لا', u'م', u'ھ', 39 | u'و', u'ۇ', u'ۆ', u'ۈ', u'ۋ', u'ې', u'ى', u'ي', u'ق', u'ك', 40 | u'گ', u'ن', u'غ', u'؟', u'،', u'؛', u'٭'] # u'ئ', 41 | # following may be not necessary, u'«', u'«', u'«', u'«', u'»', u'»', u'»', u'»'] 42 | self.__cts_group1 = [u'a', u'e', u'b', u'p', u't', u'c', u'ç', u'x', u'd', u'r', 43 | u'z', u'j', u's', u'ş', u'f', u'ñ', u'l', u'la', u'm', u'h', 44 | u'o', u'u', u'ö', u'ü', u'v', u'é', u'i', u'y', u'q', u'k', 45 | u'g', u'n', u'ğ', u'?', u',', u';', u'*'] 46 | self.__ucs_group1 = [u'а', u'ә', u'б', u'п', u'т', u'җ', u'ч', u'х', u'д', u'р', 47 | u'з', u'ж', u'с', u'ш', u'ф', u'ң', u'л', u'ла', u'м', u'һ', 48 | u'о', u'у', u'ө', u'ү', u'в', u'е', u'и', u'й', u'қ', u'к', 49 | u'г', u'н', u'ғ', u'?', u',', u';', u'*'] 50 | 51 | # I have to improve this. It is not complete 52 | self.__ipa_group1 = ["ɑ", "æ", "b", "p", "t", "dʒ", "tʃ", "χ", "d", "r", 53 | "z", "ʒ", "s", "ʃ", "f", "ŋ", "l", "la", "m", "h", 54 | "o", "u", "ø", "y", "w", "ɛ", "i", "j", "q", "k", 55 | "ɡ", "n", "ʁ", u'?', u',', u';', u'*'] 56 | 57 | def __call__(self, text, source_script=None, target_script=None): 58 | if source_script: 59 | self.source_script = source_script.upper() 60 | else: 61 | self.source_script = self.source_script.upper() 62 | if target_script: 63 | self.target_script = target_script.upper() 64 | else: 65 | self.target_script = self.target_script.upper() 66 | 67 | # If source and target are same, then return original text 68 | if self.target_script == self.source_script: 69 | return text # No conversion needed 70 | 71 | method_name = f'{self.source_script}2{self.target_script}' 72 | 73 | convert_method = getattr(self, method_name, None) 74 | 75 | if convert_method: 76 | return convert_method(text) 77 | else: 78 | raise ValueError( 79 | f'Conversion from {self.source_script} to {self.target_script} not supported') 80 | 81 | def isPureUyghurScript(herp): 82 | m = re.search('[\u0621-\u06ff]', herp) 83 | if m == None: 84 | return False 85 | else: 86 | return True 87 | 88 | def _repalce_via_table(self, text, tab1, tab2): 89 | for i, j in zip(tab1, tab2): 90 | text = text.replace(i, j) 91 | return text 92 | 93 | # ---------------------------------------------- 94 | # Source script to common turkic script 95 | def UAS2CTS(self, text, keep_apstrophe=False): 96 | """ 97 | UAS to CTS 98 | Parameters 99 | ---------- 100 | text : str 101 | 102 | Returns 103 | ------- 104 | text 105 | """ 106 | text = self._repalce_via_table( 107 | text, self.__uas_group1, self.__cts_group1) 108 | text = self.__revise_CTS(text, keep_apstrophe) 109 | return text 110 | 111 | def __revise_CTS(self, text, keep_apostrophes): 112 | """ 113 | revise CTS 114 | Parameters 115 | ---------- 116 | text : str 117 | 118 | Returns 119 | Text 120 | ------- 121 | 122 | """ 123 | # Remove a "U+0626" if it is a beginning of a word, if it is not after a alphabet in CTS 124 | text = re.sub( 125 | r'(?<=[^aeuoöübptcçxdzrjsşfñlmhvéiyqkgnğ]|^)\u0626', '', text) 126 | # Replace a "U+0626" with "'" if "U+0626" is appeared in a word and its previous character is not in 127 | # [u'a', u'e', u'é', u'i', u'o', u'u', u'ö', u'ü'] 128 | if not keep_apostrophes: 129 | text = re.sub(r'(([aeéiouöü])\u0626)', 130 | lambda m: m.group()[0], text) 131 | text = text.replace('\u0626', u"'") 132 | return text 133 | 134 | def ULS2CTS(self, text): 135 | text = text.lower() 136 | # ch ç # zh j # sh ş # gh ğ 137 | text = text.replace(u"j", u'c') \ 138 | .replace(u"ng", u'ñ') \ 139 | .replace(u"n'g", u'ng') \ 140 | .replace(u"'ng", u'ñ') \ 141 | .replace(u'ch', u'ç') \ 142 | .replace(u'zh', u'j') \ 143 | .replace(u'sh', u'ş') \ 144 | .replace(u"'gh", u'ğ') \ 145 | .replace(u"gh", u'ğ') \ 146 | .replace(u"w", u'v') \ 147 | .replace(u'ch', u'ç') 148 | return text 149 | 150 | def UYS2CTS(self, text): 151 | text = text.lower() 152 | # e:ə c:j ç:q x:h j:ⱬ ş:x ñ:ng ö:ø ü:ü v:w é:e 153 | # q:ⱪ ğ:ƣ 154 | text = text.replace(u"e", u'é') \ 155 | .replace(u"ə", u'e') \ 156 | .replace(u"j", u'c') \ 157 | .replace(u"q", u'ç') \ 158 | .replace(u"ⱬ", u'j') \ 159 | .replace(u"x", u'ş') \ 160 | .replace(u"h", u'x') \ 161 | .replace(u"ⱨ", u'h') \ 162 | .replace(u"ng", u'ñ') \ 163 | .replace(u"ø", u'ö') \ 164 | .replace(u"ü", u'ü') \ 165 | .replace(u"w", u'v') \ 166 | .replace(u"ⱪ", u'q') \ 167 | .replace(u"ƣ", u'ğ') 168 | return text 169 | 170 | def UCS2CTS(self, text): 171 | text = text.lower() 172 | text = self._repalce_via_table( 173 | text, self.__ucs_group1, self.__cts_group1) 174 | text = text.replace("я", "ya").replace("ю", "yu") 175 | return text 176 | 177 | def XJUS2CTS(self, text): 178 | text = text.replace('v', "\u0626") \ 179 | .replace(u'J', u"j") \ 180 | .replace(u'c', u"ç") \ 181 | .replace(u'j', u"c") \ 182 | .replace(u'x', u"ş") \ 183 | .replace(u'H', u"x") \ 184 | .replace(u'N', u"ñ") \ 185 | .replace(u'O', u"ö") \ 186 | .replace(u'U', u"ü") \ 187 | .replace(u'e', u"é") \ 188 | .replace(u"A", u'e') \ 189 | .replace(u'G', u"ğ") \ 190 | .replace(u'w', u"v") 191 | text = self.__revise_CTS(text, False) 192 | return text 193 | 194 | def XJUS2UAS(self, text): 195 | text = text.replace('v', "\u0626") \ 196 | .replace(u'c', u"ç") \ 197 | .replace(u'j', u"c") \ 198 | .replace(u'J', u"j") \ 199 | .replace(u'x', u"ş") \ 200 | .replace(u'H', u"x") \ 201 | .replace(u'N', u"ñ") \ 202 | .replace(u'O', u"ö") \ 203 | .replace(u'U', u"ü") \ 204 | .replace(u'e', u"é") \ 205 | .replace(u"A", u'e') \ 206 | .replace(u'G', u"ğ") \ 207 | .replace(u'w', u"v") 208 | text = self.CTS2UAS(self.__revise_CTS(text, False)) 209 | return text 210 | 211 | def UZLS2CTS(self, text): 212 | text = text.replace(u'ch', u'ç') \ 213 | .replace('sh', u'ş') \ 214 | .replace("s'h", 'sh') \ 215 | .replace('ng', u"ñ") \ 216 | .replace("n'g", 'ng') \ 217 | .replace(u"g‘", u'ğ') \ 218 | .replace("o‘", u"ö") \ 219 | .replace("u‘", u"ü") \ 220 | .replace("e", u"é") \ 221 | .replace('a', 'e') \ 222 | .replace(u'o', 'a') \ 223 | .replace(u'j', 'c') 224 | text = self.__revise_CTS(text, False) 225 | return text 226 | 227 | # ---------------------------------------------- 228 | # Common turkic script to target script 229 | 230 | def CTS2UAS(self, text): 231 | """ 232 | CTS to UAS 233 | Parameters 234 | ---------- 235 | text : str 236 | 237 | Returns 238 | ------- 239 | text 240 | """ 241 | 242 | text = re.sub(r'(?<=[^bptcçxdrzjsşfñlmhvyqkgnğ]|^)[aeéiouöü]', 243 | lambda m: u'\u0626' + m.group(), text) 244 | # add a "U+0626" before a vowel if it is the beginning of a word or after a vowel but not at the end of the word 245 | # for example 246 | # "ait" -> "U+0626aU+0626it" ئائىت 247 | # cuñxua -> cuñxua. cuñxu'a is wrong جۇڭخۇا 248 | text = self._repalce_via_table( 249 | text, self.__cts_group1, self.__uas_group1) 250 | # replace "'\u0626" with "" 251 | text = text.replace(u"'", '') 252 | text = self._revise_UAS(text) 253 | return text 254 | 255 | def _revise_UAS(self, text): 256 | return re.sub(r"(^|-|\s|[اەېىوۇۆۈ])([اەېىوۇۆۈ])", lambda m: m.group(1) + "ئ" + m.group(2), text) 257 | 258 | def CTS2ULS(self, text): 259 | text = text.lower() 260 | text = text.replace(u'ng', u"n'g") \ 261 | .replace(u'sh', u"s'h") \ 262 | .replace(u'ch', u"c'h") \ 263 | .replace(u'zh', u"z'h") \ 264 | .replace(u'gh', u"g'h") \ 265 | .replace(u'ng', u"n'g") \ 266 | .replace(u'nğ', u"n'gh") \ 267 | .replace(u'ñ', u"ng") \ 268 | .replace(u'j', u'zh') \ 269 | .replace(u"c", u'j') \ 270 | .replace(u'ç', u'ch') \ 271 | .replace(u'ş', u'sh') \ 272 | .replace(u"ğ", u"gh") \ 273 | .replace(u"v", u'w') 274 | return text 275 | 276 | def CTS2UYS(self, text): 277 | text = text.lower() 278 | text = text.replace(u'ng', u"n'g") \ 279 | .replace(u"e", u'ə') \ 280 | .replace(u'j', u"ⱬ") \ 281 | .replace(u'c', u"j") \ 282 | .replace(u'q', u"ⱪ") \ 283 | .replace(u'ç', u"q") \ 284 | .replace(u'h', u"ⱨ") \ 285 | .replace(u'x', u"h") \ 286 | .replace(u'ş', u"x") \ 287 | .replace(u'ñ', u"ng") \ 288 | .replace(u'ö', u"ø") \ 289 | .replace(u'v', u"w") \ 290 | .replace(u'é', u"e") \ 291 | .replace(u'ğ', u"ƣ") 292 | return text 293 | 294 | def CTS2IPA(self, text): 295 | position = self.__ipa_group1.index('y') 296 | self.__cts_group1 = self.__cts_group1[:position] + \ 297 | self.__cts_group1[position+1:] 298 | self.__ipa_group1 = self.__ipa_group1[:position] + \ 299 | self.__ipa_group1[position + 1:] 300 | 301 | text = self._repalce_via_table( 302 | text, self.__cts_group1, self.__ipa_group1) 303 | text = text.replace('ü', 'y') 304 | return text 305 | 306 | def CTS2UZLS(self, text): 307 | text = text.lower() 308 | text = text.replace(u"a", u'o')\ 309 | .replace(u"e", u'a') \ 310 | .replace(u'c', u"j") \ 311 | .replace(u'q', u"q") \ 312 | .replace(u'ç', u"ch") \ 313 | .replace(u'ş', u"sh") \ 314 | .replace(u'ñ', u"ng") \ 315 | .replace(u'ö', u"o‘") \ 316 | .replace(u'ü', u"u‘") \ 317 | .replace(u'é', u"e") \ 318 | .replace(u'ğ', u"g‘") 319 | return text 320 | 321 | def CTS2XJUS(self, text): 322 | text = text.lower() 323 | text = text.replace(u"e", u'A') \ 324 | .replace(u'x', u"H") \ 325 | .replace(u'j', u"J") \ 326 | .replace(u'c', u"j") \ 327 | .replace(u'ç', u"c") \ 328 | .replace(u'ş', u"x") \ 329 | .replace(u'ñ', u"N") \ 330 | .replace(u'ö', u"O") \ 331 | .replace(u'ü', u"U") \ 332 | .replace(u'é', u"e") \ 333 | .replace(u'ğ', u"G") \ 334 | .replace(u'v', u"w") 335 | 336 | text = re.sub( 337 | r'(?<=[^bptcxdrzjJsxfNlmhHyqkgnGw]|^)[aAeiouOU]', lambda m: 'v' + m.group(), text) 338 | text = text.replace(u"'", '') 339 | return text 340 | 341 | def CTS2UCS(self, text): 342 | text = text.lower() 343 | text = text.replace("ya", "я").replace("yu", "ю") 344 | text = self._repalce_via_table( 345 | text, self.__cts_group1, self.__ucs_group1) 346 | # return text.replace("'", "") 347 | return text 348 | 349 | # ---------------------------------------------- 350 | # Uyghur Latin script to target script 351 | def ULS2UAS(self, text): 352 | return self.CTS2UAS(self.ULS2CTS(text)) 353 | 354 | def ULS2UCS(self, text): 355 | return self.CTS2UCS(self.ULS2CTS(text)) 356 | 357 | def ULS2UYS(self, text): 358 | return self.CTS2UYS(self.ULS2CTS(text)) 359 | 360 | # ---------------------------------------------- 361 | # Uyghur Arabic script to target script 362 | 363 | def UAS2ULS(self, text): 364 | return self.CTS2ULS(self.UAS2CTS(text, True)) 365 | 366 | def UAS2UCS(self, text): 367 | return self.CTS2UCS(self.UAS2CTS(text, True)) 368 | 369 | def UAS2UYS(self, text): 370 | return self.CTS2UYS(self.UAS2CTS(text, True)) 371 | 372 | # ---------------------------------------------- 373 | # Uyghur Cyrillic script to target script 374 | 375 | def UCS2UAS(self, text): 376 | return self.CTS2UAS(self.UCS2CTS(text)) 377 | 378 | def UCS2ULS(self, text): 379 | return self.CTS2ULS(self.UCS2CTS(text)) 380 | 381 | def UCS2ULS(self, text): 382 | return self.CTS2ULS(self.UCS2CTS(text)) 383 | 384 | def UCS2UYS(self, text): 385 | return self.CTS2UYS(self.UCS2CTS(text)) 386 | 387 | # ---------------------------------------------- 388 | # Uyghur Yengi script to target script 389 | 390 | def UYS2UAS(self, text): 391 | return self.CTS2UAS(self.UYS2CTS(text)) 392 | 393 | def UYS2ULS(self, text): 394 | return self.CTS2ULS(self.UYS2CTS(text)) 395 | 396 | def UYS2UCS(self, text): 397 | return self.CTS2UCS(self.UYS2CTS(text)) 398 | 399 | def UAStoUZLS(self, text): 400 | return self.CTS2UZLS(self.UAS2CTS(text, True)) 401 | 402 | 403 | def args_parser(): 404 | parser = argparse.ArgumentParser( 405 | description='Convert text from one script to another') 406 | parser.add_argument('-s', '--source', help='source script', required=True) 407 | parser.add_argument('-t', '--target', help='target script', required=True) 408 | parser.add_argument('-i', '--input', help='input file', required=True) 409 | parser.add_argument('-o', '--output', help='output file', required=True) 410 | # parser.add_argument('--la', action='store_true', default=False, help='Removing apostrophe between vowels', required=False) 411 | args = parser.parse_args() 412 | return args 413 | 414 | 415 | if __name__ == "__main__": 416 | args = args_parser() 417 | print(args.less_apostrophe) 418 | with open(args.input, 'r') as f: 419 | text = f.read() 420 | 421 | converter = UgMultiScriptConverter(args.source, args.target) 422 | text = converter(text) 423 | with open(args.output, 'w') as f: 424 | f.write(text) 425 | 426 | print("Done") 427 | --------------------------------------------------------------------------------