├── tests
    ├── __init__.py
    ├── test_CTS2UZBEK.py
    ├── test_CTS2IPA.py
    ├── test_UAS2UCS.py
    ├── test_UAS2CTS.py
    ├── test_UCS2UAS.py
    ├── test_CTS2XJUS.py
    ├── test_UAS2ULS.py
    ├── test_CTS2UAS.py
    ├── test_ULS2UAS.py
    ├── test_XJUS2CTS.py
    ├── test_UZBEK2CTS.py
    └── test_XJUS2UAS.py
├── umsc
    ├── __init__.py
    └── umsc.py
├── demo.py
├── CITATION.cff
├── setup.py
├── .gitignore
├── README.md
└── LICENSE


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/umsc/__init__.py:
--------------------------------------------------------------------------------
1 | from .umsc import UgMultiScriptConverter


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
1 | from umsc import UgMultiScriptConverter
2 | 
3 | converter = UgMultiScriptConverter('CTS', 'IPA')
4 | 
5 | input = "encür"
6 | 
7 | print(converter(input))
8 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 |   - family-names: Osman
5 |     given-names: Tursun
6 |     orcid: https://orcid.org/0000-0002-0592-0864
7 | title: "Uyghur Multi-Script Converter"
8 | version: 1.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='umsc',
 5 |     version='0.4.0',
 6 |     author='Osman Tursun',
 7 |     author_email='osmanjan.t@gmail.com',
 8 |     maintainer='Osman Tursun',
 9 |     maintainer_email='mpcabd@gmail.com',
10 |     description='Script Converter for Uyghur Language',
11 |     long_description=open('README.md').read(),
12 |     long_description_content_type='text/markdown',
13 |     keywords='uyghur script converter arabic latin cyrillic IPA ئۇيغۇر',
14 |     url='https://github.com/neouyghur/ScriptConverter4Uyghur',
15 |     packages=find_packages(),
16 |     install_requires=[
17 |         "regex"
18 |     ],
19 |     classifiers=[
20 |         # Trove classifiers to categorize the package (https://pypi.org/classifiers/)
21 |         'Programming Language :: Python :: 3',
22 |         'License :: OSI Approved :: Apache Software License',
23 |         'Operating System :: OS Independent',
24 |     ],
25 |     python_requires='>=3.0',  # Minimum version requirement of the package
26 | )
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | .idea/


--------------------------------------------------------------------------------
/tests/test_CTS2UZBEK.py:
--------------------------------------------------------------------------------
 1 | from umsc.umsc import UgMultiScriptConverter
 2 | import pytest
 3 | 
 4 | 
 5 | test_data = [("qol", "qol"),
 6 |     ("baş", "bosh"),
 7 |     ("put", "put"),
 8 |     ("köz", "ko‘z"),
 9 |     ("ceñçi", "jangchi"),
10 |     ("cudé", "jude"),
11 |     ("san", "son"),
12 |     ("sey", "say"),
13 |     ("é", "e"),
14 |     ("şir", "shir"),
15 |     ("şañxey", "shongxay"),
16 |     ("kitab", "kitob"),
17 |     ("veten", "vatan"),
18 |     ("tomur", "tomur"),
19 |     ("kömür", "ko‘mu‘r"),
20 |     ("éliktir", "eliktir"),
21 |     ("vyétnam", "vyetnom"),
22 |     ("şincañ", "shinjong"),
23 |     ("anar", "onor"),
24 |     ("encür", "anju‘r"),
25 |     ("orda", "ordo"),
26 |     ("uruş", "urush"),
27 |     ("ördek", "o‘rdak"),
28 |     ("üzüm", "u‘zu‘m"),
29 |     ("élan", "elon"),
30 |     ("inkas", "inkos"),
31 |     ("inik'ana", "inik'ono"),
32 |     ("es'et", "as'at"),
33 |     ("radio", "rodio"),
34 |     ("mes'ul", "mas'ul"),
35 |     ("qariörük", "qorio‘ru‘k"),
36 |     ("naümid", "nou‘mid"),
37 |     ("it'éyiq", "it'eyiq"),
38 |     ("cem'iy", "jam'iy"),
39 |     ("nemen'gan", "naman'gon"),
40 |     ("özxan", "o‘zxon"),
41 |     ("pasxa", "posxo"),
42 |     ("bayrimi", "boyrimi"),
43 |     ("maarip", "moorip"),
44 |     ("muellim", "muallim"),
45 |     ("daire", "doira"),
46 |     ("mueyyen", "muayyan"),
47 |     ("tebiiy", "tabiiy"),
48 |     ("paaliyet", "pooliyat"),
49 |     ("ishaq", "ishoq"),
50 |     ("özbékistanğa", "o‘zbekistong‘o"),
51 |     ("hingan", "hingon"),
52 |     ("çeklengen", "chaklangan"),
53 |     ("gañgirap", "gonggirop"),
54 |     ("başlanğuç", "boshlong‘uch"),
55 |     ("cem'iyet", "jam'iyat"),
56 | ]
57 | 
58 | @pytest.mark.parametrize("input,expected", test_data)
59 | def test_UAS2CTS(input, expected):
60 |     converter = UgMultiScriptConverter("CTS", "UZLS")
61 |     assert converter(input) == expected
62 | 


--------------------------------------------------------------------------------
/tests/test_CTS2IPA.py:
--------------------------------------------------------------------------------
 1 | from umsc.umsc import UgMultiScriptConverter
 2 | import pytest
 3 | 
 4 | # Pair each input with its expected output
 5 | test_data = [
 6 |     ('qol', 'qol'),
 7 |     ('baş', 'bɑʃ'),
 8 |     ('put', 'put'),
 9 |     ('köz', 'køz'),
10 |     ('ceñçi', 'dʒæŋtʃi'),
11 |     ('cudé', 'dʒudɛ'),
12 |     ('san', 'sɑn'),
13 |     ('sey', 'sæj'),
14 |     ('é', 'ɛ'),
15 |     ('şir', 'ʃir'),
16 |     ('şañxey', 'ʃɑŋχæj'),
17 |     ('kitab', 'kitɑb'),
18 |     ('veten', 'wætæn'),
19 |     ('tomur', 'tomur'),
20 |     ('kömür', 'kømyr'),
21 |     ('éliktir', 'ɛliktir'),
22 |     ('vyétnam', 'wjɛtnɑm'),
23 |     ('şincañ', 'ʃindʒɑŋ'),
24 |     ('anar', 'ɑnɑr'),
25 |     ('encür', 'ændʒyr'),
26 |     ('orda', 'ordɑ'),
27 |     ('uruş', 'uruʃ'),
28 |     ('ördek', 'ørdæk'),
29 |     ('üzüm', 'yzym'),
30 |     ('élan', 'ɛlɑn'),
31 |     ('inkas', 'inkɑs'),
32 |     ("inik'ana", "inik'ɑnɑ"),
33 |     ("es'et", "æs'æt"),
34 |     ('radio', 'rɑdio'),
35 |     ("mes'ul", "mæs'ul"),
36 |     ('qariörük', 'qɑriøryk'),
37 |     ('naümid', 'nɑymid'),
38 |     ("it'éyiq", "it'ɛjiq"),
39 |     ("cem'iy", "dʒæm'ij"),
40 |     ('nemengan', 'næmænɡɑn'),
41 |     ('özxan', 'øzχɑn'),
42 |     ('pasxa', 'pɑsχɑ'),
43 |     ('bayrimi', 'bɑjrimi'),
44 |     ('maarip', 'mɑɑrip'),
45 |     ('muellim', 'muællim'),
46 |     ('daire', 'dɑiræ'),
47 |     ('mueyyen', 'muæjjæn'),
48 |     ('tebiiy', 'tæbiij'),
49 |     ('paaliyet', 'pɑɑlijæt'),
50 |     ('ishaq', 'ishɑq'),
51 |     ('özbékistanğa', 'øzbɛkistɑnʁɑ'),
52 |     ('hingan', 'hinɡɑn'),
53 |     ('çeklengen', 'tʃæklænɡæn'),
54 |     ('gañgirap', 'ɡɑŋɡirɑp'),
55 |     ('başlanğuç', 'bɑʃlɑnʁutʃ'),
56 |     ("cem'iyet", "dʒæm'ijæt")
57 | ]
58 | @pytest.mark.parametrize("input,expected", test_data)
59 | def test_CTS2UAS(input, expected):
60 |     converter = UgMultiScriptConverter("CTS", "IPA")
61 |     assert converter(input) == expected
62 | 


--------------------------------------------------------------------------------
/tests/test_UAS2UCS.py:
--------------------------------------------------------------------------------
 1 | from umsc.umsc import UgMultiScriptConverter
 2 | import pytest
 3 | 
 4 | 
 5 | test_data = [
 6 |     ("قول", "қол"),
 7 |     ("باش", "баш"),
 8 |     ("پۇت", "пут"),
 9 |     ("كۆز", "көз"),
10 |     ("جەڭچى", "җәңчи"),
11 |     ("جۇدې", "җуде"),
12 |     ("سان", "сан"),
13 |     ("سەي", "сәй"),
14 |     ("ئې", "е"),
15 |     ("شىر", "шир"),
16 |     ("شاڭخەي", "шаңхәй"),
17 |     ("كىتاب", "китаб"),
18 |     ("ۋەتەن", "вәтән"),
19 |     ("تومۇر", "томур"),
20 |     ("كۆمۈر", "көмүр"),
21 |     ("ئېلىكتىر", "еликтир"),
22 |     ("ۋيېتنام", "вйетнам"),
23 |     ("شىنجاڭ", "шинҗаң"),
24 |     ("ئانار", "анар"),
25 |     ("ئەنجۈر", "әнҗүр"),
26 |     ("ئوردا", "орда"),
27 |     ("ئۇرۇش", "уруш"),
28 |     ("ئۆردەك", "өрдәк"),
29 |     ("ئۈزۈم", "үзүм"),
30 |     ("ئېلان", "елан"),
31 |     ("ئىنكاس", "инкас"),
32 |     ("ئىنىكئانا", "иник'ана"),
33 |     ("ئەسئەت", "әс'әт"),
34 |     ("رادىئو", "ради'о"),
35 |     ("مەسئۇل", "мәс'ул"),
36 |     ("قارىئۆرۈك", "қари'өрүк"),
37 |     ("نائۈمىد", "на'үмид"),
38 |     ("ئىتئېيىق", "ит'ейиқ"),
39 |     ("جەمئىي", "җәм'ий"),
40 |     ("نەمەنگان", "нәмәнган"),
41 |     ("ئۆزخان", "өзхан"),
42 |     ("پاسخا", "пасха"),
43 |     ("بايرىمى", "байрими"),
44 |     ("مائارىپ", "ма'арип"),
45 |     ("مۇئەللىم", "му'әллим"),
46 |     ("دائىرە", "да'ирә"),
47 |     ("مۇئەييەن", "му'әййән"),
48 |     ("تەبىئىي", "тәби'ий"),
49 |     ("پائالىيەت", "па'алийәт"),
50 |     ("ئىسھاق", "исһақ"),
51 |     ("ئۆزبېكىستانغا", "өзбекистанға"),
52 |     ("ھىنگان", "һинган"),
53 |     ("چەكلەنگەن", "чәкләнгән"),
54 |     ("گاڭگىراپ", "гаңгирап"),
55 |     ("باشلانغۇچ", "башланғуч"),
56 |     ("جەمئىيەت", "җәм'ийәт"),
57 |     # ("جۇڭخۇا", "җуңхуа|ⱪol"),
58 | ]
59 | 
60 | 
61 | @pytest.mark.parametrize("input,expected", test_data)
62 | def test_UAS2CTS(input, expected):
63 |     converter = UgMultiScriptConverter("UAS", "UCS")
64 |     assert converter(input) == expected
65 | 


--------------------------------------------------------------------------------
/tests/test_UAS2CTS.py:
--------------------------------------------------------------------------------
 1 | from umsc.umsc import UgMultiScriptConverter
 2 | import pytest
 3 | 
 4 | # Pair each input with its expected output
 5 | test_data = [
 6 |     ("قول", "qol"),
 7 |     ("باش", "baş"),
 8 |     ("پۇت", "put"),
 9 |     ("كۆز", "köz"),
10 |     ("جەڭچى", "ceñçi"),
11 |     ("جۇدې", "cudé"),
12 |     ("سان", "san"),
13 |     ("سەي", "sey"),
14 |     ("ئې", "é"),
15 |     ("شىر", "şir"),
16 |     ("شاڭخەي", "şañxey"),
17 |     ("كىتاب", "kitab"),
18 |     ("ۋەتەن", "veten"),
19 |     ("تومۇر", "tomur"),
20 |     ("كۆمۈر", "kömür"),
21 |     ("ئېلىكتىر", "éliktir"),
22 |     ("ۋيېتنام", "vyétnam"),
23 |     ("شىنجاڭ", "şincañ"),
24 |     ("ئانار", "anar"),
25 |     ("ئەنجۈر", "encür"),
26 |     ("ئوردا", "orda"),
27 |     ("ئۇرۇش", "uruş"),
28 |     ("ئۆردەك", "ördek"),
29 |     ("ئۈزۈم", "üzüm"),
30 |     ("ئېلان", "élan"),
31 |     ("ئىنكاس", "inkas"),
32 |     ("ئىنىكئانا", "inik'ana"),
33 |     ("ئەسئەت", "es'et"),
34 |     ("رادىئو", "radio"),
35 |     ("مەسئۇل", "mes'ul"),
36 |     ("قارىئۆرۈك", "qariörük"),
37 |     ("نائۈمىد", "naümid"),
38 |     ("ئىتئېيىق", "it'éyiq"),
39 |     ("جەمئىي", "cem'iy"),
40 |     ("نەمەنگان", "nemengan"),
41 |     ("ئۆزخان", "özxan"),
42 |     ("پاسخا", "pasxa"),
43 |     ("بايرىمى", "bayrimi"),
44 |     ("مائارىپ", "maarip"),
45 |     ("مۇئەللىم", "muellim"),
46 |     ("دائىرە", "daire"),
47 |     ("مۇئەييەن", "mueyyen"),
48 |     ("تەبىئىي", "tebiiy"),
49 |     ("پائالىيەت", "paaliyet"),
50 |     ("ئىسھاق", "ishaq"),
51 |     ("ئۆزبېكىستانغا", "özbékistanğa"),
52 |     ("ھىنگان", "hingan"),
53 |     ("چەكلەنگەن", "çeklengen"),
54 |     ("گاڭگىراپ", "gañgirap"),
55 |     ("باشلانغۇچ", "başlanğuç"),
56 |     ("جەمئىيەت", "cem'iyet"),
57 |     # ("جۇڭخۇا", "cuñxua"),
58 | ]
59 | 
60 | 
61 | @pytest.mark.parametrize("input,expected", test_data)
62 | def test_UAS2CTS(input, expected):
63 |     converter = UgMultiScriptConverter("UAS", "CTS")
64 |     assert converter(input) == expected
65 | 


--------------------------------------------------------------------------------
/tests/test_UCS2UAS.py:
--------------------------------------------------------------------------------
 1 | from umsc.umsc import UgMultiScriptConverter
 2 | import pytest
 3 | 
 4 | 
 5 | test_data = [
 6 |     ("قول", "қол"),
 7 |     ("باش", "баш"),
 8 |     ("پۇت", "пут"),
 9 |     ("كۆز", "көз"),
10 |     ("جەڭچى", "җәңчи"),
11 |     ("جۇدې", "җуде"),
12 |     ("سان", "сан"),
13 |     ("سەي", "сәй"),
14 |     ("ئې", "е"),
15 |     ("شىر", "шир"),
16 |     ("شاڭخەي", "шаңхәй"),
17 |     ("كىتاب", "китаб"),
18 |     ("ۋەتەن", "вәтән"),
19 |     ("تومۇر", "томур"),
20 |     ("كۆمۈر", "көмүр"),
21 |     ("ئېلىكتىر", "еликтир"),
22 |     ("ۋيېتنام", "вйетнам"),
23 |     ("شىنجاڭ", "шинҗаң"),
24 |     ("ئانار", "анар"),
25 |     ("ئەنجۈر", "әнҗүр"),
26 |     ("ئوردا", "орда"),
27 |     ("ئۇرۇش", "уруш"),
28 |     ("ئۆردەك", "өрдәк"),
29 |     ("ئۈزۈم", "үзүм"),
30 |     ("ئېلان", "елан"),
31 |     ("ئىنكاس", "инкас"),
32 |     ("ئىنىكئانا", "иник'ана"),
33 |     ("ئەسئەت", "әс'әт"),
34 |     ("رادىئو", "ради'о"),
35 |     ("مەسئۇل", "мәс'ул"),
36 |     ("قارىئۆرۈك", "қари'өрүк"),
37 |     ("نائۈمىد", "на'үмид"),
38 |     ("ئىتئېيىق", "ит'ейиқ"),
39 |     ("جەمئىي", "җәм'ий"),
40 |     ("نەمەنگان", "нәмәнган"),
41 |     ("ئۆزخان", "өзхан"),
42 |     ("پاسخا", "пасха"),
43 |     ("بايرىمى", "байрими"),
44 |     ("مائارىپ", "ма'арип"),
45 |     ("مۇئەللىم", "му'әллим"),
46 |     ("دائىرە", "да'ирә"),
47 |     ("مۇئەييەن", "му'әййән"),
48 |     ("تەبىئىي", "тәби'ий"),
49 |     ("پائالىيەت", "па'алийәт"),
50 |     ("ئىسھاق", "исһақ"),
51 |     ("ئۆزبېكىستانغا", "өзбекистанға"),
52 |     ("ھىنگان", "һинган"),
53 |     ("چەكلەنگەن", "чәкләнгән"),
54 |     ("گاڭگىراپ", "гаңгирап"),
55 |     ("باشلانغۇچ", "башланғуч"),
56 |     ("جەمئىيەت", "җәм'ийәт"),
57 |     # ("جۇڭخۇا", "җуңхуа|ⱪol"),
58 | ]
59 | 
60 | test_data = [(value, key) for key, value in test_data]
61 | 
62 | @pytest.mark.parametrize("input,expected", test_data)
63 | def test_UAS2CTS(input, expected):
64 |     converter = UgMultiScriptConverter("UCS", "UAS")
65 |     assert converter(input) == expected
66 | 


--------------------------------------------------------------------------------
/tests/test_CTS2XJUS.py:
--------------------------------------------------------------------------------
 1 | from umsc.umsc import UgMultiScriptConverter
 2 | import pytest
 3 | 
 4 | # Pair each input with its expected output
 5 | test_data = [
 6 |     ("qol", "qol"),
 7 |     ("baş", "bax"),
 8 |     ("put", "put"),
 9 |     ("köz", "kOz"),
10 |     ("ceñçi", "jANci"),
11 |     ("cudé", "jude"),
12 |     ("san", "san"),
13 |     ("sey", "sAy"),
14 |     ("é", "ve"),
15 |     ("şir", "xir"),
16 |     ("şañxey", "xaNHAy"),
17 |     ("kitab", "kitab"),
18 |     ("veten", "wAtAn"),
19 |     ("tomur", "tomur"),
20 |     ("kömür", "kOmUr"),
21 |     ("éliktir", "veliktir"),
22 |     ("vyétnam", "wyetnam"),
23 |     ("şincañ", "xinjaN"),
24 |     ("anar", "vanar"),
25 |     ("encür", "vAnjUr"),
26 |     ("orda", "vorda"),
27 |     ("uruş", "vurux"),
28 |     ("ördek", "vOrdAk"),
29 |     ("üzüm", "vUzUm"),
30 |     ("élan", "velan"),
31 |     ("inkas", "vinkas"),
32 |     ("inik'ana", "vinikvana"),
33 |     ("es'et", "vAsvAt"),
34 |     ("radio", "radivo"),
35 |     ("mes'ul", "mAsvul"),
36 |     ("qariörük", "qarivOrUk"),
37 |     ("naümid", "navUmid"),
38 |     ("it'éyiq", "vitveyiq"),
39 |     ("cem'iy", "jAmviy"),
40 |     ("nemengan", "nAmAngan"),
41 |     ("özxan", "vOzHan"),
42 |     ("pasxa", "pasHa"),
43 |     ("bayrimi", "bayrimi"),
44 |     ("maarip", "mavarip"),
45 |     ("muellim", "muvAllim"),
46 |     ("daire", "davirA"),
47 |     ("mueyyen", "muvAyyAn"),
48 |     ("tebiiy", "tAbiviy"),
49 |     ("paaliyet", "pavaliyAt"),
50 |     ("ishaq", "vishaq"),
51 |     ("özbékistanğa", "vOzbekistanGa"),
52 |     ("hingan", "hingan"),
53 |     ("çeklengen", "cAklAngAn"),
54 |     ("gañgirap", "gaNgirap"),
55 |     ("başlanğuç", "baxlanGuc"),
56 |     ("cem'iyet", "jAmviyAt"),
57 |     # ('cuñxua', 'جۇڭخۇا'),
58 |     # ('cuñxua', 'جۇڭخۇئا'),
59 | ]
60 | 
61 | 
62 | 
63 | @pytest.mark.parametrize("input,expected", test_data)
64 | def test_CTS2UAS(input, expected):
65 |     converter = UgMultiScriptConverter("CTS", "XJUS")
66 |     assert converter(input) == expected
67 | 


--------------------------------------------------------------------------------
/tests/test_UAS2ULS.py:
--------------------------------------------------------------------------------
 1 | from umsc.umsc import UgMultiScriptConverter
 2 | import pytest
 3 | 
 4 | # Pair each input with its expected output
 5 | test_data = [
 6 |     ("قول", "qol"),
 7 |     ("باش", "bash"),
 8 |     ("پۇت", "put"),
 9 |     ("كۆز", "köz"),
10 |     ("جەڭچى", "jengchi"),
11 |     ("جۇدې", "judé"),
12 |     ("سان", "san"),
13 |     ("سەي", "sey"),
14 |     ("ئې", "é"),
15 |     ("شىر", "shir"),
16 |     ("شاڭخەي", "shangxey"),
17 |     ("كىتاب", "kitab"),
18 |     ("ۋەتەن", "weten"),
19 |     ("تومۇر", "tomur"),
20 |     ("كۆمۈر", "kömür"),
21 |     ("ئېلىكتىر", "éliktir"),
22 |     ("ۋيېتنام", "wyétnam"),
23 |     ("شىنجاڭ", "shinjang"),
24 |     ("ئانار", "anar"),
25 |     ("ئەنجۈر", "enjür"),
26 |     ("ئوردا", "orda"),
27 |     ("ئۇرۇش", "urush"),
28 |     ("ئۆردەك", "ördek"),
29 |     ("ئۈزۈم", "üzüm"),
30 |     ("ئېلان", "élan"),
31 |     ("ئىنكاس", "inkas"),
32 |     ("ئىنىكئانا", "inik'ana"),
33 |     ("ئەسئەت", "es'et"),
34 |     ("رادىئو", "radi'o"),
35 |     ("مەسئۇل", "mes'ul"),
36 |     ("قارىئۆرۈك", "qari'örük"),
37 |     ("نائۈمىد", "na'ümid"),
38 |     ("ئىتئېيىق", "it'éyiq"),
39 |     ("جەمئىي", "jem'iy"),
40 |     ("نەمەنگان", "nemen'gan"),
41 |     ("ئۆزخان", "özxan"),
42 |     ("پاسخا", "pasxa"),
43 |     ("بايرىمى", "bayrimi"),
44 |     ("مائارىپ", "ma'arip"),
45 |     ("مۇئەللىم", "mu'ellim"),
46 |     ("دائىرە", "da'ire"),
47 |     ("مۇئەييەن", "mu'eyyen"),
48 |     ("تەبىئىي", "tebi'iy"),
49 |     ("پائالىيەت", "pa'aliyet"),
50 |     ("ئىسھاق", "is'haq"),
51 |     ("ئۆزبېكىستانغا", "özbékistan'gha"),
52 |     ("ھىنگان", "hin'gan"),
53 |     ("چەكلەنگەن", "cheklen'gen"),
54 |     ("گاڭگىراپ", "ganggirap"),
55 |     ("باشلانغۇچ", "bashlan'ghuch"),
56 |     ("جەمئىيەت", "jem'iyet"),
57 |     ("جۇڭخۇا", "jungxua"),
58 |     ("ئەدەب-ئەخلاق", "edeb-exlaq"),
59 | ]
60 | 
61 | @pytest.mark.parametrize("input,expected", test_data)
62 | def test_UAS2CTS(input, expected):
63 |     converter = UgMultiScriptConverter("UAS", "ULS")
64 |     assert converter(input) == expected
65 | 


--------------------------------------------------------------------------------
/tests/test_CTS2UAS.py:
--------------------------------------------------------------------------------
 1 | from umsc.umsc import UgMultiScriptConverter
 2 | import pytest
 3 | 
 4 | # Pair each input with its expected output
 5 | test_data = [
 6 |     ("qol", "قول"),
 7 |     ("baş", "باش"),
 8 |     ("put", "پۇت"),
 9 |     ("köz", "كۆز"),
10 |     ("ceñçi", "جەڭچى"),
11 |     ("cudé", "جۇدې"),
12 |     ("san", "سان"),
13 |     ("sey", "سەي"),
14 |     ("é", "ئې"),
15 |     ("şir", "شىر"),
16 |     ("şañxey", "شاڭخەي"),
17 |     ("kitab", "كىتاب"),
18 |     ("veten", "ۋەتەن"),
19 |     ("tomur", "تومۇر"),
20 |     ("kömür", "كۆمۈر"),
21 |     ("éliktir", "ئېلىكتىر"),
22 |     ("veten", "ۋەتەن"),
23 |     ("vyétnam", "ۋيېتنام"),
24 |     ("şincañ", "شىنجاڭ"),
25 |     ("anar", "ئانار"),
26 |     ("encür", "ئەنجۈر"),
27 |     ("orda", "ئوردا"),
28 |     ("uruş", "ئۇرۇش"),
29 |     ("ördek", "ئۆردەك"),
30 |     ("üzüm", "ئۈزۈم"),
31 |     ("élan", "ئېلان"),
32 |     ("inkas", "ئىنكاس"),
33 |     ("inik'ana", "ئىنىكئانا"),
34 |     ("es'et", "ئەسئەت"),
35 |     ("radio", "رادىئو"),
36 |     ("mes'ul", "مەسئۇل"),
37 |     ("qariörük", "قارىئۆرۈك"),
38 |     ("naümid", "نائۈمىد"),
39 |     ("it'éyiq", "ئىتئېيىق"),
40 |     ("cem'iy", "جەمئىي"),
41 |     ("nemengan", "نەمەنگان"),
42 |     ("özxan", "ئۆزخان"),
43 |     ("pasxa", "پاسخا"),
44 |     ("bayrimi", "بايرىمى"),
45 |     ("maarip", "مائارىپ"),
46 |     ("muellim", "مۇئەللىم"),
47 |     ("daire", "دائىرە"),
48 |     ("mueyyen", "مۇئەييەن"),
49 |     ("tebiiy", "تەبىئىي"),
50 |     ("paaliyet", "پائالىيەت"),
51 |     ("ishaq", "ئىسھاق"),
52 |     ("özbékistanğa", "ئۆزبېكىستانغا"),
53 |     ("hingan", "ھىنگان"),
54 |     ("çeklengen", "چەكلەنگەن"),
55 |     ("gañgirap", "گاڭگىراپ"),
56 |     ("başlanğuç", "باشلانغۇچ"),
57 |     ("cem'iyet", "جەمئىيەت"),
58 |     # ('cuñxua', 'جۇڭخۇا'),
59 |     # ('cuñxua', 'جۇڭخۇئا'),
60 | ]
61 | 
62 | 
63 | 
64 | @pytest.mark.parametrize("input,expected", test_data)
65 | def test_CTS2UAS(input, expected):
66 |     converter = UgMultiScriptConverter("CTS", "UAS")
67 |     assert converter(input) == expected
68 | 


--------------------------------------------------------------------------------
/tests/test_ULS2UAS.py:
--------------------------------------------------------------------------------
 1 | from umsc.umsc import UgMultiScriptConverter
 2 | import pytest
 3 | 
 4 | # Pair each input with its expected output
 5 | test_data = [
 6 |     ("قول", "qol"),
 7 |     ("باش", "bash"),
 8 |     ("پۇت", "put"),
 9 |     ("كۆز", "köz"),
10 |     ("جەڭچى", "jengchi"),
11 |     ("جۇدې", "judé"),
12 |     ("سان", "san"),
13 |     ("سەي", "sey"),
14 |     ("ئې", "é"),
15 |     ("شىر", "shir"),
16 |     ("شاڭخەي", "shangxey"),
17 |     ("كىتاب", "kitab"),
18 |     ("ۋەتەن", "weten"),
19 |     ("تومۇر", "tomur"),
20 |     ("كۆمۈر", "kömür"),
21 |     ("ئېلىكتىر", "éliktir"),
22 |     ("ۋيېتنام", "wyétnam"),
23 |     ("شىنجاڭ", "shinjang"),
24 |     ("ئانار", "anar"),
25 |     ("ئەنجۈر", "enjür"),
26 |     ("ئوردا", "orda"),
27 |     ("ئۇرۇش", "urush"),
28 |     ("ئۆردەك", "ördek"),
29 |     ("ئۈزۈم", "üzüm"),
30 |     ("ئېلان", "élan"),
31 |     ("ئىنكاس", "inkas"),
32 |     ("ئىنىكئانا", "inik'ana"),
33 |     ("ئەسئەت", "es'et"),
34 |     ("رادىئو", "radio"),
35 |     ("مەسئۇل", "mes'ul"),
36 |     ("قارىئۆرۈك", "qariörük"),
37 |     ("نائۈمىد", "naümid"),
38 |     ("ئىتئېيىق", "it'éyiq"),
39 |     ("جەمئىي", "jem'iy"),
40 |     ("نەمەنگان", "nemen'gan"),
41 |     ("ئۆزخان", "özxan"),
42 |     ("پاسخا", "pasxa"),
43 |     ("بايرىمى", "bayrimi"),
44 |     ("مائارىپ", "maarip"),
45 |     ("مۇئەللىم", "muellim"),
46 |     ("دائىرە", "daire"),
47 |     ("مۇئەييەن", "mueyyen"),
48 |     ("تەبىئىي", "tebiiy"),
49 |     ("پائالىيەت", "paaliyet"),
50 |     ("ئىسھاق", "is'haq"),
51 |     ("ئۆزبېكىستانغا", "özbékistan'gha"),
52 |     ("ھىنگان", "hin'gan"),
53 |     ("چەكلەنگەن", "cheklen'gen"),
54 |     ("گاڭگىراپ", "ganggirap"),
55 |     ("باشلانغۇچ", "bashlan'ghuch"),
56 |     ("جەمئىيەت", "jem'iyet"),
57 |     # ("جۇڭخۇا", "jungxua"),
58 | ]
59 | 
60 | test_data = [(value, key) for key, value in test_data]
61 | 
62 | @pytest.mark.parametrize("input,expected", test_data)
63 | def test_UAS2CTS(input, expected):
64 |     converter = UgMultiScriptConverter("ULS", "UAS")
65 |     assert converter(input) == expected
66 | 


--------------------------------------------------------------------------------
/tests/test_XJUS2CTS.py:
--------------------------------------------------------------------------------
 1 | from umsc.umsc import UgMultiScriptConverter
 2 | import pytest
 3 | 
 4 | # Pair each input with its expected output
 5 | test_data = [
 6 |     ("qol", "qol"),
 7 |     ("baş", "bax"),
 8 |     ("put", "put"),
 9 |     ("köz", "kOz"),
10 |     ("ceñçi", "jANci"),
11 |     ("cudé", "jude"),
12 |     ("san", "san"),
13 |     ("sey", "sAy"),
14 |     ("é", "ve"),
15 |     ("şir", "xir"),
16 |     ("şañxey", "xaNHAy"),
17 |     ("kitab", "kitab"),
18 |     ("veten", "wAtAn"),
19 |     ("tomur", "tomur"),
20 |     ("kömür", "kOmUr"),
21 |     ("éliktir", "veliktir"),
22 |     ("vyétnam", "wyetnam"),
23 |     ("şincañ", "xinjaN"),
24 |     ("anar", "vanar"),
25 |     ("encür", "vAnjUr"),
26 |     ("orda", "vorda"),
27 |     ("uruş", "vurux"),
28 |     ("ördek", "vOrdAk"),
29 |     ("üzüm", "vUzUm"),
30 |     ("élan", "velan"),
31 |     ("inkas", "vinkas"),
32 |     ("inik'ana", "vinikvana"),
33 |     ("es'et", "vAsvAt"),
34 |     ("radio", "radivo"),
35 |     ("mes'ul", "mAsvul"),
36 |     ("qariörük", "qarivOrUk"),
37 |     ("naümid", "navUmid"),
38 |     ("it'éyiq", "vitveyiq"),
39 |     ("cem'iy", "jAmviy"),
40 |     ("nemengan", "nAmAngan"),
41 |     ("özxan", "vOzHan"),
42 |     ("pasxa", "pasHa"),
43 |     ("bayrimi", "bayrimi"),
44 |     ("maarip", "mavarip"),
45 |     ("muellim", "muvAllim"),
46 |     ("daire", "davirA"),
47 |     ("mueyyen", "muvAyyAn"),
48 |     ("tebiiy", "tAbiviy"),
49 |     ("paaliyet", "pavaliyAt"),
50 |     ("ishaq", "vishaq"),
51 |     ("özbékistanğa", "vOzbekistanGa"),
52 |     ("hingan", "hingan"),
53 |     ("çeklengen", "cAklAngAn"),
54 |     ("gañgirap", "gaNgirap"),
55 |     ("başlanğuç", "baxlanGuc"),
56 |     ("cem'iyet", "jAmviyAt"),
57 |     # ('cuñxua', 'جۇڭخۇا'),
58 |     # ('cuñxua', 'جۇڭخۇئا'),
59 | ]
60 | 
61 | 
62 | test_data = [(value, key) for key, value in test_data]
63 | @pytest.mark.parametrize("input,expected", test_data)
64 | def test_CTS2UAS(input, expected):
65 |     converter = UgMultiScriptConverter("XJUS", "CTS")
66 |     assert converter(input) == expected
67 | 


--------------------------------------------------------------------------------
/tests/test_UZBEK2CTS.py:
--------------------------------------------------------------------------------
 1 | from umsc.umsc import UgMultiScriptConverter
 2 | import pytest
 3 | 
 4 | test_data = [('qol', 'qal'),
 5 |              ('bosh', 'baş'),
 6 |              ('put', 'put'),
 7 |              ('ko‘z', 'köz'),
 8 |              ('jangchi', 'ceñçi'),
 9 |              ('jude', 'cudé'),
10 |              ('son', 'san'),
11 |              ('say', 'sey'),
12 |              ('e', 'é'),
13 |              ('shir', 'şir'),
14 |              ('shongxay', 'şañxey'),
15 |              ('kitob', 'kitab'),
16 |              ('vatan', 'veten'),
17 |              ('tomur', 'tamur'),
18 |              ('ko‘mu‘r', 'kömür'),
19 |              ('eliktir', 'éliktir'),
20 |              ('vyetnom', 'vyétnam'),
21 |              ('shinjong', 'şincañ'),
22 |              ('onor', 'anar'),
23 |              ('anju‘r', 'encür'),
24 |              ('ordo', 'arda'),
25 |              ('urush', 'uruş'),
26 |              ('o‘rdak', 'ördek'),
27 |              ('u‘zu‘m', 'üzüm'),
28 |              ('elon', 'élan'),
29 |              ('inkos', 'inkas'),
30 |              ("inik'ono", "inik'ana"),
31 |              ("as'at", "es'et"),
32 |              ('rodio', 'radia'),
33 |              ("mas'ul", "mes'ul"),
34 |              ('qorio‘ru‘k', 'qariörük'),
35 |              ('nou‘mid', 'naümid'),
36 |              ("it'eyiq", "it'éyiq"),
37 |              ("jam'iy", "cem'iy"),
38 |              ("naman'gon", "nemengan"),
39 |              ('o‘zxon', 'özxan'),
40 |              ('posxo', 'pasxa'),
41 |              ('boyrimi', 'bayrimi'),
42 |              ('moorip', 'maarip'),
43 |              ('muallim', 'muellim'),
44 |              ('doira', 'daire'),
45 |              ('muayyan', 'mueyyen'),
46 |              ('tabiiy', 'tebiiy'),
47 |              ('pooliyat', 'paaliyet'),
48 |              ("is'hoq", 'ishaq'),
49 |              ("o‘zbekiston'g‘o", "özbékistanğa"),
50 |              ("hin'gon", "hingan"),
51 |              ("chaklan'gan", 'çeklengen'),
52 |              ('gonggirop', 'gañgirap'),
53 |              ("boshlon'g‘uch", 'başlanğuç'),
54 |              ("jam'iyat", "cem'iyet")
55 |              ]
56 | 
57 | @pytest.mark.parametrize("input,expected", test_data)
58 | def test_UAS2CTS(input, expected):
59 |     converter = UgMultiScriptConverter("UZLS", "CTS")
60 |     assert converter(input) == expected
61 | 


--------------------------------------------------------------------------------
/tests/test_XJUS2UAS.py:
--------------------------------------------------------------------------------
 1 | from umsc.umsc import UgMultiScriptConverter
 2 | import pytest
 3 | 
 4 | test_data = [
 5 |     ("qol", "قول"),
 6 |     ("bax", "باش"),
 7 |     ("put", "پۇت"),
 8 |     ("kOz", "كۆز"),
 9 |     ("wAdA", "ۋەدە"),
10 |     ("juda", "جۇدا"),
11 |     ("sAn", "سەن"),
12 |     ("samsaq", "سامساق"),
13 |     ("haywan", "ھايۋان"),
14 |     ("xir", "شىر"),
15 |     ("Gulja", "غۇلجا"),
16 |     ("kitab", "كىتاب"),
17 |     ("wAtAn", "ۋەتەن"),
18 |     ("tomur", "تومۇر"),
19 |     ("kOmUr", "كۆمۈر"),
20 |     ("asman", "ئاسمان"),
21 |     ("muAllim", "مۇئەللىم"),
22 |     ("sincay", "سىنچاي"),
23 |     ("anar", "ئانار"),
24 |     ("vAnjur", "ئەنجۇر"),
25 |     ("orda", "ئوردا"),
26 |     ("urux", "ئۇرۇش"),
27 |     ("OrdAk", "ئۆردەك"),
28 |     ("UzUm", "ئۈزۈم"),
29 |     ("elan", "ئېلان"),
30 |     ("inkas", "ئىنكاس"),
31 |     ("ana", "ئانا"),
32 |     ("bizniN muAllim", "بىزنىڭ مۇئەللىم"),
33 |     ("uzun sApAr", "ئۇزۇن سەپەر"),
34 |     ("mAktAp mudir", "مەكتەپ مۇدىر"),
35 |     ("yoGan bax", "يوغان باش"),
36 |     ("maymun", "مايمۇن"),
37 |     ("bayraq", "بايراق"),
38 |     ("bArkAtlik", "بەركەتلىك"),
39 |     ("poyuz", "پويۇز"),
40 |     ("bizniN Oy", "بىزنىڭ ئۆي"),
41 |     ("kUnlAr", "كۈنلەر"),
42 |     ("bayramlar", "بايراملار"),
43 |     ("tirixcan", "تىرىشچان"),
44 |     ("yalGuz", "يالغۇز"),
45 |     ("bir kuni", "بىر كۇنى"),
46 |     ("balaN azsa ah urma yol tapidu bir kuni",
47 |      "بالاڭ ئازسا ئاھ ئۇرما يول تاپىدۇ بىر كۇنى"),
48 |     ("aman bolsa ularmu pul tapidu bir kuni",
49 |      "ئامان بولسا ئۇلارمۇ پۇل تاپىدۇ بىر كۇنى"),
50 |     ("Hata basqan qAdAmlAr qaldurGanda qAdAmdin",
51 |      "خاتا باسقان قەدەملەر قالدۇرغاندا قەدەمدىن"),
52 |     ("Ozi maNGan catqalGa ot yaqidu bir kuni",
53 |      "ئۆزى ماڭغان چاتقالغا ئوت ياقىدۇ بىر كۇنى"),
54 |     ("gAdAnkAxlik hickimdin udum AmAs ularGa",
55 |      "گەدەنكەشلىك ھىچكىمدىن ئۇدۇم ئەمەس ئۇلارغا"),
56 |     ("Ozi kilip aldiNGa gAp acidu bir kuni",
57 |      "ئۆزى كىلىپ ئالدىڭغا گەپ ئاچىدۇ بىر كۇنى"),
58 |     ("at bolGicA asawtay hAryan cipip baqmaydu",
59 |      "ئات بولغىچە ئاساۋتاي ھەريان چىپىپ باقمايدۇ"),
60 |     ("ularGimu bu dunya yUk artidu bir kuni",
61 |      "ئۇلارغىمۇ بۇ دۇنيا يۈك ئارتىدۇ بىر كۇنى"),
62 | ]
63 | 
64 | 
65 | @pytest.mark.parametrize("input,expected", test_data)
66 | def test_CTS2UAS(input, expected):
67 |     converter = UgMultiScriptConverter("XJUS", "UAS")
68 |     assert converter(input) == expected
69 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Script Converter for Uyghur Language
  2 | This converter supports multiple Uyghur writing systems:
  3 | - **ULS** — Uyghur Latin Script  
  4 | - **UAS** — Uyghur Arabic Script  
  5 | - **CTS** — Common Turkish Script  
  6 | - **UCS** — Uyghur Cyrillic Script  
  7 | - **UYS** — Uyghur Yengi (New) Script  
  8 | - **IPA** — International Phonetic Alphabet  
  9 | - **UZLS** — Uzbek Latin Script  
 10 | - **XJUS** — Xinjiang University Script  
 11 | 
 12 | ## Installation
 13 | ```
 14 | pip install umsc
 15 | ```
 16 | 
 17 | 
 18 | ## Mapping table
 19 | | UAS | CTS | ULS| UCS|UYS| IPA   | UZLS | XJUS |
 20 | |-----|----| ---- | --- | -- |-------|---|------|
 21 | | ا   | a  | a    | а   |a | /ɑ/   | o | a    |
 22 | | ە   | e  | e    | ә   |ə | /æ/   | a | A    |
 23 | | ب   | b  | b    | б   |b | /b/   | b | b    |
 24 | | پ   | p  | p    | п   |p | /p/   | p | p    |
 25 | | ت   | t  | t    | т   |t | /t/   | t | t    |
 26 | | ج   | c  | j    | җ   |j | /d͡ʒ/ |  j | j   |
 27 | | چ   | ç  | ch   | ч   |q | /t͡ʃ/ | ch | c   |
 28 | | خ   | x  | x    | х   |h | /χ/   | x | H    |
 29 | | د   | d  | d    | д   |d | /d/   | d | d    |
 30 | | ر   | r  | r    | р   |r | /r/   | r | r    |
 31 | | ز   | z  | z    | з   |z | /z/   | z | z    |
 32 | | ژ   | j  | zh   | ж   |ⱬ | /ʒ/   |  j | J   |
 33 | | س   | s  | s    | с   |s | /s/   | s | s    |
 34 | | ش   | ş  | sh   | ш   |x | /ʃ/   | sh | x   |
 35 | | ف   | f  | f    | ф   |f | /f/   | f | f    |
 36 | | ڭ   | ñ  | ng   | ң   |ng | /ŋ/   | ng | N  |
 37 | | ل   | l  | l    | л   |l | /l/   | l | l    |
 38 | | م   | m  | m    | м   |m | /m/   | m | m    |
 39 | | ھ   | h  | h    | һ   |ⱨ | /h/   | h | h    |
 40 | | و   | o  | o    | о   |o | /o/   | oʻ | o    |
 41 | | ۇ   | u  | u    | у   |u | /u/   | u | u    |
 42 | | ۆ   | ö  | ö    | ө   |ɵ | /ø/   | oʻ | O   |
 43 | | ۈ   | ü  | ü    | ү   |ü | /y/   | uʻ | U   |
 44 | | ۋ   | v  | w    | в   |w | /w/   | v | w    |
 45 | | ې   | é  | é    | е   |e | /ɛ/   | e | e    |
 46 | | ى   | i  | i    | и   |i | /i/   | i | i    |
 47 | | ي   | y  | y    | й   |y | /j/   | y | y    |
 48 | | ق   | q  | q    | қ   |ⱪ | /q/   | q | q    |
 49 | | ك   | k  | k    | к   |k | /k/   | k | k    |
 50 | | گ   | g  | g    | г   |g | /ɡ/   | g | g    |
 51 | | ن   | n  | n    | н   |n | /n/   | n | n    |
 52 | | غ   | ğ  | gh   | ғ   |ƣ | /ʁ/   | gʻ | G   |
 53 | | ئ   |    |      |     | |       |   | v    |
 54 | | يا  | ya | ya   | я   |ya |       | ya | ya   |
 55 | | يۇ  | yu | yu   | ю   |yu |       | yu | yu   |
 56 | 
 57 | ## Sample input and output examples
 58 | 
 59 | Review the files in the tests directory for examples of converting between different scripts.
 60 | 
 61 | ## Usage
 62 | 
 63 | ```
 64 | from umsc import UgMultiScriptConverter
 65 | # To convert text, you need to define source and target scripts
 66 | # The abbreviation of scrips
 67 | # ULS | Uyghur Latin Script
 68 | # UYS | Uyghur Yengi (New) Script
 69 | # CPS | Chinese Pinyin Script
 70 | # UAS | Uyghur Arabic Script
 71 | # CTS |Common Turkic Script
 72 | # UCS | Uyghur Cyrillic Script
 73 | # XJU | Xinjinag University English Case Sensitive
 74 | # UZLS | Uzbek Latin Script
 75 | # Convert Uyghur Arabic Script to Uyghur Latin Script
 76 | source_script = 'UAS'
 77 | target_script = 'ULS'
 78 | converter = UgMultiScriptConverter(source_script, target_script)
 79 | text1 = 'ياخشىمۇسىز!'
 80 | text1 = converter(text1)
 81 | print(text1)
 82 | # Convert Uyghur Latin Script to Uyghur Arabic Script
 83 | source_script = 'ULS'
 84 | target_script = 'UAS'
 85 | converter = UgMultiScriptConverter(source_script, target_script)
 86 | text2 = 'yaxshimusiz!'
 87 | text2 = converter(text2)
 88 | print(text2)
 89 | ```
 90 | 
 91 | ## Notes
 92 | - API and Uzbek are currently developing. The mapping might not be very accurate. Especially for Uzbek, it is not very clear how to map "ئا" and "ئە" to Uzbek.
 93 | 
 94 | 
 95 | ## Citation
 96 | 
 97 | If you wish to cite this project, please use `cite this repository`. 
 98 | 
 99 | ## Contributing
100 | Feel free to raise issue and pull request.
101 | 
102 | ## License
103 | Distributed under the Apache 2.0 License. See [`LICENSE`](LICENSE) for more information.
104 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/umsc/umsc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | '''
  5 | 
  6 | # Original Author: neouyghur
  7 | # Mail: osmanjan.t@gmail.com
  8 | # Licence: MIT License
  9 | 
 10 | This is a simple script to convert Uyghur texts written in different Uyghur scripts. It supports Uyghur Arabic,
 11 | Latin Common Turkish scripts, Uyghur Latin Script (also known as computer script), Uyghur Yengi (new) script and Uyghur
 12 | Cyrillic script. It is written in Python and uses PyQt5 for GUI. The source script will be converted to common turkic script,
 13 | then converted to target script. Therefore, the program is not very efficient but easy to add new scripts.
 14 | 
 15 | Abbreviations used in this file:
 16 | 
 17 | ULS | Uyghur Latin Script
 18 | UYS | Uyghur Yengi (New) Script
 19 | CPS | Chinese Pinyin Script
 20 | UAS | Uyghur Arabic Script
 21 | CTS |Common Turkic Script
 22 | UCS | Uyghur Cyrillic Script
 23 | XJU | Xinjinag University English Case Sensitive
 24 | UZLS | Uzbek Latin Script
 25 | 
 26 | '''
 27 | import regex as re
 28 | import argparse
 29 | 
 30 | 
 31 | class UgMultiScriptConverter:
 32 |     def __init__(self, source_script, target_script, less_apostrophe=False):
 33 |         self.source_script = source_script
 34 |         self.target_script = target_script
 35 |         # self.less_apostrophe = less_apostrophe
 36 | 
 37 |         self.__uas_group1 = [u'ا', u'ە', u'ب', u'پ', u'ت', u'ج', u'چ', u'خ', u'د', u'ر',
 38 |                              u'ز', u'ژ', u'س', u'ش', u'ف', u'ڭ', u'ل', u'لا', u'م', u'ھ',
 39 |                              u'و', u'ۇ', u'ۆ', u'ۈ', u'ۋ', u'ې', u'ى', u'ي', u'ق', u'ك',
 40 |                              u'گ', u'ن', u'غ', u'؟', u'،', u'؛', u'٭']  # u'ئ',
 41 |         # following may be not necessary, u'«', u'«', u'«', u'«', u'»', u'»', u'»', u'»']
 42 |         self.__cts_group1 = [u'a', u'e', u'b', u'p', u't', u'c', u'ç', u'x', u'd', u'r',
 43 |                              u'z', u'j', u's', u'ş', u'f', u'ñ', u'l', u'la', u'm', u'h',
 44 |                              u'o', u'u', u'ö', u'ü', u'v', u'é', u'i', u'y', u'q', u'k',
 45 |                              u'g', u'n', u'ğ', u'?', u',', u';', u'*']
 46 |         self.__ucs_group1 = [u'а', u'ә', u'б', u'п', u'т', u'җ', u'ч', u'х', u'д', u'р',
 47 |                              u'з', u'ж', u'с', u'ш', u'ф', u'ң', u'л', u'ла', u'м', u'һ',
 48 |                              u'о', u'у', u'ө', u'ү', u'в', u'е', u'и', u'й', u'қ', u'к',
 49 |                              u'г', u'н', u'ғ', u'?', u',', u';', u'*']
 50 | 
 51 |         # I have to improve this. It is not complete
 52 |         self.__ipa_group1 = ["ɑ", "æ", "b", "p", "t", "dʒ", "tʃ", "χ", "d", "r",
 53 |                              "z", "ʒ", "s", "ʃ", "f", "ŋ", "l", "la", "m", "h",
 54 |                              "o", "u", "ø", "y", "w", "ɛ", "i", "j", "q", "k",
 55 |                              "ɡ", "n", "ʁ", u'?', u',', u';', u'*']
 56 | 
 57 |     def __call__(self, text, source_script=None, target_script=None):
 58 |         if source_script:
 59 |             self.source_script = source_script.upper()
 60 |         else:
 61 |             self.source_script = self.source_script.upper()
 62 |         if target_script:
 63 |             self.target_script = target_script.upper()
 64 |         else:
 65 |             self.target_script = self.target_script.upper()
 66 | 
 67 |         # If source and target are same, then return original text
 68 |         if self.target_script == self.source_script:
 69 |             return text  # No conversion needed
 70 | 
 71 |         method_name = f'{self.source_script}2{self.target_script}'
 72 | 
 73 |         convert_method = getattr(self, method_name, None)
 74 | 
 75 |         if convert_method:
 76 |             return convert_method(text)
 77 |         else:
 78 |             raise ValueError(
 79 |                 f'Conversion from {self.source_script} to {self.target_script} not supported')
 80 | 
 81 |     def isPureUyghurScript(herp):
 82 |         m = re.search('[\u0621-\u06ff]', herp)
 83 |         if m == None:
 84 |             return False
 85 |         else:
 86 |             return True
 87 | 
 88 |     def _repalce_via_table(self, text, tab1, tab2):
 89 |         for i, j in zip(tab1, tab2):
 90 |             text = text.replace(i, j)
 91 |         return text
 92 | 
 93 |     # ----------------------------------------------
 94 |     # Source script to common turkic script
 95 |     def UAS2CTS(self, text, keep_apstrophe=False):
 96 |         """
 97 |         UAS to CTS
 98 |         Parameters
 99 |         ----------
100 |         text : str
101 | 
102 |         Returns
103 |         -------
104 |         text
105 |         """
106 |         text = self._repalce_via_table(
107 |             text, self.__uas_group1, self.__cts_group1)
108 |         text = self.__revise_CTS(text, keep_apstrophe)
109 |         return text
110 | 
111 |     def __revise_CTS(self, text, keep_apostrophes):
112 |         """
113 |         revise CTS
114 |         Parameters
115 |         ----------
116 |         text : str
117 | 
118 |         Returns
119 |             Text
120 |         -------
121 | 
122 |         """
123 |         # Remove a "U+0626" if it is a beginning of a word, if it is not after a alphabet in CTS
124 |         text = re.sub(
125 |             r'(?<=[^aeuoöübptcçxdzrjsşfñlmhvéiyqkgnğ]|^)\u0626', '', text)
126 |         # Replace a "U+0626" with "'" if "U+0626" is appeared in a word and its previous character is not in
127 |         # [u'a', u'e', u'é', u'i', u'o', u'u', u'ö', u'ü']
128 |         if not keep_apostrophes:
129 |             text = re.sub(r'(([aeéiouöü])\u0626)',
130 |                           lambda m: m.group()[0], text)
131 |         text = text.replace('\u0626', u"'")
132 |         return text
133 | 
134 |     def ULS2CTS(self, text):
135 |         text = text.lower()
136 |         # ch ç # zh j # sh ş # gh ğ
137 |         text = text.replace(u"j", u'c') \
138 |             .replace(u"ng", u'ñ') \
139 |             .replace(u"n'g", u'ng') \
140 |             .replace(u"'ng", u'ñ') \
141 |             .replace(u'ch', u'ç') \
142 |             .replace(u'zh', u'j') \
143 |             .replace(u'sh', u'ş') \
144 |             .replace(u"'gh", u'ğ') \
145 |             .replace(u"gh", u'ğ') \
146 |             .replace(u"w", u'v') \
147 |             .replace(u'ch', u'ç')
148 |         return text
149 | 
150 |     def UYS2CTS(self, text):
151 |         text = text.lower()
152 |         # e:ə c:j ç:q x:h j:ⱬ ş:x ñ:ng ö:ø ü:ü  v:w é:e
153 |         # q:ⱪ ğ:ƣ
154 |         text = text.replace(u"e", u'é') \
155 |             .replace(u"ə", u'e') \
156 |             .replace(u"j", u'c') \
157 |             .replace(u"q", u'ç') \
158 |             .replace(u"ⱬ", u'j') \
159 |             .replace(u"x", u'ş') \
160 |             .replace(u"h", u'x') \
161 |             .replace(u"ⱨ", u'h') \
162 |             .replace(u"ng", u'ñ') \
163 |             .replace(u"ø", u'ö') \
164 |             .replace(u"ü", u'ü') \
165 |             .replace(u"w", u'v') \
166 |             .replace(u"ⱪ", u'q') \
167 |             .replace(u"ƣ", u'ğ')
168 |         return text
169 | 
170 |     def UCS2CTS(self, text):
171 |         text = text.lower()
172 |         text = self._repalce_via_table(
173 |             text, self.__ucs_group1, self.__cts_group1)
174 |         text = text.replace("я", "ya").replace("ю", "yu")
175 |         return text
176 | 
177 |     def XJUS2CTS(self, text):
178 |         text = text.replace('v', "\u0626") \
179 |             .replace(u'J', u"j") \
180 |             .replace(u'c', u"ç") \
181 |             .replace(u'j', u"c") \
182 |             .replace(u'x', u"ş") \
183 |             .replace(u'H', u"x") \
184 |             .replace(u'N', u"ñ") \
185 |             .replace(u'O', u"ö") \
186 |             .replace(u'U', u"ü") \
187 |             .replace(u'e', u"é") \
188 |             .replace(u"A", u'e') \
189 |             .replace(u'G', u"ğ") \
190 |             .replace(u'w', u"v")
191 |         text = self.__revise_CTS(text, False)
192 |         return text
193 | 
194 |     def XJUS2UAS(self, text):
195 |         text = text.replace('v', "\u0626") \
196 |             .replace(u'c', u"ç") \
197 |             .replace(u'j', u"c") \
198 |             .replace(u'J', u"j") \
199 |             .replace(u'x', u"ş") \
200 |             .replace(u'H', u"x") \
201 |             .replace(u'N', u"ñ") \
202 |             .replace(u'O', u"ö") \
203 |             .replace(u'U', u"ü") \
204 |             .replace(u'e', u"é") \
205 |             .replace(u"A", u'e') \
206 |             .replace(u'G', u"ğ") \
207 |             .replace(u'w', u"v")
208 |         text = self.CTS2UAS(self.__revise_CTS(text, False))
209 |         return text
210 | 
211 |     def UZLS2CTS(self, text):
212 |         text = text.replace(u'ch', u'ç') \
213 |             .replace('sh', u'ş') \
214 |             .replace("s'h", 'sh') \
215 |             .replace('ng', u"ñ") \
216 |             .replace("n'g", 'ng') \
217 |             .replace(u"g‘", u'ğ') \
218 |             .replace("o‘", u"ö") \
219 |             .replace("u‘", u"ü") \
220 |             .replace("e", u"é") \
221 |             .replace('a', 'e') \
222 |             .replace(u'o', 'a') \
223 |             .replace(u'j', 'c')
224 |         text = self.__revise_CTS(text, False)
225 |         return text
226 | 
227 |     # ----------------------------------------------
228 |     # Common turkic script to target script
229 | 
230 |     def CTS2UAS(self, text):
231 |         """
232 |         CTS to UAS
233 |         Parameters
234 |         ----------
235 |         text : str
236 | 
237 |         Returns
238 |         -------
239 |           text
240 |         """
241 | 
242 |         text = re.sub(r'(?<=[^bptcçxdrzjsşfñlmhvyqkgnğ]|^)[aeéiouöü]',
243 |                       lambda m: u'\u0626' + m.group(), text)
244 |         # add a "U+0626" before a vowel if it is the beginning of a word or after a vowel but not at the end of the word
245 |         # for example
246 |         # "ait" -> "U+0626aU+0626it" ئائىت
247 |         # cuñxua -> cuñxua. cuñxu'a is wrong جۇڭخۇا
248 |         text = self._repalce_via_table(
249 |             text, self.__cts_group1, self.__uas_group1)
250 |         # replace "'\u0626" with ""
251 |         text = text.replace(u"'", '')
252 |         text = self._revise_UAS(text)
253 |         return text
254 | 
255 |     def _revise_UAS(self, text):
256 |         return re.sub(r"(^|-|\s|[اەېىوۇۆۈ])([اەېىوۇۆۈ])", lambda m: m.group(1) + "ئ" + m.group(2), text)
257 | 
258 |     def CTS2ULS(self, text):
259 |         text = text.lower()
260 |         text = text.replace(u'ng', u"n'g") \
261 |             .replace(u'sh', u"s'h") \
262 |             .replace(u'ch', u"c'h") \
263 |             .replace(u'zh', u"z'h") \
264 |             .replace(u'gh', u"g'h") \
265 |             .replace(u'ng', u"n'g") \
266 |             .replace(u'nğ', u"n'gh") \
267 |             .replace(u'ñ', u"ng") \
268 |             .replace(u'j', u'zh') \
269 |             .replace(u"c", u'j') \
270 |             .replace(u'ç', u'ch') \
271 |             .replace(u'ş', u'sh') \
272 |             .replace(u"ğ", u"gh") \
273 |             .replace(u"v", u'w')
274 |         return text
275 | 
276 |     def CTS2UYS(self, text):
277 |         text = text.lower()
278 |         text = text.replace(u'ng', u"n'g") \
279 |             .replace(u"e", u'ə') \
280 |             .replace(u'j', u"ⱬ") \
281 |             .replace(u'c', u"j") \
282 |             .replace(u'q', u"ⱪ") \
283 |             .replace(u'ç', u"q") \
284 |             .replace(u'h', u"ⱨ") \
285 |             .replace(u'x', u"h") \
286 |             .replace(u'ş', u"x") \
287 |             .replace(u'ñ', u"ng") \
288 |             .replace(u'ö', u"ø") \
289 |             .replace(u'v', u"w") \
290 |             .replace(u'é', u"e") \
291 |             .replace(u'ğ', u"ƣ")
292 |         return text
293 | 
294 |     def CTS2IPA(self, text):
295 |         position = self.__ipa_group1.index('y')
296 |         self.__cts_group1 = self.__cts_group1[:position] + \
297 |             self.__cts_group1[position+1:]
298 |         self.__ipa_group1 = self.__ipa_group1[:position] + \
299 |             self.__ipa_group1[position + 1:]
300 | 
301 |         text = self._repalce_via_table(
302 |             text, self.__cts_group1, self.__ipa_group1)
303 |         text = text.replace('ü', 'y')
304 |         return text
305 | 
306 |     def CTS2UZLS(self, text):
307 |         text = text.lower()
308 |         text = text.replace(u"a", u'o')\
309 |             .replace(u"e", u'a') \
310 |             .replace(u'c', u"j") \
311 |             .replace(u'q', u"q") \
312 |             .replace(u'ç', u"ch") \
313 |             .replace(u'ş', u"sh") \
314 |             .replace(u'ñ', u"ng") \
315 |             .replace(u'ö', u"o‘") \
316 |             .replace(u'ü', u"u‘") \
317 |             .replace(u'é', u"e") \
318 |             .replace(u'ğ', u"g‘")
319 |         return text
320 | 
321 |     def CTS2XJUS(self, text):
322 |         text = text.lower()
323 |         text = text.replace(u"e", u'A') \
324 |             .replace(u'x', u"H") \
325 |             .replace(u'j', u"J") \
326 |             .replace(u'c', u"j") \
327 |             .replace(u'ç', u"c") \
328 |             .replace(u'ş', u"x") \
329 |             .replace(u'ñ', u"N") \
330 |             .replace(u'ö', u"O") \
331 |             .replace(u'ü', u"U") \
332 |             .replace(u'é', u"e") \
333 |             .replace(u'ğ', u"G") \
334 |             .replace(u'v', u"w")
335 | 
336 |         text = re.sub(
337 |             r'(?<=[^bptcxdrzjJsxfNlmhHyqkgnGw]|^)[aAeiouOU]', lambda m: 'v' + m.group(), text)
338 |         text = text.replace(u"'", '')
339 |         return text
340 | 
341 |     def CTS2UCS(self, text):
342 |         text = text.lower()
343 |         text = text.replace("ya", "я").replace("yu", "ю")
344 |         text = self._repalce_via_table(
345 |             text, self.__cts_group1, self.__ucs_group1)
346 |         # return text.replace("'", "")
347 |         return text
348 | 
349 |     # ----------------------------------------------
350 |     # Uyghur Latin script to target script
351 |     def ULS2UAS(self, text):
352 |         return self.CTS2UAS(self.ULS2CTS(text))
353 | 
354 |     def ULS2UCS(self, text):
355 |         return self.CTS2UCS(self.ULS2CTS(text))
356 | 
357 |     def ULS2UYS(self, text):
358 |         return self.CTS2UYS(self.ULS2CTS(text))
359 | 
360 |     # ----------------------------------------------
361 |     # Uyghur Arabic script to target script
362 | 
363 |     def UAS2ULS(self, text):
364 |         return self.CTS2ULS(self.UAS2CTS(text, True))
365 | 
366 |     def UAS2UCS(self, text):
367 |         return self.CTS2UCS(self.UAS2CTS(text, True))
368 | 
369 |     def UAS2UYS(self, text):
370 |         return self.CTS2UYS(self.UAS2CTS(text, True))
371 | 
372 |     # ----------------------------------------------
373 |     # Uyghur Cyrillic script to target script
374 | 
375 |     def UCS2UAS(self, text):
376 |         return self.CTS2UAS(self.UCS2CTS(text))
377 | 
378 |     def UCS2ULS(self, text):
379 |         return self.CTS2ULS(self.UCS2CTS(text))
380 | 
381 |     def UCS2ULS(self, text):
382 |         return self.CTS2ULS(self.UCS2CTS(text))
383 | 
384 |     def UCS2UYS(self, text):
385 |         return self.CTS2UYS(self.UCS2CTS(text))
386 | 
387 |     # ----------------------------------------------
388 |     # Uyghur Yengi script to target script
389 | 
390 |     def UYS2UAS(self, text):
391 |         return self.CTS2UAS(self.UYS2CTS(text))
392 | 
393 |     def UYS2ULS(self, text):
394 |         return self.CTS2ULS(self.UYS2CTS(text))
395 | 
396 |     def UYS2UCS(self, text):
397 |         return self.CTS2UCS(self.UYS2CTS(text))
398 | 
399 |     def UAStoUZLS(self, text):
400 |         return self.CTS2UZLS(self.UAS2CTS(text, True))
401 | 
402 | 
403 | def args_parser():
404 |     parser = argparse.ArgumentParser(
405 |         description='Convert text from one script to another')
406 |     parser.add_argument('-s', '--source', help='source script', required=True)
407 |     parser.add_argument('-t', '--target', help='target script', required=True)
408 |     parser.add_argument('-i', '--input', help='input file', required=True)
409 |     parser.add_argument('-o', '--output', help='output file', required=True)
410 |     # parser.add_argument('--la', action='store_true', default=False, help='Removing apostrophe between vowels', required=False)
411 |     args = parser.parse_args()
412 |     return args
413 | 
414 | 
415 | if __name__ == "__main__":
416 |     args = args_parser()
417 |     print(args.less_apostrophe)
418 |     with open(args.input, 'r') as f:
419 |         text = f.read()
420 | 
421 |     converter = UgMultiScriptConverter(args.source, args.target)
422 |     text = converter(text)
423 |     with open(args.output, 'w') as f:
424 |         f.write(text)
425 | 
426 |     print("Done")
427 | 


--------------------------------------------------------------------------------