├── .gitignore
├── LICENSE
├── README.md
├── colab
    ├── en_IN_on_colab.ipynb
    ├── locale-colab-snippet.ipynb
    └── locale_module_colab.ipynb
├── data
    ├── bn_global_popl.tsv
    ├── demographics.tsv
    ├── din.txt
    ├── dolar_endeksi.tsv
    ├── fa_stats.tsv
    ├── klpt_stopwords.json
    ├── myanmar-regions.tsv
    ├── myanmar_ethnic_groups.tsv
    ├── rbbi
    │   ├── Default.rbbi
    │   ├── Lao.rbbi
    │   ├── lucene
    │   │   ├── source.md
    │   │   └── uax29
    │   │   │   ├── Default.rbbi
    │   │   │   └── MyanmarSyllable.rbbi
    │   ├── solrcene
    │   │   ├── Hebrew.rbbi
    │   │   ├── Khmer.rbbi
    │   │   ├── Lao.rbbi
    │   │   ├── Myanmar.rbbi
    │   │   └── source.md
    │   └── source.md
    ├── régions_métropolitaines.tsv
    ├── sorani_alphabet.tsv
    ├── sorani_alphabet_wikipedia.tsv
    ├── source.md
    ├── türkiye'ninz-illeri.tsv
    └── wordlists
    │   └── source.md
├── docs
    ├── DRAFT_icu_transforms.pdf
    ├── README.md
    └── matplotlib.md
├── notebooks
    ├── Collation.ipynb
    ├── Sorting_emoji.ipynb
    ├── armenian_pandas.ipynb
    ├── bangla_df.ipynb
    ├── ckb_sort.ipynb
    ├── complex_script_support_images.ipynb
    ├── data
    │   └── allkeys.txt
    ├── ethiopic_numbers.ipynb
    ├── icu_transforms.ipynb
    ├── images
    │   ├── sorani_plotly.png
    │   ├── sorani_plotly2.png
    │   └── sorani_plotly_inline.png
    ├── img
    │   ├── 1440px-Lake_Dukan_12.jpg
    │   ├── ckb_IQ_collation.png
    │   ├── khamti.jpg
    │   ├── linux1.png
    │   ├── macos1.png
    │   ├── mplcairo_output.png
    │   ├── sibe.jpg
    │   ├── std_matplotlib_output.png
    │   ├── tai_aiton.jpg
    │   ├── tai_aiton_text_to_image.png
    │   └── yolngu.jpg
    ├── is_IS.ipynb
    ├── kn_demographics_pandas_matplotlib.ipynb
    ├── kn_demographics_pandas_plottly.ipynb
    ├── matplotlib_locale.ipynb
    ├── matplotlib_mplcairo.ipynb
    ├── matplotlib_mplcairo2.ipynb
    ├── matplotlib_pyicu.ipynb
    ├── my-segmentation.ipynb
    ├── pandas_plot_mplcairo.ipynb
    ├── pandas_plot_plotly.ipynb
    ├── persian_df.ipynb
    ├── plotly.ipynb
    ├── plotly2.ipynb
    ├── seaborn.ipynb
    ├── sorting_pandas.ipynb
    ├── strings_casing_matching.ipynb
    ├── turkish_df.ipynb
    └── vietnamese_pandas.ipynb
├── py
    ├── am_ET_numbers_icu.py
    ├── am_ET_numbers_icu_1.png
    ├── am_ET_numbers_icu_1.py
    ├── am_ET_numbers_icu_2.png
    ├── am_ET_numbers_icu_2.py
    ├── arabic_reshaper_example.py
    ├── hi_IN_numbers_icu.png
    ├── hi_IN_numbers_icu.py
    ├── matplotlib_kurdish.png
    ├── matplotlib_kurdish.py
    ├── pandas_plot_kurdish.png
    ├── pandas_plot_kurdish.py
    ├── pyuca_test.py
    ├── seaborn_kurdish.png
    ├── seaborn_kurdish.py
    ├── wordcloud_kurdish.png
    └── wordcloud_kurdish.py
├── requirements.txt
├── rules
    ├── collation
    │   ├── README.md
    │   ├── allkeys_CLDR.txt
    │   ├── allkeys_DUCET.txt
    │   ├── cldr
    │   │   ├── ckb.xml
    │   │   ├── ckb_IQ.xml
    │   │   ├── ckb_IR.xml
    │   │   └── dtd
    │   │   │   └── ldml.dtd
    │   ├── collation_rules.py
    │   ├── glibc
    │   │   ├── ckb_IQ@academy
    │   │   └── en_SS
    │   ├── icu
    │   │   ├── ckb.txt
    │   │   └── ckb_IQ.txt
    │   ├── sorani_alphabet.tsv
    │   └── temp.ipynb
    └── segmentation
    │   ├── regex_patterns.md
    │   └── syllables
    │       ├── Khmer.rbbi
    │       ├── Lao.rbbi
    │       └── Myanmar.rbbi
├── snippets
    ├── break_iterator.py
    ├── convert_digits.py
    ├── data_cleaning.py
    ├── matching.py
    ├── regex_segmentation.py
    └── sort_key_normalise.py
└── utils
    ├── cesu8.py
    ├── el_utils.py
    └── elle.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | ### JupyterNotebooks ###
132 | # gitignore template for Jupyter Notebooks
133 | # website: http://jupyter.org/
134 | 
135 | .ipynb_checkpoints
136 | */.ipynb_checkpoints/*
137 | 
138 | # IPython
139 | profile_default/
140 | ipython_config.py
141 | 
142 | # Remove previous ipynb_checkpoints
143 | #   git rm -r .ipynb_checkpoints/
144 | 
145 | ### Linux ###
146 | *~
147 | 
148 | # temporary files which can be created if a process still has a handle open of a deleted file
149 | .fuse_hidden*
150 | 
151 | # KDE directory preferences
152 | .directory
153 | 
154 | # Linux trash folder which might appear on any partition or disk
155 | .Trash-*
156 | 
157 | # .nfs files are created when an open file is removed but is still being accessed
158 | .nfs*
159 | 
160 | ### macOS ###
161 | # General
162 | .DS_Store
163 | .AppleDouble
164 | .LSOverride
165 | 
166 | # Icon must end with two \r
167 | Icon
168 | 
169 | 
170 | # Thumbnails
171 | ._*
172 | 
173 | # Files that might appear in the root of a volume
174 | .DocumentRevisions-V100
175 | .fseventsd
176 | .Spotlight-V100
177 | .TemporaryItems
178 | .Trashes
179 | .VolumeIcon.icns
180 | .com.apple.timemachine.donotpresent
181 | 
182 | # Directories potentially created on remote AFP share
183 | .AppleDB
184 | .AppleDesktop
185 | Network Trash Folder
186 | Temporary Items
187 | .apdisk
188 | 
189 | ### VisualStudioCode ###
190 | .vscode/*
191 | !.vscode/settings.json
192 | !.vscode/tasks.json
193 | !.vscode/launch.json
194 | !.vscode/extensions.json
195 | *.code-workspace
196 | 
197 | # Local History for Visual Studio Code
198 | .history/
199 | 
200 | ### VisualStudioCode Patch ###
201 | # Ignore all local history of files
202 | .history
203 | .ionide
204 | 
205 | ### Windows ###
206 | # Windows thumbnail cache files
207 | Thumbs.db
208 | Thumbs.db:encryptable
209 | ehthumbs.db
210 | ehthumbs_vista.db
211 | 
212 | # Dump file
213 | *.stackdump
214 | 
215 | # Folder config file
216 | [Dd]esktop.ini
217 | 
218 | # Recycle Bin used on file shares
219 | $RECYCLE.BIN/
220 | 
221 | # Windows Installer files
222 | *.cab
223 | *.msi
224 | *.msix
225 | *.msm
226 | *.msp
227 | 
228 | # Windows shortcuts
229 | *.lnk
230 | 
231 | # Repo specfic
232 | notes/
233 | print/
234 | archive/
235 | .vscode/
236 | data/wordlists/kurdi_words.txt
237 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021-2 Enabling Languages
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python internationalisation
 2 | 
 3 | There is limited information on Python internationalisation or Python internationalisation best practices. What little that's available is scattered, and most available articles and tutorials on Python internationalisation are specifically on localisation.
 4 | 
 5 | The EL notebooks contain notes on various aspects of Python internationalisation, and new topics will be added over time.
 6 | 
 7 | Feedback is welcome.
 8 | 
 9 | ## Python internationalisation notes
10 | 
11 | * Collation
12 |     1. [Sorting](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/Collation.ipynb)
13 |     2. [Sorting emoji](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/Sorting_emoji.ipynb)
14 |     3. [Sorting pandas](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/sorting_pandas.ipynb)
15 | * Data visualisation
16 |     1. [Matplotlib, pandas plot, seaborn, wordcloud](https://github.com/enabling-languages/python-i18n/blob/main/docs/matplotlib.md)
17 |     2. [Locale specific formatting of numeric tick labels on matplotlib](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/matplotlib_locale.ipynb)
18 |     3. [Using PyICU to format matplotlib numeric tick labels](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/matplotlib_pyicu.ipynb)
19 | * Working with digits
20 |     1. [snippets](https://github.com/enabling-languages/python-i18n/blob/main/snippets/convert_digits.py)
21 | 
22 | ## Google Colab notes
23 | 
24 | 1. [Setting the locale of a notebook for Google Colab](https://github.com/enabling-languages/python-i18n/blob/main/colab/locale_module_colab.ipynb)
25 | 
26 | ## Resources
27 | 
28 | Python documentation: 
29 | 
30 | * [Internationalization](https://docs.python.org/3/library/i18n.html)
31 | * [Unicode HOWTO](https://docs.python.org/3/howto/unicode.html)
32 | * [Unicode Objects and Codecs](https://docs.python.org/3/c-api/unicode.html)
33 | 


--------------------------------------------------------------------------------
/colab/locale-colab-snippet.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Loading locales in Colab: snippet\n",
 8 |     "\n",
 9 |     "Refer to [locale_module_colab.ipynb](locale_module_colab.ipynb)."
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": null,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "# Import locale module\n",
19 |     "import locale"
20 |    ]
21 |   },
22 |   {
23 |    "cell_type": "code",
24 |    "execution_count": null,
25 |    "metadata": {},
26 |    "outputs": [],
27 |    "source": [
28 |     "try:\n",
29 |     "  import google.colab\n",
30 |     "  IN_COLAB = True\n",
31 |     "except ImportError:\n",
32 |     "  IN_COLAB = False\n",
33 |     "if IN_COLAB:\n",
34 |     "  try:\n",
35 |     "    locale.setlocale(locale.LC_ALL, \"en_AU.UTF-8\")\n",
36 |     "  except locale.Error:\n",
37 |     "    !sudo apt-get install language-pack-en language-pack-fr language-pack-sv language-pack-de\n",
38 |     "    #!sudo apt autoremove\n",
39 |     "    import os\n",
40 |     "    os.kill(os.getpid(), 9)\n",
41 |     "else:\n",
42 |     "  locale.setlocale(locale.LC_ALL, \"en_AU.UTF-8\")"
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": null,
48 |    "metadata": {},
49 |    "outputs": [],
50 |    "source": [
51 |     "print(\"IN_COLAB: \" + str(IN_COLAB))\n",
52 |     "print(locale.getlocale())"
53 |    ]
54 |   }
55 |  ],
56 |  "metadata": {
57 |   "interpreter": {
58 |    "hash": "bb12d0de9674b66c629d2bafada2ec4f6e6dba6d129e54dea4badc21502d54d3"
59 |   },
60 |   "kernelspec": {
61 |    "display_name": "Python 3.8.1 ('el')",
62 |    "language": "python",
63 |    "name": "python3"
64 |   },
65 |   "language_info": {
66 |    "codemirror_mode": {
67 |     "name": "ipython",
68 |     "version": 3
69 |    },
70 |    "file_extension": ".py",
71 |    "mimetype": "text/x-python",
72 |    "name": "python",
73 |    "nbconvert_exporter": "python",
74 |    "pygments_lexer": "ipython3",
75 |    "version": "3.8.1"
76 |   },
77 |   "orig_nbformat": 4
78 |  },
79 |  "nbformat": 4,
80 |  "nbformat_minor": 2
81 | }
82 | 


--------------------------------------------------------------------------------
/data/demographics.tsv:
--------------------------------------------------------------------------------
 1 | "---"	"جیھانی"	"تورکیا"	"ئێران"	"عێراق"	"سووریا"
 2 | "کرمانجی"	"١٤٬٤١٩٬٠٠٠"	"٧٬٩١٩٬٠٠٠"	"٤٤٣٬٠٠٠"	"٣٬١٨٥٬٠٠٠"	"١٬٦٦١٬٠٠٠
 3 | "
 4 | "ئەوانەی بە تورکی دەدوێن"	"٥٬٧٣٢٬٠٠٠"	"٥٬٧٣٢٬٠٠٠"	"-"	"-"	"-
 5 | "
 6 | "باشوور"	"٣٬٣٨١٬٠٠٠"	"-"	"٣٬٣٨١٬٠٠٠"	"-"	"-
 7 | "
 8 | "سۆرانی"	"١٬٥٧٦٬٠٠٠"	"-"	"٥٠٢٬٠٠٠"	"٥٦٧٬٠٠٠"	"-
 9 | "
10 | "زازایی - دەملی"	"١٬١٢٥٬٠٠٠"	"١٬١٢٥٬٠٠٠"	"-"	"-"	"-
11 | "
12 | "زازایی - ئەلڤێکا"	"١٨٤٬٠٠٠"	"١٧٩٬٠٠٠"	"-"	"-"	"-
13 | "
14 | "ڕەوەند"	"٩٠٬٠٠٠"	"٣٨٬٠٠٠"	"٢٠٬٠٠٠"	"٣٣٬٠٠٠"	"-
15 | "
16 | "ھەورامی"	"٥٤٬٠٠٠"	"-"	"٢٦٬٠٠٠"	"٢٨٬٠٠٠"	"-
17 | "
18 | "شکاکی"	"٤٩٬٠٠٠"	"٢٣٬٠٠٠"	"٢٦٬٠٠٠"	"-"	"-
19 | "
20 | "کۆی گشتی"	"٢٦٬٧١٢٬٠٠٠"	"١٥٬٠١٦٬٠٠٠"	"٤٬٣٩٨٬٠٠٠"	"٣٬٩١٦٬٠٠٠"	"١٬٦٦١٬٠٠٠"


--------------------------------------------------------------------------------
/data/fa_stats.tsv:
--------------------------------------------------------------------------------
 1 | سال	ولادت	وفات
 2 | "۱۳۳۸ "	"۸۶۴٬۸۴۶ "	۱۷۶٬۲۸۸
 3 | "۱۳۳۹ "	"۸۷۶٬۲۰۶ "	۱۷۱٬۰۴۰
 4 | "۱۳۴۰ "	"۹۰۲٬۲۶۰ "	۱۵۹٬۳۷۱
 5 | "۱۳۴۱ "	"۹۵۷٬۵۰۰ "	۱۶۵٬۴۸۸
 6 | "۱۳۴۲ "	"۹۲۰٬۹۶۷ "	۱۳۵٬۹۱۲
 7 | "۱۳۴۳ "	"۱٬۱۱۸٬۹۱۱ "	۱۴۵٬۱۷۴
 8 | "۱۳۴۴ "	"۱٬۱۸۸٬۳۴۶ "	۱۷۱٬۹۴۰
 9 | "۱۳۴۵ "	"۱٬۱۰۲٬۸۴۸ "	۱۷۸٬۹۹۱
10 | "۱۳۴۶ "	"۱٬۰۱۴٬۳۲۱ "	۱۷۸٬۷۴۹
11 | "۱۳۴۷ "	"۱٬۰۴۶٬۱۳۴ "	۱۷۳٬۳۵۲
12 | "۱۳۴۸ "	"۱٬۱۰۷٬۹۱۰ "	۱۶۷٬۵۷۵
13 | "۱۳۴۹ "	"۱٬۱۹۰٬۹۵۷ "	۱۶۳٬۸۹۶
14 | "۱۳۵۰ "	"۱٬۲۳۵٬۰۲۵ "	۱۴۹٬۰۱۱
15 | "۱۳۵۱ "	"۱٬۱۳۸٬۸۴۳ "	۱۵۳٬۹۲۰
16 | "۱۳۵۲ "	"۱٬۱۹۹٬۷۷۷ "	۱۵۵٬۳۰۵
17 | "۱۳۵۳ "	"۱٬۲۴۸٬۲۵۶ "	۱۴۹٬۸۷۵
18 | "۱۳۵۴ "	"۱٬۳۳۹٬۲۶۷ "	۱۴۸٬۵۴۳
19 | "۱۳۵۵ "	"۱٬۳۹۹٬۹۷۷ "	۱۵۶٬۰۱۰
20 | "۱۳۵۶ "	"۱٬۴۰۶٬۲۰۴ "	۱۴۶٬۳۶۹
21 | "۱۳۵۷ "	"۱٬۳۷۳٬۷۳۸ "	۱۲۷٬۸۸۳
22 | "۱۳۵۸ "	"۱٬۶۸۸٬۹۴۲ "	۱۴۲٬۴۰۱
23 | "۱۳۵۹ "	"۲٬۴۵۱٬۷۶۵ "	۱۶۲٬۱۷۵
24 | "۱۳۶۰ "	"۲٬۴۱۹٬۹۵۱ "	۱۷۸٬۰۶۵
25 | "۱۳۶۱ "	"۲٬۰۹۷٬۹۵۷ "	۲۰۰٬۶۱۴
26 | "۱۳۶۲ "	"۲٬۲۰۳٬۵۶۰ "	۲۰۷٬۲۲۸
27 | "۱۳۶۳ "	"۲٬۰۶۸٬۲۷۹ "	۱۸۶٬۴۴۰
28 | "۱۳۶۴ "	"۲٬۰۳۱٬۹۶۹ "	۱۹۰٬۰۶۱
29 | "۱۳۶۵ "	"۲٬۲۵۶٬۹۷۱ "	۱۹۹٬۵۱۱
30 | "۱۳۶۶ "	"۱٬۸۳۲٬۷۲۲ "	۲۰۴٬۲۳۰
31 | "۱۳۶۷ "	"۱٬۹۴۲٬۹۳۶ "	۲۳۸٬۳۹۰
32 | "۱۳۶۸ "	"۱٬۷۸۹٬۸۱۷ "	۱۹۹٬۶۴۵
33 | "۱۳۶۹ "	"۱٬۷۲۶٬۴۸۸ "	۲۱۷٬۶۱۵
34 | "۱۳۷۰ "	"۱٬۵۹۲٬۸۹۸ "	۲۱۷٬۶۰۴
35 | "۱۳۷۱ "	"۱٬۴۳۳٬۲۴۳ "	۱۸۸٬۶۴۷
36 | "۱۳۷۲ "	"۱٬۳۸۸٬۰۱۷ "	۲۰۸٬۱۶۱
37 | "۱۳۷۳ "	"۱٬۴۲۶٬۷۸۴ "	۲٬۵۳۸٬۰۷۸
38 | "۱۳۷۴ "	"۱٬۲۰۵٬۳۷۲ "	۲٬۷۵۶٬۴۸۲
39 | "۱۳۷۵ "	"۱٬۱۸۷٬۹۰۳ "	۱٬۲۴۰٬۹۷۵
40 | "۱۳۷۶ "	"۱٬۱۷۹٬۲۶۰ "	۱٬۰۳۱٬۸۳۶
41 | "۱۳۷۷ "	"۱٬۱۸۶٬۶۵۹ "	۵۵۱٬۳۴۵
42 | "۱۳۷۸ "	"۱٬۱۷۴٬۲۷۹ "	۵۰۶٬۹۴۵
43 | "۱۳۷۹ "	"۱٬۰۹۵٬۱۶۵ "	۳۸۲٬۶۷۴
44 | "۱۳۸۰ "	"۱٬۱۱۰٬۸۳۶ "	۴۲۱٬۵۲۵
45 | "۱۳۸۱ "	"۱٬۱۲۲٬۱۰۴ "	۳۳۷٬۲۳۷
46 | "۱۳۸۲ "	"۱٬۱۷۱٬۵۷۳ "	۳۶۸٬۵۱۸
47 | "۱۳۸۳ "	"۱٬۱۵۴٬۳۶۸ "	۳۵۵٬۲۱۳
48 | "۱۳۸۴ "	"۱٬۲۳۹٬۴۰۸ "	۳۶۳٬۷۲۳
49 | "۱۳۸۵ "	"۱٬۲۵۳٬۹۱۲ "	۴۰۸٬۵۶۶
50 | "۱۳۸۶ "	"۱٬۲۸۶٬۷۱۶ "	۴۱۲٬۷۳۶
51 | "۱۳۸۷ "	"۱٬۳۰۰٬۱۶۶ "	۴۱۷٬۷۹۸
52 | "۱۳۸۸ "	"۱٬۳۴۸٬۵۲۶ "	۳۹۳٬۵۱۴
53 | "۱۳۸۹ "	"۱٬۳۶۴٬۵۲۳ "	۴۴۰٬۵۳۸
54 | "۱۳۹۰ "	"۱٬۳۸۲٬۲۲۹ "	۴۲۲٬۱۳۳
55 | "۱۳۹۱ "	"۱٬۴۲۱٬۶۸۹ "	۳۶۷٬۵۳۹
56 | "۱۳۹۲ "	"۱٬۴۷۱٬۷۵۸ "	۳۶۱٬۲۲۷
57 | "۱۳۹۳ "	"۱٬۵۳۴٬۳۱۱ "	۴۳۶٬۸۴۰
58 | "۱۳۹۴ "	"۱٬۵۷۰٬۱۸۳ "	۳۶۶٬۶۸۴
59 | "۱۳۹۵ "	"۱٬۵۲۸٬۰۰۳ "	۳۶۹٬۱۵۲
60 | "۱۳۹۶ "	"۱٬۴۸۷٬۸۶۱ "	۳۷۶٬۳۱۳
61 | "۱۳۹۷ "	"۱٬۳۶۶٬۴۹۱ "	۳۷۷٬۰۲۴
62 | "۱۳۹۸ "	"۱٬۱۹۶٬۱۳۵ "	۳۹۵٬۳۹۲
63 | "۱۳۹۹ "	"۱٬۱۱۳٬۹۶۴ "	۵۰۷٬۵۱۱


--------------------------------------------------------------------------------
/data/klpt_stopwords.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"Sorani": {
  3 | 		"Arabic": [
  4 | 			"ئاستی",
  5 | 			"ئێستا",
  6 | 			"ئێمە",
  7 | 			"ئێوە",
  8 | 			"ئەم",
  9 | 			"ئەمساڵ",
 10 | 			"ئەمه",
 11 | 			"ئەمڕۆ",
 12 | 			"ئەمەش",
 13 | 			"ئەنجام",
 14 | 			"ئەنجامدانی",
 15 | 			"ئەو",
 16 | 			"ئەوان",
 17 | 			"ئەوانەی",
 18 | 			"ئەوه",
 19 | 			"ئەویش",
 20 | 			"ئەوەش",
 21 | 			"ئەوەشی",
 22 | 			"ئەوەی",
 23 | 			"ئەڤ",
 24 | 			"ئەگەر",
 25 | 			"ب",
 26 | 			"بارەی",
 27 | 			"باس",
 28 | 			"باسی",
 29 | 			"باش",
 30 | 			"باشترین",
 31 | 			"بدات",
 32 | 			"بن",
 33 | 			"به",
 34 | 			"بواری",
 35 | 			"بوو",
 36 | 			"بوون",
 37 | 			"بوونی",
 38 | 			"بووە",
 39 | 			"بڕی",
 40 | 			"بکات",
 41 | 			"بکرێت",
 42 | 			"بکەن",
 43 | 			"بکەین",
 44 | 			"بۆ",
 45 | 			"بۆیه",
 46 | 			"بی",
 47 | 			"بێ",
 48 | 			"بێت",
 49 | 			"بێجگە",
 50 | 			"بە",
 51 | 			"بەبێ",
 52 | 			"بەدەست",
 53 | 			"بەدەم",
 54 | 			"بەر",
 55 | 			"بەرامبەر",
 56 | 			"بەردەم",
 57 | 			"بەردەوام",
 58 | 			"بەرلە",
 59 | 			"بەرەو",
 60 | 			"بەرەوی",
 61 | 			"بەرەوە",
 62 | 			"بەسەر",
 63 | 			"بەشی",
 64 | 			"بەشێکی",
 65 | 			"بەلای",
 66 | 			"بەم",
 67 | 			"بەمەبەستی",
 68 | 			"بەهۆی",
 69 | 			"بەو",
 70 | 			"بەپێی",
 71 | 			"بەڵام",
 72 | 			"بەڵکو",
 73 | 			"تا",
 74 | 			"تاوەکو",
 75 | 			"تاکو",
 76 | 			"تر",
 77 | 			"تری",
 78 | 			"تووشی",
 79 | 			"تۆ",
 80 | 			"تیادا",
 81 | 			"تیایدا",
 82 | 			"تێ",
 83 | 			"تێدا",
 84 | 			"تێیدا",
 85 | 			"تەنها",
 86 | 			"تەنیا",
 87 | 			"تەواو",
 88 | 			"تەواوی",
 89 | 			"جار",
 90 | 			"جگە",
 91 | 			"جۆره",
 92 | 			"جێگەی",
 93 | 			"جێی",
 94 | 			"خۆی",
 95 | 			"خۆیان",
 96 | 			"داهاتوو",
 97 | 			"داهاتوودا",
 98 | 			"داهاتووی",
 99 | 			"داوای",
100 | 			"داوه",
101 | 			"در",
102 | 			"درێژەی",
103 | 			"دوا",
104 | 			"دواتر",
105 | 			"دوای",
106 | 			"دوێنێ",
107 | 			"دژی",
108 | 			"دی",
109 | 			"دیکه",
110 | 			"دیکەش",
111 | 			"دیکەی",
112 | 			"دێ",
113 | 			"دێت",
114 | 			"دە",
115 | 			"دەبن",
116 | 			"دەبێت",
117 | 			"دەبێته",
118 | 			"دەدات",
119 | 			"دەدرێت",
120 | 			"دەربارەی",
121 | 			"دەرەوەی",
122 | 			"دەکات",
123 | 			"دەکرێت",
124 | 			"دەکەن",
125 | 			"دەکەین",
126 | 			"دەگەڵ",
127 | 			"زۆر",
128 | 			"زۆربەی",
129 | 			"زۆری",
130 | 			"زیاتر",
131 | 			"ساڵ",
132 | 			"سبەی",
133 | 			"سەبارەت",
134 | 			"سەر",
135 | 			"سەرجەم",
136 | 			"سەرەکی",
137 | 			"شوێنی",
138 | 			"شێوەی",
139 | 			"شێوەیەکی",
140 | 			"لای",
141 | 			"لایەن",
142 | 			"لایەنه",
143 | 			"لایەنی",
144 | 			"لێ",
145 | 			"لە",
146 | 			"لەبابەت",
147 | 			"لەباتی",
148 | 			"لەبارەی",
149 | 			"لەبرێتی",
150 | 			"لەبەر",
151 | 			"لەبەینی",
152 | 			"لەدەم",
153 | 			"لەرێ",
154 | 			"لەرێگا",
155 | 			"لەسەر",
156 | 			"لەلایەن",
157 | 			"لەم",
158 | 			"لەناو",
159 | 			"لەنێو",
160 | 			"لەو",
161 | 			"لەپێناوی",
162 | 			"لەژێر",
163 | 			"لەگەڵ",
164 | 			"ماوەی",
165 | 			"ملیۆن",
166 | 			"من",
167 | 			"میانەی",
168 | 			"مەبەستی",
169 | 			"ناو",
170 | 			"ناوخۆی",
171 | 			"ناوی",
172 | 			"نییه",
173 | 			"نێو",
174 | 			"نێوان",
175 | 			"هات",
176 | 			"هاته",
177 | 			"هاتووە",
178 | 			"هاوکات",
179 | 			"هۆکاری",
180 | 			"هۆڵی",
181 | 			"هۆی",
182 | 			"هیچ",
183 | 			"هێڵی",
184 | 			"هەبێت",
185 | 			"هەر",
186 | 			"هەردوو",
187 | 			"هەردوولا",
188 | 			"هەروەها",
189 | 			"هەریەک",
190 | 			"هەفتەی",
191 | 			"هەمان",
192 | 			"هەموو",
193 | 			"هەندێک",
194 | 			"هەیە",
195 | 			"هەیەو",
196 | 			"و",
197 | 			"واته",
198 | 			"وایه",
199 | 			"وتی",
200 | 			"وەک",
201 | 			"وەکوو",
202 | 			"پاش",
203 | 			"پلەی",
204 | 			"پێ",
205 | 			"پێش",
206 | 			"پێشتر",
207 | 			"پێشووی",
208 | 			"پێویسته",
209 | 			"پێی",
210 | 			"چوونکه",
211 | 			"چەند",
212 | 			"چەندین",
213 | 			"ڕوو",
214 | 			"ڕووی",
215 | 			"ژمارەیەک",
216 | 			"ژمارەیەکی",
217 | 			"ژێر",
218 | 			"کاتێک",
219 | 			"کرا",
220 | 			"کران",
221 | 			"کرد",
222 | 			"کردبوو",
223 | 			"کردن",
224 | 			"کردنی",
225 | 			"کردنەوەی",
226 | 			"کردووه",
227 | 			"کردووەو",
228 | 			"کردەوه",
229 | 			"کە",
230 | 			"کەس",
231 | 			"کەم",
232 | 			"یا",
233 | 			"یان",
234 | 			"یێ",
235 | 			"یەک",
236 | 			"یەکێک",
237 | 			"یەکەم",
238 | 			"یەکەمی",
239 | 			"یەکەمین"
240 | 		],
241 | 		"Latin": []
242 | 	},
243 | 	"Kurmanji": {
244 | 		"Latin": [
245 | 			"a",
246 | 			"an",
247 | 			"bareya",
248 | 			"bareyê",
249 | 			"barên",
250 | 			"basa",
251 | 			"be",
252 | 			"belê",
253 | 			"ber",
254 | 			"bereya",
255 | 			"berê",
256 | 			"berî",
257 | 			"bi",
258 | 			"bibe",
259 | 			"bila",
260 | 			"bin",
261 | 			"bo",
262 | 			"bê",
263 | 			"bû",
264 | 			"bûn",
265 | 			"bûye",
266 | 			"da",
267 | 			"dawî",
268 | 			"dawîyê",
269 | 			"daye",
270 | 			"de",
271 | 			"dema",
272 | 			"demekê",
273 | 			"demê",
274 | 			"derbarê",
275 | 			"derve",
276 | 			"dev",
277 | 			"di",
278 | 			"dibe",
279 | 			"digel",
280 | 			"dijî",
281 | 			"dikir",
282 | 			"din",
283 | 			"dinê",
284 | 			"divê",
285 | 			"diçe",
286 | 			"doh",
287 | 			"du",
288 | 			"dê",
289 | 			"dîsan",
290 | 			"e",
291 | 			"eger",
292 | 			"em",
293 | 			"encam",
294 | 			"ev",
295 | 			"evan",
296 | 			"eve",
297 | 			"evê",
298 | 			"evî",
299 | 			"ew",
300 | 			"ewa",
301 | 			"ewan",
302 | 			"ewê",
303 | 			"ewên",
304 | 			"ewî",
305 | 			"ez",
306 | 			"gelek",
307 | 			"gelekî",
308 | 			"gelê",
309 | 			"gerek",
310 | 			"giştî",
311 | 			"gor",
312 | 			"han",
313 | 			"heger",
314 | 			"hejmarek",
315 | 			"hem",
316 | 			"heman",
317 | 			"hember",
318 | 			"hemû",
319 | 			"hene",
320 | 			"her",
321 | 			"herdem",
322 | 			"herdu",
323 | 			"herweha",
324 | 			"herwiha",
325 | 			"herwisa",
326 | 			"herî",
327 | 			"heta",
328 | 			"hev",
329 | 			"hevdu",
330 | 			"heye",
331 | 			"hin",
332 | 			"hinek",
333 | 			"hîngê",
334 | 			"hûn",
335 | 			"in",
336 | 			"ji",
337 | 			"jiber",
338 | 			"jibo",
339 | 			"jê",
340 | 			"jêr",
341 | 			"jî",
342 | 			"ka",
343 | 			"ke",
344 | 			"kes",
345 | 			"kir",
346 | 			"kirîye",
347 | 			"ku",
348 | 			"kû",
349 | 			"layê",
350 | 			"le",
351 | 			"li",
352 | 			"ligel",
353 | 			"lê",
354 | 			"me",
355 | 			"min",
356 | 			"nav",
357 | 			"nava",
358 | 			"navbera",
359 | 			"navê",
360 | 			"navîn",
361 | 			"ne",
362 | 			"nêvbera",
363 | 			"nêzîkî",
364 | 			"nîne",
365 | 			"piştî",
366 | 			"pê",
367 | 			"pêk",
368 | 			"pêş",
369 | 			"re",
370 | 			"ser",
371 | 			"serê",
372 | 			"tenê",
373 | 			"ti",
374 | 			"tiştekî",
375 | 			"tu",
376 | 			"tê",
377 | 			"u",
378 | 			"van",
379 | 			"ve",
380 | 			"vir",
381 | 			"vê",
382 | 			"vî",
383 | 			"wan",
384 | 			"we",
385 | 			"weha",
386 | 			"wek",
387 | 			"weke",
388 | 			"wekî",
389 | 			"wiha",
390 | 			"wir",
391 | 			"wisa",
392 | 			"wê",
393 | 			"wî",
394 | 			"xwarê",
395 | 			"xwe",
396 | 			"ya",
397 | 			"yan",
398 | 			"ye",
399 | 			"yek",
400 | 			"yekê",
401 | 			"yê",
402 | 			"yên",
403 | 			"zêde",
404 | 			"zêdetir",
405 | 			"çawa",
406 | 			"çend",
407 | 			"çendê",
408 | 			"çendîn",
409 | 			"çi",
410 | 			"ê",
411 | 			"êdî",
412 | 			"ên",
413 | 			"îro",
414 | 			"û"
415 | 		],
416 | 		"Arabic": []
417 | 	}
418 | }


--------------------------------------------------------------------------------
/data/myanmar-regions.tsv:
--------------------------------------------------------------------------------
 1 | အင်္ဂလိပ်အမည်	မြန်မာအမည်	မြို့တော် 	ISO	နေရာဒေသ 	လူဦးရေ _၂၀၁၄	ဧရိယာ	အမျိုးအစား
 2 | Ayeyarwady 	ဧရာဝတီ 	ပုသိမ်မြို့ 	MM-07 	အောက်ပိုင်း 	၆,၁၈၄,၈၂၉ 	၃၅,၀၃၁.၈ 	တိုင်းဒေသကြီး
 3 | Bago 	ပဲခူး 	ပဲခူးမြို့	MM-02 	အောက်ပိုင်း 	၄,၈၆၇,၃၇၃	၃၉,၄၀၂.၃ 	တိုင်းဒေသကြီး
 4 | Chin 	ချင်း 	ဟားခါးမြို့ 	MM-14 	အနောက်ပိုင်း 	၄၇၈,၈၀၁ 	၃၆,၀၁၈.၈ 	ပြည်နယ်
 5 | Kachin 	ကချင် 	မြစ်ကြီးနားမြို့ 	MM-11 	မြောက်ပိုင်း 	၁,၆၈၉,၄၄၁	၈၉,၀၄၁.၈ 	ပြည်နယ်
 6 | Kayah 	ကယားကယား	လွိုင်ကော်မြို့ 	MM-12 	အရှေ့ပိုင်း 	၂၈၆,၆၂၇ 	၁၁,၇၃၁.၅ 	ပြည်နယ်
 7 | Kayin 	ကရင် 	ဘားအံမြို့ 	MM-13 	တောင်ပိုင်း 	၁,၅၇၄,၀၇၉	၃၀,၃၈၃ 	ပြည်နယ်
 8 | Magway 	မကွေး 	မကွေးမြို့ 	MM-03 	အလယ်ပိုင်း 	၃,၉၁၇,၀၅၅	၄၄,၈၂၀.၆ 	တိုင်းဒေသကြီး
 9 | Mandalay 	မန္တလေး 	မန္တလေးမြို့ 	MM-04 	အလယ်ပိုင်း 	၆,၁၆၅,၇၂၃	၃၇,၉၄၅.၆ 	တိုင်းဒေသကြီး
10 | Mon 	မွန် 	မော်လမြိုင်မြို့ 	MM-15 	တောင်ပိုင်း 	၂,၀၅၄,၃၉၃	၁၂,၂၉၆.၆ 	ပြည်နယ်
11 | Rakhine 	ရခိုင် 	စစ်တွေမြို့ 	MM-16 	အနောက်ပိုင်း 	၃,၁၈၈,၈၀၇	၃၆,၇၇၈.၀ 	ပြည်နယ်
12 | Shan 	ရှမ်း 	တောင်ကြီးမြို့၂ 	MM-17 	အရှေ့ပိုင်း 	၅,၈၂၄,၄၃၂	၁၅၅,၈၀၁.၃ 	ပြည်နယ်
13 | Sagaing 	စစ်ကိုင်း 	မုံရွာမြို့ 	MM-01 	မြောက်ပိုင်း 	၅,၃၂၅,၃၄၇	၉၃,၇၀၄.၈ 	တိုင်းဒေသကြီး
14 | Tanintharyi 	တနင်္သာရီ 	ထားဝယ်မြို့ 	MM-05 	တောင်ပိုင်း 	၁,၄၀၈,၄၀၁	၄၄,၃၄၄.၉ 	တိုင်းဒေသကြီး
15 | Yangon 	ရန်ကုန် 	ရန်ကုန်မြို့ 	MM-06 	အောက်ပိုင်း 	၇,၃၆၀,၇၀၃	၁၀,၂၆၇.၇ 	တိုင်းဒေသကြီး
16 | Naypyidaw 	နေပြည်တော် 	နေပြည်တော် 	MM-18 	အလယ်ပိုင်း 	၁,၁၆၀,၂၄၂	၇,၀၅၄ 	ပြည်ထောင်စုနယ်မြေ


--------------------------------------------------------------------------------
/data/myanmar_ethnic_groups.tsv:
--------------------------------------------------------------------------------
 1 | မြန်မာတိုင်းရင်းသားများ	အကြမ်းဖျင်းခန့်မှန်း	အကြမ်းဖျင်းခန့်မှန်း
 2 | ကချင်	1.50	၁.၅၀
 3 | ကယား	0.75	၀.၇၅
 4 | ကရင်	7.00	၇.၀၀
 5 | တရုတ်	2.50	၂.၅၀
 6 | ဗမာ	68.00	၆၈.၀၀
 7 | မွန်	2.00	၂.၀၀
 8 | ရခိုင်	1.7	၁.၇၀
 9 | ရိုဟင်ဂျာ	1.8	၁.၈၀
10 | ရှမ်း	9.00	၉.၀၀
11 | အခြားအုပ်စုများ	4.50	၄.၅၀
12 | အိန္ဒိယ	1.25	၁.၂၅


--------------------------------------------------------------------------------
/data/rbbi/Default.rbbi:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed to the Apache Software Foundation (ASF) under one or more
  3 | # contributor license agreements.  See the NOTICE file distributed with
  4 | # this work for additional information regarding copyright ownership.
  5 | # The ASF licenses this file to You under the Apache License, Version 2.0
  6 | # (the "License"); you may not use this file except in compliance with
  7 | # the License.  You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | # This file is from ICU (with some small modifications, to avoid CJK dictionary break,
 18 | # and status code change related to that)
 19 | #
 20 | # Copyright (C) 2016 and later: Unicode, Inc. and others.
 21 | # License & terms of use: http://www.unicode.org/copyright.html
 22 | # Copyright (C) 2002-2016, International Business Machines Corporation
 23 | # and others. All Rights Reserved.
 24 | #
 25 | # file:  word.txt
 26 | #
 27 | # ICU Word Break Rules
 28 | #      See Unicode Standard Annex #29.
 29 | #      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
 30 | #      with additions for Emoji Sequences from https://goo.gl/cluFCn
 31 | #      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
 32 | #
 33 | # Note:  Updates to word.txt will usually need to be merged into
 34 | #        word_POSIX.txt also.
 35 | 
 36 | ##############################################################################
 37 | #
 38 | #  Character class definitions from TR 29
 39 | #
 40 | ##############################################################################
 41 | 
 42 | !!chain;
 43 | !!quoted_literals_only;
 44 | 
 45 | 
 46 | #
 47 | #  Character Class Definitions.
 48 | #
 49 | 
 50 | $CR                 = [\p{Word_Break = CR}];
 51 | $LF                 = [\p{Word_Break = LF}];
 52 | $Newline            = [\p{Word_Break = Newline} ];
 53 | $Extend             = [\p{Word_Break = Extend}];
 54 | $ZWJ                = [\p{Word_Break = ZWJ}];
 55 | $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
 56 | $Format             = [\p{Word_Break = Format}];
 57 | $Katakana           = [\p{Word_Break = Katakana}];
 58 | $Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
 59 | $ALetter            = [\p{Word_Break = ALetter}];
 60 | $Single_Quote       = [\p{Word_Break = Single_Quote}];
 61 | $Double_Quote       = [\p{Word_Break = Double_Quote}];
 62 | $MidNumLet          = [\p{Word_Break = MidNumLet}];
 63 | $MidLetter          = [\p{Word_Break = MidLetter}];
 64 | $MidNum             = [\p{Word_Break = MidNum}];
 65 | $Numeric            = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
 66 | 
 67 | $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
 68 | $WSegSpace          = [\p{Word_Break = WSegSpace}];
 69 | $Extended_Pict      = [:ExtPict:];
 70 | 
 71 | $Han                = [:Han:];
 72 | $Hiragana           = [:Hiragana:];
 73 | 
 74 | 
 75 | #   Dictionary character set, for triggering language-based break engines. Currently
 76 | #   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
 77 | #   5.0 or later as the definition of Complex_Context was corrected to include all
 78 | #   characters requiring dictionary break.
 79 | 
 80 | $Control        = [\p{Grapheme_Cluster_Break = Control}]; 
 81 | $HangulSyllable = [\uac00-\ud7a3];
 82 | $ComplexContext = [:LineBreak = Complex_Context:];
 83 | $KanaKanji      = [$Han $Hiragana $Katakana];
 84 | $dictionaryCJK  = [$Han $Hiragana $HangulSyllable];
 85 | $dictionary     = [$ComplexContext];
 86 | 
 87 | # leave CJK scripts out of ALetterPlus
 88 | $ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
 89 | 
 90 | 
 91 | #
 92 | #  Rules 4    Ignore Format and Extend characters, 
 93 | #             except when they appear at the beginning of a region of text.
 94 | #
 95 | # TODO: check if handling of katakana in dictionary makes rules incorrect/void
 96 | $KatakanaEx           = $Katakana           ($Extend |  $Format | $ZWJ)*;
 97 | $Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format | $ZWJ)*;
 98 | $ALetterEx            = $ALetterPlus        ($Extend |  $Format | $ZWJ)*;
 99 | $Single_QuoteEx       = $Single_Quote       ($Extend |  $Format | $ZWJ)*;
100 | $Double_QuoteEx       = $Double_Quote       ($Extend |  $Format | $ZWJ)*;
101 | $MidNumLetEx          = $MidNumLet          ($Extend |  $Format | $ZWJ)*;
102 | $MidLetterEx          = $MidLetter          ($Extend |  $Format | $ZWJ)*;
103 | $MidNumEx             = $MidNum             ($Extend |  $Format | $ZWJ)*;
104 | $NumericEx            = $Numeric            ($Extend |  $Format | $ZWJ)*;
105 | $ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format | $ZWJ)*;
106 | $Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format | $ZWJ)*;
107 | 
108 | $Ideographic    = [\p{Ideographic}];
109 | $HiraganaEx     = $Hiragana     ($Extend |  $Format | $ZWJ)*;
110 | $IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;
111 | 
112 | ## -------------------------------------------------
113 | 
114 | # Rule 3 - CR x LF
115 | #
116 | $CR $LF;
117 | 
118 | # Rule 3c   ZWJ x (Extended_Pict | EmojiNRK).  Precedes WB4, so no intervening Extend chars allowed.
119 | #
120 | $ZWJ $Extended_Pict;
121 | 
122 | # Rule 3d - Keep horizontal whitespace together.
123 | #
124 | $WSegSpace $WSegSpace;
125 | 
126 | # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
127 | #          of a region of Text.   The rule here comes into play when the start of text
128 | #          begins with a group of Format chars, or with a "word" consisting of a single
129 | #          char that is not in any of the listed word break categories followed by
130 | #          format char(s), or is not a CJK dictionary character.
131 | [^$CR $LF $Newline]? ($Extend |  $Format | $ZWJ)+;
132 | 
133 | $NumericEx {100};
134 | $ALetterEx {200};
135 | $HangulSyllable {200};
136 | $Hebrew_LetterEx{200};
137 | $KatakanaEx {300};       # note:  these status values override those from rule 5
138 | $HiraganaEx {300};       #        by virtue of being numerically larger.
139 | $IdeographicEx {400};    #
140 | 
141 | $Extended_Pict ($Extend | $Format | $ZWJ)*;
142 | 
143 | #
144 | # rule 5
145 | #    Do not break between most letters.
146 | #
147 | ($ALetterEx | $Hebrew_LetterEx)  ($ALetterEx | $Hebrew_LetterEx) {200};
148 | 
149 | # rule 6 and 7
150 | ($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
151 | 
152 | # rule 7a
153 | $Hebrew_LetterEx $Single_QuoteEx {200};
154 | 
155 | # rule 7b and 7c
156 | $Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
157 | 
158 | # rule 8
159 | 
160 | $NumericEx $NumericEx {100};
161 | 
162 | # rule 9
163 | 
164 | ($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
165 | 
166 | # rule 10
167 | 
168 | $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
169 | 
170 | # rule 11 and 12 
171 | 
172 | $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
173 | 
174 | # rule 13
175 | $KatakanaEx  $KatakanaEx {300};
176 | 
177 | # rule 13a/b
178 | 
179 | $ALetterEx       $ExtendNumLetEx {200};    #  (13a)
180 | $Hebrew_LetterEx $ExtendNumLetEx {200};    #  (13a)
181 | $NumericEx       $ExtendNumLetEx {100};    #  (13a)
182 | $KatakanaEx      $ExtendNumLetEx {300};    #  (13a)
183 | $ExtendNumLetEx  $ExtendNumLetEx {200};    #  (13a)
184 | 
185 | $ExtendNumLetEx  $ALetterEx      {200};    #  (13b)
186 | $ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
187 | $ExtendNumLetEx  $NumericEx      {100};    #  (13b)
188 | $ExtendNumLetEx  $KatakanaEx     {300};    #  (13b)
189 | 
190 | # rules 15 - 17
191 | #    Pairs of Regional Indicators stay together.
192 | #    With rule chaining disabled by ^, this rule will match exactly two of them.
193 | #    No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
194 | #
195 | ^$Regional_IndicatorEx $Regional_IndicatorEx;
196 | 
197 | # special handling for CJK characters: chain for later dictionary segmentation
198 | $HangulSyllable $HangulSyllable {200};
199 | 
200 | # Rule 999
201 | #     Match a single code point if no other rule applies.
202 | .;
203 | 


--------------------------------------------------------------------------------
/data/rbbi/Lao.rbbi:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed to the Apache Software Foundation (ASF) under one or more
  3 | # contributor license agreements.  See the NOTICE file distributed with
  4 | # this work for additional information regarding copyright ownership.
  5 | # The ASF licenses this file to You under the Apache License, Version 2.0
  6 | # (the "License"); you may not use this file except in compliance with
  7 | # the License.  You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | # Parses Lao text, with syllable as token.
 18 | #
 19 | # The definition of Lao syllable is based from:
 20 | #
 21 | #   Syllabification of Lao Script for Line Breaking
 22 | #   Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, 
 23 | #     Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
 24 | #   http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
 25 | #	http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
 26 | #
 27 | # NOTE:
 28 | # There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
 29 | # For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
 30 | #
 31 | # Syllable structure, where X is the nuclear consonant:
 32 | #
 33 | #           +----+
 34 | #           | X5 |
 35 | #           +----+
 36 | #           | X4 |
 37 | # +----+----+----+----+----+----+----+-----+
 38 | # | X0 | X1 | X  | X6 | X7 | X8 | X9 | X10 |
 39 | # +----+----+----+----+----+----+----+-----+
 40 | #           | X2 |
 41 | #           +----+
 42 | #           | X3 |
 43 | #           +----+
 44 | #
 45 | # X0 represents a vowel which occurs before the nuclear consonant. 
 46 | # It can always define the beginning of syllable.
 47 | $X0 = [\u0EC0-\u0EC4];
 48 | # X1 is a combination consonant which comes before the nuclear consonant, 
 49 | # but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
 50 | $X1 = [\u0EAB];
 51 | # X represents the nuclear consonant.
 52 | $X = [\u0E81-\u0EAE\u0EDC\u0EDD];
 53 | # X2 is a combination consonant which comes after the nuclear consonant, 
 54 | # which is placed under or next to the nuclear consonant.
 55 | $X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
 56 | # X3 represents a vowel which occurs under the nuclear consonant.
 57 | $X3 = [\u0EB8\u0EB9];
 58 | # X4 represents a vowel which occurs above the nuclear consonant. 
 59 | $X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
 60 | # X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
 61 | $X5 = [\u0EC8-\u0ECB];
 62 | # X6 represents a consonant vowel, which occurs after the nuclear consonant. 
 63 | # It functions when the syllable doesn’t have any vowels. And it always exists with X8.
 64 | $X6 = [\u0EA7\u0EAD\u0EBD];
 65 | # X7 represents a final vowel. 
 66 | # However X7_1 always represents the end of syllable and it never exists with tone mark.
 67 | $X7 = [\u0EB0\u0EB2\u0EB3];
 68 | # X8 represents an alternate consonant.
 69 | $X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
 70 | # X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
 71 | $X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
 72 | # X10 represents a sign mark. 
 73 | # It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
 74 | $X10 = [\u0EAF\u0EC6\u0ECC];
 75 | 
 76 | # Section 1
 77 | $X0_1 = [\u0EC0];
 78 | $X4_1_2 = [\u0EB4\u0EB5];
 79 | $X4_3_4 = [\u0EB6\u0EB7];
 80 | $X4_6 = [\u0EBB];
 81 | $X4_7 = [\u0EB1];
 82 | $X6_2 = [\u0EAD];
 83 | $X6_3 = [\u0EBD];
 84 | $X7_1 = [\u0EB0];
 85 | $X7_2 = [\u0EB2];
 86 | $X10_1 = [\u0EAF];
 87 | $X10_2 = [\u0EC6];
 88 | $X10_3 = [\u0ECC];
 89 | 
 90 | $Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 91 | $Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 92 | $Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 93 | $Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
 94 | $Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 95 | $Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 96 | $Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 97 | 
 98 | $Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
 99 | 
100 | # Section 2
101 | $X0_2 = [\u0EC1];
102 | 
103 | $Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
104 | $Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
105 | $Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 
106 | 
107 | $Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
108 | 
109 | # Section 3
110 | $X0_3 = [\u0EC2];
111 | $X8_3 = [\u0E8D];
112 | $X8_8 = [\u0EA7];
113 | 
114 | $Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
115 | $Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
116 | $Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
117 | 
118 | $Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
119 | 
120 | # Section 4
121 | $X0_4 = [\u0EC4];
122 | $X6_1 = [\u0EA7];
123 | 
124 | $Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
125 | 
126 | # Section 5
127 | $X0_5 = [\u0EC3];
128 | 
129 | $Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
130 | 
131 | # Section 6
132 | $Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
133 | 
134 | # Section 7
135 | $X4_1_4 = [\u0EB4-\u0EB7];
136 | 
137 | $Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
138 | 
139 | # Section 8
140 | $X4_5 = [\u0ECD];
141 | 
142 | $Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
143 | 
144 | # Section 9
145 | 
146 | $Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
147 | $Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
148 | 
149 | $Rule9 = ($Rule9_1 | $Rule9_2);
150 | 
151 | # Section 10
152 | $Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
153 | 
154 | # Section 11
155 | $Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
156 | 
157 | # Section 12
158 | $Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
159 | 
160 | # Section 13
161 | $Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
162 | 
163 | # Section 14
164 | $X7_3 = [\u0EB3];
165 | 
166 | $Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
167 | 
168 | $LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
169 | 
170 | $WordJoin = [:Line_Break=Word_Joiner:];
171 | 
172 | $LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
173 | 
174 | #
175 | # default numerical definitions
176 | #
177 | $Extend       = [\p{Word_Break = Extend}];
178 | $Format       = [\p{Word_Break = Format}];
179 | $MidNumLet    = [\p{Word_Break = MidNumLet}];
180 | $MidNum       = [\p{Word_Break = MidNum}];
181 | $Numeric      = [\p{Word_Break = Numeric}];
182 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
183 | $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
184 | $MidNumEx       = $MidNum       ($Extend |  $Format)*;
185 | $NumericEx      = $Numeric      ($Extend |  $Format)*;
186 | $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
187 | 
188 | !!forward;
189 | 
190 | $LaoJoinedSyllableEx {200};
191 | # default numeric rules
192 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};
193 | 


--------------------------------------------------------------------------------
/data/rbbi/lucene/source.md:
--------------------------------------------------------------------------------
1 | * https://gitbox.apache.org/repos/asf?p=lucene.git;a=tree;f=lucene/analysis/icu/src/data;h=e7275ffa9541dab51e4b9a62166aeef457c5c22f;hb=refs/heads/main


--------------------------------------------------------------------------------
/data/rbbi/lucene/uax29/Default.rbbi:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed to the Apache Software Foundation (ASF) under one or more
  3 | # contributor license agreements.  See the NOTICE file distributed with
  4 | # this work for additional information regarding copyright ownership.
  5 | # The ASF licenses this file to You under the Apache License, Version 2.0
  6 | # (the "License"); you may not use this file except in compliance with
  7 | # the License.  You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | # This file is from ICU (with some small modifications, to avoid CJK dictionary break,
 18 | # and status code change related to that)
 19 | #
 20 | # Copyright (C) 2016 and later: Unicode, Inc. and others.
 21 | # License & terms of use: http://www.unicode.org/copyright.html
 22 | # Copyright (C) 2002-2016, International Business Machines Corporation
 23 | # and others. All Rights Reserved.
 24 | #
 25 | # file:  word.txt
 26 | #
 27 | # ICU Word Break Rules
 28 | #      See Unicode Standard Annex #29.
 29 | #      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
 30 | #      with additions for Emoji Sequences from https://goo.gl/cluFCn
 31 | #      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
 32 | #
 33 | # Note:  Updates to word.txt will usually need to be merged into
 34 | #        word_POSIX.txt also.
 35 | 
 36 | ##############################################################################
 37 | #
 38 | #  Character class definitions from TR 29
 39 | #
 40 | ##############################################################################
 41 | 
 42 | !!chain;
 43 | !!quoted_literals_only;
 44 | 
 45 | 
 46 | #
 47 | #  Character Class Definitions.
 48 | #
 49 | 
 50 | $CR                 = [\p{Word_Break = CR}];
 51 | $LF                 = [\p{Word_Break = LF}];
 52 | $Newline            = [\p{Word_Break = Newline} ];
 53 | $Extend             = [\p{Word_Break = Extend}];
 54 | $ZWJ                = [\p{Word_Break = ZWJ}];
 55 | $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
 56 | $Format             = [\p{Word_Break = Format}];
 57 | $Katakana           = [\p{Word_Break = Katakana}];
 58 | $Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
 59 | $ALetter            = [\p{Word_Break = ALetter}];
 60 | $Single_Quote       = [\p{Word_Break = Single_Quote}];
 61 | $Double_Quote       = [\p{Word_Break = Double_Quote}];
 62 | $MidNumLet          = [\p{Word_Break = MidNumLet}];
 63 | $MidLetter          = [\p{Word_Break = MidLetter}];
 64 | $MidNum             = [\p{Word_Break = MidNum}];
 65 | $Numeric            = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
 66 | 
 67 | $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
 68 | $WSegSpace          = [\p{Word_Break = WSegSpace}];
 69 | $Extended_Pict      = [:ExtPict:];
 70 | 
 71 | $Han                = [:Han:];
 72 | $Hiragana           = [:Hiragana:];
 73 | 
 74 | 
 75 | #   Dictionary character set, for triggering language-based break engines. Currently
 76 | #   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
 77 | #   5.0 or later as the definition of Complex_Context was corrected to include all
 78 | #   characters requiring dictionary break.
 79 | 
 80 | $Control        = [\p{Grapheme_Cluster_Break = Control}]; 
 81 | $HangulSyllable = [\uac00-\ud7a3];
 82 | $ComplexContext = [:LineBreak = Complex_Context:];
 83 | $KanaKanji      = [$Han $Hiragana $Katakana];
 84 | $dictionaryCJK  = [$Han $Hiragana $HangulSyllable];
 85 | $dictionary     = [$ComplexContext];
 86 | 
 87 | # leave CJK scripts out of ALetterPlus
 88 | $ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
 89 | 
 90 | 
 91 | #
 92 | #  Rules 4    Ignore Format and Extend characters, 
 93 | #             except when they appear at the beginning of a region of text.
 94 | #
 95 | # TODO: check if handling of katakana in dictionary makes rules incorrect/void
 96 | $KatakanaEx           = $Katakana           ($Extend |  $Format | $ZWJ)*;
 97 | $Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format | $ZWJ)*;
 98 | $ALetterEx            = $ALetterPlus        ($Extend |  $Format | $ZWJ)*;
 99 | $Single_QuoteEx       = $Single_Quote       ($Extend |  $Format | $ZWJ)*;
100 | $Double_QuoteEx       = $Double_Quote       ($Extend |  $Format | $ZWJ)*;
101 | $MidNumLetEx          = $MidNumLet          ($Extend |  $Format | $ZWJ)*;
102 | $MidLetterEx          = $MidLetter          ($Extend |  $Format | $ZWJ)*;
103 | $MidNumEx             = $MidNum             ($Extend |  $Format | $ZWJ)*;
104 | $NumericEx            = $Numeric            ($Extend |  $Format | $ZWJ)*;
105 | $ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format | $ZWJ)*;
106 | $Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format | $ZWJ)*;
107 | 
108 | $Ideographic    = [\p{Ideographic}];
109 | $HiraganaEx     = $Hiragana     ($Extend |  $Format | $ZWJ)*;
110 | $IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;
111 | 
112 | ## -------------------------------------------------
113 | 
114 | # Rule 3 - CR x LF
115 | #
116 | $CR $LF;
117 | 
118 | # Rule 3c   ZWJ x (Extended_Pict | EmojiNRK).  Precedes WB4, so no intervening Extend chars allowed.
119 | #
120 | $ZWJ $Extended_Pict;
121 | 
122 | # Rule 3d - Keep horizontal whitespace together.
123 | #
124 | $WSegSpace $WSegSpace;
125 | 
126 | # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
127 | #          of a region of Text.   The rule here comes into play when the start of text
128 | #          begins with a group of Format chars, or with a "word" consisting of a single
129 | #          char that is not in any of the listed word break categories followed by
130 | #          format char(s), or is not a CJK dictionary character.
131 | [^$CR $LF $Newline]? ($Extend |  $Format | $ZWJ)+;
132 | 
133 | $NumericEx {100};
134 | $ALetterEx {200};
135 | $HangulSyllable {200};
136 | $Hebrew_LetterEx{200};
137 | $KatakanaEx {300};       # note:  these status values override those from rule 5
138 | $HiraganaEx {300};       #        by virtue of being numerically larger.
139 | $IdeographicEx {400};    #
140 | 
141 | $Extended_Pict ($Extend | $Format | $ZWJ)*;
142 | 
143 | #
144 | # rule 5
145 | #    Do not break between most letters.
146 | #
147 | ($ALetterEx | $Hebrew_LetterEx)  ($ALetterEx | $Hebrew_LetterEx) {200};
148 | 
149 | # rule 6 and 7
150 | ($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
151 | 
152 | # rule 7a
153 | $Hebrew_LetterEx $Single_QuoteEx {200};
154 | 
155 | # rule 7b and 7c
156 | $Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
157 | 
158 | # rule 8
159 | 
160 | $NumericEx $NumericEx {100};
161 | 
162 | # rule 9
163 | 
164 | ($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
165 | 
166 | # rule 10
167 | 
168 | $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
169 | 
170 | # rule 11 and 12 
171 | 
172 | $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
173 | 
174 | # rule 13
175 | $KatakanaEx  $KatakanaEx {300};
176 | 
177 | # rule 13a/b
178 | 
179 | $ALetterEx       $ExtendNumLetEx {200};    #  (13a)
180 | $Hebrew_LetterEx $ExtendNumLetEx {200};    #  (13a)
181 | $NumericEx       $ExtendNumLetEx {100};    #  (13a)
182 | $KatakanaEx      $ExtendNumLetEx {300};    #  (13a)
183 | $ExtendNumLetEx  $ExtendNumLetEx {200};    #  (13a)
184 | 
185 | $ExtendNumLetEx  $ALetterEx      {200};    #  (13b)
186 | $ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
187 | $ExtendNumLetEx  $NumericEx      {100};    #  (13b)
188 | $ExtendNumLetEx  $KatakanaEx     {300};    #  (13b)
189 | 
190 | # rules 15 - 17
191 | #    Pairs of Regional Indicators stay together.
192 | #    With rule chaining disabled by ^, this rule will match exactly two of them.
193 | #    No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
194 | #
195 | ^$Regional_IndicatorEx $Regional_IndicatorEx;
196 | 
197 | # special handling for CJK characters: chain for later dictionary segmentation
198 | $HangulSyllable $HangulSyllable {200};
199 | 
200 | # Rule 999
201 | #     Match a single code point if no other rule applies.
202 | .;
203 | 


--------------------------------------------------------------------------------
/data/rbbi/lucene/uax29/MyanmarSyllable.rbbi:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # 
18 | # Parses Myanmar text, with syllable as token. 
19 | #
20 | 
21 | $Cons = [[:Other_Letter:]&[:Myanmar:]];
22 | $Virama = [\u1039];
23 | $Asat = [\u103A];
24 | 
25 | $WordJoin = [:Line_Break=Word_Joiner:]; 
26 | 
27 | #
28 | # default numerical definitions
29 | #
30 | $Extend       = [\p{Word_Break = Extend}];
31 | $Format       = [\p{Word_Break = Format}];
32 | $MidNumLet    = [\p{Word_Break = MidNumLet}];
33 | $MidNum       = [\p{Word_Break = MidNum}];
34 | $Numeric      = [\p{Word_Break = Numeric}];
35 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
36 | $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
37 | $MidNumEx       = $MidNum       ($Extend |  $Format)*;
38 | $NumericEx      = $Numeric      ($Extend |  $Format)*;
39 | $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
40 | 
41 | $ConsEx = $Cons ($Extend | $Format)*;
42 | $AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
43 | $MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
44 | $MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
45 | 
46 | !!forward;
47 | $MyanmarJoinedSyllableEx {200};
48 | 
49 | # default numeric rules
50 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};
51 | 


--------------------------------------------------------------------------------
/data/rbbi/solrcene/Hebrew.rbbi:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | #
18 | # This is an example of rule tailoring for Hebrew.
19 | # In this example the single-quote is added to the Extend category
20 | # The double-quote is added to the MidLetter category.
21 | #
22 | !!chain;
23 | $CR           = [\p{Word_Break = CR}];
24 | $LF           = [\p{Word_Break = LF}];
25 | $Newline      = [\p{Word_Break = Newline}];
26 | $Extend       = [\p{Word_Break = Extend}\u0027];
27 | $Format       = [\p{Word_Break = Format}];
28 | $ALetter      = [\p{Word_Break = ALetter}];
29 | $MidNumLet    = [\p{Word_Break = MidNumLet}];
30 | $MidLetter    = [\p{Word_Break = MidLetter}\u0022];
31 | $MidNum       = [\p{Word_Break = MidNum}];
32 | $Numeric      = [\p{Word_Break = Numeric}];
33 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
34 | $dictionary   = [:LineBreak = Complex_Context:];
35 | $Control        = [\p{Grapheme_Cluster_Break = Control}]; 
36 | $ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]]; 
37 |                                                               
38 | $ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
39 | $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
40 | $MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
41 | $MidNumEx       = $MidNum       ($Extend |  $Format)*;
42 | $NumericEx      = $Numeric      ($Extend |  $Format)*;
43 | $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
44 | 
45 | !!forward;
46 | 
47 | $CR $LF;
48 | [^$CR $LF $Newline]? ($Extend |  $Format)+;
49 | $NumericEx {100};
50 | $ALetterEx {200};    
51 | $ALetterEx $ALetterEx {200};
52 | $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
53 | $NumericEx $NumericEx {100};
54 | $ALetterEx $NumericEx {200};
55 | $NumericEx $ALetterEx {200};
56 | $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
57 | $ALetterEx      $ExtendNumLetEx {200};    
58 | $NumericEx      $ExtendNumLetEx {100};      
59 | $ExtendNumLetEx $ExtendNumLetEx {200};    
60 | $ExtendNumLetEx $ALetterEx  {200};    
61 | $ExtendNumLetEx $NumericEx  {100};    
62 | 


--------------------------------------------------------------------------------
/data/rbbi/solrcene/Khmer.rbbi:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # 
18 | # Parses Khmer text, with orthographic syllable as token.
19 | #
20 | # The definition of Khmer orthographic syllable is taken from the Unicode Standard.
21 | #
22 | # B = base character (consonant, independent vowel, etc)
23 | $KhmerBase = [\u1780-\u17B3];
24 | # R = robat
25 | $KhmerRobat = [\u17CC];
26 | # C = consonant shifter
27 | $KhmerShifter = [\u17C9\u17CA];
28 | # S = subscript consonant or independent vowel sign
29 | $KhmerSub = ([\u17D2] $KhmerBase);
30 | # V = dependent vowel sign
31 | $KhmerVowel = [\u17B4-\u17C5];
32 | # Z = zero-width joiner or non-joiner
33 | $KhmerZWC = [\u200C\u200D];
34 | # O = any other sign
35 | $KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD]; 
36 | 
37 | $WordJoin = [:Line_Break=Word_Joiner:];
38 | 
39 | $KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
40 | 
41 | $KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
42 | 
43 | #
44 | # default numerical definitions
45 | #
46 | $Extend       = [\p{Word_Break = Extend}];
47 | $Format       = [\p{Word_Break = Format}];
48 | $MidNumLet    = [\p{Word_Break = MidNumLet}];
49 | $MidNum       = [\p{Word_Break = MidNum}];
50 | $Numeric      = [\p{Word_Break = Numeric}];
51 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
52 | $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
53 | $MidNumEx       = $MidNum       ($Extend |  $Format)*;
54 | $NumericEx      = $Numeric      ($Extend |  $Format)*;
55 | $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
56 | 
57 | !!forward;
58 | $KhmerJoinedSyllableEx {200};
59 | 
60 | # default numeric rules
61 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};
62 | 


--------------------------------------------------------------------------------
/data/rbbi/solrcene/Lao.rbbi:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed to the Apache Software Foundation (ASF) under one or more
  3 | # contributor license agreements.  See the NOTICE file distributed with
  4 | # this work for additional information regarding copyright ownership.
  5 | # The ASF licenses this file to You under the Apache License, Version 2.0
  6 | # (the "License"); you may not use this file except in compliance with
  7 | # the License.  You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | # Parses Lao text, with syllable as token.
 18 | #
 19 | # The definition of Lao syllable is based from:
 20 | #
 21 | #   Syllabification of Lao Script for Line Breaking
 22 | #   Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, 
 23 | #     Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
 24 | #   http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
 25 | #	http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
 26 | #
 27 | # NOTE:
 28 | # There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
 29 | # For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
 30 | #
 31 | # Syllable structure, where X is the nuclear consonant:
 32 | #
 33 | #           +----+
 34 | #           | X5 |
 35 | #           +----+
 36 | #           | X4 |
 37 | # +----+----+----+----+----+----+----+-----+
 38 | # | X0 | X1 | X  | X6 | X7 | X8 | X9 | X10 |
 39 | # +----+----+----+----+----+----+----+-----+
 40 | #           | X2 |
 41 | #           +----+
 42 | #           | X3 |
 43 | #           +----+
 44 | #
 45 | # X0 represents a vowel which occurs before the nuclear consonant. 
 46 | # It can always define the beginning of syllable.
 47 | $X0 = [\u0EC0-\u0EC4];
 48 | # X1 is a combination consonant which comes before the nuclear consonant, 
 49 | # but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
 50 | $X1 = [\u0EAB];
 51 | # X represents the nuclear consonant.
 52 | $X = [\u0E81-\u0EAE\u0EDC\u0EDD];
 53 | # X2 is a combination consonant which comes after the nuclear consonant, 
 54 | # which is placed under or next to the nuclear consonant.
 55 | $X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
 56 | # X3 represents a vowel which occurs under the nuclear consonant.
 57 | $X3 = [\u0EB8\u0EB9];
 58 | # X4 represents a vowel which occurs above the nuclear consonant. 
 59 | $X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
 60 | # X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
 61 | $X5 = [\u0EC8-\u0ECB];
 62 | # X6 represents a consonant vowel, which occurs after the nuclear consonant. 
 63 | # It functions when the syllable doesn’t have any vowels. And it always exists with X8.
 64 | $X6 = [\u0EA7\u0EAD\u0EBD];
 65 | # X7 represents a final vowel. 
 66 | # However X7_1 always represents the end of syllable and it never exists with tone mark.
 67 | $X7 = [\u0EB0\u0EB2\u0EB3];
 68 | # X8 represents an alternate consonant.
 69 | $X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
 70 | # X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
 71 | $X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
 72 | # X10 represents a sign mark. 
 73 | # It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
 74 | $X10 = [\u0EAF\u0EC6\u0ECC];
 75 | 
 76 | # Section 1
 77 | $X0_1 = [\u0EC0];
 78 | $X4_1_2 = [\u0EB4\u0EB5];
 79 | $X4_3_4 = [\u0EB6\u0EB7];
 80 | $X4_6 = [\u0EBB];
 81 | $X4_7 = [\u0EB1];
 82 | $X6_2 = [\u0EAD];
 83 | $X6_3 = [\u0EBD];
 84 | $X7_1 = [\u0EB0];
 85 | $X7_2 = [\u0EB2];
 86 | $X10_1 = [\u0EAF];
 87 | $X10_2 = [\u0EC6];
 88 | $X10_3 = [\u0ECC];
 89 | 
 90 | $Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 91 | $Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 92 | $Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 93 | $Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
 94 | $Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 95 | $Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 96 | $Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 97 | 
 98 | $Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
 99 | 
100 | # Section 2
101 | $X0_2 = [\u0EC1];
102 | 
103 | $Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
104 | $Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
105 | $Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 
106 | 
107 | $Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
108 | 
109 | # Section 3
110 | $X0_3 = [\u0EC2];
111 | $X8_3 = [\u0E8D];
112 | $X8_8 = [\u0EA7];
113 | 
114 | $Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
115 | $Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
116 | $Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
117 | 
118 | $Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
119 | 
120 | # Section 4
121 | $X0_4 = [\u0EC4];
122 | $X6_1 = [\u0EA7];
123 | 
124 | $Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
125 | 
126 | # Section 5
127 | $X0_5 = [\u0EC3];
128 | 
129 | $Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
130 | 
131 | # Section 6
132 | $Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
133 | 
134 | # Section 7
135 | $X4_1_4 = [\u0EB4-\u0EB7];
136 | 
137 | $Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
138 | 
139 | # Section 8
140 | $X4_5 = [\u0ECD];
141 | 
142 | $Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
143 | 
144 | # Section 9
145 | 
146 | $Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
147 | $Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
148 | 
149 | $Rule9 = ($Rule9_1 | $Rule9_2);
150 | 
151 | # Section 10
152 | $Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
153 | 
154 | # Section 11
155 | $Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
156 | 
157 | # Section 12
158 | $Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
159 | 
160 | # Section 13
161 | $Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
162 | 
163 | # Section 14
164 | $X7_3 = [\u0EB3];
165 | 
166 | $Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
167 | 
168 | $LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
169 | 
170 | $WordJoin = [:Line_Break=Word_Joiner:];
171 | 
172 | $LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
173 | 
174 | #
175 | # default numerical definitions
176 | #
177 | $Extend       = [\p{Word_Break = Extend}];
178 | $Format       = [\p{Word_Break = Format}];
179 | $MidNumLet    = [\p{Word_Break = MidNumLet}];
180 | $MidNum       = [\p{Word_Break = MidNum}];
181 | $Numeric      = [\p{Word_Break = Numeric}];
182 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
183 | $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
184 | $MidNumEx       = $MidNum       ($Extend |  $Format)*;
185 | $NumericEx      = $Numeric      ($Extend |  $Format)*;
186 | $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
187 | 
188 | !!forward;
189 | 
190 | $LaoJoinedSyllableEx {200};
191 | # default numeric rules
192 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};
193 | 


--------------------------------------------------------------------------------
/data/rbbi/solrcene/Myanmar.rbbi:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # 
18 | # Parses Myanmar text, with syllable as token. 
19 | #
20 | 
21 | $Cons = [[:Other_Letter:]&[:Myanmar:]];
22 | $Virama = [\u1039];
23 | $Asat = [\u103A];
24 | 
25 | $WordJoin = [:Line_Break=Word_Joiner:]; 
26 | 
27 | #
28 | # default numerical definitions
29 | #
30 | $Extend       = [\p{Word_Break = Extend}];
31 | $Format       = [\p{Word_Break = Format}];
32 | $MidNumLet    = [\p{Word_Break = MidNumLet}];
33 | $MidNum       = [\p{Word_Break = MidNum}];
34 | $Numeric      = [\p{Word_Break = Numeric}];
35 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
36 | $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
37 | $MidNumEx       = $MidNum       ($Extend |  $Format)*;
38 | $NumericEx      = $Numeric      ($Extend |  $Format)*;
39 | $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
40 | 
41 | $ConsEx = $Cons ($Extend | $Format)*;
42 | $AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
43 | $MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
44 | $MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
45 | 
46 | !!forward;
47 | $MyanmarJoinedSyllableEx {200};
48 | 
49 | # default numeric rules
50 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};
51 | 


--------------------------------------------------------------------------------
/data/rbbi/solrcene/source.md:
--------------------------------------------------------------------------------
1 | * https://github.com/chrismattmann/solrcene/tree/master/modules/analysis/icu/src/data


--------------------------------------------------------------------------------
/data/rbbi/source.md:
--------------------------------------------------------------------------------
 1 | # RBBI files
 2 | 
 3 | * [icu::RuleBasedBreakIterator Class Reference](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedBreakIterator.html)
 4 | * [Boundary Analysis](https://unicode-org.github.io/icu/userguide/boundaryanalysis/)
 5 | * [Break Rules](https://unicode-org.github.io/icu/userguide/boundaryanalysis/break-rules.html)
 6 | * [Updating ICU's built-in Break Iterator rules](https://github.com/unicode-org/icu/blob/main/docs/processes/rules_update.md)
 7 | 
 8 | ## Current Lucene
 9 | 
10 | * [gitbox.apache.org - lucene](https://gitbox.apache.org/repos/asf?p=lucene.git;a=tree;f=lucene/analysis/icu/src/data/uax29;h=8423b0c7713159c3dffb549f18a37c425eb96001;hb=HEAD)
11 | 
12 | ## Old Lucene
13 | 
14 | * [apache/lucene-solr](https://github.com/apache/lucene-solr/tree/releases/lucene-solr/4.0.0/lucene/analysis/icu/src/data/uax29)
15 | 
16 | 
17 | ## Misc
18 | 
19 | * https://stackoverflow.com/questions/559949/the-word-break-rule-file
20 | 
21 | ```txt
22 | RuleBasedBreakIterator (icu)
23 | RuleBasedCollator (icu)
24 | RuleBasedNumberFormat (icu)
25 | RuleBasedTimeZone (icu)
26 | ```
27 | 
28 | 


--------------------------------------------------------------------------------
/data/régions_métropolitaines.tsv:
--------------------------------------------------------------------------------
 1 | ﻿"Dénomination "	"Chef-lieu de région "	Superficie (km2)	Population (2019)	"Population estimée (2022) "	"Densité (2019) (hab./km2) "	Code Insee
 2 | Occitanie	Toulouse	72 724	5 933 185	6 053 548	81,6	76
 3 | Grand Est	Strasbourg	57 441	5 556 219	5 542 094	96,7	44
 4 | Normandie	Rouen	29 907	3 325 032	3 307 286	111,2	28
 5 | Bretagne	Rennes	27 208	3 354 854	3 402 932	123,3	53
 6 | Île-de-France	Paris	12 011	12 262 544	12 395 148	1020,9	11
 7 | Centre-Val de Loire	Orléans	39 151	2 573 180	2 564 915	65,7	24
 8 | Pays de la Loire	Nantes	32 082	3 806 461	3 873 096	118,6	52
 9 | Provence-Alpes-Côte d'Azur	Marseille	31 400	5 081 101	5 131 187	161,8	93
10 | Auvergne-Rhône-Alpes	Lyon	69 711	8 042 936	8 153 233	115,4	84
11 | Hauts-de-France	Lille	31 806	6 004 947	5 987 172	188,8	32
12 | Bourgogne-Franche-Comté	Dijon	47 784	2 805 580	2 785 393	58,7	27
13 | Nouvelle-Aquitaine	Bordeaux	84 036	6 010 289	6 081 985	71,5	75
14 | Corse	Ajaccio	8 680	340 440	349 465	39,2	94


--------------------------------------------------------------------------------
/data/sorani_alphabet.tsv:
--------------------------------------------------------------------------------
 1 | Order	Character	Codepoint
 2 | 1	ئ	U+0626
 3 | 2	ا	U+0627
 4 | 3	ب	U+0628
 5 | 4	پ	U+067E
 6 | 5	ت	U+062A
 7 | 6	ج	U+062C
 8 | 7	چ	U+0686
 9 | 8	ح	U+062D
10 | 9	خ	U+062E
11 | 10	د	U+062F
12 | 11	ر	U+0631
13 | 12	ڕ	U+0695
14 | 13	ز	U+0632
15 | 14	ژ	U+0698
16 | 15	س	U+0633
17 | 16	ش	U+0634
18 | 17	ع	U+0639
19 | 18	غ	U+063A
20 | 19	ف	U+0641
21 | 20	ڤ	U+06A4
22 | 21	ق	U+0642
23 | 22	ک	U+06A9
24 | 23	گ	U+06AF
25 | 24	ل	U+0644
26 | 25	ڵ	U+06B5
27 | 26	م	U+0645
28 | 27	ن	U+0646
29 | 28	ه	U+0647
30 | 29	ە	U+06D5
31 | 30	و	U+0648
32 | 31	وو	U+0648 U+0648
33 | 32	ۆ	U+06C6
34 | 33	ی	U+06CC
35 | 34	ێ	U+06CE


--------------------------------------------------------------------------------
/data/sorani_alphabet_wikipedia.tsv:
--------------------------------------------------------------------------------
 1 | Order	Character	Codepoint
 2 | 1	ئ	U+0626
 3 | 2	ا	U+0627
 4 | 3	ب	U+0628
 5 | 4	پ	U+067E
 6 | 5	ت	U+062A
 7 | 6	ج	U+062C
 8 | 7	چ	U+0686
 9 | 8	ح	U+062D
10 | 9	خ	U+062E
11 | 10	د	U+062F
12 | 11	ر	U+0631
13 | 12	ڕ	U+0695
14 | 13	ز	U+0632
15 | 14	ژ	U+0698
16 | 15	س	U+0633
17 | 16	ش	U+0634
18 | 17	ع	U+0639
19 | 18	غ	U+063A
20 | 19	ف	U+0641
21 | 20	ڤ	U+06A4
22 | 21	ق	U+0642
23 | 22	ک	U+06A9
24 | 23	گ	U+06AF
25 | 24	ل	U+0644
26 | 25	ڵ	U+06B5
27 | 26	م	U+0645
28 | 27	ن	U+0646
29 | 28	ه	U+0647
30 | 29	ە	U+06D5
31 | 30	و	U+0648
32 | 32	ۆ	U+06C6
33 | 31	وو	U+0648 U+0648
34 | 33	ی	U+06CC
35 | 34	ێ	U+06CE


--------------------------------------------------------------------------------
/data/source.md:
--------------------------------------------------------------------------------
1 | # Sources
2 | 
3 | ## klpt_stopwords
4 | 
5 | The [stopword list](https://github.com/sinaahmadi/klpt/blob/master/klpt/data/stopwords.json) is from Sina Ahmadi's [Kurdish Language Processing Toolkit](https://github.com/sinaahmadi/klpt), which was released under an [Attribution-ShareAlike 4.0 International Public License](https://github.com/sinaahmadi/klpt/blob/master/LICENSE).
6 | 
7 | 


--------------------------------------------------------------------------------
/data/türkiye'ninz-illeri.tsv:
--------------------------------------------------------------------------------
 1 | Ad	Alan (km²)	Nüfus (2019)	NY kişi/km²	Plaka kodu	Telefon kodu	Vali
 2 | İstanbul	5.461	15.519.267	2.841,83	34	212, 216	Ali Yerlikaya
 3 | Eskişehir	13.960	887.475	63,57	26	222	Erol Ayyıldız
 4 | Bursa	10.813	3.056.120	282,63	16	224	Yakup Canbolat
 5 | Yalova	798	270.976	339,56	77	226	Muammer Erol
 6 | Bilecik	4.179	219.427	52,50	11	228	Bilal Şentürk
 7 | İzmir	11.891	4.367.251	367,27	35	232	Yavuz Selim Köşger
 8 | Manisa	13.339	1.440.611	107,99	45	236	Yaşar Karadeniz
 9 | Antalya	20.177	2.511.700	124,48	07	242	Ersin Yazıcı
10 | Isparta	8.946	444.914	49,73	32	246	Ömer Seymenoğlu
11 | Burdur	7.175	270.796	37,74	15	248	Ali Arslantaş
12 | Muğla	12.654	983.142	77,69	48	252	Orhan Tavlı
13 | Aydın	8.116	1.110.972	136,88	09	256	Hüseyin Aksoy
14 | Denizli	12.134	1.037.208	85,47	20	258	Ali Fuat Atik
15 | Kocaeli	3.397	1.953.035	574,92	41	262	Seddar Yavuz
16 | Sakarya	4.824	1.029.650	213,44	54	264	Çetin Oktay Kaldırım
17 | Balıkesir	14.583	1.228.620	84,25	10	266	Hasan Şıldak
18 | Afyonkarahisar	14.016	729.483	52,04	03	272	Gökmen Çiçek
19 | Kütahya	11.634	579.257	49,79	43	274	Ali Çelik
20 | Uşak	5.555	370.509	66,69	64	276	Funda Kocabıyık
21 | Tekirdağ	6.190	1.055.412	170,50	59	282	Aziz Yıldırım
22 | Edirne	6.145	413.903	67,35	22	284	Ekrem Canalp
23 | Çanakkale	9.817	542.157	55,22	17	286	İlhami Aktaş
24 | Kırklareli	6.459	361.836	56,02	39	288	Osman Bilgin
25 | Ankara	25.632	5.639.076	220	06	312	Vasip Şahin
26 | Kırıkkale	4.791	283.017	59,07	71	318	Yunus Sezer
27 | Adana	13.844	2.237.940	161,65	01	322	Süleyman Elban
28 | Mersin	16.010	1.840.425	114,95	33	324	Ali İhsan Su
29 | Hatay	5.524	1.628.894	294,87	31	326	Rahmi Doğan
30 | Osmaniye	3.320	538.759	162,27	80	328	Erdinç Yılmaz
31 | Konya	40.838	2.232.374	54,66	42	332	Vahdettin Özkan
32 | Karaman	8.678	253.279	29,18	70	338	Mehmet Alpaslan Işık
33 | Gaziantep	6.803	2.069.364	304,18	27	342	Davut Gül
34 | Kahramanmaraş	14.520	1.154.102	79,48	46	344	Ömer Faruk Coşkun
35 | Sivas	28.164	638.956	22,68	58	346	Salih Ayhan
36 | Kilis	1.412	142.490	100,91	79	348	Recep Soytürk
37 | Kayseri	16.970	1.407.409	82,93	38	352	Şehmus Günaydın
38 | Yozgat	13.690	421.200	30,76	66	354	Ziya Polat
39 | Tokat	10.042	612.747	61,01	60	356	Ozan Balcı
40 | Amasya	5.628	337.800	60,02	05	358	Mustafa Masatlı
41 | Samsun	9.725	1.348.542	138,66	55	362	Zülkif Dağlı
42 | Çorum	12.428	530.864	42,71	19	364	Mustafa Çiftçi
43 | Kastamonu	13.064	379.405	29,04	37	366	Avni Çakır
44 | Sinop	5.717	218.243	38,17	57	368	Erol Karaömeroğlu
45 | Karabük	4.142	248.458	59,98	78	370	Fuat Gürel
46 | Zonguldak	3.342	596.053	178,35	67	372	Mustafa Tutulmaz
47 | Bolu	8.313	316.126	38,02	14	374	Ahmet Ümit
48 | Çankırı	7.542	195.789	25,95	18	376	Abdullah Ayaz
49 | Bartın	2.330	198.249	85,08	74	378	Sinan Güner
50 | Düzce	2.492	392.166	157,36	81	380	Cevdet Atay
51 | Aksaray	7.659	416.367	54,36	68	382	Hamza Aydoğdu
52 | Nevşehir	5.485	303.010	55,24	50	384	İnci Sezer Becel
53 | Kırşehir	6.584	242.938	36,89	40	386	İbrahim Akın
54 | Niğde	7.234	362.861	48,59	51	388	Yılmaz Şimşek
55 | Diyarbakır	15.168	1.756.353	115,79	21	412	Münir Karaloğlu
56 | Şanlıurfa	19.242	2.073.614	107,76	63	414	Abdullah Erin
57 | Adıyaman	7.337	626.465	85,38	02	416	Aykut Pekmez
58 | Malatya	12.259	800.165	65,27	44	422	Aydın Baruş
59 | Elazığ	9.383	591.098	62,99	23	424	Erkaya Yırık
60 | Bingöl	8.004	279.812	34,95	12	426	Kadir Ekinci
61 | Tunceli	7.582	84.660	11,16	62	428	Mehmet Ali Özkan
62 | Van	20.921	1.136.757	54,33	65	432	Mehmet Emin Bilmez
63 | Bitlis	8.294	348.115	41,97	13	434	Oktay Çağatay
64 | Muş	8.650	408.809	47,26	49	436	İlker Gündüzöz
65 | Hakkâri	7.095	280.991	39,60	30	438	İdris Akbıyık
66 | Erzurum	25.006	762.062	30,47	25	442	Okay Memiş
67 | Erzincan	11.815	234.747	19,86	24	446	Mehmet Makas
68 | Ordu	5.861	754.198	128,68	52	452	Tuncay Sonel
69 | Giresun	7.025	448.400	63,82	28	454	Enver Ünlü
70 | Gümüşhane	6.668	164.521	24,67	29	456	Kamuran Taşbilek
71 | Bayburt	3.746	84.843	22,64	69	458	Cüneyt Epcim
72 | Trabzon	4.628	808.974	174,79	61	462	İsmail Ustaoğlu
73 | Rize	3.835	343.212	89,49	53	464	Kemal Çeber
74 | Artvin	7.393	170.875	23,11	08	466	Yılmaz Doruk
75 | Ağrı	11.099	536.199	48,31	04	472	Osman Varol
76 | Kars	10.193	285.410	28	36	474	Türker Öksüz
77 | Iğdır	3.664	199.442	54,43	76	476	Hüseyin Engin Sarıibrahim
78 | Ardahan	4.934	97.319	19,72	75	478	Hüseyin Öner
79 | Mardin	8.780	838.778	95,53	47	482	Mahmut Demirtaş
80 | Siirt	5.717	330.280	57,77	56	484	Osman Hacıbektaşoğlu
81 | Şırnak	7.078	529.615	74,82	73	486	Ali Hamza Pehlivan
82 | Batman	4.477	608.659	135,95	72	488	Hulusi Şahin


--------------------------------------------------------------------------------
/data/wordlists/source.md:
--------------------------------------------------------------------------------
1 | # Data sources &ndash; wordlists
2 | 
3 | * [kurdi_words.txt](https://raw.githubusercontent.com/0xdolan/kurdi/master/corpus/kurdi_words.txt) (Sorani)


--------------------------------------------------------------------------------
/docs/DRAFT_icu_transforms.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/docs/DRAFT_icu_transforms.pdf


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Python internationalisation


--------------------------------------------------------------------------------
/docs/matplotlib.md:
--------------------------------------------------------------------------------
 1 | appropriate # Python and Pandas internationalisation
 2 | 
 3 | ## Data visualisation issues for languages that need bidirectional support or complex font rendering.
 4 | 
 5 | _Matplotlib_ is a commonly used tool for basic data visualisation in Python, and is the default plotting tool with _pandas.Dataframe.plot_. It is also used by _seaborn_ and _wordcount_, along with other libraries and tools.
 6 | 
 7 | The default backends for _Matplotlib_ have a number of limitations:
 8 | 
 9 | 1. No support for the Unicode bidirectional algorithm, 
10 | 2. No support for complex font rendering
11 | 
12 | This places severe limits on what natural languages can be used in titles, lables, legends, and other text elements in plots.
13 | 
14 | The package [mplcairo](https://github.com/matplotlib/mplcairo) provides an alternative backend for _matplotlib_ that uses [Raqm](https://github.com/HOST-Oman/libraqm) and [GNU FriBidi](https://github.com/fribidi/fribidi) for bidirectional text layout and complex rendering of OpenType features. This allows the use of most languages to be supported in plots.
15 | 
16 | The key limitations for _mplcairo_ are bugs in iPython and the lack of support for _Jupyter notebooks_.
17 | 
18 | Using the _mplcairo_ backend for _matplotlib_ we can display plot titles, axes labels and categorical tick labels in any language we need to support.
19 | 
20 | There are two missing pieces at this point:
21 | 
22 | 1. Display of numeric tick labels in a numeral system appropriate  for the UI language.
23 | 2. Choice on bidirectional layout req
24 | uirements of the appropriate
25 |  data visualisation.
26 | 
27 | ## Numeral systems
28 | 
29 | Regarding the first issue, it is possible to use `matplotlib.ticker.FuncFormatter()` to apply a function to convert to the target numeral system, and apply necessary grouping and decimal separators.
30 | 
31 | ### RTL layout and data visualisation
32 | 
33 | It isn't always necessary to change the layout of the plot. If the plot is using a cartesian coordinate system, it is best to use the default layout. 
34 | The layout used, combined with user expectations, will impact the interpretation of trends in data visualisations. User interpretation of the visualisations, combined with user experience are critical inputs into a data visualisation design.
35 | 
36 | If a RTL layout is required:
37 | 
38 | 1. Use `yaxis.tick_right()` and `yaxis.set_label_position("right")` to reposition y-axis to the right side of the plot
39 | 2. Use `plt.gca().invert_xaxis()` to invert the x-axis. This step may not be necessary. UX is an important consideration.
40 | 
41 | ### Examples
42 | 
43 | The following python scripts uses [Sorani Kurdish data](https://github.com/enabling-languages/python-i18n/blob/main/data/demographics.tsv):
44 | 
45 | * [matplotlib](https://github.com/enabling-languages/python-i18n/blob/main/py/matplotlib_kurdish.py)
46 | * [pandas.Dataframe.plot](https://github.com/enabling-languages/python-i18n/blob/main/py/pandas_plot_kurdish.py)
47 | * [seaborn](https://github.com/enabling-languages/python-i18n/blob/main/py/seaborn_kurdish.py)
48 | * [wordcount](https://github.com/enabling-languages/python-i18n/blob/main/py/wordcloud_kurdish.py)
49 | 
50 | <figure style="border: 1px solid silver;"><img src="https://raw.githubusercontent.com/enabling-languages/python-i18n/main/py/seaborn_kurdish.png" alt="Kurdish plot using Seaborn" style="width:100%"><figcaption align = "center" style="text-align: center"><b>Fig.1 - Kurdish bar charts in both LTR and RTL layouts.</b></figcaption></figure>
51 | 
52 | <figure style="border: 1px solid silver;"><img src="https://raw.githubusercontent.com/enabling-languages/python-i18n/main/py/wordcloud_kurdish.png" alt="Kurdish wordcloud" style="width:100%"><figcaption align = "center" style="text-align: center"><b>Fig.2 - Kurdish wordcloud.</b></figcaption></figure>
53 | 


--------------------------------------------------------------------------------
/notebooks/Sorting_emoji.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Sorting emoji\n",
  8 |     "\n",
  9 |     "Python's inbuild sorting algorithms sort emoji by codepoint.\n",
 10 |     "\n",
 11 |     "Codepoint order, as well as the default collation rules provided by the Unicode Collation Algorithm do not provide adequate [ordering and grouping](https://www.unicode.org/reports/tr51/#Sorting) of emoji.\n",
 12 |     "\n",
 13 |     "The Unicode Common Locale Data Repository (CLDR) provides colation rules for emoji. [Conformant emoji collation](https://www.unicode.org/reports/tr51/#Collation_Conformance) is defined in CLDR tailoring rules for the Unicode Collation Algorthim (UCA).\n",
 14 |     "\n",
 15 |     "CLDR groups emoji into broad conceptual categories in order to group related emoji together."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Emoji only collation\n",
 23 |     "\n",
 24 |     "For the following discussion we will use the following emoji:\n",
 25 |     "\n",
 26 |     "|Character |Codepoint |Description |Category |\n",
 27 |     "|--------- |--------- |----------- |-------- |\n",
 28 |     "|🦜 |U+1F99C |Parrot |animal-bird |\n",
 29 |     "|🥚 |U+1F95A |Egg |food-prepared |\n",
 30 |     "|🐔 |U+1F414 |Chicken |animal-bird |\n",
 31 |     "\n",
 32 |     "The default python sort algorithm will order then in terms of the emoji's codepoint: U+1F414 (chicken), U+1F95A (egg), and then U+1F99C (parrot).\n",
 33 |     "\n",
 34 |     "The CLDR ordering would be to sort the two bids together (U+1F414 then U+1F99C), followed by U+1F95A."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 1,
 40 |    "metadata": {},
 41 |    "outputs": [
 42 |     {
 43 |      "data": {
 44 |       "text/plain": [
 45 |        "['🐔', '🥚', '🦜']"
 46 |       ]
 47 |      },
 48 |      "execution_count": 1,
 49 |      "metadata": {},
 50 |      "output_type": "execute_result"
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "a = ['🦜', '🥚', '🐔']\n",
 55 |     "sorted(a)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "Using PyICU, it is possible to sort emoji according to CLDR's collation rules for Emoji. The `-u-co-emoji` Unicode BCP-47 extension will enable CLDR based emoji collation. When sorting just wmoji we can use the langauge subtag `und` (undetermined) as the base for the locale identifier: `und-u-co-emoji`."
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 2,
 68 |    "metadata": {},
 69 |    "outputs": [
 70 |     {
 71 |      "name": "stdout",
 72 |      "output_type": "stream",
 73 |      "text": [
 74 |       "['🐔', '🦜', '🥚']\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "from icu import Collator, Locale\n",
 80 |     "coll = Collator.createInstance(Locale.createCanonical(\"und-u-co-emoji\"))\n",
 81 |     "print(sorted(a, key=coll.getSortKey))"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "This yields a CLDR based sort using the CLDR emoji collation rules."
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "##  Sorting text and emoji\n",
 96 |     "\n",
 97 |     "A more complex scenario is sorting a set of text and emoji.\n",
 98 |     "\n",
 99 |     "[UTS #35](https://unicode.org/reports/tr35/tr35-collation.html#Combining_Rules) provides a discussion of tailoring and combining rules in relation to sorting emoji and text. We'll implement the example given in UTS #35 in Python.\n",
100 |     "\n",
101 |     "The following characters are used:\n",
102 |     "\n",
103 |     "|Character  |Codepoint  |Description  |\n",
104 |     "|---------- |---------- |------------ |\n",
105 |     "|😀 |U+1F600 |Grinning Face |\n",
106 |     "|글 |U+AE00 |Hangul Syllable Geul |\n",
107 |     "|Z |U+005A |Latin Capital Letter Z |\n",
108 |     "|ü |U+00FC |Latin Small Letter U with Diaeresis |\n",
109 |     "|, |U+002C |Comma |\n",
110 |     "|✈️️ |U+2708 U+FE0F |Airplane |\n",
111 |     "|y |U+0079 |Latin Small Letter Y |\n",
112 |     "|☹️ |U+2639 U+FE0F |White Frowning Face |\n",
113 |     "|a |U+0061 |Latin Small Letter A |\n",
114 |     "\n",
115 |     "Enabling emoji collation overrides language specific tailorings. This has no impact on text for languages that use the root collation, but will have a negative impact on languages that do require tailoring to obtain the correct collation order.\n",
116 |     "\n",
117 |     "The python sort algorithm will order content by codepoint:"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 11,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/plain": [
128 |        "[',', 'Z', 'a', 'y', 'ü', '☹️', '✈️️', '글', '😀']"
129 |       ]
130 |      },
131 |      "execution_count": 11,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "# List to be sorted\n",
138 |     "b = ['😀', '글', 'Z', 'ü', ',', '✈️️', 'y', '☹️', 'a']\n",
139 |     "\n",
140 |     "#Default Python sort\n",
141 |     "sorted(b)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "The `en` locale identifier will use the CLDR root collation. Emoji are not sorted using the CLDR emoji collation rules:"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 25,
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "data": {
158 |       "text/plain": [
159 |        "[',', '☹️', '✈️️', '😀', 'a', 'ü', 'y', 'Z', '글']"
160 |       ]
161 |      },
162 |      "execution_count": 25,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "# locale: en\n",
169 |     "en_coll = Collator.createInstance(Locale.forLanguageTag(\"en\"));\n",
170 |     "sorted(b, key=en_coll.getSortKey)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "Enabling emoji collation using the `en-u-co-emoji` locale will sort the emoji based on the emoji collation rules and the remaining characters are sorted as per the root collation algorithm."
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 24,
183 |    "metadata": {},
184 |    "outputs": [
185 |     {
186 |      "data": {
187 |       "text/plain": [
188 |        "[',', '😀', '☹️', '✈️️', 'a', 'ü', 'y', 'Z', '글']"
189 |       ]
190 |      },
191 |      "execution_count": 24,
192 |      "metadata": {},
193 |      "output_type": "execute_result"
194 |     }
195 |    ],
196 |    "source": [
197 |     "# locale for en-u-co-emoji\n",
198 |     "en_emoji_coll = Collator.createInstance(Locale.forLanguageTag(\"en-u-co-emoji\"));\n",
199 |     "sorted(b, key=en_emoji_coll.getSortKey)"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "`en-u-co-emoji\"`will yield the same result as `und-u-co-emoji`, i.e. sort emoji according to the CLDR emoji collation order and sort other characters according to the root collation algorithm."
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 23,
212 |    "metadata": {},
213 |    "outputs": [
214 |     {
215 |      "data": {
216 |       "text/plain": [
217 |        "[',', '😀', '☹️', '✈️️', 'a', 'ü', 'y', 'Z', '글']"
218 |       ]
219 |      },
220 |      "execution_count": 23,
221 |      "metadata": {},
222 |      "output_type": "execute_result"
223 |     }
224 |    ],
225 |    "source": [
226 |     "# locale for und-u-co-emoji\n",
227 |     "und_emoji_coll = Collator.createInstance(Locale.forLanguageTag(\"und-u-co-emoji\"));\n",
228 |     "sorted(b, key=und_emoji_coll.getSortKey)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "The `da` locale has tailored collation rules to order text in the sequence required for Danish:"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 22,
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "data": {
245 |       "text/plain": [
246 |        "[',', '☹️', '✈️️', '😀', 'a', 'y', 'ü', 'Z', '글']"
247 |       ]
248 |      },
249 |      "execution_count": 22,
250 |      "metadata": {},
251 |      "output_type": "execute_result"
252 |     }
253 |    ],
254 |    "source": [
255 |     "# locale for da\n",
256 |     "da_coll = Collator.createInstance(Locale.forLanguageTag(\"da\"));\n",
257 |     "sorted(b, key=da_coll.getSortKey)"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "Adding emoji collation support overrides the Danish language tailorings. Look at the order of __ü__ in the list for the `da` and `da-u-co-emoji` locales."
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 20,
270 |    "metadata": {},
271 |    "outputs": [
272 |     {
273 |      "data": {
274 |       "text/plain": [
275 |        "[',', '😀', '☹️', '✈️️', 'a', 'ü', 'y', 'Z', '글']"
276 |       ]
277 |      },
278 |      "execution_count": 20,
279 |      "metadata": {},
280 |      "output_type": "execute_result"
281 |     }
282 |    ],
283 |    "source": [
284 |     "# locale for da-u-co-emoji\n",
285 |     "da_emoji_coll = Collator.createInstance(Locale.forLanguageTag(\"da-u-co-emoji\"));\n",
286 |     "sorted(b, key=da_emoji_coll.getSortKey)"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "To overcome this, it is possible to combine the collation rules for the `da` and `da_and_emoji_rules`. We can do this by:\n",
294 |     "\n",
295 |     "1. Initiating collator instances for each locale, and retrieve the rules\n",
296 |     "2. Concatenate the rule sets\n",
297 |     "3. Initiate a collator instance using `RuleBasedCollator`\n",
298 |     "\n",
299 |     "This will order emoji according to the emoji collation rules and order Latin script text according to Danish collation rules."
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 19,
305 |    "metadata": {},
306 |    "outputs": [
307 |     {
308 |      "data": {
309 |       "text/plain": [
310 |        "[',', '😀', '☹️', '✈️️', 'a', 'y', 'ü', 'Z', '글']"
311 |       ]
312 |      },
313 |      "execution_count": 19,
314 |      "metadata": {},
315 |      "output_type": "execute_result"
316 |     }
317 |    ],
318 |    "source": [
319 |     "# Combinded rules\n",
320 |     "from icu import RuleBasedCollator\n",
321 |     "#da_and_emoji_rules = Collator.createInstance(Locale.forLanguageTag('da')).getRules() + Collator.createInstance(Locale.forLanguageTag('und-u-co-emoji')).getRules()\n",
322 |     "da_rules = Collator.createInstance(Locale.forLanguageTag('da')).getRules()\n",
323 |     "emoji_rules = Collator.createInstance(Locale.forLanguageTag('und-u-co-emoji')).getRules()\n",
324 |     "da_and_emoji_rules = da_rules + emoji_rules\n",
325 |     "combined_coll = RuleBasedCollator(da_and_emoji_rules)\n",
326 |     "sorted(b, key=combined_coll.getSortKey)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "metadata": {},
332 |    "source": [
333 |     "The same approach is needed for other languages that are not supported by the CLDR root collation algorithm and require tailored rules."
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "metadata": {},
339 |    "source": [
340 |     "## Resources\n",
341 |     "\n",
342 |     "* [Emoji ordering chart](https://www.unicode.org/emoji/charts/emoji-ordering.html)\n",
343 |     "* [CLDR Root collation rules](https://github.com/unicode-org/cldr/blob/353527cdabf1e8870d261beb3c908de6deb1915b/common/collation/root.xml#L951)"
344 |    ]
345 |   }
346 |  ],
347 |  "metadata": {
348 |   "interpreter": {
349 |    "hash": "bb12d0de9674b66c629d2bafada2ec4f6e6dba6d129e54dea4badc21502d54d3"
350 |   },
351 |   "kernelspec": {
352 |    "display_name": "Python 3.8.1 64-bit ('el': venv)",
353 |    "language": "python",
354 |    "name": "python3"
355 |   },
356 |   "language_info": {
357 |    "codemirror_mode": {
358 |     "name": "ipython",
359 |     "version": 3
360 |    },
361 |    "file_extension": ".py",
362 |    "mimetype": "text/x-python",
363 |    "name": "python",
364 |    "nbconvert_exporter": "python",
365 |    "pygments_lexer": "ipython3",
366 |    "version": "3.8.1"
367 |   },
368 |   "orig_nbformat": 4
369 |  },
370 |  "nbformat": 4,
371 |  "nbformat_minor": 2
372 | }
373 | 


--------------------------------------------------------------------------------
/notebooks/ethiopic_numbers.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Working with Ethiopic numbers\n",
  8 |     "\n",
  9 |     "CLDR sets the default number system for languages  written in the Ethiopic script to the Arabic (Latin) Number System. The Ethiopic number system is marked as an alternative (traditional) numbering system, and is not used by default.\n",
 10 |     "\n",
 11 |     "CLDR defines decimal and algorithmic [number systems](https://github.com/unicode-org/cldr/blob/main/common/supplemental/numberingSystems.xml). The Ethiopic number system is an algorithmic alphabetic numeral system.\n",
 12 |     "\n",
 13 |     "For a description of the number system refer to [Ethiopic number system](http://www.geez.org/Numerals/) for more details. A list of [sample numbers](http://www.geez.org/Numerals/NumberSamples.html) is available.\n",
 14 |     "\n",
 15 |     "ICU provides a number of classes used for [formatting numbers](https://unicode-org.github.io/icu/userguide/format_parse/numbers/), but the class needed to format Ethiopic numbers is the [RuleBasedNumberFormat](https://unicode-org.github.io/icu/userguide/format_parse/numbers/rbnf.html) class.\n",
 16 |     "\n",
 17 |     "Refer to the ICU4C API [RuleBasedNumberFormat class reference](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedNumberFormat.html). The RBNF rule set for [Ethiopic](https://github.com/unicode-org/cldr/blob/6c8ad511801043124d6ce25e0388412fe9b7b2f4/common/rbnf/root.xml#L246) is defined in the CLDR root locale.\n",
 18 |     "\n",
 19 |     "The most common use for the `RuleBasedNumberFormat` class is to format numbers as ordinals or as words in the target locale. It is also the nechanism for formating and parsing algorithmic number systems.\n"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Spelling out numbers in Amharic\n",
 27 |     "\n",
 28 |     "1. Create a locale instance\n",
 29 |     "2. create a number formatter instance using `RuleBasedNumberFormat` class\n",
 30 |     "3. Format the number\n",
 31 |     "\n",
 32 |     "We start by importing the necessary classes from PyICU:"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 15,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from icu import Locale, Formattable, RuleBasedNumberFormat, URBNFRuleSetTag"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "- [Locale](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1Locale.html) &ndash; methods for initiating and working with ICU's locale objects.\n",
 49 |     "- [Formattable](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1Formattable.html) &ndash; a wrapper that converts between numeric types, strings and date objects. It's primary use is in formatting.\n",
 50 |     "- [RulebasedNumberFormat](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedNumberFormat.html) &ndash; formats numbers according to a set of rules. The rules maybe inbuilt set of rules, or custom rules.\n",
 51 |     "- [URBNFRuleSetTag](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/namespaceicu.html#a55dbbbdd4946251c23988013e06e695e) &ndash; tags for predefined rule sets to use with `RulebasedNumberFormat`."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "First, create a Locale instance, and a formatter instance. There are a number of methods for building a Locale instance. To keep things simple, we'll just pass a locale identifier directly to the class."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 16,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "loc = Locale('am_ET')\n",
 68 |     "formatter = RuleBasedNumberFormat(URBNFRuleSetTag.SPELLOUT, loc)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "We can control what rule sets are used. The following rule sets are available:"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 17,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "%spellout-numbering-year\n",
 88 |       "%spellout-numbering\n",
 89 |       "%spellout-cardinal\n",
 90 |       "%spellout-ordinal\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "for n in range(formatter.getNumberOfRuleSetNames()):\n",
 96 |     "    print(formatter.getRuleSetName(n))"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 26,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "name": "stdout",
106 |      "output_type": "stream",
107 |      "text": [
108 |       "%spellout-numbering\n"
109 |      ]
110 |     }
111 |    ],
112 |    "source": [
113 |     "print(formatter.getDefaultRuleSetName())"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "The `%spellout-numbering` is the default for Amharic, but `%spellout-numbering-year`, `%spellout-cardinal`, and `%spellout-ordinal` are alternative rule sets available. Use the [setDefaultRuleSet](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedNumberFormat.html#aa0fbc19602d99cfcb550e2c11cb9ca91) method, if required."
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "`RuleBasedNumberFormat` can be used in a number of ways, refer to the API documentation. IN this particular case we want to create a formatter that uses the Amharic spellout rule set. We passed the relevant rule set identifer and the required locale to create a formatter instance.\n",
128 |     "\n",
129 |     "The same Python code can be used for any locale that have spellout [rule sets](https://github.com/unicode-org/icu/tree/main/icu4c/source/data/rbnf).\n",
130 |     "\n",
131 |     "To convert the number to its word representation, use the `format` method."
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 18,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "name": "stdout",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "አስር ሁለት ሺ ሦስት መቶ አራት አስር አምስት\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "number = 12345\n",
149 |     "r = formatter.format(number)\n",
150 |     "print(r)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "You can use the `parse` method to convert the word representation back into a formated number:"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 19,
163 |    "metadata": {},
164 |    "outputs": [
165 |     {
166 |      "data": {
167 |       "text/plain": [
168 |        "<Formattable: '12,345'>"
169 |       ]
170 |      },
171 |      "execution_count": 19,
172 |      "metadata": {},
173 |      "output_type": "execute_result"
174 |     }
175 |    ],
176 |    "source": [
177 |     "rreverse = formatter.parse(r)\n",
178 |     "rreverse"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "This returns a Formattable object, which you can either render as a formated string, or convert to an interger or float, as required."
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 20,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "name": "stdout",
195 |      "output_type": "stream",
196 |      "text": [
197 |       "12,345\n"
198 |      ]
199 |     }
200 |    ],
201 |    "source": [
202 |     "rreverse_string = str(rreverse)\n",
203 |     "print(rreverse_string)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "Available methods are:\n",
211 |     "\n",
212 |     "- getDouble &ndash; returns a floating point number\n",
213 |     "- getInt64 &ndash; returns an integer\n"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 21,
219 |    "metadata": {},
220 |    "outputs": [
221 |     {
222 |      "name": "stdout",
223 |      "output_type": "stream",
224 |      "text": [
225 |       "12345\n",
226 |       "12345.0\n"
227 |      ]
228 |     }
229 |    ],
230 |    "source": [
231 |     "ireverse = rreverse.getInt64()\n",
232 |     "print(ireverse)\n",
233 |     "\n",
234 |     "dreverse = rreverse.getDouble()\n",
235 |     "print(dreverse)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "## Working with the Ethiopic numeral system\n",
243 |     "\n",
244 |     "Creating a formatter for Ethiopic numbers is a two step process, we need to create a formatter passing a rule set identifier for number systems and a locale, then we need to set the actual rule set needed. Locales may support multiple rule sets. \n",
245 |     "\n",
246 |     "1. Create a locale instance\n",
247 |     "2. Create a formatter instance\n",
248 |     "3. Set the rule set required\n",
249 |     "\n",
250 |     "We'll reuse the existing Locale instance."
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 22,
256 |    "metadata": {},
257 |    "outputs": [
258 |     {
259 |      "name": "stdout",
260 |      "output_type": "stream",
261 |      "text": [
262 |       "%armenian-lower\n",
263 |       "%armenian-upper\n",
264 |       "%cyrillic-lower\n",
265 |       "%ethiopic\n",
266 |       "%georgian\n",
267 |       "%greek-lower\n",
268 |       "%greek-upper\n",
269 |       "%hebrew\n",
270 |       "%hebrew-item\n",
271 |       "%roman-lower\n",
272 |       "%roman-upper\n",
273 |       "%tamil\n",
274 |       "%zz-default\n"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "eformatter = RuleBasedNumberFormat(URBNFRuleSetTag.NUMBERING_SYSTEM, loc)\n",
280 |     "\n",
281 |     "for n in range(eformatter.getNumberOfRuleSetNames()):\n",
282 |     "    print(eformatter.getRuleSetName(n))\n"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "markdown",
287 |    "metadata": {},
288 |    "source": [
289 |     "The public name of the rule set we need is `%ethiopic`, so we set this as our default rule set:"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 23,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "eformatter.setDefaultRuleSet('%ethiopic')"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "metadata": {},
304 |    "source": [
305 |     "Then format the number as above:"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 24,
311 |    "metadata": {},
312 |    "outputs": [
313 |     {
314 |      "name": "stdout",
315 |      "output_type": "stream",
316 |      "text": [
317 |       "፳፫፻፵፩\n"
318 |      ]
319 |     }
320 |    ],
321 |    "source": [
322 |     "number = 2341\n",
323 |     "r = eformatter.format(number)\n",
324 |     "print(r)"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {},
330 |    "source": [
331 |     "And likewise, we can parse the ethiopic digits back to the Arabic (Latin) number system:"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": 25,
337 |    "metadata": {},
338 |    "outputs": [
339 |     {
340 |      "name": "stdout",
341 |      "output_type": "stream",
342 |      "text": [
343 |       "2,341\n",
344 |       "2341\n",
345 |       "2341.0\n"
346 |      ]
347 |     }
348 |    ],
349 |    "source": [
350 |     "rreverse = eformatter.parse(r)\n",
351 |     "print(str(rreverse))\n",
352 |     "print(rreverse.getInt64())\n",
353 |     "print(rreverse.getDouble())"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "## Further information\n",
361 |     "\n",
362 |     "- Unicode Locale Data Markup Language (LDML) [Part 3: Numbers](https://www.unicode.org/reports/tr35/tr35-numbers.html#unicode-locale-data-markup-language-ldmlpart-3-numbers)\n",
363 |     "  - [Number Systems](http://www.unicode.org/reports/tr35/tr35-numbers.html#Numbering_Systems)\n",
364 |     "  - [Rule-Based Number Formatting](https://www.unicode.org/reports/tr35/tr35-numbers.html#Rule-Based_Number_Formatting)"
365 |    ]
366 |   }
367 |  ],
368 |  "metadata": {
369 |   "kernelspec": {
370 |    "display_name": "athinkra",
371 |    "language": "python",
372 |    "name": "python3"
373 |   },
374 |   "language_info": {
375 |    "codemirror_mode": {
376 |     "name": "ipython",
377 |     "version": 3
378 |    },
379 |    "file_extension": ".py",
380 |    "mimetype": "text/x-python",
381 |    "name": "python",
382 |    "nbconvert_exporter": "python",
383 |    "pygments_lexer": "ipython3",
384 |    "version": "3.11.0"
385 |   },
386 |   "orig_nbformat": 4
387 |  },
388 |  "nbformat": 4,
389 |  "nbformat_minor": 2
390 | }
391 | 


--------------------------------------------------------------------------------
/notebooks/images/sorani_plotly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/images/sorani_plotly.png


--------------------------------------------------------------------------------
/notebooks/images/sorani_plotly2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/images/sorani_plotly2.png


--------------------------------------------------------------------------------
/notebooks/images/sorani_plotly_inline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/images/sorani_plotly_inline.png


--------------------------------------------------------------------------------
/notebooks/img/1440px-Lake_Dukan_12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/1440px-Lake_Dukan_12.jpg


--------------------------------------------------------------------------------
/notebooks/img/ckb_IQ_collation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/ckb_IQ_collation.png


--------------------------------------------------------------------------------
/notebooks/img/khamti.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/khamti.jpg


--------------------------------------------------------------------------------
/notebooks/img/linux1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/linux1.png


--------------------------------------------------------------------------------
/notebooks/img/macos1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/macos1.png


--------------------------------------------------------------------------------
/notebooks/img/mplcairo_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/mplcairo_output.png


--------------------------------------------------------------------------------
/notebooks/img/sibe.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/sibe.jpg


--------------------------------------------------------------------------------
/notebooks/img/std_matplotlib_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/std_matplotlib_output.png


--------------------------------------------------------------------------------
/notebooks/img/tai_aiton.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/tai_aiton.jpg


--------------------------------------------------------------------------------
/notebooks/img/tai_aiton_text_to_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/tai_aiton_text_to_image.png


--------------------------------------------------------------------------------
/notebooks/img/yolngu.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/yolngu.jpg


--------------------------------------------------------------------------------
/notebooks/pandas_plot_mplcairo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Sorani Kurdish data using Pandas plot\n",
  8 |     "\n",
  9 |     "Enabling `mplcairo`, with `raqm`, as the backend for `matplotlib` will allow us to reuse the [Kurdish matplotlib example](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/matplotlib_mplcairo.ipynb) with Pandas `plot`.\n",
 10 |     "\n",
 11 |     "__Please note:__ This notebook will run on MacOS, but tends to be buggy on other platforms. The _mplcairo_ package does not currently support Jupyter. It is better to use _mplcairo_ in a script, rather than a notebook. See [pandas_plot_kurdish.py](../py/pandas_plot_kurdish.py)."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Setup"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 3,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import pandas as pd\n",
 28 |     "import locale, platform\n",
 29 |     "import mplcairo\n",
 30 |     "import matplotlib as mpl\n",
 31 |     "if platform.system() == \"Darwin\":\n",
 32 |     "    mpl.use(\"module://mplcairo.macosx\")\n",
 33 |     "else:\n",
 34 |     "   mpl.use(\"module://mplcairo.qt\")\n",
 35 |     "import matplotlib.pyplot as plt\n",
 36 |     "import matplotlib.ticker as ticker\n",
 37 |     "import unicodedata as ud, regex as re"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## Helper functions"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 4,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "def convert_digits(s, sep = (\",\", \".\")):\n",
 54 |     "    nd = re.compile(r'^-?\\p{Nd}[,.\\u066B\\u066C\\u0020\\u2009\\u202F\\p{Nd}]*$')\n",
 55 |     "    tsep, dsep = sep\n",
 56 |     "    if nd.match(s):\n",
 57 |     "        s = s.replace(tsep, \"\")\n",
 58 |     "        s = ''.join([str(ud.decimal(c, c)) for c in s])\n",
 59 |     "        if dsep in s:\n",
 60 |     "            return float(s.replace(dsep, \".\")) if dsep != \".\" else float(s)\n",
 61 |     "        return int(s)\n",
 62 |     "    return s\n",
 63 |     "\n",
 64 |     "seps = (\"\\u066C\", \"\\u066B\")\n",
 65 |     "digitsconv = lambda x: convert_digits(x.replace(\"-\", \"٠\"), sep = seps)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "## Process data and plot data"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 5,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/html": [
 83 |        "<div>\n",
 84 |        "<style scoped>\n",
 85 |        "    .dataframe tbody tr th:only-of-type {\n",
 86 |        "        vertical-align: middle;\n",
 87 |        "    }\n",
 88 |        "\n",
 89 |        "    .dataframe tbody tr th {\n",
 90 |        "        vertical-align: top;\n",
 91 |        "    }\n",
 92 |        "\n",
 93 |        "    .dataframe thead th {\n",
 94 |        "        text-align: right;\n",
 95 |        "    }\n",
 96 |        "</style>\n",
 97 |        "<table border=\"1\" class=\"dataframe\">\n",
 98 |        "  <thead>\n",
 99 |        "    <tr style=\"text-align: right;\">\n",
100 |        "      <th></th>\n",
101 |        "      <th>---</th>\n",
102 |        "      <th>جیھانی</th>\n",
103 |        "      <th>تورکیا</th>\n",
104 |        "      <th>ئێران</th>\n",
105 |        "      <th>عێراق</th>\n",
106 |        "      <th>سووریا</th>\n",
107 |        "    </tr>\n",
108 |        "  </thead>\n",
109 |        "  <tbody>\n",
110 |        "    <tr>\n",
111 |        "      <th>0</th>\n",
112 |        "      <td>کرمانجی</td>\n",
113 |        "      <td>14419000</td>\n",
114 |        "      <td>7919000</td>\n",
115 |        "      <td>443000</td>\n",
116 |        "      <td>3185000</td>\n",
117 |        "      <td>1661000</td>\n",
118 |        "    </tr>\n",
119 |        "    <tr>\n",
120 |        "      <th>1</th>\n",
121 |        "      <td>ئەوانەی بە تورکی دەدوێن</td>\n",
122 |        "      <td>5732000</td>\n",
123 |        "      <td>5732000</td>\n",
124 |        "      <td>0</td>\n",
125 |        "      <td>0</td>\n",
126 |        "      <td>0</td>\n",
127 |        "    </tr>\n",
128 |        "    <tr>\n",
129 |        "      <th>2</th>\n",
130 |        "      <td>باشوور</td>\n",
131 |        "      <td>3381000</td>\n",
132 |        "      <td>0</td>\n",
133 |        "      <td>3381000</td>\n",
134 |        "      <td>0</td>\n",
135 |        "      <td>0</td>\n",
136 |        "    </tr>\n",
137 |        "    <tr>\n",
138 |        "      <th>3</th>\n",
139 |        "      <td>سۆرانی</td>\n",
140 |        "      <td>1576000</td>\n",
141 |        "      <td>0</td>\n",
142 |        "      <td>502000</td>\n",
143 |        "      <td>567000</td>\n",
144 |        "      <td>0</td>\n",
145 |        "    </tr>\n",
146 |        "    <tr>\n",
147 |        "      <th>4</th>\n",
148 |        "      <td>زازایی - دەملی</td>\n",
149 |        "      <td>1125000</td>\n",
150 |        "      <td>1125000</td>\n",
151 |        "      <td>0</td>\n",
152 |        "      <td>0</td>\n",
153 |        "      <td>0</td>\n",
154 |        "    </tr>\n",
155 |        "    <tr>\n",
156 |        "      <th>5</th>\n",
157 |        "      <td>زازایی - ئەلڤێکا</td>\n",
158 |        "      <td>184000</td>\n",
159 |        "      <td>179000</td>\n",
160 |        "      <td>0</td>\n",
161 |        "      <td>0</td>\n",
162 |        "      <td>0</td>\n",
163 |        "    </tr>\n",
164 |        "    <tr>\n",
165 |        "      <th>6</th>\n",
166 |        "      <td>ڕەوەند</td>\n",
167 |        "      <td>90000</td>\n",
168 |        "      <td>38000</td>\n",
169 |        "      <td>20000</td>\n",
170 |        "      <td>33000</td>\n",
171 |        "      <td>0</td>\n",
172 |        "    </tr>\n",
173 |        "    <tr>\n",
174 |        "      <th>7</th>\n",
175 |        "      <td>ھەورامی</td>\n",
176 |        "      <td>54000</td>\n",
177 |        "      <td>0</td>\n",
178 |        "      <td>26000</td>\n",
179 |        "      <td>28000</td>\n",
180 |        "      <td>0</td>\n",
181 |        "    </tr>\n",
182 |        "    <tr>\n",
183 |        "      <th>8</th>\n",
184 |        "      <td>شکاکی</td>\n",
185 |        "      <td>49000</td>\n",
186 |        "      <td>23000</td>\n",
187 |        "      <td>26000</td>\n",
188 |        "      <td>0</td>\n",
189 |        "      <td>0</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>9</th>\n",
193 |        "      <td>کۆی گشتی</td>\n",
194 |        "      <td>26712000</td>\n",
195 |        "      <td>15016000</td>\n",
196 |        "      <td>4398000</td>\n",
197 |        "      <td>3916000</td>\n",
198 |        "      <td>1661000</td>\n",
199 |        "    </tr>\n",
200 |        "  </tbody>\n",
201 |        "</table>\n",
202 |        "</div>"
203 |       ],
204 |       "text/plain": [
205 |        "                       ---    جیھانی    تورکیا    ئێران    عێراق   سووریا\n",
206 |        "0                  کرمانجی  14419000   7919000   443000  3185000  1661000\n",
207 |        "1  ئەوانەی بە تورکی دەدوێن   5732000   5732000        0        0        0\n",
208 |        "2                   باشوور   3381000         0  3381000        0        0\n",
209 |        "3                   سۆرانی   1576000         0   502000   567000        0\n",
210 |        "4           زازایی - دەملی   1125000   1125000        0        0        0\n",
211 |        "5         زازایی - ئەلڤێکا    184000    179000        0        0        0\n",
212 |        "6                   ڕەوەند     90000     38000    20000    33000        0\n",
213 |        "7                  ھەورامی     54000         0    26000    28000        0\n",
214 |        "8                    شکاکی     49000     23000    26000        0        0\n",
215 |        "9                 کۆی گشتی  26712000  15016000  4398000  3916000  1661000"
216 |       ]
217 |      },
218 |      "execution_count": 5,
219 |      "metadata": {},
220 |      "output_type": "execute_result"
221 |     }
222 |    ],
223 |    "source": [
224 |     "import pandas as pd\n",
225 |     "conv = {\n",
226 |     "    'سووریا': digitsconv,\n",
227 |     "    'عێراق': digitsconv,\n",
228 |     "    'ئێران': digitsconv,\n",
229 |     "    'تورکیا': digitsconv,\n",
230 |     "    'جیھانی': digitsconv\n",
231 |     "}\n",
232 |     "df = pd.read_table(\"../data/demographics.tsv\", converters=conv)\n",
233 |     "df"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 6,
239 |    "metadata": {},
240 |    "outputs": [
241 |     {
242 |      "name": "stdout",
243 |      "output_type": "stream",
244 |      "text": [
245 |       "تورکیا    30032000\n",
246 |       "ئێران      8796000\n",
247 |       "عێراق      7729000\n",
248 |       "سووریا     3322000\n",
249 |       "dtype: int64\n"
250 |      ]
251 |     }
252 |    ],
253 |    "source": [
254 |     "col_list=[\"تورکیا\" ,\"ئێران\" ,\"عێراق\" ,\"سووریا\"]\n",
255 |     "\n",
256 |     "total_df = df[col_list].sum(axis=0)\n",
257 |     "print(total_df)"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "Using indicies and values of the `total_df` series:"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 20,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "def convert_to_arab_ns(n, p=None, decimal=2, sep_in=[\"\", \".\"], sep_out=[\"\\u066C\", \"\\u066B\"], scale=None):\n",
274 |     "    locale.setlocale(locale.LC_ALL, \"en_US.UTF-8\")\n",
275 |     "    decimal_places = decimal\n",
276 |     "    if sep_in == [\"\", \".\"]:\n",
277 |     "        n = n * scale if scale else n\n",
278 |     "        format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d'\n",
279 |     "        n = locale.format_string(format_string, n, grouping=True, monetary=True)\n",
280 |     "        n = n.replace(\",\", \"ṯ\").replace(\".\", \"ḏ\")\n",
281 |     "        #n = str(n)\n",
282 |     "    if sep_in[0] in [\" \", \",\", \"٬\", \"\\u2009\"]:\n",
283 |     "        n = n.replace(r'[\\u0020,٬\\u2009]', \"ṯ\")\n",
284 |     "    elif sep_in[0] == \".\":\n",
285 |     "        n = n.replace(\".\", \"ṯ\")\n",
286 |     "    if sep_in[1] in [\",\", \".\", \"٫\"]:\n",
287 |     "        n = n.replace(r'[,.٫]', \"ḏ\")\n",
288 |     "    sep = sep_out\n",
289 |     "    t = n.maketrans(\"0123456789\", \"٠١٢٣٤٥٦٧٨٩\")\n",
290 |     "    locale.setlocale(locale.LC_ALL, \"\")\n",
291 |     "    return n.translate(t).replace(\"ṯ\", sep[0] ).replace(\"ḏ\", sep[1])"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 23,
297 |    "metadata": {},
298 |    "outputs": [],
299 |    "source": [
300 |     "\n",
301 |     "# ax = total_df.plot(kind=\"bar\", title='ڕێژەی دانیشتووانی کورد', xlabel=\"ناوچە\", ylabel=\"ڕێژەی دانیشتووان\" ,rot=0)\n",
302 |     "# DEFAULT_NUMERAL_SYSYEM = \"arab\"\n",
303 |     "# ns_formatter = ticker.FuncFormatter(lambda x, p: convert_to_arab_ns(x, p, scale=0.000001))\n",
304 |     "# ax.get_yaxis().set_major_formatter(ns_formatter)\n",
305 |     "# plt.show()"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "# fig = px.bar(x=total_df.index, y=total_df.values)\n",
315 |     "fig = total_df.plot(kind=\"bar\", title='ڕێژەی دانیشتووانی کورد', xlabel=\"ناوچە\", ylabel=\"ڕێژەی دانیشتووان\" ,rot=0)\n",
316 |     "\n",
317 |     "fig.update_layout(\n",
318 |     "    title={\n",
319 |     "        'text': 'ڕێژەی دانیشتووانی کورد',\n",
320 |     "        'y':0.95,\n",
321 |     "        'x':0.5,\n",
322 |     "        'xanchor': 'center',\n",
323 |     "        'yanchor': 'top'},\n",
324 |     "    xaxis_title=\"ناوچە\",\n",
325 |     "    yaxis_title=\"ڕێژەی دانیشتووان\",\n",
326 |     "    font=dict(\n",
327 |     "        family=\"Vazirmatn\",\n",
328 |     "        size=14,\n",
329 |     "        color=\"Grey\"\n",
330 |     "    )\n",
331 |     ")\n",
332 |     "\n",
333 |     "fig.show()"
334 |    ]
335 |   }
336 |  ],
337 |  "metadata": {
338 |   "interpreter": {
339 |    "hash": "05c935ee2b4ff45f26d355be2499c84aedc5a4939bfa2f7a9b7f00dda4a86ade"
340 |   },
341 |   "kernelspec": {
342 |    "display_name": "Python 3.10.1 ('el-test')",
343 |    "language": "python",
344 |    "name": "python3"
345 |   },
346 |   "language_info": {
347 |    "codemirror_mode": {
348 |     "name": "ipython",
349 |     "version": 3
350 |    },
351 |    "file_extension": ".py",
352 |    "mimetype": "text/x-python",
353 |    "name": "python",
354 |    "nbconvert_exporter": "python",
355 |    "pygments_lexer": "ipython3",
356 |    "version": "3.10.1"
357 |   },
358 |   "orig_nbformat": 4
359 |  },
360 |  "nbformat": 4,
361 |  "nbformat_minor": 2
362 | }
363 | 


--------------------------------------------------------------------------------
/notebooks/persian_df.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Pandas internationalisation: Persian (فارسی) data example\n",
  8 |     "\n",
  9 |     "An example of reading in Persian data in Pandas.\n",
 10 |     "\n",
 11 |     "The file `fa_stats.tsv` is a tab delimited file in Persian. Column 1 contains a four digit year based on the Islamic calendar. Columns 2 and 3 contain integers using Eastern Arabic-Indic digits, using the Arabic thousands seperator.\n",
 12 |     "\n",
 13 |     "A set of conversion functions are used with `pd.read_table()` to convert the data to a format that cen be used in Pandas.\n",
 14 |     "\n",
 15 |     "Column 1 is converted to the Gregorian Calendar, using a combination of the `convert_digits()` function and PyICU's `icu.Calendar` and `icu.GregorianCalendar` modules. After the dataframe is available, we use `pandas.Series.dt.year` to convert the datetime objects in the column to Four digit year display.\n",
 16 |     "\n",
 17 |     "The `convert_digits()` function is used to convert the Eastern Arabic-Indic digits in columns 2 and 3 to Arabic digits that can be manipulated by Pandas."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 1,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import unicodedataplus as ud, regex as re, pandas as pd\n",
 27 |     "from icu import Locale, Calendar, GregorianCalendar\n",
 28 |     "\n",
 29 |     "def convert_digits(s, sep = (\",\", \".\")):\n",
 30 |     "    nd = re.compile(r'^-?\\p{Nd}[,.\\u066B\\u066C\\u0020\\u2009\\u202F\\p{Nd}]*$')\n",
 31 |     "    tsep, dsep = sep\n",
 32 |     "    if nd.match(s):\n",
 33 |     "        s = s.replace(tsep, \"\")\n",
 34 |     "        s = ''.join([str(ud.decimal(c, c)) for c in s])\n",
 35 |     "        if dsep in s:\n",
 36 |     "            return float(s.replace(dsep, \".\")) if dsep != \".\" else float(s)\n",
 37 |     "        return int(s)\n",
 38 |     "    return s\n",
 39 |     "\n",
 40 |     "loc = \"fa_IR\"\n",
 41 |     "in_c = Calendar.createInstance(Locale(loc + \"@calendar=persian\"))\n",
 42 |     "out_c = GregorianCalendar(Locale(loc + \"@calendar=gregorian\"))\n",
 43 |     "\n",
 44 |     "def convert_islamic_year(y, in_c, out_c):\n",
 45 |     "    y = convert_digits(y.strip())\n",
 46 |     "    in_c.set(Calendar.YEAR, y)\n",
 47 |     "    out_c.setTime(in_c.getTime())\n",
 48 |     "    return out_c.get(Calendar.YEAR)\n",
 49 |     "\n",
 50 |     "seps = (\"\\u066C\", \"\\u066B\")\n",
 51 |     "digitf = lambda x: convert_digits(x.strip(), sep = seps)\n",
 52 |     "datef = lambda x: convert_islamic_year(x, in_c=in_c, out_c=out_c)\n"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 2,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "data": {
 62 |       "text/html": [
 63 |        "<div>\n",
 64 |        "<style scoped>\n",
 65 |        "    .dataframe tbody tr th:only-of-type {\n",
 66 |        "        vertical-align: middle;\n",
 67 |        "    }\n",
 68 |        "\n",
 69 |        "    .dataframe tbody tr th {\n",
 70 |        "        vertical-align: top;\n",
 71 |        "    }\n",
 72 |        "\n",
 73 |        "    .dataframe thead th {\n",
 74 |        "        text-align: right;\n",
 75 |        "    }\n",
 76 |        "</style>\n",
 77 |        "<table border=\"1\" class=\"dataframe\">\n",
 78 |        "  <thead>\n",
 79 |        "    <tr style=\"text-align: right;\">\n",
 80 |        "      <th></th>\n",
 81 |        "      <th>سال</th>\n",
 82 |        "      <th>ولادت</th>\n",
 83 |        "      <th>وفات</th>\n",
 84 |        "    </tr>\n",
 85 |        "  </thead>\n",
 86 |        "  <tbody>\n",
 87 |        "    <tr>\n",
 88 |        "      <th>0</th>\n",
 89 |        "      <td>1959</td>\n",
 90 |        "      <td>864846</td>\n",
 91 |        "      <td>176288</td>\n",
 92 |        "    </tr>\n",
 93 |        "    <tr>\n",
 94 |        "      <th>1</th>\n",
 95 |        "      <td>1960</td>\n",
 96 |        "      <td>876206</td>\n",
 97 |        "      <td>171040</td>\n",
 98 |        "    </tr>\n",
 99 |        "    <tr>\n",
100 |        "      <th>2</th>\n",
101 |        "      <td>1961</td>\n",
102 |        "      <td>902260</td>\n",
103 |        "      <td>159371</td>\n",
104 |        "    </tr>\n",
105 |        "  </tbody>\n",
106 |        "</table>\n",
107 |        "</div>"
108 |       ],
109 |       "text/plain": [
110 |        "    سال   ولادت    وفات\n",
111 |        "0  1959  864846  176288\n",
112 |        "1  1960  876206  171040\n",
113 |        "2  1961  902260  159371"
114 |       ]
115 |      },
116 |      "execution_count": 2,
117 |      "metadata": {},
118 |      "output_type": "execute_result"
119 |     }
120 |    ],
121 |    "source": [
122 |     "conv = {\"سال\": datef ,\"ولادت\": digitf, \"وفات\": digitf}\n",
123 |     "df = pd.read_table(\"../data/csv/fa_stats.tsv\", converters=conv, parse_dates=['سال'])\n",
124 |     "df[\"سال\"] = df[\"سال\"].dt.year\n",
125 |     "df.head(3)"
126 |    ]
127 |   }
128 |  ],
129 |  "metadata": {
130 |   "interpreter": {
131 |    "hash": "bb12d0de9674b66c629d2bafada2ec4f6e6dba6d129e54dea4badc21502d54d3"
132 |   },
133 |   "kernelspec": {
134 |    "display_name": "Python 3",
135 |    "language": "python",
136 |    "name": "python3"
137 |   },
138 |   "language_info": {
139 |    "codemirror_mode": {
140 |     "name": "ipython",
141 |     "version": 3
142 |    },
143 |    "file_extension": ".py",
144 |    "mimetype": "text/x-python",
145 |    "name": "python",
146 |    "nbconvert_exporter": "python",
147 |    "pygments_lexer": "ipython3",
148 |    "version": "3.8.1"
149 |   }
150 |  },
151 |  "nbformat": 4,
152 |  "nbformat_minor": 2
153 | }
154 | 


--------------------------------------------------------------------------------
/notebooks/strings_casing_matching.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Python string operations: casing and matching"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "|Operation   |Python   |Pandas  |PyICU  |\n",
 15 |     "|----------- |-------- |------- |------ |\n",
 16 |     "|Lowercasing |[str.lower()](https://docs.python.org/3/library/stdtypes.html#str.lower)  |[pandas.Series.str.lower()](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.lower.html?highlight=lower#pandas-series-str-lower) |icu.UnicodeString.toLower() |\n",
 17 |     "|Uppercasing |[str.upper()](https://docs.python.org/3/library/stdtypes.html#str.upper)  |[pandas.Series.str.upper()](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.upper.html#pandas-series-str-upper) |icu.UnicodeString.toUpper() |\n",
 18 |     "|Titlecasing |[str.title()](https://docs.python.org/3/library/stdtypes.html#str.title)  |[pandas.Series.str.title](pandas.Series.str.title) |icu.UnicodeString.toTitle() |\n",
 19 |     "|Casefolding |[str.casefold()](https://docs.python.org/3/library/stdtypes.html#str.casefold) |[pandas.Series.str.casefold()]() |icu.UnicodeString.CaseFold() |\n",
 20 |     "\n",
 21 |     "The operations [str.capitalize()](https://docs.python.org/3/library/stdtypes.html#str.capitalize)/[pandas.Series.str.capitalize()](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.capitalize.html#pandas-series-str-capitalize) and [str.swapcase()](https://docs.python.org/3/library/stdtypes.html#str.swapcase)/[pandas.Series.str.swapcase()](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.swapcase.html#pandas-series-str-swapcase), although string operations, aren't necessarily casing operations.\n",
 22 |     "\n",
 23 |     "N.B. we will not explore the differences between an [object and `StringDtype`](https://pandas.pydata.org/docs/user_guide/text.html#behavior-differences) in Pandas."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 4,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from el_internationalisation import cp, cpnames, udata"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "## Python casing operations\n",
 40 |     "\n",
 41 |     "Unicode contains a set of special casing mappings. These are divided intto unconditional and conditional mappings. All casing operations should support unconditional special mappings by default.\n",
 42 |     "\n",
 43 |     "Python's casing operations are language insensitive, that is langauge is not taken into account when casing operations occur. The current locale has no impact on casing operations, therefore language sensitive mappings are unsupported.\n",
 44 |     "\n",
 45 |     "Unconditional mappings:\n",
 46 |     "\n",
 47 |     "  * Eszett (ß) casing \n",
 48 |     "  * Preserving canonical equivalence of I WITH DOT ABOVE (&#x0130;)\n",
 49 |     "  * Ligatures (Latin and Armenian script)\n",
 50 |     "  * When a lowercase charcater has no corresponding uppercase precomposed character\n",
 51 |     "  * Greek letters with letters with hupogegramménē (ὑπογεγραμμένη) or prosgráphō (προσγράφω) have special uppercase equivalents.\n",
 52 |     "  * Some Greek letters with letters with hupogegramménē (ὑπογεγραμμένη) have no titlecase\n",
 53 |     "\n",
 54 |     "Conditional mappings:\n",
 55 |     "  1. Language-Insensitive Mappings\n",
 56 |     "    * Final form of Greek sigma\n",
 57 |     "  2. Language-Sensitive Mappings\n",
 58 |     "    * Lithuanian retains the dot in a lowercase i/j when followed by accents\n",
 59 |     "    * For Turkish and Azeri, I and i-dotless; I-dot and i are case pairs\n",
 60 |     "\n",
 61 |     "See [Special Casings](https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt), which forms part of the Unicode Character database (UCD)."
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "### Unconditional mappings\n",
 69 |     "\n",
 70 |     "Python lowercasing and uppercasing support the unconditional mappings of Unicode's special mappings.\n",
 71 |     "\n",
 72 |     "|Character  |Lowercase  |Titlecase  |Uppercase  |Notes  |\n",
 73 |     "|---------- |---------- |---------- |---------- |------ |\n",
 74 |     "\n",
 75 |     "#### Latin script"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 20,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "ß (00DF) ⇒ SS (0053 0053)\n",
 88 |       "Titlecase: should not appear word initial.\n",
 89 |       "i̇ (0069 0307) ⇐ İ (0130)\n",
 90 |       "Titlecase: İ (0049 0307)\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "# ß\n",
 96 |     "ESZETT = \"ß\"\n",
 97 |     "print(f'{ESZETT} ({cp(ESZETT)}) ⇒ {ESZETT.upper()} ({cp(ESZETT.upper())})')\n",
 98 |     "print(\"Titlecase: should not appear word initial.\")\n",
 99 |     "\n",
100 |     "# I WITH DOT ABOVE\n",
101 |     "IDOT = \"\\u0130\"\n",
102 |     "print(f'{IDOT.lower()} ({cp(IDOT.lower())}) ⇐ {IDOT} ({cp(IDOT)})')\n",
103 |     "print(f'Titlecase: {\"i̇\".title()} ({cp(\"i̇\".title())})')"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "Note that Python titlecasing does not resolve back to the precomosed U+0130, but this is part of a wider issue with Python titlecasing, unlike uppercasing and lowercasing, titlecasing does not adhere to the Unicode specification\n",
111 |     "\n",
112 |     "If we take the name of the Turkish city İstanbul:"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 26,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "name": "stdout",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "İstanbul: 0130 0073 0074 0061 006E 0062 0075 006C\n",
125 |       "i̇stanbul: 0069 0307 0073 0074 0061 006E 0062 0075 006C\n",
126 |       "Titlecase: İStanbul (0049 0307 0053 0074 0061 006E 0062 0075 006C)\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "print(f'İstanbul: {cp(\"İstanbul\")}')\n",
132 |     "istanbul = \"İstanbul\".lower()\n",
133 |     "print(f'{istanbul}: {cp(istanbul)}')\n",
134 |     "istanbul_title = istanbul.title()\n",
135 |     "print(f'Titlecase: {istanbul_title} ({cp(istanbul_title)})')"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "The first three characters in the titlecased string are U+0049 U+0307 U+0053. Python titlecases the first alphabetic character after a non-alphabetic character. Combining diacritics are not considered alphabetic characaters:"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 27,
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "data": {
152 |       "text/plain": [
153 |        "False"
154 |       ]
155 |      },
156 |      "execution_count": 27,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "istanbul.isalpha()"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "So __i__ is uppercased to __I__, U+0307 is treated as a non-alphabetic character and the titlecasing operation titlecases the __s__, giving us İStanbul as the titlecased version of the string.\n",
170 |     "\n",
171 |     "It is important to note that the Unicode definition also excludes marks, like combining diacrtics, but Unicode titlecasing does not apply an alphabetic mask to titlecasing."
172 |    ]
173 |   }
174 |  ],
175 |  "metadata": {
176 |   "kernelspec": {
177 |    "display_name": "Python 3.8.1 ('el')",
178 |    "language": "python",
179 |    "name": "python3"
180 |   },
181 |   "language_info": {
182 |    "codemirror_mode": {
183 |     "name": "ipython",
184 |     "version": 3
185 |    },
186 |    "file_extension": ".py",
187 |    "mimetype": "text/x-python",
188 |    "name": "python",
189 |    "nbconvert_exporter": "python",
190 |    "pygments_lexer": "ipython3",
191 |    "version": "3.8.1"
192 |   },
193 |   "orig_nbformat": 4,
194 |   "vscode": {
195 |    "interpreter": {
196 |     "hash": "bb12d0de9674b66c629d2bafada2ec4f6e6dba6d129e54dea4badc21502d54d3"
197 |    }
198 |   }
199 |  },
200 |  "nbformat": 4,
201 |  "nbformat_minor": 2
202 | }
203 | 


--------------------------------------------------------------------------------
/py/am_ET_numbers_icu.py:
--------------------------------------------------------------------------------
 1 | from icu import Locale, LocalizedNumberFormatter, Formattable, RuleBasedNumberFormat, URBNFRuleSetTag
 2 | # lang = "hi-IN-u-nu-deva"
 3 | # lang = "en-IN"
 4 | lang = input("Enter language tag: ")
 5 | LOC = Locale.createCanonical(lang)
 6 | 
 7 | number = 123452.54
 8 | formatter = LocalizedNumberFormatter(LOC)
 9 | r = formatter.formatDouble(number)
10 | print(r)
11 | # १,२३,४५२.५४
12 | 
13 | rb_formatter = RuleBasedNumberFormat(URBNFRuleSetTag.SPELLOUT, LOC)
14 | r2 = rb_formatter.format(number)
15 | print(r2)
16 | # एक लाख तेईस हज़ार चार सौ बावन दशमलव पाँच चार
17 | 
18 | r3 = rb_formatter.parse(r2)
19 | print(Formattable.getDouble(r3))
20 | # 123452.54
21 | 
22 | 


--------------------------------------------------------------------------------
/py/am_ET_numbers_icu_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/am_ET_numbers_icu_1.png


--------------------------------------------------------------------------------
/py/am_ET_numbers_icu_1.py:
--------------------------------------------------------------------------------
 1 | from icu import Locale, LocalizedNumberFormatter, Formattable, RuleBasedNumberFormat, URBNFRuleSetTag
 2 | lang = "am-ET-u-nu-ethi"
 3 | LOC = Locale.createCanonical(lang)
 4 | number = 123452
 5 | formatter = RuleBasedNumberFormat(URBNFRuleSetTag.SPELLOUT, LOC)
 6 | 
 7 | #
 8 | # Spellout (in Amharic)
 9 | # 
10 | r = formatter.format(number)
11 | print(r)
12 | # መቶ ሁለት አስር ሦስት ሺ አራት መቶ አምስት አስር ሁለት
13 | 
14 | #
15 | # Convert back
16 | #
17 | n = formatter.parse(r)
18 | print(n)
19 | # 123,452
20 | print(Formattable.getInt64(n))
21 | # 123452
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/py/am_ET_numbers_icu_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/am_ET_numbers_icu_2.png


--------------------------------------------------------------------------------
/py/am_ET_numbers_icu_2.py:
--------------------------------------------------------------------------------
 1 | from icu import Locale, RuleBasedNumberFormat, URBNFRuleSetTag
 2 | lang = "am-ET-u-nu-ethi"
 3 | LOC = Locale.createCanonical(lang)
 4 | number = 123452
 5 | formatter = RuleBasedNumberFormat(URBNFRuleSetTag.NUMBERING_SYSTEM, LOC)
 6 | formatter.setDefaultRuleSet('%ethiopic')
 7 | r = formatter.format(number)
 8 | print(r)
 9 | # ፲፪፼፴፬፻፶፪
10 | 
11 | # http://www.geez.org/Numerals/NumberSamples.html
12 | 
13 | 
14 | def toEthiopicNS(n):
15 |     formatter = RuleBasedNumberFormat(URBNFRuleSetTag.NUMBERING_SYSTEM, Locale("am_ET"))
16 |     formatter.setDefaultRuleSet('%ethiopic')
17 |     return formatter.format(n)
18 | 
19 | import pytest
20 | arabic_numbers = [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 100010000, 100100000, 100200000, 100110000, 1, 11, 111, 1111, 11111, 111111, 1111111, 11111111, 111111111, 1111111111, 11111111111, 111111111111, 1111111111111, 1, 12, 123, 1234, 12345, 7654321, 17654321, 51615131, 15161513, 10101011, 101, 1001, 1010, 1011, 1100, 1101, 1111, 10001, 10010, 10100, 10101, 10110, 10111, 100001, 100010, 100011, 100100, 101010, 1000001, 1000101, 1000100, 1010000, 1010001, 1100001, 1010101, 101010101, 100010000, 100010100, 101010100, 3, 30, 33, 303, 3003, 3030, 3033, 3300, 3303, 3333, 30003, 30303, 300003, 303030, 3000003, 3000303, 3030003, 3300003, 3030303, 303030303, 333333333]
21 | ethiopic_numbers = ["a፩", "፲", "፻", "፲፻", "፼", "፲፼", "፻፼", "፲፻፼", "፼፼", "፲፼፼", "፻፼፼", "፲፻፼፼", "፼፼፼", "፼፩፼", "፼፲፼", "፼፳፼", "፼፲፩፼", "፩", "፲፩", "፻፲፩", "፲፩፻፲፩", "፼፲፩፻፲፩", "፲፩፼፲፩፻፲፩", "፻፲፩፼፲፩፻፲፩", "፲፩፻፲፩፼፲፩፻፲፩", "፼፲፩፻፲፩፼፲፩፻፲፩", "፲፩፼፲፩፻፲፩፼፲፩፻፲፩", "፻፲፩፼፲፩፻፲፩፼፲፩፻፲፩", "፲፩፻፲፩፼፲፩፻፲፩፼፲፩፻፲፩", "፼፲፩፻፲፩፼፲፩፻፲፩፼፲፩፻፲፩", "፩", "፲፪", "፻፳፫", "፲፪፻፴፬", "፼፳፫፻፵፭", "፯፻፷፭፼፵፫፻፳፩", "፲፯፻፷፭፼፵፫፻፳፩", "፶፩፻፷፩፼፶፩፻፴፩", "፲፭፻፲፮፼፲፭፻፲፫", "፲፻፲፼፲፻፲፩", "፻፩", "፲፻፩", "፲፻፲", "፲፻፲፩", "፲፩፻", "፲፩፻፩", "፲፩፻፲፩", "፼፩", "፼፲", "፼፻", "፼፻፩", "፼፻፲", "፼፻፲፩", "፲፼፩", "፲፼፲", "፲፼፲፩", "፲፼፻", "፲፼፲፻፲", "፻፼፩", "፻፼፻፩", "፻፼፻", "፻፩፼", "፻፩፼፩", "፻፲፼፩", "፻፩፼፻፩", "፼፻፩፼፻፩", "፼፩፼", "፼፩፼፻", "፼፻፩፼፻", "፫", "፴", "፴፫", "፫፻፫", "፴፻፫", "፴፻፴", "፴፻፴፫", "፴፫፻", "፴፫፻፫", "፴፫፻፴፫", "፫፼፫", "፫፼፫፻፫", "፴፼፫", "፴፼፴፻፴", "፫፻፼፫", "፫፻፼፫፻፫", "፫፻፫፼፫", "፫፻፴፼፫", "፫፻፫፼፫፻፫", "፫፼፫፻፫፼፫፻፫", "፫፼፴፫፻፴፫፼፴፫፻፴፫"]
22 | converted = list(map(toEthiopicNS, arabic_numbers))
23 | converted == ethiopic_numbers
24 | # True
25 | 
26 | def test_ethiopic_ns(l, r):
27 |     converted = list(map(toEthiopicNS, l))
28 |     assert converted == r
29 | 
30 | test_ethiopic_ns(arabic_numbers, ethiopic_numbers)
31 | 
32 | # * [icu::RuleBasedNumberFormat Class Reference](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedNumberFormat.html)
33 | # * [URBNFRuleSetTag](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/namespaceicu.html#a55dbbbdd4946251c23988013e06e695e)
34 | 
35 | 
36 | for n in range(formatter.getNumberOfRuleSetNames()):
37 |     print(formatter.getRuleSetName(n))
38 | # %armenian-lower
39 | # %armenian-upper
40 | # %cyrillic-lower
41 | # %ethiopic
42 | # %georgian
43 | # %greek-lower
44 | # %greek-upper
45 | # %hebrew
46 | # # %hebrew-item
47 | # %roman-lower
48 | # %roman-upper
49 | # %tamil
50 | # %zz-default
51 | 


--------------------------------------------------------------------------------
/py/arabic_reshaper_example.py:
--------------------------------------------------------------------------------
 1 | import arabic_reshaper
 2 | from bidi.algorithm import get_display
 3 | 
 4 | from el_internationalisation import cp, clean_presentation_forms
 5 | 
 6 | def rtl_hack(text: str, arabic: bool = True) -> str:
 7 |     """Visually reorders Arabic or Hebrew script Unicode text
 8 | 
 9 |     Visually reorders Arabic or Hebrew script Unicode text. For Arabic script text,
10 |     individual Unicode characters are substituting each character for its equivalent
11 |     presentation form. The modules are used to overcome lack of bidirectional algorithm
12 |     and complex font rendering in some modules and terminals.
13 | 
14 |     It is better to solutions that utilise proper bidirectional algorithm and font
15 |     rendering implementations. For matplotlib use the mplcairo backend instead. For
16 |     annotating images use Pillow. Both make use of libraqm.
17 | 
18 |     arabic_reshaper module converts Arabic characters to Arabic Presentation Forms:
19 |         pip install arabic-reshaper
20 | 
21 |     bidi.algorithm module converts a logically ordered string to visually ordered
22 |     equivalent.
23 |         pip install python-bidi
24 | 
25 |     Args:
26 |         text (str): _description_
27 | 
28 |     Returns:
29 |         str: _description_
30 |     """
31 |     return get_display(arabic_reshaper.reshape(text)) if arabic == True else get_display(text)
32 | 
33 | text = 'اللغة العربية رائعة'
34 | text_h = rtl_hack(text)
35 | print(text)
36 | print(cp(text))
37 | print(text_h)
38 | print(cp(text_h))
39 | 
40 | 
41 | 
42 | 
43 | 
44 | s1 = "لا"
45 | s1_h = rtl_hack(s1)
46 | s2 = "لأ"
47 | s2_h = rtl_hack(s2)
48 | 
49 | print("\n")
50 | print(s1)
51 | print(cp(s1))
52 | print(s1_h)
53 | print(cp(s1_h))
54 | 
55 | print("\n")
56 | print(s2)
57 | print(cp(s2))
58 | print(s2_h)
59 | print(cp(s2_h))
60 | 
61 | 
62 | s3 = "עברית חדשה"
63 | s3_h = rtl_hack(s3, arabic=False)
64 | print("\n")
65 | print(s3)
66 | print(cp(s3))
67 | print(s3_h)
68 | print(cp(s3_h))
69 | # print(s3_h == s3[::-1])
70 | 
71 | 
72 | # Note s3[::-1] is used for reversing strings,
73 | # but for languages that use combining marks,
74 | # it is better to reverse grapheme clusters:
75 | #
76 | #   from grapheme import graphemes
77 | #   print(s3_h == "".join(list(graphemes(s3))[::-1]))
78 | 
79 | from grapheme import graphemes
80 | def reverse_string(text: str, use_graphemes: bool = False) -> str:
81 |     return "".join(list(graphemes(text))[::-1]) if use_graphemes else text[::-1]
82 | 
83 | import regex as re
84 | def reverse_string_regex(text: str, use_graphemes: bool = False) -> str:
85 |     return "".join(re.findall(r'\X', text)[::-1]) if use_graphemes else text[::-1]
86 | 
87 | print("---")
88 | # print(s3_h == "".join(list(graphemes(s3))[::-1]))
89 | # print("\n")
90 | print(text_h == text[::-1])
91 | print(clean_presentation_forms(text_h) == text[::-1])
92 | # print(clean_presentation_forms(text_h) == "".join(list(graphemes(text))[::-1]))
93 | 
94 | 
95 | print(clean_presentation_forms(text_h) == reverse_string(text))
96 | print(clean_presentation_forms(text_h) == reverse_string(text, use_graphemes=True))
97 | 
98 | print(clean_presentation_forms(text_h) == reverse_string_regex(text))
99 | print(clean_presentation_forms(text_h) == reverse_string_regex(text, use_graphemes=True))


--------------------------------------------------------------------------------
/py/hi_IN_numbers_icu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/hi_IN_numbers_icu.png


--------------------------------------------------------------------------------
/py/hi_IN_numbers_icu.py:
--------------------------------------------------------------------------------
 1 | from icu import Locale, LocalizedNumberFormatter, Formattable, RuleBasedNumberFormat, URBNFRuleSetTag
 2 | # lang = "hi-IN-u-nu-deva"
 3 | # lang = "en-IN"
 4 | lang = input("Enter language tag: ")
 5 | LOC = Locale.createCanonical(lang)
 6 | 
 7 | number = 123452.54
 8 | formatter = LocalizedNumberFormatter(LOC)
 9 | r = formatter.formatDouble(number)
10 | print(r)
11 | # १,२३,४५२.५४
12 | 
13 | rb_formatter = RuleBasedNumberFormat(URBNFRuleSetTag.SPELLOUT, LOC)
14 | r2 = rb_formatter.format(number)
15 | print(r2)
16 | # एक लाख तेईस हज़ार चार सौ बावन दशमलव पाँच चार
17 | 
18 | r3 = rb_formatter.parse(r2)
19 | print(Formattable.getDouble(r3))
20 | # 123452.54
21 | 
22 | 


--------------------------------------------------------------------------------
/py/matplotlib_kurdish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/matplotlib_kurdish.png


--------------------------------------------------------------------------------
/py/matplotlib_kurdish.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # matplotlib_kurdish.py
  3 | #
  4 | #   This script will read in and process a Sorani Kurdish TSV file.
  5 | #   
  6 | #   mplcairo supports a number of backends available. 
  7 | #
  8 | #   If you wish to save plot as an image, rather than display plot
  9 | #   use module://mplcairo.base
 10 | #
 11 | #   Depending on your OS and system configuration a number of 
 12 | #   backends that render to widgets are available:
 13 | #      * module://mplcairo.gtk (used below for non-macOS installs)
 14 | #      * module://mplcairo.gtk_native
 15 | #      * module://mplcairo.qt
 16 | #      * module://mplcairo.tk
 17 | #      * module://mplcairo.wx
 18 | #      * module://mplcairo.macosx  (used below for macOS)
 19 | 
 20 | import pandas as pd
 21 | import locale, platform
 22 | import gi
 23 | import mplcairo
 24 | import matplotlib as mpl
 25 | if platform.system() == "Darwin":
 26 |     mpl.use("module://mplcairo.macosx")
 27 | else:
 28 |     gi.require_version("Gtk", "3.0")
 29 |     mpl.use("module://mplcairo.gtk")
 30 |     # mpl.use("module://mplcairo.qt")
 31 | import matplotlib.pyplot as plt
 32 | import matplotlib.ticker as ticker
 33 | import seaborn as sns
 34 | import unicodedata as ud, regex as re
 35 | 
 36 | # Convert non-Western Arabic digits to Western Arabic digits
 37 | def convert_digits(s, sep = (",", ".")):
 38 |     nd = re.compile(r'^-?\p{Nd}[,.\u066B\u066C\u0020\u2009\u202F\p{Nd}]*$')
 39 |     tsep, dsep = sep
 40 |     if nd.match(s):
 41 |         s = s.replace(tsep, "")
 42 |         s = ''.join([str(ud.decimal(c, c)) for c in s])
 43 |         if dsep in s:
 44 |             return float(s.replace(dsep, ".")) if dsep != "." else float(s)
 45 |         return int(s)
 46 |     return s
 47 | 
 48 | # Specify grouping and decimal seperators using in data
 49 | seps = ("\u066C", "\u066B")
 50 | # convert entries to hyphen to Eastern Arabic zero, and pass to convert_digits()
 51 | digitsconv = lambda x: convert_digits(x.replace("-", "٠"), sep = seps)
 52 | 
 53 | # Covert Western Arabic digits to Eastern Arabic digits for tick labels
 54 | def convert_to_sorani_ns(n, p=None, scale=None):
 55 |     locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
 56 |     decimal_places = 2
 57 |     n = n * scale if scale else n
 58 |     format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d'
 59 |     n = locale.format_string(format_string, n, grouping=True, monetary=True)
 60 |     n = n.replace(",", "ṯ").replace(".", "ḏ")
 61 |     sep = ["\u066C", "\u066B"]
 62 |     t = n.maketrans("0123456789", "٠١٢٣٤٥٦٧٨٩")
 63 |     locale.setlocale(locale.LC_ALL, "")
 64 |     return n.translate(t).replace("ṯ", sep[0] ).replace("ḏ", sep[1])
 65 | 
 66 | # import data
 67 | import pandas as pd
 68 | conv = {
 69 |     'سووریا': digitsconv,
 70 |     'عێراق': digitsconv,
 71 |     'ئێران': digitsconv,
 72 |     'تورکیا': digitsconv,
 73 |     'جیھانی': digitsconv
 74 | }
 75 | df = pd.read_table("../data/demographics.tsv", converters=conv)
 76 | print(df)
 77 | 
 78 | # get sum of each column
 79 | col_list=["تورکیا" ,"ئێران" ,"عێراق" ,"سووریا"]
 80 | total_df = df[col_list].sum(axis=0)
 81 | print(total_df)
 82 | 
 83 | 
 84 | 
 85 | 
 86 | fig, axes = plt.subplots(1,2)
 87 | plt.rcParams.update({'font.family':'Vazirmatn'})
 88 | 
 89 | # axes[0] - subplot with default (LTR) layout 
 90 | axes[0].bar(total_df.index, total_df.values, color='royalblue', alpha=0.7)
 91 | axes[0].grid(color='#95a5a6', linestyle='--', linewidth=2, axis='y', alpha=0.7)
 92 | axes[0].set_xlabel("ناوچە", size=12)
 93 | axes[0].set_ylabel("ڕێژەی دانیشتووان (بە ملیۆن)", size=12)
 94 | axes[0].set_title('ڕێژەی دانیشتووانی کورد', size=15)
 95 | 
 96 | ns_formatter = ticker.FuncFormatter(lambda x, p: convert_to_sorani_ns(x, p, scale=0.000001))
 97 | axes[0].get_yaxis().set_major_formatter(ns_formatter)
 98 | 
 99 | # axes[1] - subplot with RTL layout 
100 | axes[1].bar(total_df.index, total_df.values, color='royalblue', alpha=0.7)
101 | axes[1].grid(color='#95a5a6', linestyle='--', linewidth=2, axis='y', alpha=0.7)
102 | 
103 | # move y axis and associated label to right of plot
104 | axes[1].yaxis.tick_right()
105 | axes[1].yaxis.set_label_position("right")
106 | # invert x-axis
107 | #plt.gca().invert_xaxis()
108 | axes[1].invert_xaxis()
109 | axes[1].set_xlabel("ناوچە", size=12)
110 | axes[1].set_ylabel("ڕێژەی دانیشتووان (بە ملیۆن)", size=12, labelpad=10)
111 | axes[1].set_title('ڕێژەی دانیشتووانی کورد', size=15)
112 | axes[1].get_yaxis().set_major_formatter(ns_formatter)
113 | 
114 | # block=True required for running script in CLI when outputting canvas to widget.
115 | plt.tight_layout()
116 | plt.show(block=True)


--------------------------------------------------------------------------------
/py/pandas_plot_kurdish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/pandas_plot_kurdish.png


--------------------------------------------------------------------------------
/py/pandas_plot_kurdish.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # matplotlib_kurdish.py
  3 | #
  4 | #   This script will read in and process a Sorani Kurdish TSV file.
  5 | #   
  6 | #   mplcairo supports a number of backends available. 
  7 | #
  8 | #   If you wish to save plot as an image, rather than display plot
  9 | #   use module://mplcairo.base
 10 | #
 11 | #   Depending on your OS and system configuration a number of 
 12 | #   backends that render to widgets are available:
 13 | #      * module://mplcairo.gtk (used below for non-macOS installs)
 14 | #      * module://mplcairo.gtk_native
 15 | #      * module://mplcairo.qt
 16 | #      * module://mplcairo.tk
 17 | #      * module://mplcairo.wx
 18 | #      * module://mplcairo.macosx  (used below for macOS)
 19 | 
 20 | import pandas as pd
 21 | import locale, platform
 22 | import gi
 23 | import mplcairo
 24 | import matplotlib as mpl
 25 | if platform.system() == "Darwin":
 26 |     mpl.use("module://mplcairo.macosx")
 27 | else:
 28 |     gi.require_version("Gtk", "3.0")
 29 |     mpl.use("module://mplcairo.gtk")
 30 |     # mpl.use("module://mplcairo.qt")
 31 | import matplotlib.pyplot as plt
 32 | import matplotlib.ticker as ticker
 33 | import seaborn as sns
 34 | import unicodedata as ud, regex as re
 35 | 
 36 | # Convert non-Western Arabic digits to Western Arabic digits
 37 | def convert_digits(s, sep = (",", ".")):
 38 |     nd = re.compile(r'^-?\p{Nd}[,.\u066B\u066C\u0020\u2009\u202F\p{Nd}]*$')
 39 |     tsep, dsep = sep
 40 |     if nd.match(s):
 41 |         s = s.replace(tsep, "")
 42 |         s = ''.join([str(ud.decimal(c, c)) for c in s])
 43 |         if dsep in s:
 44 |             return float(s.replace(dsep, ".")) if dsep != "." else float(s)
 45 |         return int(s)
 46 |     return s
 47 | 
 48 | # Specify grouping and decimal seperators using in data
 49 | seps = ("\u066C", "\u066B")
 50 | # convert entries to hyphen to Eastern Arabic zero, and pass to convert_digits()
 51 | digitsconv = lambda x: convert_digits(x.replace("-", "٠"), sep = seps)
 52 | 
 53 | # Covert Western Arabic digits to Eastern Arabic digits for tick labels
 54 | def convert_to_sorani_ns(n, p=None, scale=None):
 55 |     locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
 56 |     decimal_places = 2
 57 |     n = n * scale if scale else n
 58 |     format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d'
 59 |     n = locale.format_string(format_string, n, grouping=True, monetary=True)
 60 |     n = n.replace(",", "ṯ").replace(".", "ḏ")
 61 |     sep = ["\u066C", "\u066B"]
 62 |     t = n.maketrans("0123456789", "٠١٢٣٤٥٦٧٨٩")
 63 |     locale.setlocale(locale.LC_ALL, "")
 64 |     return n.translate(t).replace("ṯ", sep[0] ).replace("ḏ", sep[1])
 65 | 
 66 | # import data
 67 | import pandas as pd
 68 | conv = {
 69 |     'سووریا': digitsconv,
 70 |     'عێراق': digitsconv,
 71 |     'ئێران': digitsconv,
 72 |     'تورکیا': digitsconv,
 73 |     'جیھانی': digitsconv
 74 | }
 75 | df = pd.read_table("../data/demographics.tsv", converters=conv)
 76 | print(df)
 77 | 
 78 | # get sum of each column
 79 | col_list=["تورکیا" ,"ئێران" ,"عێراق" ,"سووریا"]
 80 | total_df = df[col_list].sum(axis=0)
 81 | print(total_df)
 82 | 
 83 | plt.figure()
 84 | plt.rcParams.update({'font.family':'Vazirmatn'})
 85 | ns_formatter = ticker.FuncFormatter(lambda x, p: convert_to_sorani_ns(x, p, scale=0.000001))
 86 | 
 87 | plt.subplot(1, 2, 1)
 88 | ax1 = total_df.plot(kind="bar", title='ڕێژەی دانیشتووانی کورد', xlabel="ناوچە", ylabel="ڕێژەی دانیشتووان (بە ملیۆن)", rot=0)
 89 | ax1.get_yaxis().set_major_formatter(ns_formatter)
 90 | 
 91 | plt.subplot(1, 2, 2)
 92 | ax2 = total_df.plot(kind="bar", title='ڕێژەی دانیشتووانی کورد', xlabel="ناوچە", ylabel="ڕێژەی دانیشتووان (بە ملیۆن)", rot=0)
 93 | ax2.get_yaxis().set_major_formatter(ns_formatter)
 94 | # move y axis and associated label to right of plot
 95 | ax2.yaxis.tick_right()
 96 | ax2.yaxis.set_label_position("right")
 97 | # invert x-axis
 98 | #plt.gca().invert_xaxis()
 99 | ax2.invert_xaxis()
100 | 
101 | plt.tight_layout()
102 | plt.show(block=True)


--------------------------------------------------------------------------------
/py/pyuca_test.py:
--------------------------------------------------------------------------------
 1 | import pyuca
 2 | test_list = ["₨", "Z", "ز", "z", "ر", "٨", "R", "﷼"]
 3 | ducet_rules = "../rules/collation/allkeys_DUCET.txt"
 4 | cldr_rules = "../rules/collation/allkeys_CLDR.txt"
 5 | ducet_collator = pyuca.Collator(ducet_rules)
 6 | cldr_collator = pyuca.Collator(cldr_rules)
 7 | 
 8 | sorted_default = sorted(test_list)
 9 | print(sorted_default)
10 | sorted_ducet = sorted(test_list, key=ducet_collator.sort_key)
11 | print(sorted_ducet)
12 | sorted_cldr = sorted(test_list, key=cldr_collator.sort_key)
13 | print(sorted_cldr)
14 | 
15 | from icu import Locale, Collator
16 | loc = Locale.getRoot()
17 | collator = Collator.createInstance(loc)
18 | sorted_icu_root = sorted(test_list, key=collator.getSortKey)
19 | print(sorted_icu_root)
20 | 
21 | print(sorted_icu_root == sorted_cldr)


--------------------------------------------------------------------------------
/py/seaborn_kurdish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/seaborn_kurdish.png


--------------------------------------------------------------------------------
/py/seaborn_kurdish.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # seaborn_kurdish.py
  3 | #
  4 | #   This script will read in and process a Sorani Kurdish TSV file.
  5 | #   Two plots will be generated (a LTR layout and a RTL layout).
  6 | #   
  7 | #   mplcairo supports a number of backends available. 
  8 | #
  9 | #   If you wish to save plot as an image, rather than display plot
 10 | #   use module://mplcairo.base
 11 | #
 12 | #   Depending on your OS and system configuration a number of 
 13 | #   backends that render to widgets are available:
 14 | #      * module://mplcairo.gtk (used below for non-macOS installs)
 15 | #      * module://mplcairo.gtk_native
 16 | #      * module://mplcairo.qt
 17 | #      * module://mplcairo.tk
 18 | #      * module://mplcairo.wx
 19 | #      * module://mplcairo.macosx  (used below for macOS)
 20 | 
 21 | import pandas as pd
 22 | import locale, platform
 23 | import gi
 24 | import mplcairo
 25 | import matplotlib as mpl
 26 | if platform.system() == "Darwin":
 27 |     mpl.use("module://mplcairo.macosx")
 28 | else:
 29 |     gi.require_version("Gtk", "3.0")
 30 |     mpl.use("module://mplcairo.gtk")
 31 |     # mpl.use("module://mplcairo.qt")
 32 | import matplotlib.pyplot as plt
 33 | import matplotlib.ticker as ticker
 34 | import seaborn as sns
 35 | import unicodedata as ud, regex as re
 36 | 
 37 | # Convert non-Western Arabic digits to Western Arabic digits
 38 | def convert_digits(s, sep = (",", ".")):
 39 |     nd = re.compile(r'^-?\p{Nd}[,.\u066B\u066C\u0020\u2009\u202F\p{Nd}]*$')
 40 |     tsep, dsep = sep
 41 |     if nd.match(s):
 42 |         s = s.replace(tsep, "")
 43 |         s = ''.join([str(ud.decimal(c, c)) for c in s])
 44 |         if dsep in s:
 45 |             return float(s.replace(dsep, ".")) if dsep != "." else float(s)
 46 |         return int(s)
 47 |     return s
 48 | 
 49 | # Specify grouping and decimal seperators using in data
 50 | seps = ("\u066C", "\u066B")
 51 | # convert entries to hyphen to Eastern Arabic zero, and pass to convert_digits()
 52 | digitsconv = lambda x: convert_digits(x.replace("-", "٠"), sep = seps)
 53 | 
 54 | # Covert Western Arabic digits to Eastern Arabic digits for tick labels
 55 | def convert_to_sorani_ns(n, p=None, scale=None):
 56 |     locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
 57 |     decimal_places = 2
 58 |     n = n * scale if scale else n
 59 |     format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d'
 60 |     n = locale.format_string(format_string, n, grouping=True, monetary=True)
 61 |     n = n.replace(",", "ṯ").replace(".", "ḏ")
 62 |     sep = ["\u066C", "\u066B"]
 63 |     t = n.maketrans("0123456789", "٠١٢٣٤٥٦٧٨٩")
 64 |     locale.setlocale(locale.LC_ALL, "")
 65 |     return n.translate(t).replace("ṯ", sep[0] ).replace("ḏ", sep[1])
 66 | 
 67 | # import data
 68 | import pandas as pd
 69 | conv = {
 70 |     'سووریا': digitsconv,
 71 |     'عێراق': digitsconv,
 72 |     'ئێران': digitsconv,
 73 |     'تورکیا': digitsconv,
 74 |     'جیھانی': digitsconv
 75 | }
 76 | df = pd.read_table("../data/demographics.tsv", converters=conv)
 77 | print(df)
 78 | 
 79 | # get sum of each column
 80 | col_list=["تورکیا" ,"ئێران" ,"عێراق" ,"سووریا"]
 81 | total_df = df[col_list].sum(axis=0)
 82 | print(total_df)
 83 | 
 84 | # Plot data. First subplot (axes[0]) is default layout, second subplot (axes[1]) is an RTL layout
 85 | sns.set_style('darkgrid')
 86 | sns.set_context({"font.family": "Vazirmatn"})
 87 | fig, axes = plt.subplots(1,2)
 88 | sns.barplot(x=total_df.index, y=total_df.values, ax=axes[0])
 89 | sns.barplot(x=total_df.index, y=total_df.values, ax=axes[1])
 90 | 
 91 | # set common labels for X and Y axes.
 92 | plt.setp(axes, xlabel="ناوچە")
 93 | plt.setp(axes, ylabel="ڕێژەی دانیشتووان (بە ملیۆن)")
 94 | # Set single title for all subplots
 95 | fig.suptitle('ڕێژەی دانیشتووانی کورد')
 96 | 
 97 | # Define and apply conversion to tick labels for both axes
 98 | ns_formatter = ticker.FuncFormatter(lambda x, p: convert_to_sorani_ns(x, p, scale=0.000001))
 99 | axes[0].get_yaxis().set_major_formatter(ns_formatter)
100 | axes[1].get_yaxis().set_major_formatter(ns_formatter)
101 | 
102 | # move y axis and associated label to right of axes[1]
103 | axes[1].yaxis.tick_right()
104 | axes[1].yaxis.set_label_position("right")
105 | # invert x-axis for axes[1]
106 | #plt.gca().invert_xaxis()
107 | axes[1].invert_xaxis()
108 | 
109 | # block=True required for running script in CLI when outputting canvas to widget.
110 | plt.show(block=True)


--------------------------------------------------------------------------------
/py/wordcloud_kurdish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/wordcloud_kurdish.png


--------------------------------------------------------------------------------
/py/wordcloud_kurdish.py:
--------------------------------------------------------------------------------
 1 | import gi, platform, os
 2 | import mplcairo
 3 | import matplotlib as mpl
 4 | if platform.system() == "Darwin":
 5 |     mpl.use("module://mplcairo.macosx")
 6 | else:
 7 |     gi.require_version("Gtk", "3.0")
 8 |     mpl.use("module://mplcairo.gtk")
 9 |     # mpl.use("module://mplcairo.qt")
10 | import matplotlib.pyplot as plt
11 | from wordcloud import WordCloud
12 | 
13 | # Stopword list from klpt (Kurdish Language Processing Toolkit)
14 | # Available stopword lists: Sorani (Arabic) and Kurmanji (Latin)
15 | def get_kurdish_stopwords(dialect, script):
16 |     from urllib.request import urlopen
17 |     import json
18 |     url = "https://raw.githubusercontent.com/sinaahmadi/klpt/master/klpt/data/stopwords.json"
19 |     response = urlopen(url)
20 |     data_json = json.loads(response.read())
21 |     return set(data_json[dialect][script])
22 | 
23 | ckb_stopwords = get_kurdish_stopwords("Sorani", "Arabic")
24 | text = """
25 | زمانی کوردی
26 | لە ئینسایکڵۆپیدیای ئازادی ویکیپیدیاوە
27 | ئەم وتارە سەبارەت بە زمانی کوردی نووسراوە. بۆ شاعیرە کوردەکە، بڕوانە کوردی (شاعیر). بۆ وتارە ھاوشێوەکان، بڕوانە کوردی (ڕوونکردنەوە).
28 | زمانی کوردی (بە کرمانجی، بە سۆرانی: زمانی کوردی، بە کەڵهوڕی: زوان کوردی، بە لەکی: زوۆن کوردی، بە زازاکی، بە ھەورامی: زوانو کوردی) زمانێکە کە خەڵکی کورد قسەی پێدەکەن. لە ڕووی بنەماڵەوە بەشێکە لە زمانە ھیندوئەورووپایییەکان. ئەم زمانە لە زمانی کەڤناری مادی کەوتووەتەوە. زمانی کوردی لە نێوان زمانە ئێرانییەکاندا لە بواری پرژماربوونی ئاخێوەران سێیەمین زمانە و دەکەوێتە دوای زمانەکانی فارسی و پەشتۆ.
29 | شێوەزارەکانی کوردی
30 | وتاری سەرەکی: شێوەزارەکانی زمانی کوردی
31 | زمانی کوردی چەند شێوەزارێکی سەرەکی ھەیە کە جیاوازیی زۆریان ھەیە و زمانناسەکان لە سەر چۆنیەتی جیاکردنەوەی ئەم شێوەزارانە یەکدەنگ نین و زۆرێک لە زمانناسەکان باوەڕییان بە ماڵباتی زمانگەلی کوردی ھەیە. یانی کورمانجیی باکووری و گۆرانی، بە پێی یاسا و ڕێسای زمانناسی و زمانەوانییەوە، دو زمانی سەربەخۆی کوردینە، نەک دو شێوەزار. بەڵام زۆربەی ئەو کەسانەی زمانی(زمانەکانی) کوردییان دابەش کردووە، بەم چوار دەستەیە بووە
32 |     کوردیی باکووری
33 |     کوردیی ناوەندی
34 |     کوردیی باشووری
35 |     گۆرانی-زازایی
36 | ھەندێک لە زمانناسان، لوڕیش وەک شێوەزارێکی زمانی کوردی پۆلبەند دەکەن. ئەگەر چی لوڕی ژمارەیەکی زۆری وشەی کوردی تێدایە، بەڵام ھێشتاش لێکۆلینەوەیەکی ئەوتۆ لە سەر لوڕی لە بەر دەستدا نییە.
37 | ئەلفوبێی کوردی
38 | وتار سەرەکییەکان: ئەلفوبێکانی کوردی و ئەلفوبێی عەرەبیی زمانی کوردی
39 | بەھۆی ئەوەی کە کوردەکان لە ژێر دەسەڵاتی عوسمانی و ئێران بوون و ئەلفوبێی فەرمیی ئەو دوو وڵاتە ئەلفوبێی عەرەبی بوو، کوردەکانیش تا پێش سییەکان تەنیا ئەلفوبێی عەرەبییان بۆ نووسینی کوردی بەکار دەھێنا. لە تورکیا، لە دوای بە فەرمیکردنی ئەلفوبێی لاتینی بۆ زمانی تورکی، جەلادەت عەلی بەدرخان لە ساڵی ١٩٣٢ ئەلفوبێیەکی لاتینیی بۆ زمانی کوردی داھێنا کە ئێستا بە ناوی "ئەلفوبێی ھاوار" یان "بەدرخان" دەناسرێت. 
40 | """
41 | 
42 | font_file = os.path.expanduser("~/.local/share/fonts/fontamin/TrueType/Estedad/Estedad_Regular.ttf")
43 | 
44 | word_cloud = WordCloud(font_path=font_file, collocations = False, background_color = 'white', stopwords=ckb_stopwords).generate(text)
45 | plt.imshow(word_cloud, interpolation='bilinear')
46 | plt.axis("off")
47 | plt.show(block=True)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | gcld3==3.0.13
 2 | grapheme==0.6.0
 3 | LaoNLP==0.2.dev5
 4 | nltk>=3.6.4
 5 | pandas==1.1.3
 6 | PyICU==2.7.4
 7 | pyidaungsu==0.0.9
 8 | pythainlp==2.3.1
 9 | python-myanmar==1.10.0
10 | pyuca==1.2
11 | regex==2020.4.4
12 | tangled-up-in-unicode==0.0.6
13 | unicodedata2==13.0.0.post2
14 | unicodedataplus==13.0.0.post2
15 | 


--------------------------------------------------------------------------------
/rules/collation/README.md:
--------------------------------------------------------------------------------
 1 | # Collation rules
 2 | 
 3 | Unicode 15.0.0 \
 4 | CLDR v41
 5 | 
 6 | __Collation data:__
 7 | 
 8 | * [allkeys_CLDR.txt](https://github.com/unicode-org/cldr/blob/main/common/uca/allkeys_CLDR.txt)
 9 | * [allkeys_DUCET.txt](https://www.unicode.org/Public/UCA/latest/allkeys.txt)
10 | * [CLDR collation rules per locale](https://github.com/unicode-org/cldr/tree/release-42-beta2/common/collation)
11 | 
12 | 
13 | __Other links:__
14 | 
15 | * [CLDR versions](https://cldr.unicode.org/index/downloads)
16 | * [CLDR (GitHub)](https://github.com/unicode-org/cldr)
17 | * [CLDR development version](https://cldr.unicode.org/index/downloads/dev)
18 | * [UCA data - latest](https://www.unicode.org/Public/UCA/latest/)
19 | * [UCD data and charts - latest](https://www.unicode.org/Public/UCD/latest/)
20 | * [Collator demo](https://icu4c-demos.unicode.org/icu-bin/collation.html)


--------------------------------------------------------------------------------
/rules/collation/cldr/ckb.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE ldml SYSTEM "dtd/ldml.dtd">
 3 | <!-- <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd"> -->
 4 | <!--
 5 | Copyright © 2022 Enabling Languages.
 6 | Released under MIT license.
 7 | CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/).
 8 | -->
 9 | <ldml>
10 | 	<identity>
11 | 		<version number="$Revision$" />
12 | 		<language type="ckb" />
13 | 	</identity>
14 | 	<collations>
15 | 		<!-- TODO: Add collation type="search" -->
16 | 		<collation type="standard" draft="contributed"
17 | 				references="https://en.wikipedia.org/wiki/Kurdish_alphabets#Sorani_alphabet 2022-sep-22">
18 | 			<!-- Note that this collation is intended only for Sorani/Central Kurdish. -->
19 | 			<cr><![CDATA[
20 | 				[normalization on]
21 |                 [reorder Arab]
22 |                 # &\u0631 < \u0695
23 | 				&ر < ڕ
24 |                 # &\u0648 < \u0648\u0648
25 | 				&و < وو
26 | 			]]></cr>
27 | 		</collation>
28 | 	</collations>
29 | </ldml>
30 | 
31 | 


--------------------------------------------------------------------------------
/rules/collation/cldr/ckb_IQ.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE ldml SYSTEM "dtd/ldml.dtd">
 3 | <!--
 4 | Copyright © 2022 Enabling Languages.
 5 | Released under MIT license.
 6 | CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/).
 7 | -->
 8 | <ldml>
 9 | 	<identity>
10 | 		<version number="$Revision$"/>
11 | 		<language type="ckb" /> 
12 | 		<territory type="IQ" /> 
13 | 	</identity>
14 | 	<collations>
15 | 		<collation type="standard">
16 | 			<cr><![CDATA[
17 | 				[import ckb]
18 | 			]]></cr>
19 | 		</collation>
20 | 	</collations>
21 | </ldml>


--------------------------------------------------------------------------------
/rules/collation/cldr/ckb_IR.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE ldml SYSTEM "dtd/ldml.dtd">
 3 | <!--
 4 | Copyright © 2022 Enabling Languages.
 5 | Released under MIT license.
 6 | CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/).
 7 | -->
 8 | <ldml>
 9 | 	<identity>
10 | 		<version number="$Revision$"/>
11 | 		<language type="ckb" /> 
12 | 		<territory type="IR" /> 
13 | 	</identity>
14 | 	<collations>
15 | 		<collation type="standard">
16 | 			<cr><![CDATA[
17 | 				[import ckb]
18 | 			]]></cr>
19 | 		</collation>
20 | 	</collations>
21 | </ldml>


--------------------------------------------------------------------------------
/rules/collation/collation_rules.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #
 3 | # Collation _rules
 4 | #
 5 | ###############################
 6 | 
 7 | # Akan (ak, fat, twi, wss)
 8 | ak_rules = fat_rules = twi_rules = wss_rules = (
 9 |     "&E<ɛ<<<Ɛ"
10 |     "&O<ɔ<<<Ɔ"
11 | )
12 | 
13 | # Dinka (din, dip, diw, dib, dks, dik)
14 | din_rules = dip_rules = diw_rules = dib_rules = dks_rules = dik_rules = (
15 |     "[normalization on]"
16 |     "&A<<aa<<<Aa<<<AA<<ä<<<Ä<<ää<<<Ää<<<ÄÄ"
17 |     "&D<dh<<<Dh<<<DH"
18 |     "&E<<ee<<<Ee<<<EE<<ë<<<Ë<<ëë<<<Ëë<<<ËË<ɛ<<<Ɛ<<ɛɛ<<<Ɛɛ<<<ƐƐ<<ɛ̈<<<Ɛ̈<<ɛ̈ɛ̈<<<Ɛ̈ɛ̈<<<Ɛ̈Ɛ̈"
19 |     "&G<ɣ<<<Ɣ"
20 |     "&I<<ii<<<II<<ï<<<Ï<<ïï<<<Ïï<<<ÏÏ"
21 |     "&N<nh<<<Nh<<<NH<ny<<<Ny<<<NY<ŋ<<<Ŋ"
22 |     "&O<<oo<<<Oo<<<OO<<ö<<<Ö<<öö<<<Öö<<<ÖÖ<ɔ<<<Ɔ<<ɔɔ<<<Ɔɔ<<<ƆƆ<<ɔ̈<<<Ɔ̈<<ɔ̈ɔ̈<<<Ɔ̈ɔ̈<<<Ɔ̈Ɔ̈"
23 |     "&T<th<<<Th<<<TH"
24 |     "&U<<uu<<<UU"
25 | )
26 | 
27 | # Shan (shn, shn-Mymr, shn-Mymr-MM)   from sldr
28 | shn_rules = (
29 |     "&ၵ < ၶ < ၷ < င < ၸ < သ < ၺ < ၹ < တ < ၻ < ၼ < ပ < ၽ < ၾ < ၿ < မ < ယ < ရ < လ < ဝ < ႀ < ႁ"
30 |     "&ႁ < ႃ < ိ < ီ < ော် < ေ < ၤ < ု < ူ < ုဝ် < ူဝ် < ေႃ် < ေႃ < ိုဝ် < ိူဝ် < ႆ < ၺ်"
31 |     "&ၺ် < ၢႆ < ၢၺ် < ူၺ် < ွႆ < ွၺ် < ိုၺ် < ိူၺ် < ႟ < ဝ်"
32 |     "&ဝ် < ၢဝ် < ိဝ် < ဵဝ် < ႅဝ် < ိုဝ်ဝ် < ိူဝ်ဝ် < ွ် < ၵ်"
33 |     "&ၵ် < ၢၵ် < ိၵ် < ဵၵ် < ႅၵ် < ုၵ် < ူၵ် < ွၵ် < ိုၵ် < ိူၵ် < ၵ်ျ"
34 |     "&ၵ်ျ < ၢၵ်ျ < ိၵ်ျ < ဵၵ်ျ < ႅၵ်ျ < ုၵ်ျ < ူၵ်ျ < ွၵ်ျ < ိုၵ်ျ < ိူၵ်ျ < ၶ်"
35 |     "&ၶ် < ၢၶ် < ိၶ် < ဵၶ် < ႅၶ် < ုၶ် < ူၶ် < ွၶ် < ိုၶ် < ိူၶ် < ၶ်ျ"
36 |     "&ၶ်ျ < ၢၶ်ျ < ိၶ်ျ < ဵၶ်ျ < ႅၶ်ျ < ုၶ်ျ < ူၶ်ျ < ွၶ်ျ < ိုၶ်ျ < ိူၶ်ျ < ၷ်"
37 |     "&ၷ် < ၢၷ် < ိၷ် < ဵၷ် < ႅၷ် < ုၷ် < ူၷ် < ွၷ် < ိုၷ် < ိူၷ် < ၷ်ျ"
38 |     "&ၷ်ျ < ၢၷ်ျ < ိၷ်ျ < ဵၷ်ျ < ႅၷ်ျ < ုၷ်ျ < ူၷ်ျ < ွၷ်ျ < ိုၷ်ျ < ိူၷ်ျ < င်"
39 |     "&င် < ၢင် < ိင် < ဵင် < ႅင် < ုင် < ူင် < ွင် < ိုင် < ိူင် < ၸ်"
40 |     "&ၸ် < ၢၸ် < ိၸ် < ဵၸ် < ႅၸ် < ုၸ် < ူၸ် < ွၸ် < ိုၸ် < ိူၸ် < သ်"
41 |     "&သ် < ၢသ် < ိသ် < ဵသ် < ႅသ် < ုသ် < ူသ် < ွသ် < ိုသ် < ိူသ် < သ်ျ"
42 |     "&သ်ျ < ၢသ်ျ < ိသ်ျ < ဵသ်ျ < ႅသ်ျ < ုသ်ျ < ူသ်ျ < ွသ်ျ < ိုသ်ျ < ိူသ်ျ < ၹ်"
43 |     "&ၹ် < ၢၹ် < ိၹ် < ဵၹ် < ႅၹ် < ုၹ် < ူၹ် < ွၹ် < ိုၹ် < ိူၹ် < တ်"
44 |     "&တ် < ၢတ် < ိတ် < ဵတ် < ႅတ် < ုတ် < ူတ် < ွတ် < ိုတ် < ိူတ် < ၻ်"
45 |     "&ၻ် < ၢၻ် < ိၻ် < ဵၻ် < ႅၻ် < ုၻ် < ူၻ် < ွၻ် < ိုၻ် < ိူၻ် < ၼ်"
46 |     "&ၼ် < ၢၼ် < ိၼ် < ဵၼ် < ႅၼ် < ုၼ် < ူၼ် < ွၼ် < ိုၼ် < ိူၼ် < ပ်"
47 |     "&ပ် < ၢပ် < ိပ် < ဵပ် < ႅပ် < ုပ် < ူပ် < ွပ် < ိုပ် < ိူပ် < ၽ်"
48 |     "&ၽ် < ၢၽ် < ိၽ် < ဵၽ် < ႅၽ် < ုၽ် < ူၽ် < ွၽ် < ိုၽ် < ိူၽ် < ၾ်"
49 |     "&ၾ် < ၢၾ် < ိၾ် < ဵၾ် < ႅၾ် < ုၾ် < ူၾ် < ွၾ် < ိုၾ် < ိူၾ် < ၿ်"
50 |     "&ၿ် < ၢၿ် < ိၿ် < ဵၿ် < ႅၿ် < ုၿ် < ူၿ် < ွၿ် < ိုၿ် < ိူၿ် < မ်"
51 |     "&မ် < ၢမ် < ိမ် < ဵမ် < ႅမ် < ုမ် < ူမ် < ွမ် < ိုမ် < ိူမ် < ယ်"
52 |     "&ယ် < ၢယ် < ိယ် < ဵယ် < ႅယ် < ုယ် < ူယ် < ွယ် < ိုယ် < ိူယ် < ရ်"
53 |     "&ရ် < ၢရ် < ိရ် < ဵရ် < ႅရ် < ုရ် < ူရ် < ွရ် < ိုရ် < ိူရ် < လ်"
54 |     "&လ် < ၢလ် < ိလ် < ဵလ် < ႅလ် < ုလ် < ူလ် < ွလ် < ိုလ် < ိူလ် < ႀ်"
55 |     "&ႀ် < ၢႀ် < ိႀ် < ဵႀ် < ႅႀ် < ုႀ် < ူႀ် < ွႀ် < ိုႀ် < ိူႀ် < ႁ်"
56 |     "&ႁ် < ၢႁ် < ိႁ် < ဵႁ် < ႅႁ် < ုႁ် < ူႁ် < ွႁ် < ိုႁ် < ိူႁ် < ျ < ြ < ႂ"
57 |     "&[last secondary ignorable] <<ႇ <<ႈ <<း <<ႉ <<ႊ"
58 |     "&ိၼ် < ိၺ်"
59 |     "&မ် < ံ"
60 |     "&ၢမ် < ၢံ"
61 |     "&ိမ် < ်ံ"
62 |     "&ုမ် < ုံ"
63 |     "&ူမ် < ူံ"
64 |     "&ွမ် < ွံ"
65 | )
66 | 
67 | # Sgaw Karen (ksw)    from UTN11 v4
68 | ksw_rules = (
69 |     "&\u1021 < \u1027"
70 |     "< \u1062\u103A < \u102C\u103A < \u1038 < \u1063\u103A < \u1064"
71 |     "< \u102B < \u1036 < \u1062 < \u102F < \u1030 < \u1037 < \u1032 < \u102D < \u102E"
72 |     "< \u103E < \u1060 < \u103B < \u103C < \u103D"
73 |     "&\u1012 < \u1012\u103A / \u1036"
74 |     "&\u1019 < \u1019\u103A / \u102E\u1064"
75 | )
76 | 
77 | # Kayah (kyu-Mymr)   from UTN11 v4
78 | kyu_Mymr_rules = (
79 |     "&\u1021 < \u1064 < \u1038"
80 |     "< \u1072 < \u102E < \u102D < \u1036 < \u1032 < \u1073 < \u1074 < \u1034"
81 |     "< \u102F < \u1030 < \u103C < \u103B < \u103D < \u103E"
82 | )
83 | 
84 | # Sorani Kurdish (Iraq) ckb_IQ  Using glibc  https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=localedata/locales/ckb_IQ;hb=HEAD
85 | ckb_glibc_rules = ckb_IQ_glibc_rules = (
86 |     "[normalization on]"
87 |     "[reorder Arab]"
88 |     "&\u0631<\u0695"
89 |     "&\u0646<\u0648<\u06C6"
90 | )
91 | 
92 | # Sorani Kurdish (Iraq) ckb_IQ  Using sort order of the Kurdish Academy standard 
93 | ckb_rules = ckb_IQ_rules = (
94 |     "[normalization on]"
95 |     "[reorder Arab]"
96 |     "&\u0631 < \u0695 < \u0632"
97 |     "&\u0648 < \u06C6 < \u0648\u0648"
98 | )


--------------------------------------------------------------------------------
/rules/collation/glibc/ckb_IQ@academy:
--------------------------------------------------------------------------------
  1 | comment_char %
  2 | escape_char /
  3 | 
  4 | % This file is part of the GNU C Library and contains locale data.
  5 | % The Free Software Foundation does not claim any copyright interest
  6 | % in the locale data contained in this file.  The foregoing does not
  7 | % affect the license of the GNU C Library as a whole.  It does not
  8 | % exempt you from the conditions of the license if your use would
  9 | % otherwise be governed by that license.
 10 | 
 11 | % Central Kurdish language locale for Iraq (using Arabic letters):
 12 | % Filename: ckb_iq@academy
 13 | % Locale name: ckb_iq.UTF-8@academy
 14 | % Language: Central Kurdish (Sorani)
 15 | % Language abbreviation: KU-AR (Kurdish - Arabic letters)
 16 | % Charset: UTF-8
 17 | % Creation Date: 20022-10-14
 18 | % History:
 19 | % October 2021: Defining ckb_IQ@academy locale based on CLDR ckb_IQ and GLIBC ckb_IQ
 20 | 
 21 | LC_IDENTIFICATION
 22 | 
 23 | title	   "Central Kurdish language locale for Iraq (Kurdish Academy orthography)"
 24 | source	   "andjc"
 25 | address    ""
 26 | contact    "andjc"
 27 | email	   ""
 28 | tel	   ""
 29 | fax	   ""
 30 | language   "Central Kurdish"
 31 | territory  "Iraq"
 32 | revision   "0.1"
 33 | date	   "2022-10-14"
 34 | 
 35 | category "i18n:2012";LC_IDENTIFICATION
 36 | category "i18n:2012";LC_CTYPE
 37 | category "i18n:2012";LC_COLLATE
 38 | category "i18n:2012";LC_TIME
 39 | category "i18n:2012";LC_NUMERIC
 40 | category "i18n:2012";LC_MONETARY
 41 | category "i18n:2012";LC_MESSAGES
 42 | category "i18n:2012";LC_PAPER
 43 | category "i18n:2012";LC_NAME
 44 | category "i18n:2012";LC_ADDRESS
 45 | category "i18n:2012";LC_TELEPHONE
 46 | category "i18n:2012";LC_MEASUREMENT
 47 | 
 48 | END LC_IDENTIFICATION
 49 | 
 50 | LC_CTYPE
 51 | copy "i18n"
 52 | END LC_CTYPE
 53 | 
 54 | LC_COLLATE
 55 | 
 56 | % Copy the template from ISO/IEC 14651
 57 | copy "iso14651_t1"
 58 | 
 59 | collating-element <U0648_0648> from "<U0648><U0648>"  % Double ARABIC LETTER WAW
 60 | 
 61 | reorder-after <S0631> % ARABIC LETTER REH
 62 | <S0695> % ARABIC LETTER REH WITH SMALL V BELOW
 63 | 
 64 | reorder-after <S0648> % ARABIC LETTER WAW
 65 | <U0648_0648> % Double ARABIC LETTER WAW
 66 | 
 67 | reorder-end
 68 | 
 69 | END LC_COLLATE
 70 | 
 71 | LC_MONETARY
 72 | copy "ckb_IQ"
 73 | END LC_MONETARY
 74 | 
 75 | LC_NUMERIC
 76 | copy "ckb_IQ"
 77 | END LC_NUMERIC
 78 | 
 79 | LC_TIME
 80 | copy "ckb_IQ"
 81 | END LC_TIME
 82 | 
 83 | LC_MESSAGES
 84 | copy "ckb_IQ"
 85 | END LC_MESSAGES
 86 | 
 87 | LC_PAPER
 88 | copy "ckb_IQ"
 89 | END LC_PAPER
 90 | 
 91 | LC_NAME
 92 | copy "ckb_IQ"
 93 | END LC_NAME
 94 | 
 95 | LC_ADDRESS
 96 | copy "ckb_IQ"
 97 | END LC_ADDRESS
 98 | 
 99 | LC_TELEPHONE
100 | copy "ckb_IQ"
101 | END LC_TELEPHONE
102 | 
103 | LC_MEASUREMENT
104 | copy "ckb_IQ"
105 | END LC_MEASUREMENT


--------------------------------------------------------------------------------
/rules/collation/glibc/en_SS:
--------------------------------------------------------------------------------
  1 | comment_char %
  2 | escape_char /
  3 | 
  4 | % This file is part of the GNU C Library and contains locale data.
  5 | % The Free Software Foundation does not claim any copyright interest
  6 | % in the locale data contained in this file.  The foregoing does not
  7 | % affect the license of the GNU C Library as a whole.  It does not
  8 | % exempt you from the conditions of the license if your use would
  9 | % otherwise be governed by that license.
 10 | 
 11 | LC_IDENTIFICATION
 12 | title      "English locale for South Sudan"
 13 | source     "CLDR"
 14 | address    ""
 15 | contact    "Andjc"
 16 | email      ""
 17 | tel        ""
 18 | fax        ""
 19 | language   "English"
 20 | territory  "South Sudan"
 21 | revision   "1.0"
 22 | date       "2022-10-13"
 23 | 
 24 | category "i18n:2012";LC_IDENTIFICATION
 25 | category "i18n:2012";LC_CTYPE
 26 | category "i18n:2012";LC_COLLATE
 27 | category "i18n:2012";LC_TIME
 28 | category "i18n:2012";LC_NUMERIC
 29 | category "i18n:2012";LC_MONETARY
 30 | category "i18n:2012";LC_MESSAGES
 31 | category "i18n:2012";LC_PAPER
 32 | category "i18n:2012";LC_NAME
 33 | category "i18n:2012";LC_ADDRESS
 34 | category "i18n:2012";LC_TELEPHONE
 35 | category "i18n:2012";LC_MEASUREMENT
 36 | END LC_IDENTIFICATION
 37 | 
 38 | LC_CTYPE
 39 | copy "i18n"
 40 | 
 41 | translit_start
 42 | include "translit_combining";""
 43 | translit_end
 44 | END LC_CTYPE
 45 | 
 46 | LC_COLLATE
 47 | % Copy the template from ISO/IEC 14651
 48 | copy "iso14651_t1"
 49 | END LC_COLLATE
 50 | 
 51 | LC_MONETARY
 52 | int_curr_symbol         "SSP "
 53 | currency_symbol         "<U00A3>"
 54 | mon_decimal_point       "."
 55 | mon_thousands_sep       ","
 56 | mon_grouping            3;3
 57 | positive_sign           ""
 58 | negative_sign           "-"
 59 | int_frac_digits         2
 60 | frac_digits             2
 61 | p_cs_precedes           1
 62 | p_sep_by_space          0
 63 | n_cs_precedes           1
 64 | n_sep_by_space          0
 65 | p_sign_posn             1
 66 | n_sign_posn             1
 67 | END LC_MONETARY
 68 | 
 69 | LC_NUMERIC
 70 | decimal_point           "."
 71 | thousands_sep           ","
 72 | grouping                3;3
 73 | END LC_NUMERIC
 74 | 
 75 | LC_TIME
 76 | abday       "Sun";"Mon";"Tue";"Wed";"Thu";"Fri";"Sat"
 77 | day         "Sunday";/
 78 |             "Monday";/
 79 |             "Tuesday";/
 80 |             "Wednesday";/
 81 |             "Thursday";/
 82 |             "Friday";/
 83 |             "Saturday"
 84 | abmon       "Jan";"Feb";/
 85 |             "Mar";"Apr";/
 86 |             "May";"Jun";/
 87 |             "Jul";"Aug";/
 88 |             "Sep";"Oct";/
 89 |             "Nov";"Dec"
 90 | mon         "January";/
 91 |             "February";/
 92 |             "March";/
 93 |             "April";/
 94 |             "May";/
 95 |             "June";/
 96 |             "July";/
 97 |             "August";/
 98 |             "September";/
 99 |             "October";/
100 |             "November";/
101 |             "December"
102 | d_t_fmt     "%a %d %b %Y %T %Z"
103 | d_fmt       "%d//%m//%y"
104 | t_fmt       "%T"
105 | am_pm       "am";"pm"
106 | t_fmt_ampm  "%l:%M:%S %P %Z"
107 | date_fmt    "%a %e %b %H:%M:%S %Z %Y"
108 | week    7;19971130;4
109 | first_weekday 2
110 | END LC_TIME
111 | 
112 | LC_MESSAGES
113 | copy "en_US"
114 | END LC_MESSAGES
115 | 
116 | LC_PAPER
117 | copy "i18n"
118 | END LC_PAPER
119 | 
120 | LC_TELEPHONE
121 | tel_int_fmt    "+%c %a %l"
122 | tel_dom_fmt    "%A %l"
123 | int_select     "00"       % https://en.wikipedia.org/wiki/International_call_prefix,  https://en.wikipedia.org/wiki/List_of_international_call_prefixes  
124 | int_prefix     "211"      % https://en.wikipedia.org/wiki/List_of_country_calling_codes
125 | END LC_TELEPHONE
126 | 
127 | LC_MEASUREMENT
128 | copy "i18n"
129 | END LC_MEASUREMENT
130 | 
131 | LC_NAME
132 | copy "en_US"
133 | END LC_NAME
134 | 
135 | LC_ADDRESS
136 | postal_fmt    "%f%N%a%N%d%N%b%N%s %h %e %r%N%z %T%N%c%N"
137 | country_name "South Sudan"
138 | country_ab2   "SS"      % https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
139 | country_ab3   "SSD"
140 | country_num   728
141 | country_car    ""
142 | lang_name     "English"
143 | lang_ab      "en"
144 | lang_term    "eng"
145 | lang_lib    "eng"
146 | END LC_ADDRESS
147 | 
148 | % https://man7.org/linux/man-pages/man5/locale.5.html
149 | % https://metacpan.org/dist/DateTime-Locale/view/lib/DateTime/Locale/en_SS.pod
150 | % https://www.localeplanet.com/icu/en-SS/index.html
151 | % https://www.iana.org/time-zones   -   https://www.timeanddate.com/worldclock/south-sudan/juba   - Central African Time (CAT) - Africa/Juba - UTC+2


--------------------------------------------------------------------------------
/rules/collation/icu/ckb.txt:
--------------------------------------------------------------------------------
 1 | // © 2016 and later: Unicode, Inc. and others.
 2 | // License & terms of use: http://www.unicode.org/copyright.html
 3 | // Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
 4 | ckb{
 5 |     collations{
 6 |         standard{
 7 |             Sequence{
 8 |                 "[normalization on]"
 9 |                 "[reorder Arab]"
10 |                 "&ر < ڕ"
11 |                 "&و < وو"
12 |             }
13 |             Version{"42"}
14 |         }
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/rules/collation/icu/ckb_IQ.txt:
--------------------------------------------------------------------------------
 1 | // © 2016 and later: Unicode, Inc. and others.
 2 | // License & terms of use: http://www.unicode.org/copyright.html
 3 | // Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
 4 | ckb_IQ{
 5 |     collations{
 6 |         standard{
 7 |             Sequence{"[import ckb]"}
 8 |             Version{"42"}
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/rules/collation/sorani_alphabet.tsv:
--------------------------------------------------------------------------------
 1 | Order	Character	Codepoint
 2 | 1	ئ	U+0626
 3 | 2	ا	U+0627
 4 | 3	ب	U+0628
 5 | 4	پ	U+067E
 6 | 5	ت	U+062A
 7 | 6	ج	U+062C
 8 | 7	چ	U+0686
 9 | 8	ح	U+062D
10 | 9	خ	U+062E
11 | 10	د	U+062F
12 | 11	ر	U+0631
13 | 12	ڕ	U+0695
14 | 13	ز	U+0632
15 | 14	ژ	U+0698
16 | 15	س	U+0633
17 | 16	ش	U+0634
18 | 17	ع	U+0639
19 | 18	غ	U+063A
20 | 19	ف	U+0641
21 | 20	ڤ	U+06A4
22 | 21	ق	U+0642
23 | 22	ک	U+06A9
24 | 23	گ	U+06AF
25 | 24	ل	U+0644
26 | 25	ڵ	U+06B5
27 | 26	م	U+0645
28 | 27	ن	U+0646
29 | 28	ه	U+0647
30 | 29	ە	U+06D5
31 | 30	و	U+0648
32 | 32	وو	U+0648 U+0648
33 | 31	ۆ	U+06C6
34 | 33	ی	U+06CC
35 | 34	ێ	U+06CE


--------------------------------------------------------------------------------
/rules/collation/temp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 6,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import regex as re\n",
 10 |     "text = \"ရန်ကုန်ကွန်ပျူတာတက္ကသိုလ်\""
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 7,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "name": "stdout",
 20 |      "output_type": "stream",
 21 |      "text": [
 22 |       "Number of graphemes: 14\n",
 23 |       "Graphemes: ['ရ', 'န်', 'ကု', 'န်', 'ကွ', 'န်', 'ပျူ', 'တ', 'ာ', 'တ', 'က္', 'က', 'သို', 'လ်']\n"
 24 |      ]
 25 |     }
 26 |    ],
 27 |    "source": [
 28 |     "# split string into extended grapheme clusters\n",
 29 |     "graphemes = re.findall(r'\\X', text)\n",
 30 |     "print(f'Number of graphemes: {len(graphemes)}\\nGraphemes: {graphemes}')"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 8,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "Number of syllables: 7\n",
 43 |       "Syllables: ['န်', 'ုန်', 'ွန်', 'ျူ', 'ာ', 'က္က', 'ိုလ်']\n"
 44 |      ]
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "# syllable segmentation with regex\n",
 49 |     "pattern = r'(?:(?<!္)([က-ဪဿ၊-၏]|[၀-၉]+|[^က-၏]+)(?![ှျ]?[့္်]))'\n",
 50 |     "syllables = re.sub(pattern, r\" \", text).split()\n",
 51 |     "print(f'Number of syllables: {len(syllables)}\\nSyllables: {syllables}')"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 11,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "Is an alphabetic sequence: False\n",
 64 |       "Regex match for word forming characters: True\n"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "# Is a string of alphabetic characters?\n",
 70 |     "print(f'Is an alphabetic sequence: {text.isalpha()}')\n",
 71 |     "results = bool(re.match(r'^\\p{Alphabetic}[\\p{Alphabetic}\\p{Mn}\\p{Mc}·]*$', text))\n",
 72 |     "print(f\"Regex match for word forming characters: {results}\")"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 13,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "name": "stdout",
 82 |      "output_type": "stream",
 83 |      "text": [
 84 |       "Is a Myanmar string: True\n"
 85 |      ]
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "# Is a Myanmar script string\n",
 90 |     "isMyanmar = bool(re.match(r'^\\p{Myanmar}[\\p{Myanmar}\\p{Common}]*$', text))\n",
 91 |     "print(f'Is a Myanmar string: {isMyanmar}')"
 92 |    ]
 93 |   }
 94 |  ],
 95 |  "metadata": {
 96 |   "kernelspec": {
 97 |    "display_name": "Python 3.8.1 ('el')",
 98 |    "language": "python",
 99 |    "name": "python3"
100 |   },
101 |   "language_info": {
102 |    "codemirror_mode": {
103 |     "name": "ipython",
104 |     "version": 3
105 |    },
106 |    "file_extension": ".py",
107 |    "mimetype": "text/x-python",
108 |    "name": "python",
109 |    "nbconvert_exporter": "python",
110 |    "pygments_lexer": "ipython3",
111 |    "version": "3.8.1"
112 |   },
113 |   "orig_nbformat": 4,
114 |   "vscode": {
115 |    "interpreter": {
116 |     "hash": "bb12d0de9674b66c629d2bafada2ec4f6e6dba6d129e54dea4badc21502d54d3"
117 |    }
118 |   }
119 |  },
120 |  "nbformat": 4,
121 |  "nbformat_minor": 2
122 | }
123 | 


--------------------------------------------------------------------------------
/rules/segmentation/regex_patterns.md:
--------------------------------------------------------------------------------
 1 | # Regex patterns
 2 | 
 3 | ## Graphemes
 4 | 
 5 | `pattern = r'(\X)'`
 6 | 
 7 | ## Myanmar syllables
 8 | 
 9 | 1. ReSegment: `pattern = r'(?:(?<!္)([က-ဪဿ၊-၏]|[၀-၉]+|[^က-၏]+)(?![ှျ]?[့္်]))'`
10 | 2. sylbreak and myWord: 
11 |     ```py
12 |     myConsonant = r"က-အ"
13 |     enChar = r"a-zA-Z0-9"
14 |     otherChar = r"ဣဤဥဦဧဩဪဿ၌၍၏၀-၉၊။!-/:-@[-`{-~\s"
15 |     ssSymbol = r'္'
16 |     aThat = r'်'
17 |     pattern = re.compile(r"((?<!" + ssSymbol + r")["+ myConsonant + r"](?![" + aThat + ssSymbol + r"])" + r"|[" + enChar + otherChar + r"])")
18 |     ```
19 | 
20 | 
21 | 
22 | ## Resources:
23 | 
24 | * https://medium.com/computronium/text-segmentation-7150cc58cb03 : `re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw)`
25 | * https://goodboychan.github.io/python/datacamp/natural_language_processing/2020/07/15/01-Regular-expressions-and-word-tokenization.html
26 | * https://web.stanford.edu/~jurafsky/slp3/old_oct19/2.pdf
27 | * https://www.kaggle.com/discussions/general/331411


--------------------------------------------------------------------------------
/rules/segmentation/syllables/Khmer.rbbi:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # 
18 | # Parses Khmer text, with orthographic syllable as token.
19 | #
20 | # The definition of Khmer orthographic syllable is taken from the Unicode Standard.
21 | #
22 | # B = base character (consonant, independent vowel, etc)
23 | $KhmerBase = [\u1780-\u17B3];
24 | # R = robat
25 | $KhmerRobat = [\u17CC];
26 | # C = consonant shifter
27 | $KhmerShifter = [\u17C9\u17CA];
28 | # S = subscript consonant or independent vowel sign
29 | $KhmerSub = ([\u17D2] $KhmerBase);
30 | # V = dependent vowel sign
31 | $KhmerVowel = [\u17B4-\u17C5];
32 | # Z = zero-width joiner or non-joiner
33 | $KhmerZWC = [\u200C\u200D];
34 | # O = any other sign
35 | $KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD]; 
36 | 
37 | $WordJoin = [:Line_Break=Word_Joiner:];
38 | 
39 | $KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
40 | 
41 | $KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
42 | 
43 | #
44 | # default numerical definitions
45 | #
46 | $Extend       = [\p{Word_Break = Extend}];
47 | $Format       = [\p{Word_Break = Format}];
48 | $MidNumLet    = [\p{Word_Break = MidNumLet}];
49 | $MidNum       = [\p{Word_Break = MidNum}];
50 | $Numeric      = [\p{Word_Break = Numeric}];
51 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
52 | $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
53 | $MidNumEx       = $MidNum       ($Extend |  $Format)*;
54 | $NumericEx      = $Numeric      ($Extend |  $Format)*;
55 | $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
56 | 
57 | !!forward;
58 | $KhmerJoinedSyllableEx {200};
59 | 
60 | # default numeric rules
61 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};
62 | 


--------------------------------------------------------------------------------
/rules/segmentation/syllables/Lao.rbbi:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed to the Apache Software Foundation (ASF) under one or more
  3 | # contributor license agreements.  See the NOTICE file distributed with
  4 | # this work for additional information regarding copyright ownership.
  5 | # The ASF licenses this file to You under the Apache License, Version 2.0
  6 | # (the "License"); you may not use this file except in compliance with
  7 | # the License.  You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | # Parses Lao text, with syllable as token.
 18 | #
 19 | # The definition of Lao syllable is based from:
 20 | #
 21 | #   Syllabification of Lao Script for Line Breaking
 22 | #   Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, 
 23 | #     Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
 24 | #   http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
 25 | #	http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
 26 | #
 27 | # NOTE:
 28 | # There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
 29 | # For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
 30 | #
 31 | # Syllable structure, where X is the nuclear consonant:
 32 | #
 33 | #           +----+
 34 | #           | X5 |
 35 | #           +----+
 36 | #           | X4 |
 37 | # +----+----+----+----+----+----+----+-----+
 38 | # | X0 | X1 | X  | X6 | X7 | X8 | X9 | X10 |
 39 | # +----+----+----+----+----+----+----+-----+
 40 | #           | X2 |
 41 | #           +----+
 42 | #           | X3 |
 43 | #           +----+
 44 | #
 45 | # X0 represents a vowel which occurs before the nuclear consonant. 
 46 | # It can always define the beginning of syllable.
 47 | $X0 = [\u0EC0-\u0EC4];
 48 | # X1 is a combination consonant which comes before the nuclear consonant, 
 49 | # but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
 50 | $X1 = [\u0EAB];
 51 | # X represents the nuclear consonant.
 52 | $X = [\u0E81-\u0EAE\u0EDC\u0EDD];
 53 | # X2 is a combination consonant which comes after the nuclear consonant, 
 54 | # which is placed under or next to the nuclear consonant.
 55 | $X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
 56 | # X3 represents a vowel which occurs under the nuclear consonant.
 57 | $X3 = [\u0EB8\u0EB9];
 58 | # X4 represents a vowel which occurs above the nuclear consonant. 
 59 | $X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
 60 | # X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
 61 | $X5 = [\u0EC8-\u0ECB];
 62 | # X6 represents a consonant vowel, which occurs after the nuclear consonant. 
 63 | # It functions when the syllable doesn’t have any vowels. And it always exists with X8.
 64 | $X6 = [\u0EA7\u0EAD\u0EBD];
 65 | # X7 represents a final vowel. 
 66 | # However X7_1 always represents the end of syllable and it never exists with tone mark.
 67 | $X7 = [\u0EB0\u0EB2\u0EB3];
 68 | # X8 represents an alternate consonant.
 69 | $X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
 70 | # X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
 71 | $X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
 72 | # X10 represents a sign mark. 
 73 | # It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
 74 | $X10 = [\u0EAF\u0EC6\u0ECC];
 75 | 
 76 | # Section 1
 77 | $X0_1 = [\u0EC0];
 78 | $X4_1_2 = [\u0EB4\u0EB5];
 79 | $X4_3_4 = [\u0EB6\u0EB7];
 80 | $X4_6 = [\u0EBB];
 81 | $X4_7 = [\u0EB1];
 82 | $X6_2 = [\u0EAD];
 83 | $X6_3 = [\u0EBD];
 84 | $X7_1 = [\u0EB0];
 85 | $X7_2 = [\u0EB2];
 86 | $X10_1 = [\u0EAF];
 87 | $X10_2 = [\u0EC6];
 88 | $X10_3 = [\u0ECC];
 89 | 
 90 | $Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 91 | $Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 92 | $Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 93 | $Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
 94 | $Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 95 | $Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 96 | $Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
 97 | 
 98 | $Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
 99 | 
100 | # Section 2
101 | $X0_2 = [\u0EC1];
102 | 
103 | $Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
104 | $Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
105 | $Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 
106 | 
107 | $Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
108 | 
109 | # Section 3
110 | $X0_3 = [\u0EC2];
111 | $X8_3 = [\u0E8D];
112 | $X8_8 = [\u0EA7];
113 | 
114 | $Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
115 | $Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
116 | $Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
117 | 
118 | $Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
119 | 
120 | # Section 4
121 | $X0_4 = [\u0EC4];
122 | $X6_1 = [\u0EA7];
123 | 
124 | $Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
125 | 
126 | # Section 5
127 | $X0_5 = [\u0EC3];
128 | 
129 | $Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
130 | 
131 | # Section 6
132 | $Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
133 | 
134 | # Section 7
135 | $X4_1_4 = [\u0EB4-\u0EB7];
136 | 
137 | $Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
138 | 
139 | # Section 8
140 | $X4_5 = [\u0ECD];
141 | 
142 | $Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
143 | 
144 | # Section 9
145 | 
146 | $Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
147 | $Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
148 | 
149 | $Rule9 = ($Rule9_1 | $Rule9_2);
150 | 
151 | # Section 10
152 | $Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
153 | 
154 | # Section 11
155 | $Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
156 | 
157 | # Section 12
158 | $Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
159 | 
160 | # Section 13
161 | $Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
162 | 
163 | # Section 14
164 | $X7_3 = [\u0EB3];
165 | 
166 | $Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
167 | 
168 | $LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
169 | 
170 | $WordJoin = [:Line_Break=Word_Joiner:];
171 | 
172 | $LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
173 | 
174 | #
175 | # default numerical definitions
176 | #
177 | $Extend       = [\p{Word_Break = Extend}];
178 | $Format       = [\p{Word_Break = Format}];
179 | $MidNumLet    = [\p{Word_Break = MidNumLet}];
180 | $MidNum       = [\p{Word_Break = MidNum}];
181 | $Numeric      = [\p{Word_Break = Numeric}];
182 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
183 | $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
184 | $MidNumEx       = $MidNum       ($Extend |  $Format)*;
185 | $NumericEx      = $Numeric      ($Extend |  $Format)*;
186 | $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
187 | 
188 | !!forward;
189 | 
190 | $LaoJoinedSyllableEx {200};
191 | # default numeric rules
192 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};
193 | 


--------------------------------------------------------------------------------
/rules/segmentation/syllables/Myanmar.rbbi:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # 
18 | # Parses Myanmar text, with syllable as token. 
19 | #
20 | 
21 | $Cons = [[:Other_Letter:]&[:Myanmar:]];
22 | $Virama = [\u1039];
23 | $Asat = [\u103A];
24 | 
25 | $WordJoin = [:Line_Break=Word_Joiner:]; 
26 | 
27 | #
28 | # default numerical definitions
29 | #
30 | $Extend       = [\p{Word_Break = Extend}];
31 | $Format       = [\p{Word_Break = Format}];
32 | $MidNumLet    = [\p{Word_Break = MidNumLet}];
33 | $MidNum       = [\p{Word_Break = MidNum}];
34 | $Numeric      = [\p{Word_Break = Numeric}];
35 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
36 | $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
37 | $MidNumEx       = $MidNum       ($Extend |  $Format)*;
38 | $NumericEx      = $Numeric      ($Extend |  $Format)*;
39 | $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
40 | 
41 | $ConsEx = $Cons ($Extend | $Format)*;
42 | $AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
43 | $MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
44 | $MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
45 | 
46 | !!forward;
47 | $MyanmarJoinedSyllableEx {200};
48 | 
49 | # default numeric rules
50 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};
51 | 


--------------------------------------------------------------------------------
/snippets/break_iterator.py:
--------------------------------------------------------------------------------
 1 | ####################
 2 | #
 3 | # Break Iterator
 4 | #   © Enabling Languages 2022
 5 | #   Released under the MIT License.
 6 | #
 7 | ####################
 8 | 
 9 | from icu import BreakIterator, Locale
10 | 
11 | l = "en_AU"
12 | 
13 | if l in BreakIterator.getAvailableLocales():
14 |     DEFAULT_LOC = Locale(l)
15 | else:
16 |     DEFAULT_LOC = Locale.getRoot()
17 |   
18 | def iterate_breaks(text, break_iterator):
19 |     break_iterator.setText(text)
20 |     lastpos = 0
21 |     while True:
22 |         next_boundary = break_iterator.nextBoundary()
23 |         if next_boundary == -1: return
24 |         yield text[lastpos:next_boundary]
25 |         lastpos = next_boundary
26 |         
27 | text=""
28 | 
29 | # character, word, line, title, and sentence instances
30 | #   createCharacterInstance()
31 | #   createLineInstance()
32 | #   createSentenceInstance()
33 | #   createTitleInstance()
34 | #   createWordInstance()
35 | bi = BreakIterator.createCharacterInstance(DEFAULT_LOC)
36 | list(iterate_breaks(text, bi))
37 | 


--------------------------------------------------------------------------------
/snippets/convert_digits.py:
--------------------------------------------------------------------------------
  1 | ####################
  2 | #
  3 | # Convert digits
  4 | #   © Enabling Languages 2022
  5 | #   Released under the MIT License.
  6 | #
  7 | ####################
  8 | 
  9 | import unicodedataplus as ud, regex as re
 10 | import locale
 11 | 
 12 | #
 13 | # To Western Arabic digits
 14 | #
 15 | def convert_digits(s, sep = (",", ".")):
 16 |     nd = re.compile(r'^-?\p{Nd}[,.\u066B\u066C\u0020\u2009\u202F\p{Nd}]*$')
 17 |     tsep, dsep = sep
 18 |     if nd.match(s):
 19 |         s = s.replace(tsep, "")
 20 |         s = ''.join([str(ud.decimal(c, c)) for c in s])
 21 |         if dsep in s:
 22 |             return float(s.replace(dsep, ".")) if dsep != "." else float(s)
 23 |         return int(s)
 24 |     return s
 25 | 
 26 | def is_number(v, sep = (",", ".")):
 27 |     original = v
 28 |     n = re.compile(r'^-?\p{N}[,.\u066B\u066C\u0020\u2009\u202F\p{N}]+$')
 29 |     nd = re.compile(r'^-?\p{Nd}[,.\u066B\u066C\u0020\u2009\u202F\p{Nd}]+$')
 30 |     v = "".join(v.split())
 31 |     if isinstance(v, int) or isinstance(v, float):
 32 |         return isinstance(v, (int, str)), type(v), v
 33 |     elif isinstance(v.strip(), str) and nd.match(v.strip()):
 34 |         v = convert_digits(v.strip(), sep)
 35 |         return True, type(v), v
 36 |     else:
 37 |         return False, type(v), v
 38 | 
 39 | #
 40 | # convert_numeral_systems()
 41 | #
 42 | #    Convert numerals between numeral systems
 43 | #    Default settings convert python int or float to the specified numeral system.
 44 | #    Returns a string
 45 | #    Modifications added to assist in changing matplotlib tick labels: p and scale parameters. These two parameters should be idnored in all other cases.
 46 | 
 47 | # import locale
 48 | def convert_numeral_systems(n, p=None, system_out="", system_in="latn", decimal=2, sep_in=["", "."], sep_out=["", "."], scale=None):
 49 |     locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
 50 |     decimal_places = decimal
 51 |     if system_in == "latn" and sep_in == ["", "."]:
 52 |         n = n / scale if scale else n
 53 |         format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d'
 54 |         n = locale.format_string(format_string, n, grouping=True, monetary=True)
 55 |         n = n.replace(",", "ṯ").replace(".", "ḏ")
 56 |         #n = str(n)
 57 |     if sep_in[0] in [" ", ",", "٬", "\u2009", "\u202F"]:
 58 |         n = n.replace(r'[\u0020,٬\u2009\u202F]', "ṯ")
 59 |     elif sep_in[0] == ".":
 60 |         n = n.replace(".", "ṯ")
 61 |     if sep_in[1] in [",", ".", "٫"]:
 62 |         n = n.replace(r'[,.٫]', "ḏ")
 63 |     data = {
 64 |         "adlm" : {'name' : 'Adlam Digits (adlm)', "digits" : "𞥐𞥑𞥒𞥓𞥔𞥕𞥖𞥗𞥘𞥙", "sep_out": [",", "."]},
 65 |         "ahom" : {'name' : 'Ahom Digits (ahom)', "digits" : "𑜰𑜱𑜲𑜳𑜴𑜵𑜶𑜷𑜸𑜹"},
 66 |         "arab" : {'name' : 'Arabic-Indic Digits (arab)', "digits" : "٠١٢٣٤٥٦٧٨٩"},
 67 |         "arabext" : {'name' : 'Extended Arabic-Indic Digits (arabext)', "digits" : "۰۱۲۳۴۵۶۷۸۹", "sep_out_out": ["\u066C", "\u066B"]},
 68 |         "bali" : {'name' : 'Balinese Digits (bali)', "digits" : "᭐᭑᭒᭓᭔᭕᭖᭗᭘᭙"},
 69 |         "beng" : {'name' : 'Bangla Digits (beng)', "digits" : "০১২৩৪৫৬৭৮৯"},
 70 |         "bhks" : {'name' : 'Bhaiksuki  Digits (bhks)', "digits" : "𑱐𑱑𑱒𑱓𑱔𑱕𑱖𑱗𑱘𑱙"},
 71 |         "brah" : {'name' : 'Brahmi Digits (brah)', "digits" : "𑁦𑁧𑁨𑁩𑁪𑁫𑁬𑁭𑁮𑁯"},
 72 |         "cakm" : {'name' : 'Chakma Digits (cakm)', "digits" : "𑄶𑄷𑄸𑄹𑄺𑄻𑄼𑄽𑄾𑄿"},
 73 |         "cham" : {'name' : 'Cham Digits (cham)', "digits" : "꩐꩑꩒꩓꩔꩕꩖꩗꩘꩙"},
 74 |         "deva" : {'name' : 'Devanagari Digits (deva)', "digits" : "०१२३४५६७८९"},
 75 |         "diak" : {'name' : 'Dhives/Divehi Digits (diak)', "digits" : "𑥐𑥑𑥒𑥓𑥔𑥕𑥖𑥗𑥘𑥙"},
 76 |         "fullwide" : {'name' : 'Full-Width Digits (fullwide)', "digits" : "０１２３４５６７８９"},
 77 |         "gong" : {'name' : 'Gunjala Gondi digits (gong)', "digits" : "𑶠𑶡𑶢𑶣𑶤𑶥𑶦𑶧𑶨𑶩"},
 78 |         "gonm" : {'name' : 'Masaram Gondi digits (gonm)', "digits" : "𑵐𑵑𑵒𑵓𑵔𑵕𑵖𑵗𑵘𑵙"},
 79 |         "gujr" : {'name' : 'Gujarati Digits (gujr)', "digits" : "૦૧૨૩૪૫૬૭૮૯"},
 80 |         "guru" : {'name' : 'Gurmukhi Digits (guru)', "digits" : "੦੧੨੩੪੫੬੭੮੯"},
 81 |         "hmng" : {'name' : 'Pahawh Hmong Digits (hmng)', "digits" : "𖭐𖭑𖭒𖭓𖭔𖭕𖭖𖭗𖭘𖭙"},
 82 |         "hmnp" : {'name' : 'Nyiakeng Puachue Hmong Digits (hmnp)', "digits" : "𞅀𞅁𞅂𞅃𞅄𞅅𞅆𞅇𞅈𞅉"},
 83 |         "java" : {'name' : 'Javanese Digits (java)', "digits" : "꧐꧑꧒꧓꧔꧕꧖꧗꧘꧙"},
 84 |         "kali" : {'name' : 'Kayah Li Digits (kali)', "digits" : "꤀꤁꤂꤃꤄꤅꤆꤇꤈꤉"},
 85 |         "khmr" : {'name' : 'Khmer Digits (khmr)', "digits" : "០១២៣៤៥៦៧៨៩"},
 86 |         "knda" : {'name' : 'Kannada Digits (knda)', "digits" : "೦೧೨೩೪೫೬೭೮೯"},
 87 |         "lana" : {'name' : 'Tai Tham Hora Digits (lana)', "digits" : "᪀᪁᪂᪃᪄᪅᪆᪇᪈᪉"},
 88 |         "lanatham" : {'name' : 'Tai Tham Tham Digits (lanatham)', "digits" : "᪐᪑᪒᪓᪔᪕᪖᪗᪘᪙"},
 89 |         "laoo" : {'name' : 'Lao Digits (laoo)', "digits" : "໐໑໒໓໔໕໖໗໘໙"},
 90 |         "latn" : {'name' : 'Latin Digits (latn)', "digits" : "0123456789"},
 91 |         "lepc" : {'name' : 'Lepcha Digits (lepc)', "digits" : "᱀᱁᱂᱃᱄᱅᱆᱇᱈᱉"},
 92 |         "limb" : {'name' : 'Limbu Digits (limb)', "digits" : "᥆᥇᥈᥉᥊᥋᥌᥍᥎᥏"},
 93 |         "mlym" : {'name' : 'Malayalam Digits (mlym)', "digits" : "൦൧൨൩൪൫൬൭൮൯"},
 94 |         "modi" : {'name' : 'Modi Digits (modi)', "digits" : "𑙐𑙑𑙒𑙓𑙔𑙕𑙖𑙗𑙘𑙙"},
 95 |         "mong" : {'name' : 'Mongolian Digits (mong)', "digits" : "᠐᠑᠒᠓᠔᠕᠖᠗᠘᠙"},
 96 |         "mroo" : {'name' : 'Mro Digits (mroo)', "digits" : "𖩠𖩡𖩢𖩣𖩤𖩥𖩦𖩧𖩨𖩩"},
 97 |         "mtei" : {'name' : 'Meetei Mayek Digits (mtei)', "digits" : "꯰꯱꯲꯳꯴꯵꯶꯷꯸꯹"},
 98 |         "mymr" : {'name' : 'Myanmar Digits (mymr)', "digits" : "၀၁၂၃၄၅၆၇၈၉", "sep_out": [",", "."]},
 99 |         "mymrshan" : {'name' : 'Myanmar Shan Digits (mymrshan)', "digits" : "႐႑႒႓႔႕႖႗႘႙", "sep_out": [",", "."]},
100 |         "mymrtlng" : {'name' : 'Myanmar Tai Laing Digits (mymrtlng)', "digits" : "꧰꧱꧲꧳꧴꧵꧶꧷꧸꧹"},
101 |         "newa" : {'name' : 'Pracalit Digits (newa)', "digits" : "𑑐𑑑𑑒𑑓𑑔𑑕𑑖𑑗𑑘𑑙"},
102 |         "nkoo" : {'name' : "N’Ko Digits (nkoo)", "digits" : "߀߁߂߃߄߅߆߇߈߉"},
103 |         "olck" : {'name' : 'Ol Chiki Digits (olck)', "digits" : "᱐᱑᱒᱓᱔᱕᱖᱗᱘᱙"},
104 |         "orya" : {'name' : 'Odia Digits (orya)', "digits" : "୦୧୨୩୪୫୬୭୮୯"},
105 |         "osma" : {'name' : 'Osmanya Digits (osma)', "digits" : "𐒠𐒡𐒢𐒣𐒤𐒥𐒦𐒧𐒨𐒩"},
106 |         "rohg" : {'name' : 'Hanifi Rohingya digits (rohg)', "digits" : "𐴰𐴱𐴲𐴳𐴴𐴵𐴶𐴷𐴸𐴹"},
107 |         "saur" : {'name' : 'Saurashtra Digits (saur)', "digits" : "꣐꣑꣒꣓꣔꣕꣖꣗꣘꣙"},
108 |         "shrd" : {'name' : 'Sharada Digits (shrd)', "digits" : "𑇐𑇑𑇒𑇓𑇔𑇕𑇖𑇗𑇘𑇙"},
109 |         "sind" : {'name' : 'Khudabadi Digits (sind)', "digits" : "𑋰𑋱𑋲𑋳𑋴𑋵𑋶𑋷𑋸𑋹"},
110 |         "sinh" : {'name' : 'Sinhala Digits (sinh)', "digits" : "෦෧෨෩෪෫෬෭෮෯"},
111 |         "sora" : {'name' : 'Sora Sompeng Digits (sora)', "digits" : "𑃰𑃱𑃲𑃳𑃴𑃵𑃶𑃷𑃸𑃹"},
112 |         "sund" : {'name' : 'Sundanese Digits (sund)', "digits" : "᮰᮱᮲᮳᮴᮵᮶᮷᮸᮹"},
113 |         "takr" : {'name' : 'Takri Digits (takr)', "digits" : "𑛀𑛁𑛂𑛃𑛄𑛅𑛆𑛇𑛈𑛉"},
114 |         "talu" : {'name' : 'New Tai Lue Digits (talu)', "digits" : "᧐᧑᧒᧓᧔᧕᧖᧗᧘᧙"},
115 |         "tamldec" : {'name' : 'Tamil Digits (tamldec)', "digits" : "௦௧௨௩௪௫௬௭௮௯"},
116 |         "tnsa" : {'name' : 'Tangsa Digits (tnsa)', "digits" : "𖫀𖫁𖫂𖫃𖫄𖫅𖫆𖫇𖫈𖫉"},
117 |         "telu" : {'name' : 'Telugu Digits (telu)', "digits" : "౦౧౨౩౪౫౬౭౮౯"},
118 |         "thai" : {'name' : 'Thai Digits (thai)', "digits" : "๐๑๒๓๔๕๖๗๘๙"},
119 |         "tibt" : {'name' : 'Tibetan Digits (tibt)', "digits" : "༠༡༢༣༤༥༦༧༨༩"},
120 |         "tirh" : {'name' : 'Tirhuta Digits (tirh)', "digits" : "𑓐𑓑𑓒𑓓𑓔𑓕𑓖𑓗𑓘𑓙"},
121 |         "vaii" : {'name' : 'Vai Digits (vaii)', "digits" : "꘠꘡꘢꘣꘤꘥꘦꘧꘨꘩"},
122 |         "wara" : {'name' : 'Warang Citi Digits (wara)', "digits" : "𑣠𑣡𑣢𑣣𑣤𑣥𑣦𑣧𑣨𑣩"},
123 |         "wcho" : {'name' : 'Wancho Digits (wcho)', "digits" : "𞋰𞋱𞋲𞋳𞋴𞋵𞋶𞋷𞋸𞋹"}
124 |         #"hanidec" : {'name' : 'Chinese Decimal Numerals (hanidec)', "digits": '', "sep_out": [",", "."]}
125 |     }
126 |     try:
127 |         sep = data[system_out]['sep_out']
128 |     except KeyError:
129 |         sep = sep_out
130 |     t = n.maketrans(data[system_in]["digits"], data[system_out]["digits"])
131 |     locale.setlocale(locale.LC_ALL, "")
132 |     return n.translate(t).replace("ṯ", sep[0] ).replace("ḏ", sep[1])
133 | 
134 | 
135 | #
136 | # convert_to_arab_ns()
137 | #
138 | #    Convert numerals from Western Arabic numerals to Eastern Arabic numerals.
139 | #    A numeral system (ns) specific version of convert_numeral_systems().
140 | #
141 | #    Returns a string
142 | #
143 | 
144 | def convert_to_arab_ns(n, p=None, decimal=2, sep_in=["", "."], sep_out=["\u066C", "\u066B"], scale=None):
145 |     locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
146 |     decimal_places = decimal
147 |     if sep_in == ["", "."]:
148 |         n = n * scale if scale else n
149 |         format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d'
150 |         n = locale.format_string(format_string, n, grouping=True, monetary=True)
151 |         n = n.replace(",", "ṯ").replace(".", "ḏ")
152 |     if sep_in[0] in [" ", ",", "٬", "\u2009", "\u202F"]:
153 |         n = n.replace(r'[\u0020,٬\u2009\u202F]', "ṯ")
154 |     elif sep_in[0] == ".":
155 |         n = n.replace(".", "ṯ")
156 |     if sep_in[1] in [",", ".", "٫"]:
157 |         n = n.replace(r'[,.٫]', "ḏ")
158 |     #sep = sep_out
159 |     t = n.maketrans("0123456789", "٠١٢٣٤٥٦٧٨٩")
160 |     locale.setlocale(locale.LC_ALL, "")
161 |     return n.translate(t).replace("ṯ", sep_out[0] ).replace("ḏ", sep_out[1])
162 | 
163 | convert_to_kurdish_ns = convert_to_arab_ns
164 | 
165 | #
166 | # Locale formatted numbers using PyICU
167 | #   Supports both integers and floating point numbers.
168 | #
169 | # Usage:
170 | #   icu_formatted_digits(112345.05)
171 | #   icu_formatted_digits(112345.05, loc=Locale.getFrench())
172 | #   icu_formatted_digits(112345.05, loc=Locale("hi_IN@numbers=deva"))
173 | #   icu_formatted_digits(112345.05, loc=Locale.forLanguageTag("my-MM-u-nu-mymr"))
174 | #   icu_formatted_digits(112345.05, loc=Locale("ckb_IQ@numbers=arab"))
175 | #   icu_formatted_digits(112345.05, loc=Locale.forLanguageTag("ckb-IQ-u-nu-arab"))
176 | #   icu_formatted_digits(112345.05, loc=Locale.forLanguageTag("ckb-IR-u-nu-arabext"))
177 | from icu import Locale, NumberFormat, LocalizedNumberFormatter, ICU_MAX_MAJOR_VERSION
178 | 
179 | def icu_formatted_digits(digit, p=None, loc=None):
180 |     if loc is None:
181 |         loc = Locale.getRoot()
182 |     if int(ICU_MAX_MAJOR_VERSION) >= 60:
183 |         formatter = LocalizedNumberFormatter(loc)
184 |         r = formatter.formatDouble(digit) if isinstance(digit, float) else formatter.formatInt(digit)
185 |     else:
186 |         formatter = NumberFormat.createInstance(loc)
187 |         r = formatter.format(digit)
188 |     return r
189 | 
190 | 


--------------------------------------------------------------------------------
/snippets/data_cleaning.py:
--------------------------------------------------------------------------------
 1 | ####################
 2 | #
 3 | # Data cleaning
 4 | #   © Enabling Languages 2022
 5 | #   Released under the MIT License.
 6 | #
 7 | ####################
 8 | 
 9 | # import unicodedata as ud
10 | import unicodedataplus as ud
11 | import regex as re
12 | from icu import UnicodeString, Locale, Normalizer2, UNormalizationMode2
13 | 
14 | 
15 | #
16 | # Unicode normalisation
17 | #   Simple wrappers for Unicode normalisation
18 | #
19 | 
20 | def NFD(s, engine="ud"):
21 |     if engine == "icu":
22 |         normalizer = Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE)
23 |         return normalizer.normalize(s)
24 |     return ud.normalize('NFD', s)
25 | 
26 | def NFKD(s, engine="ud"):
27 |     if engine == "icu":
28 |         normalizer = Normalizer2.getInstance(None, "nfkc", UNormalizationMode2.DECOMPOSE)
29 |         return normalizer.normalize(s)
30 |     return ud.normalize('NFKD', s)
31 | 
32 | def NFC(s, engine="ud"):
33 |     if engine == "icu":
34 |         normalizer = Normalizer2.getInstance(None, "nfc", UNormalizationMode2.COMPOSE)
35 |         return normalizer.normalize(s)
36 |     return ud.normalize('NFC', s)
37 | 
38 | def NFKC(s, engine="ud"):
39 |     if engine == "icu":
40 |         normalizer = Normalizer2.getInstance(None, "nfkc", UNormalizationMode2.COMPOSE)
41 |         return normalizer.normalize(s)
42 |     return ud.normalize('NFKC', s)
43 | 
44 | #
45 | # Clean presentation forms
46 | #
47 | #    For Latin and Armenian scripts, use either folding=True or folding=False (default), 
48 | #    while for Arabic and Hebrew scripts, use folding=False.
49 | #
50 | 
51 | def has_presentation_forms(text):
52 |     pattern = r'([\p{InAlphabetic_Presentation_Forms}\p{InArabic_Presentation_Forms-A}\p{InArabic_Presentation_Forms-B}]+)'
53 |     return bool(re.findall(pattern, text))
54 | 
55 | def clean_presentation_forms(text, folding=False):
56 |     def clean_pf(match, folding):
57 |         return  match.group(1).casefold() if folding else ud.normalize("NFKC", match.group(1))
58 |     pattern = r'([\p{InAlphabetic_Presentation_Forms}\p{InArabic_Presentation_Forms-A}\p{InArabic_Presentation_Forms-B}]+)'
59 |     return re.sub(pattern, lambda match, folding=folding: clean_pf(match, folding), text)
60 | 
61 | # PyICU Helper functions for casing and casefolding.
62 | # s is a string, l is an ICU locale object (defaulting to CLDR Root Locale)
63 | def toLower(s, l=Locale.getRoot()):
64 |     return str(UnicodeString(s).toLower(l))
65 | def toUpper(s, l=Locale.getRoot()):
66 |     return str(UnicodeString(s).toUpper(l))
67 | def toTitle(s, l=Locale.getRoot()):
68 |     return str(UnicodeString(s).toTitle(l))
69 | def toSentence(s, l=Locale.getRoot()):
70 |     return(str(UnicodeString(s[0]).toUpper(l)) + str(UnicodeString(s[1:]).toLower(l)))
71 | def foldCase(s):
72 |     return str(UnicodeString(s).foldCase())
73 | 
74 | #
75 | # Turkish casing implemented without module dependencies.
76 | # PyICU provides a more comprehensive solution for Turkish.
77 | #
78 | 
79 | # To lowercase
80 | def kucukharfyap(s):
81 |     return ud.normalize("NFC", s).replace('İ', 'i').replace('I', 'ı').lower()
82 | 
83 | # To uppercase
84 | def buyukharfyap(s):
85 |     return ud.normalize("NFC", s).replace('ı', 'I').replace('i', 'İ').upper()
86 | 


--------------------------------------------------------------------------------
/snippets/matching.py:
--------------------------------------------------------------------------------
 1 | ####################
 2 | #
 3 | # Unicode matching
 4 | #
 5 | #   © Enabling Languages 2022
 6 | #   Released under the MIT License.
 7 | #
 8 | ####################
 9 | 
10 | import unicodedataplus as ud
11 | import regex as re
12 | 
13 | def caseless_match(x, y):
14 |   return x.casefold() == y.casefold()
15 | 
16 | def canonical_caseless_match(x, y):
17 |   return ud.normalize("NFD", ud.normalize("NFD", x).casefold()) == ud.normalize("NFD", ud.normalize("NFD", y).casefold())
18 | 
19 | def compatibility_caseless_match(x, y):
20 |   return ud.normalize("NFKD", ud.normalize("NFKD", ud.normalize("NFD", x).casefold()).casefold()) == ud.normalize("NFKD", ud.normalize("NFKD", ud.normalize("NFD", y).casefold()).casefold())
21 | 
22 | def NFKC_Casefold(s):
23 |   pattern = re.compile(r"\p{Default_Ignorable_Code_Point=Yes}")
24 |   s = re.sub(pattern, '', s)
25 |   return ud.normalize("NFC", ud.normalize('NFKC', s).casefold())
26 | 
27 | def identifier_caseless_match(x, y):
28 |   return NFKC_Casefold(ud.normalize("NFD", x)) == NFKC_Casefold(ud.normalize("NFD", y))
29 | 


--------------------------------------------------------------------------------
/snippets/regex_segmentation.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from el_internationalisation import cp
 3 | 
 4 | 
 5 | def regex_segmentation(text: str, pattern: str, sep: str = "\u200B", mode: list = ["list"]) -> list | str | None:
 6 |     """Tokenise string using regex, returning results as a list or string.
 7 | 
 8 |     Args:
 9 |         text (str): text to be segmented
10 |         pattern (str): regex pattern for segmentation
11 |         sep (str, optional): seperator to use if string is returned or results are displayed to STDOUT. Defaults to "\u200B" (ZWSP - Zero Width Space).
12 |         display (bool, optional): Indicates whether results should displayed on STDOUT (True) or returned (False). Defaults to False.
13 |         mode (str, optional): Indicates if results should be returned as a list or string, or displayed to STDOUT. Defaults to "list". Use "string" to return results as a string. Use "display" to output to STDOUT
14 | 
15 |     Returns:
16 |         list | str | None: Results returned as list or string (see mode argument) or as None (if display)
17 |     """
18 |     result: str = regex.sub(pattern, r"\u200B\1", text)
19 |     if result[0] == "\u200B":
20 |         result = result[1:]
21 |     result_list: list = result.split("\u200B")
22 |     result_string: str = sep.join(result_list)
23 |     if "display" in mode:
24 |         print(
25 |             f"Number of tokens: {str(len(result_list))} \nSegmentation boundaries: {result_string}")
26 |     if "codepoints" in mode:
27 |         for item in result_list:
28 |             print(cp(item))
29 |     if ("string" not in mode) and ("list" not in mode):
30 |         print("Nothing to return")
31 |         return None
32 |     return result_string if "string" in mode else result_list
33 | 
34 | #####################
35 | #
36 | # Examples
37 | #
38 | #####################
39 | 
40 | 
41 | s = 'ရန်ကုန်ကွန်ပျူတာတက္ကသိုလ်'
42 | pattern = r'(?:(?<!္)([က-ဪဿ၊-၏]|[၀-၉]+|[^က-၏]+)(?![ှျ]?[့္်]))'
43 | SEP = "\u2009·\u2009"
44 | 
45 | # Returns list
46 | #
47 | # Output:
48 | #   ['ရန်', 'ကုန်', 'ကွန်', 'ပျူ', 'တာ', 'တက္က', 'သိုလ်']
49 | l = regex_segmentation(s, pattern)
50 | print(f"\n{l}\n")
51 | 
52 | # Print to STDOUT number of tokens, and segmentation boundary of
53 | # token string using SEP
54 | #
55 | # Returns `None`
56 | #
57 | # Output to STDOUT:
58 | #   Number of tokens: 7
59 | #   Segmentation boundaries: ရန် · ကုန် · ကွန် · ပျူ · တာ · တက္က · သိုလ်
60 | #   None
61 | d = regex_segmentation(s, pattern, sep=SEP, mode=["display"])
62 | print(d)
63 | print("-----")
64 | 
65 | c = regex_segmentation(s, pattern, mode=["codepoints"])
66 | print(c)
67 | print("-----")
68 | 
69 | # Returns string with syllables seperated by pipe character
70 | #
71 | # Output
72 | #   ရန်|ကုန်|ကွန်|ပျူ|တာ|တက္က|သိုလ်
73 | sl = regex_segmentation(s, pattern, sep="|", mode=["string"])
74 | print(sl)
75 | print("-----")
76 | 
77 | # Returns string with syllbales seperated by ZWSP
78 | #
79 | # Output:
80 | #   ရန်​ကုန်​ကွန်​ပျူ​တာ​တက္က​သိုလ်
81 | sl2 = regex_segmentation(s, pattern, mode=["string"])
82 | print(f"\n{sl2}")   # ရန်​ကုန်​ကွန်​ပျူ​တာ​တက္က​သိုလ်
83 | print("-----")
84 | 
85 | lots = regex_segmentation(s, pattern, sep=SEP, mode=["codepoints", "list"])
86 | print(lots)
87 | print("-----")
88 | 
89 | lots2 = regex_segmentation(s, pattern, sep=SEP, mode=["display","codepoints", "string"])
90 | print(lots2)
91 | print("-----")
92 | 
93 | lots3 = regex_segmentation(s, pattern, sep=SEP, mode=["display","codepoints", "string"])
94 | print(lots3)


--------------------------------------------------------------------------------
/snippets/sort_key_normalise.py:
--------------------------------------------------------------------------------
 1 | ####################
 2 | #
 3 | # normalised_sort
 4 | #
 5 | #   © Enabling Languages 2022
 6 | #   Released under the MIT License.
 7 | #
 8 | #   Normalise text before sorting. This key finction will perform Unicode normalisation and
 9 | #   will lowercase the strings before sorting.
10 | #
11 | #   nf (normalisation form): {NFC, NFKC, NFD, NFKD}
12 | #   loc (use locale): {True, False}
13 | #
14 | #   Usage:
15 | #     * sorted(my_list, key=normalised_sort)  OR  my_list.sort(key=normalised_sort)
16 | #     * sorted(my_list, key=lambda x: normalised_sort(x, "NFD"))  OR  my_list.sort(key=lambda x: normalised_sort(x, "NFD"))
17 | #     * sorted(my_list, key=lambda x: normalised_sort(x, "NFC", loc=True))  OR  my_list.sort(key=lambda x: normalised_sort(x, "NFC", loc=True))
18 | #
19 | #  For examples see gist: https://gist.github.com/andjc/821d85f0e10549f9e4ab8c84c1ee00f5
20 | #
21 | # ####################
22 | 
23 | import locale
24 | import unicodedata as ud
25 | 
26 | # def normalised_sort(s, nf="NFD", loc=False):
27 | #     nf = nf.upper()
28 | #     if nf in ["NFC", "NFKC", "NFD", "NFKD"]:
29 | #         s = locale.strxfrm(ud.normalize(nf, s.lower())) if loc else ud.normalize(nf, s.lower())
30 | #     return s
31 | 
32 | def normalised_sort(s, nf="NFKD", loc=False):
33 |     nf = nf.upper()
34 |     if nf in ["NFC", "NFKC", "NFD", "NFKD"]:
35 |         if isinstance(s, pd.core.series.Series):
36 |             s = s.str.normalize(nf).str.lower()
37 |         else:
38 |             s = locale.strxfrm(ud.normalize(nf, s).lower()) if loc else ud.normalize(nf, s).lower()
39 |     return s
40 | 
41 | def normalised_string(s, nf="NFKD"):
42 |     nf = nf.upper()
43 |     if nf in ["NFC", "NFKC", "NFD", "NFKD"]:
44 |         s = ud.normalize(nf, s).lower()
45 |     return s
46 | 
47 | # Function to assist in locale specific or ICU sorting
48 | def df_sort(dataframe, series, key):
49 |     def sort_series(key=None,reverse=False):
50 |         def sorter(series):
51 |             series_list = list(series)
52 |             return [series_list.index(i) for i in sorted(series_list,key=key,reverse=reverse)]
53 |         return sorter
54 |     if (isinstance(series, pd.Series)):
55 |         sort_by_custom_dict = sort_series(key=key)
56 |         return dataframe.iloc[sort_by_custom_dict(series)]
57 | 
58 | 


--------------------------------------------------------------------------------
/utils/cesu8.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2014, 2015 SAP SE
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | #################################################################################
 16 | #
 17 | # Source for CESU-8 codec: https://github.com/SAP-archive/PyHDB/blob/master/pyhdb/cesu8.py
 18 | #
 19 | # Changes made by Enabling Languages (09/2021):
 20 | #     * Removed Python2 support.
 21 | 
 22 | import codecs
 23 | 
 24 | SURROGATE_IDENTICATOR_INT = 0xED
 25 | SURROGATE_IDENTICATOR_BYTE = b'\xed'
 26 | 
 27 | class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
 28 |     # Decoder inspired by python-ftfy written by Rob Speer
 29 |     # https://github.com/LuminosoInsight/python-ftfy/blob/master/ftfy/bad_codecs/utf8_variants.py
 30 | 
 31 |     def _buffer_decode(self, input, errors, final):
 32 |         decoded_segments = []
 33 |         position = 0
 34 | 
 35 |         while True:
 36 |             decoded, consumed = self._buffer_decode_step(input[position:], errors, final)
 37 | 
 38 |             if consumed == 0:
 39 |                 break
 40 | 
 41 |             decoded_segments.append(decoded)
 42 |             position += consumed
 43 | 
 44 |         if final and position != len(input):
 45 |             raise Exception("Final decoder doesn't decoded all bytes")
 46 | 
 47 |         return u''.join(decoded_segments), position
 48 | 
 49 |     def _buffer_decode_step(self, input, errors, final):
 50 |         # If begin of CESU-8 sequence
 51 |         if input.startswith(SURROGATE_IDENTICATOR_BYTE):
 52 |             if len(input) < 6:
 53 |                 if not final:
 54 |                     # Stream is not done yet
 55 |                     return u'', 0
 56 | 
 57 |                 # As there are less than six bytes it can't be a CESU-8 surrogate
 58 |                 # but probably a UTF-8 byte sequence
 59 |                 return codecs.utf_8_decode(input, errors, final)
 60 | 
 61 |             bytenums = input
 62 | 
 63 |             # Verify that the 6 bytes are in possible range of a CESU-8 surrogate
 64 |             if bytenums[1] >= 0xa0 and bytenums[1] <= 0xbf and \
 65 |                bytenums[2] >= 0x80 and bytenums[2] <= 0xbf and \
 66 |                bytenums[3] == SURROGATE_IDENTICATOR_INT and \
 67 |                bytenums[4] >= 0xb0 and bytenums[4] <= 0xbf and \
 68 |                bytenums[5] >= 0x80 and bytenums[5] <= 0xbf:
 69 | 
 70 |                 codepoint = (
 71 |                     ((bytenums[1] & 0x0f) << 16) +
 72 |                     ((bytenums[2] & 0x3f) << 10) +
 73 |                     ((bytenums[4] & 0x0f) << 6) +
 74 |                     (bytenums[5] & 0x3f) +
 75 |                     0x10000
 76 |                 )
 77 |                 return chr(codepoint), 6
 78 | 
 79 |             # No CESU-8 surrogate but probably a 3 byte UTF-8 sequence
 80 |             return codecs.utf_8_decode(input[:3], errors, final)
 81 | 
 82 |         cesu8_surrogate_start = input.find(SURROGATE_IDENTICATOR_BYTE)
 83 |         if cesu8_surrogate_start > 0:
 84 |             # Decode everything until start of cesu8 surrogate pair
 85 |             return codecs.utf_8_decode(input[:cesu8_surrogate_start], errors, final)
 86 | 
 87 |         # No sign of CESU-8 encoding
 88 |         return codecs.utf_8_decode(input, errors, final)
 89 | 
 90 | class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
 91 | 
 92 |     def _buffer_encode(self, input, errors, final=False):
 93 |         encoded_segments = []
 94 |         position = 0
 95 |         input_length = len(input)
 96 | 
 97 |         while position + 1 <= input_length:
 98 |             encoded, consumed = self._buffer_encode_step(
 99 |                 input[position], errors, final
100 |             )
101 | 
102 |             if consumed == 0:
103 |                 break
104 | 
105 |             encoded_segments.append(encoded)
106 |             position += consumed
107 | 
108 |         if final and position != len(input):
109 |             raise Exception("Final encoder doesn't encode all characters")
110 | 
111 |         return b''.join(encoded_segments), position
112 | 
113 |     def _buffer_encode_step(self, char, errors, final):
114 |         codepoint = ord(char)
115 |         if codepoint <= 65535:
116 |             return codecs.utf_8_encode(char, errors)
117 |         else:
118 |             seq = bytearray(6)
119 |             seq[0] = 0xED
120 |             seq[1] = 0xA0 | (((codepoint & 0x1F0000) >> 16) - 1)
121 |             seq[2] = 0x80 | (codepoint & 0xFC00) >> 10
122 |             seq[3] = 0xED
123 |             seq[4] = 0xB0 | ((codepoint >> 6) & 0x3F)
124 |             seq[5] = 0x80 | (codepoint & 0x3F)
125 |             return bytes(seq), 1
126 | 
127 | def encode(input, errors='strict'):
128 |     return IncrementalEncoder(errors).encode(input, final=True), len(input)
129 | 
130 | def decode(input, errors='strict'):
131 |     return IncrementalDecoder(errors).decode(input, final=True), len(input)
132 | 
133 | class StreamWriter(codecs.StreamWriter):
134 |     encode = encode
135 | 
136 | class StreamReader(codecs.StreamReader):
137 |     decode = decode
138 | 
139 | CESU8_CODEC_INFO = codecs.CodecInfo(
140 |     name="cesu-8",
141 |     encode=encode,
142 |     decode=decode,
143 |     incrementalencoder=IncrementalEncoder,
144 |     incrementaldecoder=IncrementalDecoder,
145 |     streamreader=StreamReader,
146 |     streamwriter=StreamWriter,
147 | )
148 | 
149 | def search_function(encoding):
150 |     if encoding == 'cesu-8':
151 |         return CESU8_CODEC_INFO
152 |     else:
153 |         return None
154 | 
155 | codecs.register(search_function)
156 | 


--------------------------------------------------------------------------------