├── .gitignore
├── LICENSE
├── README.md
├── colab
├── en_IN_on_colab.ipynb
├── locale-colab-snippet.ipynb
└── locale_module_colab.ipynb
├── data
├── bn_global_popl.tsv
├── demographics.tsv
├── din.txt
├── dolar_endeksi.tsv
├── fa_stats.tsv
├── klpt_stopwords.json
├── myanmar-regions.tsv
├── myanmar_ethnic_groups.tsv
├── rbbi
│ ├── Default.rbbi
│ ├── Lao.rbbi
│ ├── lucene
│ │ ├── source.md
│ │ └── uax29
│ │ │ ├── Default.rbbi
│ │ │ └── MyanmarSyllable.rbbi
│ ├── solrcene
│ │ ├── Hebrew.rbbi
│ │ ├── Khmer.rbbi
│ │ ├── Lao.rbbi
│ │ ├── Myanmar.rbbi
│ │ └── source.md
│ └── source.md
├── régions_métropolitaines.tsv
├── sorani_alphabet.tsv
├── sorani_alphabet_wikipedia.tsv
├── source.md
├── türkiye'ninz-illeri.tsv
└── wordlists
│ └── source.md
├── docs
├── DRAFT_icu_transforms.pdf
├── README.md
└── matplotlib.md
├── notebooks
├── Collation.ipynb
├── Sorting_emoji.ipynb
├── armenian_pandas.ipynb
├── bangla_df.ipynb
├── ckb_sort.ipynb
├── complex_script_support_images.ipynb
├── data
│ └── allkeys.txt
├── ethiopic_numbers.ipynb
├── icu_transforms.ipynb
├── images
│ ├── sorani_plotly.png
│ ├── sorani_plotly2.png
│ └── sorani_plotly_inline.png
├── img
│ ├── 1440px-Lake_Dukan_12.jpg
│ ├── ckb_IQ_collation.png
│ ├── khamti.jpg
│ ├── linux1.png
│ ├── macos1.png
│ ├── mplcairo_output.png
│ ├── sibe.jpg
│ ├── std_matplotlib_output.png
│ ├── tai_aiton.jpg
│ ├── tai_aiton_text_to_image.png
│ └── yolngu.jpg
├── is_IS.ipynb
├── kn_demographics_pandas_matplotlib.ipynb
├── kn_demographics_pandas_plottly.ipynb
├── matplotlib_locale.ipynb
├── matplotlib_mplcairo.ipynb
├── matplotlib_mplcairo2.ipynb
├── matplotlib_pyicu.ipynb
├── my-segmentation.ipynb
├── pandas_plot_mplcairo.ipynb
├── pandas_plot_plotly.ipynb
├── persian_df.ipynb
├── plotly.ipynb
├── plotly2.ipynb
├── seaborn.ipynb
├── sorting_pandas.ipynb
├── strings_casing_matching.ipynb
├── turkish_df.ipynb
└── vietnamese_pandas.ipynb
├── py
├── am_ET_numbers_icu.py
├── am_ET_numbers_icu_1.png
├── am_ET_numbers_icu_1.py
├── am_ET_numbers_icu_2.png
├── am_ET_numbers_icu_2.py
├── arabic_reshaper_example.py
├── hi_IN_numbers_icu.png
├── hi_IN_numbers_icu.py
├── matplotlib_kurdish.png
├── matplotlib_kurdish.py
├── pandas_plot_kurdish.png
├── pandas_plot_kurdish.py
├── pyuca_test.py
├── seaborn_kurdish.png
├── seaborn_kurdish.py
├── wordcloud_kurdish.png
└── wordcloud_kurdish.py
├── requirements.txt
├── rules
├── collation
│ ├── README.md
│ ├── allkeys_CLDR.txt
│ ├── allkeys_DUCET.txt
│ ├── cldr
│ │ ├── ckb.xml
│ │ ├── ckb_IQ.xml
│ │ ├── ckb_IR.xml
│ │ └── dtd
│ │ │ └── ldml.dtd
│ ├── collation_rules.py
│ ├── glibc
│ │ ├── ckb_IQ@academy
│ │ └── en_SS
│ ├── icu
│ │ ├── ckb.txt
│ │ └── ckb_IQ.txt
│ ├── sorani_alphabet.tsv
│ └── temp.ipynb
└── segmentation
│ ├── regex_patterns.md
│ └── syllables
│ ├── Khmer.rbbi
│ ├── Lao.rbbi
│ └── Myanmar.rbbi
├── snippets
├── break_iterator.py
├── convert_digits.py
├── data_cleaning.py
├── matching.py
├── regex_segmentation.py
└── sort_key_normalise.py
└── utils
├── cesu8.py
├── el_utils.py
└── elle.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | ### JupyterNotebooks ###
132 | # gitignore template for Jupyter Notebooks
133 | # website: http://jupyter.org/
134 |
135 | .ipynb_checkpoints
136 | */.ipynb_checkpoints/*
137 |
138 | # IPython
139 | profile_default/
140 | ipython_config.py
141 |
142 | # Remove previous ipynb_checkpoints
143 | # git rm -r .ipynb_checkpoints/
144 |
145 | ### Linux ###
146 | *~
147 |
148 | # temporary files which can be created if a process still has a handle open of a deleted file
149 | .fuse_hidden*
150 |
151 | # KDE directory preferences
152 | .directory
153 |
154 | # Linux trash folder which might appear on any partition or disk
155 | .Trash-*
156 |
157 | # .nfs files are created when an open file is removed but is still being accessed
158 | .nfs*
159 |
160 | ### macOS ###
161 | # General
162 | .DS_Store
163 | .AppleDouble
164 | .LSOverride
165 |
166 | # Icon must end with two \r
167 | Icon
168 |
169 |
170 | # Thumbnails
171 | ._*
172 |
173 | # Files that might appear in the root of a volume
174 | .DocumentRevisions-V100
175 | .fseventsd
176 | .Spotlight-V100
177 | .TemporaryItems
178 | .Trashes
179 | .VolumeIcon.icns
180 | .com.apple.timemachine.donotpresent
181 |
182 | # Directories potentially created on remote AFP share
183 | .AppleDB
184 | .AppleDesktop
185 | Network Trash Folder
186 | Temporary Items
187 | .apdisk
188 |
189 | ### VisualStudioCode ###
190 | .vscode/*
191 | !.vscode/settings.json
192 | !.vscode/tasks.json
193 | !.vscode/launch.json
194 | !.vscode/extensions.json
195 | *.code-workspace
196 |
197 | # Local History for Visual Studio Code
198 | .history/
199 |
200 | ### VisualStudioCode Patch ###
201 | # Ignore all local history of files
202 | .history
203 | .ionide
204 |
205 | ### Windows ###
206 | # Windows thumbnail cache files
207 | Thumbs.db
208 | Thumbs.db:encryptable
209 | ehthumbs.db
210 | ehthumbs_vista.db
211 |
212 | # Dump file
213 | *.stackdump
214 |
215 | # Folder config file
216 | [Dd]esktop.ini
217 |
218 | # Recycle Bin used on file shares
219 | $RECYCLE.BIN/
220 |
221 | # Windows Installer files
222 | *.cab
223 | *.msi
224 | *.msix
225 | *.msm
226 | *.msp
227 |
228 | # Windows shortcuts
229 | *.lnk
230 |
231 | # Repo specfic
232 | notes/
233 | print/
234 | archive/
235 | .vscode/
236 | data/wordlists/kurdi_words.txt
237 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021-2 Enabling Languages
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python internationalisation
2 |
3 | There is limited information on Python internationalisation or Python internationalisation best practices. What little that's available is scattered, and most available articles and tutorials on Python internationalisation are specifically on localisation.
4 |
5 | The EL notebooks contain notes on various aspects of Python internationalisation, and new topics will be added over time.
6 |
7 | Feedback is welcome.
8 |
9 | ## Python internationalisation notes
10 |
11 | * Collation
12 | 1. [Sorting](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/Collation.ipynb)
13 | 2. [Sorting emoji](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/Sorting_emoji.ipynb)
14 | 3. [Sorting pandas](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/sorting_pandas.ipynb)
15 | * Data visualisation
16 | 1. [Matplotlib, pandas plot, seaborn, wordcloud](https://github.com/enabling-languages/python-i18n/blob/main/docs/matplotlib.md)
17 | 2. [Locale specific formatting of numeric tick labels on matplotlib](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/matplotlib_locale.ipynb)
18 | 3. [Using PyICU to format matplotlib numeric tick labels](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/matplotlib_pyicu.ipynb)
19 | * Working with digits
20 | 1. [snippets](https://github.com/enabling-languages/python-i18n/blob/main/snippets/convert_digits.py)
21 |
22 | ## Google Colab notes
23 |
24 | 1. [Setting the locale of a notebook for Google Colab](https://github.com/enabling-languages/python-i18n/blob/main/colab/locale_module_colab.ipynb)
25 |
26 | ## Resources
27 |
28 | Python documentation:
29 |
30 | * [Internationalization](https://docs.python.org/3/library/i18n.html)
31 | * [Unicode HOWTO](https://docs.python.org/3/howto/unicode.html)
32 | * [Unicode Objects and Codecs](https://docs.python.org/3/c-api/unicode.html)
33 |
--------------------------------------------------------------------------------
/colab/locale-colab-snippet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Loading locales in Colab: snippet\n",
8 | "\n",
9 | "Refer to [locale_module_colab.ipynb](locale_module_colab.ipynb)."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Import locale module\n",
19 | "import locale"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "try:\n",
29 | " import google.colab\n",
30 | " IN_COLAB = True\n",
31 | "except ImportError:\n",
32 | " IN_COLAB = False\n",
33 | "if IN_COLAB:\n",
34 | " try:\n",
35 | " locale.setlocale(locale.LC_ALL, \"en_AU.UTF-8\")\n",
36 | " except locale.Error:\n",
37 | " !sudo apt-get install language-pack-en language-pack-fr language-pack-sv language-pack-de\n",
38 | " #!sudo apt autoremove\n",
39 | " import os\n",
40 | " os.kill(os.getpid(), 9)\n",
41 | "else:\n",
42 | " locale.setlocale(locale.LC_ALL, \"en_AU.UTF-8\")"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "print(\"IN_COLAB: \" + str(IN_COLAB))\n",
52 | "print(locale.getlocale())"
53 | ]
54 | }
55 | ],
56 | "metadata": {
57 | "interpreter": {
58 | "hash": "bb12d0de9674b66c629d2bafada2ec4f6e6dba6d129e54dea4badc21502d54d3"
59 | },
60 | "kernelspec": {
61 | "display_name": "Python 3.8.1 ('el')",
62 | "language": "python",
63 | "name": "python3"
64 | },
65 | "language_info": {
66 | "codemirror_mode": {
67 | "name": "ipython",
68 | "version": 3
69 | },
70 | "file_extension": ".py",
71 | "mimetype": "text/x-python",
72 | "name": "python",
73 | "nbconvert_exporter": "python",
74 | "pygments_lexer": "ipython3",
75 | "version": "3.8.1"
76 | },
77 | "orig_nbformat": 4
78 | },
79 | "nbformat": 4,
80 | "nbformat_minor": 2
81 | }
82 |
--------------------------------------------------------------------------------
/data/demographics.tsv:
--------------------------------------------------------------------------------
1 | "---" "جیھانی" "تورکیا" "ئێران" "عێراق" "سووریا"
2 | "کرمانجی" "١٤٬٤١٩٬٠٠٠" "٧٬٩١٩٬٠٠٠" "٤٤٣٬٠٠٠" "٣٬١٨٥٬٠٠٠" "١٬٦٦١٬٠٠٠
3 | "
4 | "ئەوانەی بە تورکی دەدوێن" "٥٬٧٣٢٬٠٠٠" "٥٬٧٣٢٬٠٠٠" "-" "-" "-
5 | "
6 | "باشوور" "٣٬٣٨١٬٠٠٠" "-" "٣٬٣٨١٬٠٠٠" "-" "-
7 | "
8 | "سۆرانی" "١٬٥٧٦٬٠٠٠" "-" "٥٠٢٬٠٠٠" "٥٦٧٬٠٠٠" "-
9 | "
10 | "زازایی - دەملی" "١٬١٢٥٬٠٠٠" "١٬١٢٥٬٠٠٠" "-" "-" "-
11 | "
12 | "زازایی - ئەلڤێکا" "١٨٤٬٠٠٠" "١٧٩٬٠٠٠" "-" "-" "-
13 | "
14 | "ڕەوەند" "٩٠٬٠٠٠" "٣٨٬٠٠٠" "٢٠٬٠٠٠" "٣٣٬٠٠٠" "-
15 | "
16 | "ھەورامی" "٥٤٬٠٠٠" "-" "٢٦٬٠٠٠" "٢٨٬٠٠٠" "-
17 | "
18 | "شکاکی" "٤٩٬٠٠٠" "٢٣٬٠٠٠" "٢٦٬٠٠٠" "-" "-
19 | "
20 | "کۆی گشتی" "٢٦٬٧١٢٬٠٠٠" "١٥٬٠١٦٬٠٠٠" "٤٬٣٩٨٬٠٠٠" "٣٬٩١٦٬٠٠٠" "١٬٦٦١٬٠٠٠"
--------------------------------------------------------------------------------
/data/fa_stats.tsv:
--------------------------------------------------------------------------------
1 | سال ولادت وفات
2 | "۱۳۳۸ " "۸۶۴٬۸۴۶ " ۱۷۶٬۲۸۸
3 | "۱۳۳۹ " "۸۷۶٬۲۰۶ " ۱۷۱٬۰۴۰
4 | "۱۳۴۰ " "۹۰۲٬۲۶۰ " ۱۵۹٬۳۷۱
5 | "۱۳۴۱ " "۹۵۷٬۵۰۰ " ۱۶۵٬۴۸۸
6 | "۱۳۴۲ " "۹۲۰٬۹۶۷ " ۱۳۵٬۹۱۲
7 | "۱۳۴۳ " "۱٬۱۱۸٬۹۱۱ " ۱۴۵٬۱۷۴
8 | "۱۳۴۴ " "۱٬۱۸۸٬۳۴۶ " ۱۷۱٬۹۴۰
9 | "۱۳۴۵ " "۱٬۱۰۲٬۸۴۸ " ۱۷۸٬۹۹۱
10 | "۱۳۴۶ " "۱٬۰۱۴٬۳۲۱ " ۱۷۸٬۷۴۹
11 | "۱۳۴۷ " "۱٬۰۴۶٬۱۳۴ " ۱۷۳٬۳۵۲
12 | "۱۳۴۸ " "۱٬۱۰۷٬۹۱۰ " ۱۶۷٬۵۷۵
13 | "۱۳۴۹ " "۱٬۱۹۰٬۹۵۷ " ۱۶۳٬۸۹۶
14 | "۱۳۵۰ " "۱٬۲۳۵٬۰۲۵ " ۱۴۹٬۰۱۱
15 | "۱۳۵۱ " "۱٬۱۳۸٬۸۴۳ " ۱۵۳٬۹۲۰
16 | "۱۳۵۲ " "۱٬۱۹۹٬۷۷۷ " ۱۵۵٬۳۰۵
17 | "۱۳۵۳ " "۱٬۲۴۸٬۲۵۶ " ۱۴۹٬۸۷۵
18 | "۱۳۵۴ " "۱٬۳۳۹٬۲۶۷ " ۱۴۸٬۵۴۳
19 | "۱۳۵۵ " "۱٬۳۹۹٬۹۷۷ " ۱۵۶٬۰۱۰
20 | "۱۳۵۶ " "۱٬۴۰۶٬۲۰۴ " ۱۴۶٬۳۶۹
21 | "۱۳۵۷ " "۱٬۳۷۳٬۷۳۸ " ۱۲۷٬۸۸۳
22 | "۱۳۵۸ " "۱٬۶۸۸٬۹۴۲ " ۱۴۲٬۴۰۱
23 | "۱۳۵۹ " "۲٬۴۵۱٬۷۶۵ " ۱۶۲٬۱۷۵
24 | "۱۳۶۰ " "۲٬۴۱۹٬۹۵۱ " ۱۷۸٬۰۶۵
25 | "۱۳۶۱ " "۲٬۰۹۷٬۹۵۷ " ۲۰۰٬۶۱۴
26 | "۱۳۶۲ " "۲٬۲۰۳٬۵۶۰ " ۲۰۷٬۲۲۸
27 | "۱۳۶۳ " "۲٬۰۶۸٬۲۷۹ " ۱۸۶٬۴۴۰
28 | "۱۳۶۴ " "۲٬۰۳۱٬۹۶۹ " ۱۹۰٬۰۶۱
29 | "۱۳۶۵ " "۲٬۲۵۶٬۹۷۱ " ۱۹۹٬۵۱۱
30 | "۱۳۶۶ " "۱٬۸۳۲٬۷۲۲ " ۲۰۴٬۲۳۰
31 | "۱۳۶۷ " "۱٬۹۴۲٬۹۳۶ " ۲۳۸٬۳۹۰
32 | "۱۳۶۸ " "۱٬۷۸۹٬۸۱۷ " ۱۹۹٬۶۴۵
33 | "۱۳۶۹ " "۱٬۷۲۶٬۴۸۸ " ۲۱۷٬۶۱۵
34 | "۱۳۷۰ " "۱٬۵۹۲٬۸۹۸ " ۲۱۷٬۶۰۴
35 | "۱۳۷۱ " "۱٬۴۳۳٬۲۴۳ " ۱۸۸٬۶۴۷
36 | "۱۳۷۲ " "۱٬۳۸۸٬۰۱۷ " ۲۰۸٬۱۶۱
37 | "۱۳۷۳ " "۱٬۴۲۶٬۷۸۴ " ۲٬۵۳۸٬۰۷۸
38 | "۱۳۷۴ " "۱٬۲۰۵٬۳۷۲ " ۲٬۷۵۶٬۴۸۲
39 | "۱۳۷۵ " "۱٬۱۸۷٬۹۰۳ " ۱٬۲۴۰٬۹۷۵
40 | "۱۳۷۶ " "۱٬۱۷۹٬۲۶۰ " ۱٬۰۳۱٬۸۳۶
41 | "۱۳۷۷ " "۱٬۱۸۶٬۶۵۹ " ۵۵۱٬۳۴۵
42 | "۱۳۷۸ " "۱٬۱۷۴٬۲۷۹ " ۵۰۶٬۹۴۵
43 | "۱۳۷۹ " "۱٬۰۹۵٬۱۶۵ " ۳۸۲٬۶۷۴
44 | "۱۳۸۰ " "۱٬۱۱۰٬۸۳۶ " ۴۲۱٬۵۲۵
45 | "۱۳۸۱ " "۱٬۱۲۲٬۱۰۴ " ۳۳۷٬۲۳۷
46 | "۱۳۸۲ " "۱٬۱۷۱٬۵۷۳ " ۳۶۸٬۵۱۸
47 | "۱۳۸۳ " "۱٬۱۵۴٬۳۶۸ " ۳۵۵٬۲۱۳
48 | "۱۳۸۴ " "۱٬۲۳۹٬۴۰۸ " ۳۶۳٬۷۲۳
49 | "۱۳۸۵ " "۱٬۲۵۳٬۹۱۲ " ۴۰۸٬۵۶۶
50 | "۱۳۸۶ " "۱٬۲۸۶٬۷۱۶ " ۴۱۲٬۷۳۶
51 | "۱۳۸۷ " "۱٬۳۰۰٬۱۶۶ " ۴۱۷٬۷۹۸
52 | "۱۳۸۸ " "۱٬۳۴۸٬۵۲۶ " ۳۹۳٬۵۱۴
53 | "۱۳۸۹ " "۱٬۳۶۴٬۵۲۳ " ۴۴۰٬۵۳۸
54 | "۱۳۹۰ " "۱٬۳۸۲٬۲۲۹ " ۴۲۲٬۱۳۳
55 | "۱۳۹۱ " "۱٬۴۲۱٬۶۸۹ " ۳۶۷٬۵۳۹
56 | "۱۳۹۲ " "۱٬۴۷۱٬۷۵۸ " ۳۶۱٬۲۲۷
57 | "۱۳۹۳ " "۱٬۵۳۴٬۳۱۱ " ۴۳۶٬۸۴۰
58 | "۱۳۹۴ " "۱٬۵۷۰٬۱۸۳ " ۳۶۶٬۶۸۴
59 | "۱۳۹۵ " "۱٬۵۲۸٬۰۰۳ " ۳۶۹٬۱۵۲
60 | "۱۳۹۶ " "۱٬۴۸۷٬۸۶۱ " ۳۷۶٬۳۱۳
61 | "۱۳۹۷ " "۱٬۳۶۶٬۴۹۱ " ۳۷۷٬۰۲۴
62 | "۱۳۹۸ " "۱٬۱۹۶٬۱۳۵ " ۳۹۵٬۳۹۲
63 | "۱۳۹۹ " "۱٬۱۱۳٬۹۶۴ " ۵۰۷٬۵۱۱
--------------------------------------------------------------------------------
/data/klpt_stopwords.json:
--------------------------------------------------------------------------------
1 | {
2 | "Sorani": {
3 | "Arabic": [
4 | "ئاستی",
5 | "ئێستا",
6 | "ئێمە",
7 | "ئێوە",
8 | "ئەم",
9 | "ئەمساڵ",
10 | "ئەمه",
11 | "ئەمڕۆ",
12 | "ئەمەش",
13 | "ئەنجام",
14 | "ئەنجامدانی",
15 | "ئەو",
16 | "ئەوان",
17 | "ئەوانەی",
18 | "ئەوه",
19 | "ئەویش",
20 | "ئەوەش",
21 | "ئەوەشی",
22 | "ئەوەی",
23 | "ئەڤ",
24 | "ئەگەر",
25 | "ب",
26 | "بارەی",
27 | "باس",
28 | "باسی",
29 | "باش",
30 | "باشترین",
31 | "بدات",
32 | "بن",
33 | "به",
34 | "بواری",
35 | "بوو",
36 | "بوون",
37 | "بوونی",
38 | "بووە",
39 | "بڕی",
40 | "بکات",
41 | "بکرێت",
42 | "بکەن",
43 | "بکەین",
44 | "بۆ",
45 | "بۆیه",
46 | "بی",
47 | "بێ",
48 | "بێت",
49 | "بێجگە",
50 | "بە",
51 | "بەبێ",
52 | "بەدەست",
53 | "بەدەم",
54 | "بەر",
55 | "بەرامبەر",
56 | "بەردەم",
57 | "بەردەوام",
58 | "بەرلە",
59 | "بەرەو",
60 | "بەرەوی",
61 | "بەرەوە",
62 | "بەسەر",
63 | "بەشی",
64 | "بەشێکی",
65 | "بەلای",
66 | "بەم",
67 | "بەمەبەستی",
68 | "بەهۆی",
69 | "بەو",
70 | "بەپێی",
71 | "بەڵام",
72 | "بەڵکو",
73 | "تا",
74 | "تاوەکو",
75 | "تاکو",
76 | "تر",
77 | "تری",
78 | "تووشی",
79 | "تۆ",
80 | "تیادا",
81 | "تیایدا",
82 | "تێ",
83 | "تێدا",
84 | "تێیدا",
85 | "تەنها",
86 | "تەنیا",
87 | "تەواو",
88 | "تەواوی",
89 | "جار",
90 | "جگە",
91 | "جۆره",
92 | "جێگەی",
93 | "جێی",
94 | "خۆی",
95 | "خۆیان",
96 | "داهاتوو",
97 | "داهاتوودا",
98 | "داهاتووی",
99 | "داوای",
100 | "داوه",
101 | "در",
102 | "درێژەی",
103 | "دوا",
104 | "دواتر",
105 | "دوای",
106 | "دوێنێ",
107 | "دژی",
108 | "دی",
109 | "دیکه",
110 | "دیکەش",
111 | "دیکەی",
112 | "دێ",
113 | "دێت",
114 | "دە",
115 | "دەبن",
116 | "دەبێت",
117 | "دەبێته",
118 | "دەدات",
119 | "دەدرێت",
120 | "دەربارەی",
121 | "دەرەوەی",
122 | "دەکات",
123 | "دەکرێت",
124 | "دەکەن",
125 | "دەکەین",
126 | "دەگەڵ",
127 | "زۆر",
128 | "زۆربەی",
129 | "زۆری",
130 | "زیاتر",
131 | "ساڵ",
132 | "سبەی",
133 | "سەبارەت",
134 | "سەر",
135 | "سەرجەم",
136 | "سەرەکی",
137 | "شوێنی",
138 | "شێوەی",
139 | "شێوەیەکی",
140 | "لای",
141 | "لایەن",
142 | "لایەنه",
143 | "لایەنی",
144 | "لێ",
145 | "لە",
146 | "لەبابەت",
147 | "لەباتی",
148 | "لەبارەی",
149 | "لەبرێتی",
150 | "لەبەر",
151 | "لەبەینی",
152 | "لەدەم",
153 | "لەرێ",
154 | "لەرێگا",
155 | "لەسەر",
156 | "لەلایەن",
157 | "لەم",
158 | "لەناو",
159 | "لەنێو",
160 | "لەو",
161 | "لەپێناوی",
162 | "لەژێر",
163 | "لەگەڵ",
164 | "ماوەی",
165 | "ملیۆن",
166 | "من",
167 | "میانەی",
168 | "مەبەستی",
169 | "ناو",
170 | "ناوخۆی",
171 | "ناوی",
172 | "نییه",
173 | "نێو",
174 | "نێوان",
175 | "هات",
176 | "هاته",
177 | "هاتووە",
178 | "هاوکات",
179 | "هۆکاری",
180 | "هۆڵی",
181 | "هۆی",
182 | "هیچ",
183 | "هێڵی",
184 | "هەبێت",
185 | "هەر",
186 | "هەردوو",
187 | "هەردوولا",
188 | "هەروەها",
189 | "هەریەک",
190 | "هەفتەی",
191 | "هەمان",
192 | "هەموو",
193 | "هەندێک",
194 | "هەیە",
195 | "هەیەو",
196 | "و",
197 | "واته",
198 | "وایه",
199 | "وتی",
200 | "وەک",
201 | "وەکوو",
202 | "پاش",
203 | "پلەی",
204 | "پێ",
205 | "پێش",
206 | "پێشتر",
207 | "پێشووی",
208 | "پێویسته",
209 | "پێی",
210 | "چوونکه",
211 | "چەند",
212 | "چەندین",
213 | "ڕوو",
214 | "ڕووی",
215 | "ژمارەیەک",
216 | "ژمارەیەکی",
217 | "ژێر",
218 | "کاتێک",
219 | "کرا",
220 | "کران",
221 | "کرد",
222 | "کردبوو",
223 | "کردن",
224 | "کردنی",
225 | "کردنەوەی",
226 | "کردووه",
227 | "کردووەو",
228 | "کردەوه",
229 | "کە",
230 | "کەس",
231 | "کەم",
232 | "یا",
233 | "یان",
234 | "یێ",
235 | "یەک",
236 | "یەکێک",
237 | "یەکەم",
238 | "یەکەمی",
239 | "یەکەمین"
240 | ],
241 | "Latin": []
242 | },
243 | "Kurmanji": {
244 | "Latin": [
245 | "a",
246 | "an",
247 | "bareya",
248 | "bareyê",
249 | "barên",
250 | "basa",
251 | "be",
252 | "belê",
253 | "ber",
254 | "bereya",
255 | "berê",
256 | "berî",
257 | "bi",
258 | "bibe",
259 | "bila",
260 | "bin",
261 | "bo",
262 | "bê",
263 | "bû",
264 | "bûn",
265 | "bûye",
266 | "da",
267 | "dawî",
268 | "dawîyê",
269 | "daye",
270 | "de",
271 | "dema",
272 | "demekê",
273 | "demê",
274 | "derbarê",
275 | "derve",
276 | "dev",
277 | "di",
278 | "dibe",
279 | "digel",
280 | "dijî",
281 | "dikir",
282 | "din",
283 | "dinê",
284 | "divê",
285 | "diçe",
286 | "doh",
287 | "du",
288 | "dê",
289 | "dîsan",
290 | "e",
291 | "eger",
292 | "em",
293 | "encam",
294 | "ev",
295 | "evan",
296 | "eve",
297 | "evê",
298 | "evî",
299 | "ew",
300 | "ewa",
301 | "ewan",
302 | "ewê",
303 | "ewên",
304 | "ewî",
305 | "ez",
306 | "gelek",
307 | "gelekî",
308 | "gelê",
309 | "gerek",
310 | "giştî",
311 | "gor",
312 | "han",
313 | "heger",
314 | "hejmarek",
315 | "hem",
316 | "heman",
317 | "hember",
318 | "hemû",
319 | "hene",
320 | "her",
321 | "herdem",
322 | "herdu",
323 | "herweha",
324 | "herwiha",
325 | "herwisa",
326 | "herî",
327 | "heta",
328 | "hev",
329 | "hevdu",
330 | "heye",
331 | "hin",
332 | "hinek",
333 | "hîngê",
334 | "hûn",
335 | "in",
336 | "ji",
337 | "jiber",
338 | "jibo",
339 | "jê",
340 | "jêr",
341 | "jî",
342 | "ka",
343 | "ke",
344 | "kes",
345 | "kir",
346 | "kirîye",
347 | "ku",
348 | "kû",
349 | "layê",
350 | "le",
351 | "li",
352 | "ligel",
353 | "lê",
354 | "me",
355 | "min",
356 | "nav",
357 | "nava",
358 | "navbera",
359 | "navê",
360 | "navîn",
361 | "ne",
362 | "nêvbera",
363 | "nêzîkî",
364 | "nîne",
365 | "piştî",
366 | "pê",
367 | "pêk",
368 | "pêş",
369 | "re",
370 | "ser",
371 | "serê",
372 | "tenê",
373 | "ti",
374 | "tiştekî",
375 | "tu",
376 | "tê",
377 | "u",
378 | "van",
379 | "ve",
380 | "vir",
381 | "vê",
382 | "vî",
383 | "wan",
384 | "we",
385 | "weha",
386 | "wek",
387 | "weke",
388 | "wekî",
389 | "wiha",
390 | "wir",
391 | "wisa",
392 | "wê",
393 | "wî",
394 | "xwarê",
395 | "xwe",
396 | "ya",
397 | "yan",
398 | "ye",
399 | "yek",
400 | "yekê",
401 | "yê",
402 | "yên",
403 | "zêde",
404 | "zêdetir",
405 | "çawa",
406 | "çend",
407 | "çendê",
408 | "çendîn",
409 | "çi",
410 | "ê",
411 | "êdî",
412 | "ên",
413 | "îro",
414 | "û"
415 | ],
416 | "Arabic": []
417 | }
418 | }
--------------------------------------------------------------------------------
/data/myanmar-regions.tsv:
--------------------------------------------------------------------------------
1 | အင်္ဂလိပ်အမည် မြန်မာအမည် မြို့တော် ISO နေရာဒေသ လူဦးရေ _၂၀၁၄ ဧရိယာ အမျိုးအစား
2 | Ayeyarwady ဧရာဝတီ ပုသိမ်မြို့ MM-07 အောက်ပိုင်း ၆,၁၈၄,၈၂၉ ၃၅,၀၃၁.၈ တိုင်းဒေသကြီး
3 | Bago ပဲခူး ပဲခူးမြို့ MM-02 အောက်ပိုင်း ၄,၈၆၇,၃၇၃ ၃၉,၄၀၂.၃ တိုင်းဒေသကြီး
4 | Chin ချင်း ဟားခါးမြို့ MM-14 အနောက်ပိုင်း ၄၇၈,၈၀၁ ၃၆,၀၁၈.၈ ပြည်နယ်
5 | Kachin ကချင် မြစ်ကြီးနားမြို့ MM-11 မြောက်ပိုင်း ၁,၆၈၉,၄၄၁ ၈၉,၀၄၁.၈ ပြည်နယ်
6 | Kayah ကယားကယား လွိုင်ကော်မြို့ MM-12 အရှေ့ပိုင်း ၂၈၆,၆၂၇ ၁၁,၇၃၁.၅ ပြည်နယ်
7 | Kayin ကရင် ဘားအံမြို့ MM-13 တောင်ပိုင်း ၁,၅၇၄,၀၇၉ ၃၀,၃၈၃ ပြည်နယ်
8 | Magway မကွေး မကွေးမြို့ MM-03 အလယ်ပိုင်း ၃,၉၁၇,၀၅၅ ၄၄,၈၂၀.၆ တိုင်းဒေသကြီး
9 | Mandalay မန္တလေး မန္တလေးမြို့ MM-04 အလယ်ပိုင်း ၆,၁၆၅,၇၂၃ ၃၇,၉၄၅.၆ တိုင်းဒေသကြီး
10 | Mon မွန် မော်လမြိုင်မြို့ MM-15 တောင်ပိုင်း ၂,၀၅၄,၃၉၃ ၁၂,၂၉၆.၆ ပြည်နယ်
11 | Rakhine ရခိုင် စစ်တွေမြို့ MM-16 အနောက်ပိုင်း ၃,၁၈၈,၈၀၇ ၃၆,၇၇၈.၀ ပြည်နယ်
12 | Shan ရှမ်း တောင်ကြီးမြို့၂ MM-17 အရှေ့ပိုင်း ၅,၈၂၄,၄၃၂ ၁၅၅,၈၀၁.၃ ပြည်နယ်
13 | Sagaing စစ်ကိုင်း မုံရွာမြို့ MM-01 မြောက်ပိုင်း ၅,၃၂၅,၃၄၇ ၉၃,၇၀၄.၈ တိုင်းဒေသကြီး
14 | Tanintharyi တနင်္သာရီ ထားဝယ်မြို့ MM-05 တောင်ပိုင်း ၁,၄၀၈,၄၀၁ ၄၄,၃၄၄.၉ တိုင်းဒေသကြီး
15 | Yangon ရန်ကုန် ရန်ကုန်မြို့ MM-06 အောက်ပိုင်း ၇,၃၆၀,၇၀၃ ၁၀,၂၆၇.၇ တိုင်းဒေသကြီး
16 | Naypyidaw နေပြည်တော် နေပြည်တော် MM-18 အလယ်ပိုင်း ၁,၁၆၀,၂၄၂ ၇,၀၅၄ ပြည်ထောင်စုနယ်မြေ
--------------------------------------------------------------------------------
/data/myanmar_ethnic_groups.tsv:
--------------------------------------------------------------------------------
1 | မြန်မာတိုင်းရင်းသားများ အကြမ်းဖျင်းခန့်မှန်း အကြမ်းဖျင်းခန့်မှန်း
2 | ကချင် 1.50 ၁.၅၀
3 | ကယား 0.75 ၀.၇၅
4 | ကရင် 7.00 ၇.၀၀
5 | တရုတ် 2.50 ၂.၅၀
6 | ဗမာ 68.00 ၆၈.၀၀
7 | မွန် 2.00 ၂.၀၀
8 | ရခိုင် 1.7 ၁.၇၀
9 | ရိုဟင်ဂျာ 1.8 ၁.၈၀
10 | ရှမ်း 9.00 ၉.၀၀
11 | အခြားအုပ်စုများ 4.50 ၄.၅၀
12 | အိန္ဒိယ 1.25 ၁.၂၅
--------------------------------------------------------------------------------
/data/rbbi/Default.rbbi:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # This file is from ICU (with some small modifications, to avoid CJK dictionary break,
18 | # and status code change related to that)
19 | #
20 | # Copyright (C) 2016 and later: Unicode, Inc. and others.
21 | # License & terms of use: http://www.unicode.org/copyright.html
22 | # Copyright (C) 2002-2016, International Business Machines Corporation
23 | # and others. All Rights Reserved.
24 | #
25 | # file: word.txt
26 | #
27 | # ICU Word Break Rules
28 | # See Unicode Standard Annex #29.
29 | # These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
30 | # with additions for Emoji Sequences from https://goo.gl/cluFCn
31 | # Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
32 | #
33 | # Note: Updates to word.txt will usually need to be merged into
34 | # word_POSIX.txt also.
35 |
36 | ##############################################################################
37 | #
38 | # Character class definitions from TR 29
39 | #
40 | ##############################################################################
41 |
42 | !!chain;
43 | !!quoted_literals_only;
44 |
45 |
46 | #
47 | # Character Class Definitions.
48 | #
49 |
50 | $CR = [\p{Word_Break = CR}];
51 | $LF = [\p{Word_Break = LF}];
52 | $Newline = [\p{Word_Break = Newline} ];
53 | $Extend = [\p{Word_Break = Extend}];
54 | $ZWJ = [\p{Word_Break = ZWJ}];
55 | $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
56 | $Format = [\p{Word_Break = Format}];
57 | $Katakana = [\p{Word_Break = Katakana}];
58 | $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
59 | $ALetter = [\p{Word_Break = ALetter}];
60 | $Single_Quote = [\p{Word_Break = Single_Quote}];
61 | $Double_Quote = [\p{Word_Break = Double_Quote}];
62 | $MidNumLet = [\p{Word_Break = MidNumLet}];
63 | $MidLetter = [\p{Word_Break = MidLetter}];
64 | $MidNum = [\p{Word_Break = MidNum}];
65 | $Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
66 |
67 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
68 | $WSegSpace = [\p{Word_Break = WSegSpace}];
69 | $Extended_Pict = [:ExtPict:];
70 |
71 | $Han = [:Han:];
72 | $Hiragana = [:Hiragana:];
73 |
74 |
75 | # Dictionary character set, for triggering language-based break engines. Currently
76 | # limited to LineBreak=Complex_Context. Note that this set only works in Unicode
77 | # 5.0 or later as the definition of Complex_Context was corrected to include all
78 | # characters requiring dictionary break.
79 |
80 | $Control = [\p{Grapheme_Cluster_Break = Control}];
81 | $HangulSyllable = [\uac00-\ud7a3];
82 | $ComplexContext = [:LineBreak = Complex_Context:];
83 | $KanaKanji = [$Han $Hiragana $Katakana];
84 | $dictionaryCJK = [$Han $Hiragana $HangulSyllable];
85 | $dictionary = [$ComplexContext];
86 |
87 | # leave CJK scripts out of ALetterPlus
88 | $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
89 |
90 |
91 | #
92 | # Rules 4 Ignore Format and Extend characters,
93 | # except when they appear at the beginning of a region of text.
94 | #
95 | # TODO: check if handling of katakana in dictionary makes rules incorrect/void
96 | $KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
97 | $Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
98 | $ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
99 | $Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
100 | $Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
101 | $MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
102 | $MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
103 | $MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
104 | $NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
105 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
106 | $Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
107 |
108 | $Ideographic = [\p{Ideographic}];
109 | $HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
110 | $IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
111 |
112 | ## -------------------------------------------------
113 |
114 | # Rule 3 - CR x LF
115 | #
116 | $CR $LF;
117 |
118 | # Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
119 | #
120 | $ZWJ $Extended_Pict;
121 |
122 | # Rule 3d - Keep horizontal whitespace together.
123 | #
124 | $WSegSpace $WSegSpace;
125 |
126 | # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
127 | # of a region of Text. The rule here comes into play when the start of text
128 | # begins with a group of Format chars, or with a "word" consisting of a single
129 | # char that is not in any of the listed word break categories followed by
130 | # format char(s), or is not a CJK dictionary character.
131 | [^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
132 |
133 | $NumericEx {100};
134 | $ALetterEx {200};
135 | $HangulSyllable {200};
136 | $Hebrew_LetterEx{200};
137 | $KatakanaEx {300}; # note: these status values override those from rule 5
138 | $HiraganaEx {300}; # by virtue of being numerically larger.
139 | $IdeographicEx {400}; #
140 |
141 | $Extended_Pict ($Extend | $Format | $ZWJ)*;
142 |
143 | #
144 | # rule 5
145 | # Do not break between most letters.
146 | #
147 | ($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200};
148 |
149 | # rule 6 and 7
150 | ($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
151 |
152 | # rule 7a
153 | $Hebrew_LetterEx $Single_QuoteEx {200};
154 |
155 | # rule 7b and 7c
156 | $Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
157 |
158 | # rule 8
159 |
160 | $NumericEx $NumericEx {100};
161 |
162 | # rule 9
163 |
164 | ($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
165 |
166 | # rule 10
167 |
168 | $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
169 |
170 | # rule 11 and 12
171 |
172 | $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
173 |
174 | # rule 13
175 | $KatakanaEx $KatakanaEx {300};
176 |
177 | # rule 13a/b
178 |
179 | $ALetterEx $ExtendNumLetEx {200}; # (13a)
180 | $Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a)
181 | $NumericEx $ExtendNumLetEx {100}; # (13a)
182 | $KatakanaEx $ExtendNumLetEx {300}; # (13a)
183 | $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
184 |
185 | $ExtendNumLetEx $ALetterEx {200}; # (13b)
186 | $ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
187 | $ExtendNumLetEx $NumericEx {100}; # (13b)
188 | $ExtendNumLetEx $KatakanaEx {300}; # (13b)
189 |
190 | # rules 15 - 17
191 | # Pairs of Regional Indicators stay together.
192 | # With rule chaining disabled by ^, this rule will match exactly two of them.
193 | # No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
194 | #
195 | ^$Regional_IndicatorEx $Regional_IndicatorEx;
196 |
197 | # special handling for CJK characters: chain for later dictionary segmentation
198 | $HangulSyllable $HangulSyllable {200};
199 |
200 | # Rule 999
201 | # Match a single code point if no other rule applies.
202 | .;
203 |
--------------------------------------------------------------------------------
/data/rbbi/Lao.rbbi:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # Parses Lao text, with syllable as token.
18 | #
19 | # The definition of Lao syllable is based from:
20 | #
21 | # Syllabification of Lao Script for Line Breaking
22 | # Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
23 | # Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
24 | # http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
25 | # http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
26 | #
27 | # NOTE:
28 | # There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
29 | # For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
30 | #
31 | # Syllable structure, where X is the nuclear consonant:
32 | #
33 | # +----+
34 | # | X5 |
35 | # +----+
36 | # | X4 |
37 | # +----+----+----+----+----+----+----+-----+
38 | # | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 |
39 | # +----+----+----+----+----+----+----+-----+
40 | # | X2 |
41 | # +----+
42 | # | X3 |
43 | # +----+
44 | #
45 | # X0 represents a vowel which occurs before the nuclear consonant.
46 | # It can always define the beginning of syllable.
47 | $X0 = [\u0EC0-\u0EC4];
48 | # X1 is a combination consonant which comes before the nuclear consonant,
49 | # but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
50 | $X1 = [\u0EAB];
51 | # X represents the nuclear consonant.
52 | $X = [\u0E81-\u0EAE\u0EDC\u0EDD];
53 | # X2 is a combination consonant which comes after the nuclear consonant,
54 | # which is placed under or next to the nuclear consonant.
55 | $X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
56 | # X3 represents a vowel which occurs under the nuclear consonant.
57 | $X3 = [\u0EB8\u0EB9];
58 | # X4 represents a vowel which occurs above the nuclear consonant.
59 | $X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
60 | # X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
61 | $X5 = [\u0EC8-\u0ECB];
62 | # X6 represents a consonant vowel, which occurs after the nuclear consonant.
63 | # It functions when the syllable doesn’t have any vowels. And it always exists with X8.
64 | $X6 = [\u0EA7\u0EAD\u0EBD];
65 | # X7 represents a final vowel.
66 | # However X7_1 always represents the end of syllable and it never exists with tone mark.
67 | $X7 = [\u0EB0\u0EB2\u0EB3];
68 | # X8 represents an alternate consonant.
69 | $X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
70 | # X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
71 | $X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
72 | # X10 represents a sign mark.
73 | # It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
74 | $X10 = [\u0EAF\u0EC6\u0ECC];
75 |
76 | # Section 1
77 | $X0_1 = [\u0EC0];
78 | $X4_1_2 = [\u0EB4\u0EB5];
79 | $X4_3_4 = [\u0EB6\u0EB7];
80 | $X4_6 = [\u0EBB];
81 | $X4_7 = [\u0EB1];
82 | $X6_2 = [\u0EAD];
83 | $X6_3 = [\u0EBD];
84 | $X7_1 = [\u0EB0];
85 | $X7_2 = [\u0EB2];
86 | $X10_1 = [\u0EAF];
87 | $X10_2 = [\u0EC6];
88 | $X10_3 = [\u0ECC];
89 |
90 | $Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
91 | $Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
92 | $Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
93 | $Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
94 | $Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
95 | $Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
96 | $Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
97 |
98 | $Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
99 |
100 | # Section 2
101 | $X0_2 = [\u0EC1];
102 |
103 | $Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
104 | $Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
105 | $Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
106 |
107 | $Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
108 |
109 | # Section 3
110 | $X0_3 = [\u0EC2];
111 | $X8_3 = [\u0E8D];
112 | $X8_8 = [\u0EA7];
113 |
114 | $Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
115 | $Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
116 | $Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
117 |
118 | $Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
119 |
120 | # Section 4
121 | $X0_4 = [\u0EC4];
122 | $X6_1 = [\u0EA7];
123 |
124 | $Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
125 |
126 | # Section 5
127 | $X0_5 = [\u0EC3];
128 |
129 | $Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
130 |
131 | # Section 6
132 | $Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
133 |
134 | # Section 7
135 | $X4_1_4 = [\u0EB4-\u0EB7];
136 |
137 | $Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
138 |
139 | # Section 8
140 | $X4_5 = [\u0ECD];
141 |
142 | $Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
143 |
144 | # Section 9
145 |
146 | $Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
147 | $Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
148 |
149 | $Rule9 = ($Rule9_1 | $Rule9_2);
150 |
151 | # Section 10
152 | $Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
153 |
154 | # Section 11
155 | $Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
156 |
157 | # Section 12
158 | $Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
159 |
160 | # Section 13
161 | $Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
162 |
163 | # Section 14
164 | $X7_3 = [\u0EB3];
165 |
166 | $Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
167 |
168 | $LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
169 |
170 | $WordJoin = [:Line_Break=Word_Joiner:];
171 |
172 | $LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
173 |
174 | #
175 | # default numerical definitions
176 | #
177 | $Extend = [\p{Word_Break = Extend}];
178 | $Format = [\p{Word_Break = Format}];
179 | $MidNumLet = [\p{Word_Break = MidNumLet}];
180 | $MidNum = [\p{Word_Break = MidNum}];
181 | $Numeric = [\p{Word_Break = Numeric}];
182 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
183 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*;
184 | $MidNumEx = $MidNum ($Extend | $Format)*;
185 | $NumericEx = $Numeric ($Extend | $Format)*;
186 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
187 |
188 | !!forward;
189 |
190 | $LaoJoinedSyllableEx {200};
191 | # default numeric rules
192 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
193 |
--------------------------------------------------------------------------------
/data/rbbi/lucene/source.md:
--------------------------------------------------------------------------------
1 | * https://gitbox.apache.org/repos/asf?p=lucene.git;a=tree;f=lucene/analysis/icu/src/data;h=e7275ffa9541dab51e4b9a62166aeef457c5c22f;hb=refs/heads/main
--------------------------------------------------------------------------------
/data/rbbi/lucene/uax29/Default.rbbi:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # This file is from ICU (with some small modifications, to avoid CJK dictionary break,
18 | # and status code change related to that)
19 | #
20 | # Copyright (C) 2016 and later: Unicode, Inc. and others.
21 | # License & terms of use: http://www.unicode.org/copyright.html
22 | # Copyright (C) 2002-2016, International Business Machines Corporation
23 | # and others. All Rights Reserved.
24 | #
25 | # file: word.txt
26 | #
27 | # ICU Word Break Rules
28 | # See Unicode Standard Annex #29.
29 | # These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
30 | # with additions for Emoji Sequences from https://goo.gl/cluFCn
31 | # Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
32 | #
33 | # Note: Updates to word.txt will usually need to be merged into
34 | # word_POSIX.txt also.
35 |
36 | ##############################################################################
37 | #
38 | # Character class definitions from TR 29
39 | #
40 | ##############################################################################
41 |
42 | !!chain;
43 | !!quoted_literals_only;
44 |
45 |
46 | #
47 | # Character Class Definitions.
48 | #
49 |
50 | $CR = [\p{Word_Break = CR}];
51 | $LF = [\p{Word_Break = LF}];
52 | $Newline = [\p{Word_Break = Newline} ];
53 | $Extend = [\p{Word_Break = Extend}];
54 | $ZWJ = [\p{Word_Break = ZWJ}];
55 | $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
56 | $Format = [\p{Word_Break = Format}];
57 | $Katakana = [\p{Word_Break = Katakana}];
58 | $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
59 | $ALetter = [\p{Word_Break = ALetter}];
60 | $Single_Quote = [\p{Word_Break = Single_Quote}];
61 | $Double_Quote = [\p{Word_Break = Double_Quote}];
62 | $MidNumLet = [\p{Word_Break = MidNumLet}];
63 | $MidLetter = [\p{Word_Break = MidLetter}];
64 | $MidNum = [\p{Word_Break = MidNum}];
65 | $Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
66 |
67 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
68 | $WSegSpace = [\p{Word_Break = WSegSpace}];
69 | $Extended_Pict = [:ExtPict:];
70 |
71 | $Han = [:Han:];
72 | $Hiragana = [:Hiragana:];
73 |
74 |
75 | # Dictionary character set, for triggering language-based break engines. Currently
76 | # limited to LineBreak=Complex_Context. Note that this set only works in Unicode
77 | # 5.0 or later as the definition of Complex_Context was corrected to include all
78 | # characters requiring dictionary break.
79 |
80 | $Control = [\p{Grapheme_Cluster_Break = Control}];
81 | $HangulSyllable = [\uac00-\ud7a3];
82 | $ComplexContext = [:LineBreak = Complex_Context:];
83 | $KanaKanji = [$Han $Hiragana $Katakana];
84 | $dictionaryCJK = [$Han $Hiragana $HangulSyllable];
85 | $dictionary = [$ComplexContext];
86 |
87 | # leave CJK scripts out of ALetterPlus
88 | $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
89 |
90 |
91 | #
92 | # Rules 4 Ignore Format and Extend characters,
93 | # except when they appear at the beginning of a region of text.
94 | #
95 | # TODO: check if handling of katakana in dictionary makes rules incorrect/void
96 | $KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
97 | $Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
98 | $ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
99 | $Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
100 | $Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
101 | $MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
102 | $MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
103 | $MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
104 | $NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
105 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
106 | $Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
107 |
108 | $Ideographic = [\p{Ideographic}];
109 | $HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
110 | $IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
111 |
112 | ## -------------------------------------------------
113 |
114 | # Rule 3 - CR x LF
115 | #
116 | $CR $LF;
117 |
118 | # Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
119 | #
120 | $ZWJ $Extended_Pict;
121 |
122 | # Rule 3d - Keep horizontal whitespace together.
123 | #
124 | $WSegSpace $WSegSpace;
125 |
126 | # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
127 | # of a region of Text. The rule here comes into play when the start of text
128 | # begins with a group of Format chars, or with a "word" consisting of a single
129 | # char that is not in any of the listed word break categories followed by
130 | # format char(s), or is not a CJK dictionary character.
131 | [^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
132 |
133 | $NumericEx {100};
134 | $ALetterEx {200};
135 | $HangulSyllable {200};
136 | $Hebrew_LetterEx{200};
137 | $KatakanaEx {300}; # note: these status values override those from rule 5
138 | $HiraganaEx {300}; # by virtue of being numerically larger.
139 | $IdeographicEx {400}; #
140 |
141 | $Extended_Pict ($Extend | $Format | $ZWJ)*;
142 |
143 | #
144 | # rule 5
145 | # Do not break between most letters.
146 | #
147 | ($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200};
148 |
149 | # rule 6 and 7
150 | ($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
151 |
152 | # rule 7a
153 | $Hebrew_LetterEx $Single_QuoteEx {200};
154 |
155 | # rule 7b and 7c
156 | $Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
157 |
158 | # rule 8
159 |
160 | $NumericEx $NumericEx {100};
161 |
162 | # rule 9
163 |
164 | ($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
165 |
166 | # rule 10
167 |
168 | $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
169 |
170 | # rule 11 and 12
171 |
172 | $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
173 |
174 | # rule 13
175 | $KatakanaEx $KatakanaEx {300};
176 |
177 | # rule 13a/b
178 |
179 | $ALetterEx $ExtendNumLetEx {200}; # (13a)
180 | $Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a)
181 | $NumericEx $ExtendNumLetEx {100}; # (13a)
182 | $KatakanaEx $ExtendNumLetEx {300}; # (13a)
183 | $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
184 |
185 | $ExtendNumLetEx $ALetterEx {200}; # (13b)
186 | $ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
187 | $ExtendNumLetEx $NumericEx {100}; # (13b)
188 | $ExtendNumLetEx $KatakanaEx {300}; # (13b)
189 |
190 | # rules 15 - 17
191 | # Pairs of Regional Indicators stay together.
192 | # With rule chaining disabled by ^, this rule will match exactly two of them.
193 | # No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
194 | #
195 | ^$Regional_IndicatorEx $Regional_IndicatorEx;
196 |
197 | # special handling for CJK characters: chain for later dictionary segmentation
198 | $HangulSyllable $HangulSyllable {200};
199 |
200 | # Rule 999
201 | # Match a single code point if no other rule applies.
202 | .;
203 |
--------------------------------------------------------------------------------
/data/rbbi/lucene/uax29/MyanmarSyllable.rbbi:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | #
18 | # Parses Myanmar text, with syllable as token.
19 | #
20 |
21 | $Cons = [[:Other_Letter:]&[:Myanmar:]];
22 | $Virama = [\u1039];
23 | $Asat = [\u103A];
24 |
25 | $WordJoin = [:Line_Break=Word_Joiner:];
26 |
27 | #
28 | # default numerical definitions
29 | #
30 | $Extend = [\p{Word_Break = Extend}];
31 | $Format = [\p{Word_Break = Format}];
32 | $MidNumLet = [\p{Word_Break = MidNumLet}];
33 | $MidNum = [\p{Word_Break = MidNum}];
34 | $Numeric = [\p{Word_Break = Numeric}];
35 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
36 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*;
37 | $MidNumEx = $MidNum ($Extend | $Format)*;
38 | $NumericEx = $Numeric ($Extend | $Format)*;
39 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
40 |
41 | $ConsEx = $Cons ($Extend | $Format)*;
42 | $AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
43 | $MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
44 | $MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
45 |
46 | !!forward;
47 | $MyanmarJoinedSyllableEx {200};
48 |
49 | # default numeric rules
50 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
51 |
--------------------------------------------------------------------------------
/data/rbbi/solrcene/Hebrew.rbbi:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | #
18 | # This is an example of rule tailoring for Hebrew.
19 | # In this example the single-quote is added to the Extend category
20 | # The double-quote is added to the MidLetter category.
21 | #
22 | !!chain;
23 | $CR = [\p{Word_Break = CR}];
24 | $LF = [\p{Word_Break = LF}];
25 | $Newline = [\p{Word_Break = Newline}];
26 | $Extend = [\p{Word_Break = Extend}\u0027];
27 | $Format = [\p{Word_Break = Format}];
28 | $ALetter = [\p{Word_Break = ALetter}];
29 | $MidNumLet = [\p{Word_Break = MidNumLet}];
30 | $MidLetter = [\p{Word_Break = MidLetter}\u0022];
31 | $MidNum = [\p{Word_Break = MidNum}];
32 | $Numeric = [\p{Word_Break = Numeric}];
33 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
34 | $dictionary = [:LineBreak = Complex_Context:];
35 | $Control = [\p{Grapheme_Cluster_Break = Control}];
36 | $ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
37 |
38 | $ALetterEx = $ALetterPlus ($Extend | $Format)*;
39 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*;
40 | $MidLetterEx = $MidLetter ($Extend | $Format)*;
41 | $MidNumEx = $MidNum ($Extend | $Format)*;
42 | $NumericEx = $Numeric ($Extend | $Format)*;
43 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
44 |
45 | !!forward;
46 |
47 | $CR $LF;
48 | [^$CR $LF $Newline]? ($Extend | $Format)+;
49 | $NumericEx {100};
50 | $ALetterEx {200};
51 | $ALetterEx $ALetterEx {200};
52 | $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
53 | $NumericEx $NumericEx {100};
54 | $ALetterEx $NumericEx {200};
55 | $NumericEx $ALetterEx {200};
56 | $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
57 | $ALetterEx $ExtendNumLetEx {200};
58 | $NumericEx $ExtendNumLetEx {100};
59 | $ExtendNumLetEx $ExtendNumLetEx {200};
60 | $ExtendNumLetEx $ALetterEx {200};
61 | $ExtendNumLetEx $NumericEx {100};
62 |
--------------------------------------------------------------------------------
/data/rbbi/solrcene/Khmer.rbbi:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | #
18 | # Parses Khmer text, with orthographic syllable as token.
19 | #
20 | # The definition of Khmer orthographic syllable is taken from the Unicode Standard.
21 | #
22 | # B = base character (consonant, independent vowel, etc)
23 | $KhmerBase = [\u1780-\u17B3];
24 | # R = robat
25 | $KhmerRobat = [\u17CC];
26 | # C = consonant shifter
27 | $KhmerShifter = [\u17C9\u17CA];
28 | # S = subscript consonant or independent vowel sign
29 | $KhmerSub = ([\u17D2] $KhmerBase);
30 | # V = dependent vowel sign
31 | $KhmerVowel = [\u17B4-\u17C5];
32 | # Z = zero-width joiner or non-joiner
33 | $KhmerZWC = [\u200C\u200D];
34 | # O = any other sign
35 | $KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD];
36 |
37 | $WordJoin = [:Line_Break=Word_Joiner:];
38 |
39 | $KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
40 |
41 | $KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
42 |
43 | #
44 | # default numerical definitions
45 | #
46 | $Extend = [\p{Word_Break = Extend}];
47 | $Format = [\p{Word_Break = Format}];
48 | $MidNumLet = [\p{Word_Break = MidNumLet}];
49 | $MidNum = [\p{Word_Break = MidNum}];
50 | $Numeric = [\p{Word_Break = Numeric}];
51 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
52 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*;
53 | $MidNumEx = $MidNum ($Extend | $Format)*;
54 | $NumericEx = $Numeric ($Extend | $Format)*;
55 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
56 |
57 | !!forward;
58 | $KhmerJoinedSyllableEx {200};
59 |
60 | # default numeric rules
61 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
62 |
--------------------------------------------------------------------------------
/data/rbbi/solrcene/Lao.rbbi:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # Parses Lao text, with syllable as token.
18 | #
19 | # The definition of Lao syllable is based from:
20 | #
21 | # Syllabification of Lao Script for Line Breaking
22 | # Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
23 | # Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
24 | # http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
25 | # http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
26 | #
27 | # NOTE:
28 | # There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
29 | # For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
30 | #
31 | # Syllable structure, where X is the nuclear consonant:
32 | #
33 | # +----+
34 | # | X5 |
35 | # +----+
36 | # | X4 |
37 | # +----+----+----+----+----+----+----+-----+
38 | # | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 |
39 | # +----+----+----+----+----+----+----+-----+
40 | # | X2 |
41 | # +----+
42 | # | X3 |
43 | # +----+
44 | #
45 | # X0 represents a vowel which occurs before the nuclear consonant.
46 | # It can always define the beginning of syllable.
47 | $X0 = [\u0EC0-\u0EC4];
48 | # X1 is a combination consonant which comes before the nuclear consonant,
49 | # but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
50 | $X1 = [\u0EAB];
51 | # X represents the nuclear consonant.
52 | $X = [\u0E81-\u0EAE\u0EDC\u0EDD];
53 | # X2 is a combination consonant which comes after the nuclear consonant,
54 | # which is placed under or next to the nuclear consonant.
55 | $X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
56 | # X3 represents a vowel which occurs under the nuclear consonant.
57 | $X3 = [\u0EB8\u0EB9];
58 | # X4 represents a vowel which occurs above the nuclear consonant.
59 | $X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
60 | # X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
61 | $X5 = [\u0EC8-\u0ECB];
62 | # X6 represents a consonant vowel, which occurs after the nuclear consonant.
63 | # It functions when the syllable doesn’t have any vowels. And it always exists with X8.
64 | $X6 = [\u0EA7\u0EAD\u0EBD];
65 | # X7 represents a final vowel.
66 | # However X7_1 always represents the end of syllable and it never exists with tone mark.
67 | $X7 = [\u0EB0\u0EB2\u0EB3];
68 | # X8 represents an alternate consonant.
69 | $X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
70 | # X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
71 | $X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
72 | # X10 represents a sign mark.
73 | # It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
74 | $X10 = [\u0EAF\u0EC6\u0ECC];
75 |
76 | # Section 1
77 | $X0_1 = [\u0EC0];
78 | $X4_1_2 = [\u0EB4\u0EB5];
79 | $X4_3_4 = [\u0EB6\u0EB7];
80 | $X4_6 = [\u0EBB];
81 | $X4_7 = [\u0EB1];
82 | $X6_2 = [\u0EAD];
83 | $X6_3 = [\u0EBD];
84 | $X7_1 = [\u0EB0];
85 | $X7_2 = [\u0EB2];
86 | $X10_1 = [\u0EAF];
87 | $X10_2 = [\u0EC6];
88 | $X10_3 = [\u0ECC];
89 |
90 | $Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
91 | $Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
92 | $Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
93 | $Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
94 | $Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
95 | $Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
96 | $Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
97 |
98 | $Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
99 |
100 | # Section 2
101 | $X0_2 = [\u0EC1];
102 |
103 | $Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
104 | $Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
105 | $Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
106 |
107 | $Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
108 |
109 | # Section 3
110 | $X0_3 = [\u0EC2];
111 | $X8_3 = [\u0E8D];
112 | $X8_8 = [\u0EA7];
113 |
114 | $Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
115 | $Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
116 | $Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
117 |
118 | $Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
119 |
120 | # Section 4
121 | $X0_4 = [\u0EC4];
122 | $X6_1 = [\u0EA7];
123 |
124 | $Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
125 |
126 | # Section 5
127 | $X0_5 = [\u0EC3];
128 |
129 | $Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
130 |
131 | # Section 6
132 | $Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
133 |
134 | # Section 7
135 | $X4_1_4 = [\u0EB4-\u0EB7];
136 |
137 | $Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
138 |
139 | # Section 8
140 | $X4_5 = [\u0ECD];
141 |
142 | $Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
143 |
144 | # Section 9
145 |
146 | $Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
147 | $Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
148 |
149 | $Rule9 = ($Rule9_1 | $Rule9_2);
150 |
151 | # Section 10
152 | $Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
153 |
154 | # Section 11
155 | $Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
156 |
157 | # Section 12
158 | $Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
159 |
160 | # Section 13
161 | $Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
162 |
163 | # Section 14
164 | $X7_3 = [\u0EB3];
165 |
166 | $Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
167 |
168 | $LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
169 |
170 | $WordJoin = [:Line_Break=Word_Joiner:];
171 |
172 | $LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
173 |
174 | #
175 | # default numerical definitions
176 | #
177 | $Extend = [\p{Word_Break = Extend}];
178 | $Format = [\p{Word_Break = Format}];
179 | $MidNumLet = [\p{Word_Break = MidNumLet}];
180 | $MidNum = [\p{Word_Break = MidNum}];
181 | $Numeric = [\p{Word_Break = Numeric}];
182 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
183 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*;
184 | $MidNumEx = $MidNum ($Extend | $Format)*;
185 | $NumericEx = $Numeric ($Extend | $Format)*;
186 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
187 |
188 | !!forward;
189 |
190 | $LaoJoinedSyllableEx {200};
191 | # default numeric rules
192 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
193 |
--------------------------------------------------------------------------------
/data/rbbi/solrcene/Myanmar.rbbi:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | #
18 | # Parses Myanmar text, with syllable as token.
19 | #
20 |
21 | $Cons = [[:Other_Letter:]&[:Myanmar:]];
22 | $Virama = [\u1039];
23 | $Asat = [\u103A];
24 |
25 | $WordJoin = [:Line_Break=Word_Joiner:];
26 |
27 | #
28 | # default numerical definitions
29 | #
30 | $Extend = [\p{Word_Break = Extend}];
31 | $Format = [\p{Word_Break = Format}];
32 | $MidNumLet = [\p{Word_Break = MidNumLet}];
33 | $MidNum = [\p{Word_Break = MidNum}];
34 | $Numeric = [\p{Word_Break = Numeric}];
35 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
36 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*;
37 | $MidNumEx = $MidNum ($Extend | $Format)*;
38 | $NumericEx = $Numeric ($Extend | $Format)*;
39 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
40 |
41 | $ConsEx = $Cons ($Extend | $Format)*;
42 | $AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
43 | $MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
44 | $MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
45 |
46 | !!forward;
47 | $MyanmarJoinedSyllableEx {200};
48 |
49 | # default numeric rules
50 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
51 |
--------------------------------------------------------------------------------
/data/rbbi/solrcene/source.md:
--------------------------------------------------------------------------------
1 | * https://github.com/chrismattmann/solrcene/tree/master/modules/analysis/icu/src/data
--------------------------------------------------------------------------------
/data/rbbi/source.md:
--------------------------------------------------------------------------------
1 | # RBBI files
2 |
3 | * [icu::RuleBasedBreakIterator Class Reference](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedBreakIterator.html)
4 | * [Boundary Analysis](https://unicode-org.github.io/icu/userguide/boundaryanalysis/)
5 | * [Break Rules](https://unicode-org.github.io/icu/userguide/boundaryanalysis/break-rules.html)
6 | * [Updating ICU's built-in Break Iterator rules](https://github.com/unicode-org/icu/blob/main/docs/processes/rules_update.md)
7 |
8 | ## Current Lucene
9 |
10 | * [gitbox.apache.org - lucene](https://gitbox.apache.org/repos/asf?p=lucene.git;a=tree;f=lucene/analysis/icu/src/data/uax29;h=8423b0c7713159c3dffb549f18a37c425eb96001;hb=HEAD)
11 |
12 | ## Old Lucene
13 |
14 | * [apache/lucene-solr](https://github.com/apache/lucene-solr/tree/releases/lucene-solr/4.0.0/lucene/analysis/icu/src/data/uax29)
15 |
16 |
17 | ## Misc
18 |
19 | * https://stackoverflow.com/questions/559949/the-word-break-rule-file
20 |
21 | ```txt
22 | RuleBasedBreakIterator (icu)
23 | RuleBasedCollator (icu)
24 | RuleBasedNumberFormat (icu)
25 | RuleBasedTimeZone (icu)
26 | ```
27 |
28 |
--------------------------------------------------------------------------------
/data/régions_métropolitaines.tsv:
--------------------------------------------------------------------------------
1 | "Dénomination " "Chef-lieu de région " Superficie (km2) Population (2019) "Population estimée (2022) " "Densité (2019) (hab./km2) " Code Insee
2 | Occitanie Toulouse 72 724 5 933 185 6 053 548 81,6 76
3 | Grand Est Strasbourg 57 441 5 556 219 5 542 094 96,7 44
4 | Normandie Rouen 29 907 3 325 032 3 307 286 111,2 28
5 | Bretagne Rennes 27 208 3 354 854 3 402 932 123,3 53
6 | Île-de-France Paris 12 011 12 262 544 12 395 148 1020,9 11
7 | Centre-Val de Loire Orléans 39 151 2 573 180 2 564 915 65,7 24
8 | Pays de la Loire Nantes 32 082 3 806 461 3 873 096 118,6 52
9 | Provence-Alpes-Côte d'Azur Marseille 31 400 5 081 101 5 131 187 161,8 93
10 | Auvergne-Rhône-Alpes Lyon 69 711 8 042 936 8 153 233 115,4 84
11 | Hauts-de-France Lille 31 806 6 004 947 5 987 172 188,8 32
12 | Bourgogne-Franche-Comté Dijon 47 784 2 805 580 2 785 393 58,7 27
13 | Nouvelle-Aquitaine Bordeaux 84 036 6 010 289 6 081 985 71,5 75
14 | Corse Ajaccio 8 680 340 440 349 465 39,2 94
--------------------------------------------------------------------------------
/data/sorani_alphabet.tsv:
--------------------------------------------------------------------------------
1 | Order Character Codepoint
2 | 1 ئ U+0626
3 | 2 ا U+0627
4 | 3 ب U+0628
5 | 4 پ U+067E
6 | 5 ت U+062A
7 | 6 ج U+062C
8 | 7 چ U+0686
9 | 8 ح U+062D
10 | 9 خ U+062E
11 | 10 د U+062F
12 | 11 ر U+0631
13 | 12 ڕ U+0695
14 | 13 ز U+0632
15 | 14 ژ U+0698
16 | 15 س U+0633
17 | 16 ش U+0634
18 | 17 ع U+0639
19 | 18 غ U+063A
20 | 19 ف U+0641
21 | 20 ڤ U+06A4
22 | 21 ق U+0642
23 | 22 ک U+06A9
24 | 23 گ U+06AF
25 | 24 ل U+0644
26 | 25 ڵ U+06B5
27 | 26 م U+0645
28 | 27 ن U+0646
29 | 28 ه U+0647
30 | 29 ە U+06D5
31 | 30 و U+0648
32 | 31 وو U+0648 U+0648
33 | 32 ۆ U+06C6
34 | 33 ی U+06CC
35 | 34 ێ U+06CE
--------------------------------------------------------------------------------
/data/sorani_alphabet_wikipedia.tsv:
--------------------------------------------------------------------------------
1 | Order Character Codepoint
2 | 1 ئ U+0626
3 | 2 ا U+0627
4 | 3 ب U+0628
5 | 4 پ U+067E
6 | 5 ت U+062A
7 | 6 ج U+062C
8 | 7 چ U+0686
9 | 8 ح U+062D
10 | 9 خ U+062E
11 | 10 د U+062F
12 | 11 ر U+0631
13 | 12 ڕ U+0695
14 | 13 ز U+0632
15 | 14 ژ U+0698
16 | 15 س U+0633
17 | 16 ش U+0634
18 | 17 ع U+0639
19 | 18 غ U+063A
20 | 19 ف U+0641
21 | 20 ڤ U+06A4
22 | 21 ق U+0642
23 | 22 ک U+06A9
24 | 23 گ U+06AF
25 | 24 ل U+0644
26 | 25 ڵ U+06B5
27 | 26 م U+0645
28 | 27 ن U+0646
29 | 28 ه U+0647
30 | 29 ە U+06D5
31 | 30 و U+0648
32 | 32 ۆ U+06C6
33 | 31 وو U+0648 U+0648
34 | 33 ی U+06CC
35 | 34 ێ U+06CE
--------------------------------------------------------------------------------
/data/source.md:
--------------------------------------------------------------------------------
1 | # Sources
2 |
3 | ## klpt_stopwords
4 |
5 | The [stopword list](https://github.com/sinaahmadi/klpt/blob/master/klpt/data/stopwords.json) is from Sina Ahmadi's [Kurdish Language Processing Toolkit](https://github.com/sinaahmadi/klpt), which was released under an [Attribution-ShareAlike 4.0 International Public License](https://github.com/sinaahmadi/klpt/blob/master/LICENSE).
6 |
7 |
--------------------------------------------------------------------------------
/data/türkiye'ninz-illeri.tsv:
--------------------------------------------------------------------------------
1 | Ad Alan (km²) Nüfus (2019) NY kişi/km² Plaka kodu Telefon kodu Vali
2 | İstanbul 5.461 15.519.267 2.841,83 34 212, 216 Ali Yerlikaya
3 | Eskişehir 13.960 887.475 63,57 26 222 Erol Ayyıldız
4 | Bursa 10.813 3.056.120 282,63 16 224 Yakup Canbolat
5 | Yalova 798 270.976 339,56 77 226 Muammer Erol
6 | Bilecik 4.179 219.427 52,50 11 228 Bilal Şentürk
7 | İzmir 11.891 4.367.251 367,27 35 232 Yavuz Selim Köşger
8 | Manisa 13.339 1.440.611 107,99 45 236 Yaşar Karadeniz
9 | Antalya 20.177 2.511.700 124,48 07 242 Ersin Yazıcı
10 | Isparta 8.946 444.914 49,73 32 246 Ömer Seymenoğlu
11 | Burdur 7.175 270.796 37,74 15 248 Ali Arslantaş
12 | Muğla 12.654 983.142 77,69 48 252 Orhan Tavlı
13 | Aydın 8.116 1.110.972 136,88 09 256 Hüseyin Aksoy
14 | Denizli 12.134 1.037.208 85,47 20 258 Ali Fuat Atik
15 | Kocaeli 3.397 1.953.035 574,92 41 262 Seddar Yavuz
16 | Sakarya 4.824 1.029.650 213,44 54 264 Çetin Oktay Kaldırım
17 | Balıkesir 14.583 1.228.620 84,25 10 266 Hasan Şıldak
18 | Afyonkarahisar 14.016 729.483 52,04 03 272 Gökmen Çiçek
19 | Kütahya 11.634 579.257 49,79 43 274 Ali Çelik
20 | Uşak 5.555 370.509 66,69 64 276 Funda Kocabıyık
21 | Tekirdağ 6.190 1.055.412 170,50 59 282 Aziz Yıldırım
22 | Edirne 6.145 413.903 67,35 22 284 Ekrem Canalp
23 | Çanakkale 9.817 542.157 55,22 17 286 İlhami Aktaş
24 | Kırklareli 6.459 361.836 56,02 39 288 Osman Bilgin
25 | Ankara 25.632 5.639.076 220 06 312 Vasip Şahin
26 | Kırıkkale 4.791 283.017 59,07 71 318 Yunus Sezer
27 | Adana 13.844 2.237.940 161,65 01 322 Süleyman Elban
28 | Mersin 16.010 1.840.425 114,95 33 324 Ali İhsan Su
29 | Hatay 5.524 1.628.894 294,87 31 326 Rahmi Doğan
30 | Osmaniye 3.320 538.759 162,27 80 328 Erdinç Yılmaz
31 | Konya 40.838 2.232.374 54,66 42 332 Vahdettin Özkan
32 | Karaman 8.678 253.279 29,18 70 338 Mehmet Alpaslan Işık
33 | Gaziantep 6.803 2.069.364 304,18 27 342 Davut Gül
34 | Kahramanmaraş 14.520 1.154.102 79,48 46 344 Ömer Faruk Coşkun
35 | Sivas 28.164 638.956 22,68 58 346 Salih Ayhan
36 | Kilis 1.412 142.490 100,91 79 348 Recep Soytürk
37 | Kayseri 16.970 1.407.409 82,93 38 352 Şehmus Günaydın
38 | Yozgat 13.690 421.200 30,76 66 354 Ziya Polat
39 | Tokat 10.042 612.747 61,01 60 356 Ozan Balcı
40 | Amasya 5.628 337.800 60,02 05 358 Mustafa Masatlı
41 | Samsun 9.725 1.348.542 138,66 55 362 Zülkif Dağlı
42 | Çorum 12.428 530.864 42,71 19 364 Mustafa Çiftçi
43 | Kastamonu 13.064 379.405 29,04 37 366 Avni Çakır
44 | Sinop 5.717 218.243 38,17 57 368 Erol Karaömeroğlu
45 | Karabük 4.142 248.458 59,98 78 370 Fuat Gürel
46 | Zonguldak 3.342 596.053 178,35 67 372 Mustafa Tutulmaz
47 | Bolu 8.313 316.126 38,02 14 374 Ahmet Ümit
48 | Çankırı 7.542 195.789 25,95 18 376 Abdullah Ayaz
49 | Bartın 2.330 198.249 85,08 74 378 Sinan Güner
50 | Düzce 2.492 392.166 157,36 81 380 Cevdet Atay
51 | Aksaray 7.659 416.367 54,36 68 382 Hamza Aydoğdu
52 | Nevşehir 5.485 303.010 55,24 50 384 İnci Sezer Becel
53 | Kırşehir 6.584 242.938 36,89 40 386 İbrahim Akın
54 | Niğde 7.234 362.861 48,59 51 388 Yılmaz Şimşek
55 | Diyarbakır 15.168 1.756.353 115,79 21 412 Münir Karaloğlu
56 | Şanlıurfa 19.242 2.073.614 107,76 63 414 Abdullah Erin
57 | Adıyaman 7.337 626.465 85,38 02 416 Aykut Pekmez
58 | Malatya 12.259 800.165 65,27 44 422 Aydın Baruş
59 | Elazığ 9.383 591.098 62,99 23 424 Erkaya Yırık
60 | Bingöl 8.004 279.812 34,95 12 426 Kadir Ekinci
61 | Tunceli 7.582 84.660 11,16 62 428 Mehmet Ali Özkan
62 | Van 20.921 1.136.757 54,33 65 432 Mehmet Emin Bilmez
63 | Bitlis 8.294 348.115 41,97 13 434 Oktay Çağatay
64 | Muş 8.650 408.809 47,26 49 436 İlker Gündüzöz
65 | Hakkâri 7.095 280.991 39,60 30 438 İdris Akbıyık
66 | Erzurum 25.006 762.062 30,47 25 442 Okay Memiş
67 | Erzincan 11.815 234.747 19,86 24 446 Mehmet Makas
68 | Ordu 5.861 754.198 128,68 52 452 Tuncay Sonel
69 | Giresun 7.025 448.400 63,82 28 454 Enver Ünlü
70 | Gümüşhane 6.668 164.521 24,67 29 456 Kamuran Taşbilek
71 | Bayburt 3.746 84.843 22,64 69 458 Cüneyt Epcim
72 | Trabzon 4.628 808.974 174,79 61 462 İsmail Ustaoğlu
73 | Rize 3.835 343.212 89,49 53 464 Kemal Çeber
74 | Artvin 7.393 170.875 23,11 08 466 Yılmaz Doruk
75 | Ağrı 11.099 536.199 48,31 04 472 Osman Varol
76 | Kars 10.193 285.410 28 36 474 Türker Öksüz
77 | Iğdır 3.664 199.442 54,43 76 476 Hüseyin Engin Sarıibrahim
78 | Ardahan 4.934 97.319 19,72 75 478 Hüseyin Öner
79 | Mardin 8.780 838.778 95,53 47 482 Mahmut Demirtaş
80 | Siirt 5.717 330.280 57,77 56 484 Osman Hacıbektaşoğlu
81 | Şırnak 7.078 529.615 74,82 73 486 Ali Hamza Pehlivan
82 | Batman 4.477 608.659 135,95 72 488 Hulusi Şahin
--------------------------------------------------------------------------------
/data/wordlists/source.md:
--------------------------------------------------------------------------------
1 | # Data sources – wordlists
2 |
3 | * [kurdi_words.txt](https://raw.githubusercontent.com/0xdolan/kurdi/master/corpus/kurdi_words.txt) (Sorani)
--------------------------------------------------------------------------------
/docs/DRAFT_icu_transforms.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/docs/DRAFT_icu_transforms.pdf
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Python internationalisation
--------------------------------------------------------------------------------
/docs/matplotlib.md:
--------------------------------------------------------------------------------
1 | appropriate # Python and Pandas internationalisation
2 |
3 | ## Data visualisation issues for languages that need bidirectional support or complex font rendering.
4 |
5 | _Matplotlib_ is a commonly used tool for basic data visualisation in Python, and is the default plotting tool with _pandas.Dataframe.plot_. It is also used by _seaborn_ and _wordcount_, along with other libraries and tools.
6 |
7 | The default backends for _Matplotlib_ have a number of limitations:
8 |
9 | 1. No support for the Unicode bidirectional algorithm,
10 | 2. No support for complex font rendering
11 |
12 | This places severe limits on what natural languages can be used in titles, lables, legends, and other text elements in plots.
13 |
14 | The package [mplcairo](https://github.com/matplotlib/mplcairo) provides an alternative backend for _matplotlib_ that uses [Raqm](https://github.com/HOST-Oman/libraqm) and [GNU FriBidi](https://github.com/fribidi/fribidi) for bidirectional text layout and complex rendering of OpenType features. This allows the use of most languages to be supported in plots.
15 |
16 | The key limitations for _mplcairo_ are bugs in iPython and the lack of support for _Jupyter notebooks_.
17 |
18 | Using the _mplcairo_ backend for _matplotlib_ we can display plot titles, axes labels and categorical tick labels in any language we need to support.
19 |
20 | There are two missing pieces at this point:
21 |
22 | 1. Display of numeric tick labels in a numeral system appropriate for the UI language.
23 | 2. Choice on bidirectional layout req
24 | uirements of the appropriate
25 | data visualisation.
26 |
27 | ## Numeral systems
28 |
29 | Regarding the first issue, it is possible to use `matplotlib.ticker.FuncFormatter()` to apply a function to convert to the target numeral system, and apply necessary grouping and decimal separators.
30 |
31 | ### RTL layout and data visualisation
32 |
33 | It isn't always necessary to change the layout of the plot. If the plot is using a cartesian coordinate system, it is best to use the default layout.
34 | The layout used, combined with user expectations, will impact the interpretation of trends in data visualisations. User interpretation of the visualisations, combined with user experience are critical inputs into a data visualisation design.
35 |
36 | If a RTL layout is required:
37 |
38 | 1. Use `yaxis.tick_right()` and `yaxis.set_label_position("right")` to reposition y-axis to the right side of the plot
39 | 2. Use `plt.gca().invert_xaxis()` to invert the x-axis. This step may not be necessary. UX is an important consideration.
40 |
41 | ### Examples
42 |
43 | The following python scripts uses [Sorani Kurdish data](https://github.com/enabling-languages/python-i18n/blob/main/data/demographics.tsv):
44 |
45 | * [matplotlib](https://github.com/enabling-languages/python-i18n/blob/main/py/matplotlib_kurdish.py)
46 | * [pandas.Dataframe.plot](https://github.com/enabling-languages/python-i18n/blob/main/py/pandas_plot_kurdish.py)
47 | * [seaborn](https://github.com/enabling-languages/python-i18n/blob/main/py/seaborn_kurdish.py)
48 | * [wordcount](https://github.com/enabling-languages/python-i18n/blob/main/py/wordcloud_kurdish.py)
49 |
50 |
Fig.1 - Kurdish bar charts in both LTR and RTL layouts.
51 |
52 |
Fig.2 - Kurdish wordcloud.
53 |
--------------------------------------------------------------------------------
/notebooks/Sorting_emoji.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Sorting emoji\n",
8 | "\n",
9 | "Python's inbuild sorting algorithms sort emoji by codepoint.\n",
10 | "\n",
11 | "Codepoint order, as well as the default collation rules provided by the Unicode Collation Algorithm do not provide adequate [ordering and grouping](https://www.unicode.org/reports/tr51/#Sorting) of emoji.\n",
12 | "\n",
13 | "The Unicode Common Locale Data Repository (CLDR) provides colation rules for emoji. [Conformant emoji collation](https://www.unicode.org/reports/tr51/#Collation_Conformance) is defined in CLDR tailoring rules for the Unicode Collation Algorthim (UCA).\n",
14 | "\n",
15 | "CLDR groups emoji into broad conceptual categories in order to group related emoji together."
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "## Emoji only collation\n",
23 | "\n",
24 | "For the following discussion we will use the following emoji:\n",
25 | "\n",
26 | "|Character |Codepoint |Description |Category |\n",
27 | "|--------- |--------- |----------- |-------- |\n",
28 | "|🦜 |U+1F99C |Parrot |animal-bird |\n",
29 | "|🥚 |U+1F95A |Egg |food-prepared |\n",
30 | "|🐔 |U+1F414 |Chicken |animal-bird |\n",
31 | "\n",
32 | "The default python sort algorithm will order then in terms of the emoji's codepoint: U+1F414 (chicken), U+1F95A (egg), and then U+1F99C (parrot).\n",
33 | "\n",
34 | "The CLDR ordering would be to sort the two bids together (U+1F414 then U+1F99C), followed by U+1F95A."
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 1,
40 | "metadata": {},
41 | "outputs": [
42 | {
43 | "data": {
44 | "text/plain": [
45 | "['🐔', '🥚', '🦜']"
46 | ]
47 | },
48 | "execution_count": 1,
49 | "metadata": {},
50 | "output_type": "execute_result"
51 | }
52 | ],
53 | "source": [
54 | "a = ['🦜', '🥚', '🐔']\n",
55 | "sorted(a)"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "Using PyICU, it is possible to sort emoji according to CLDR's collation rules for Emoji. The `-u-co-emoji` Unicode BCP-47 extension will enable CLDR based emoji collation. When sorting just wmoji we can use the langauge subtag `und` (undetermined) as the base for the locale identifier: `und-u-co-emoji`."
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 2,
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "name": "stdout",
72 | "output_type": "stream",
73 | "text": [
74 | "['🐔', '🦜', '🥚']\n"
75 | ]
76 | }
77 | ],
78 | "source": [
79 | "from icu import Collator, Locale\n",
80 | "coll = Collator.createInstance(Locale.createCanonical(\"und-u-co-emoji\"))\n",
81 | "print(sorted(a, key=coll.getSortKey))"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "This yields a CLDR based sort using the CLDR emoji collation rules."
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {},
94 | "source": [
95 | "## Sorting text and emoji\n",
96 | "\n",
97 | "A more complex scenario is sorting a set of text and emoji.\n",
98 | "\n",
99 | "[UTS #35](https://unicode.org/reports/tr35/tr35-collation.html#Combining_Rules) provides a discussion of tailoring and combining rules in relation to sorting emoji and text. We'll implement the example given in UTS #35 in Python.\n",
100 | "\n",
101 | "The following characters are used:\n",
102 | "\n",
103 | "|Character |Codepoint |Description |\n",
104 | "|---------- |---------- |------------ |\n",
105 | "|😀 |U+1F600 |Grinning Face |\n",
106 | "|글 |U+AE00 |Hangul Syllable Geul |\n",
107 | "|Z |U+005A |Latin Capital Letter Z |\n",
108 | "|ü |U+00FC |Latin Small Letter U with Diaeresis |\n",
109 | "|, |U+002C |Comma |\n",
110 | "|✈️️ |U+2708 U+FE0F |Airplane |\n",
111 | "|y |U+0079 |Latin Small Letter Y |\n",
112 | "|☹️ |U+2639 U+FE0F |White Frowning Face |\n",
113 | "|a |U+0061 |Latin Small Letter A |\n",
114 | "\n",
115 | "Enabling emoji collation overrides language specific tailorings. This has no impact on text for languages that use the root collation, but will have a negative impact on languages that do require tailoring to obtain the correct collation order.\n",
116 | "\n",
117 | "The python sort algorithm will order content by codepoint:"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 11,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "data": {
127 | "text/plain": [
128 | "[',', 'Z', 'a', 'y', 'ü', '☹️', '✈️️', '글', '😀']"
129 | ]
130 | },
131 | "execution_count": 11,
132 | "metadata": {},
133 | "output_type": "execute_result"
134 | }
135 | ],
136 | "source": [
137 | "# List to be sorted\n",
138 | "b = ['😀', '글', 'Z', 'ü', ',', '✈️️', 'y', '☹️', 'a']\n",
139 | "\n",
140 | "#Default Python sort\n",
141 | "sorted(b)"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "metadata": {},
147 | "source": [
148 | "The `en` locale identifier will use the CLDR root collation. Emoji are not sorted using the CLDR emoji collation rules:"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 25,
154 | "metadata": {},
155 | "outputs": [
156 | {
157 | "data": {
158 | "text/plain": [
159 | "[',', '☹️', '✈️️', '😀', 'a', 'ü', 'y', 'Z', '글']"
160 | ]
161 | },
162 | "execution_count": 25,
163 | "metadata": {},
164 | "output_type": "execute_result"
165 | }
166 | ],
167 | "source": [
168 | "# locale: en\n",
169 | "en_coll = Collator.createInstance(Locale.forLanguageTag(\"en\"));\n",
170 | "sorted(b, key=en_coll.getSortKey)"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "Enabling emoji collation using the `en-u-co-emoji` locale will sort the emoji based on the emoji collation rules and the remaining characters are sorted as per the root collation algorithm."
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 24,
183 | "metadata": {},
184 | "outputs": [
185 | {
186 | "data": {
187 | "text/plain": [
188 | "[',', '😀', '☹️', '✈️️', 'a', 'ü', 'y', 'Z', '글']"
189 | ]
190 | },
191 | "execution_count": 24,
192 | "metadata": {},
193 | "output_type": "execute_result"
194 | }
195 | ],
196 | "source": [
197 | "# locale for en-u-co-emoji\n",
198 | "en_emoji_coll = Collator.createInstance(Locale.forLanguageTag(\"en-u-co-emoji\"));\n",
199 | "sorted(b, key=en_emoji_coll.getSortKey)"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "`en-u-co-emoji\"`will yield the same result as `und-u-co-emoji`, i.e. sort emoji according to the CLDR emoji collation order and sort other characters according to the root collation algorithm."
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 23,
212 | "metadata": {},
213 | "outputs": [
214 | {
215 | "data": {
216 | "text/plain": [
217 | "[',', '😀', '☹️', '✈️️', 'a', 'ü', 'y', 'Z', '글']"
218 | ]
219 | },
220 | "execution_count": 23,
221 | "metadata": {},
222 | "output_type": "execute_result"
223 | }
224 | ],
225 | "source": [
226 | "# locale for und-u-co-emoji\n",
227 | "und_emoji_coll = Collator.createInstance(Locale.forLanguageTag(\"und-u-co-emoji\"));\n",
228 | "sorted(b, key=und_emoji_coll.getSortKey)"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "The `da` locale has tailored collation rules to order text in the sequence required for Danish:"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 22,
241 | "metadata": {},
242 | "outputs": [
243 | {
244 | "data": {
245 | "text/plain": [
246 | "[',', '☹️', '✈️️', '😀', 'a', 'y', 'ü', 'Z', '글']"
247 | ]
248 | },
249 | "execution_count": 22,
250 | "metadata": {},
251 | "output_type": "execute_result"
252 | }
253 | ],
254 | "source": [
255 | "# locale for da\n",
256 | "da_coll = Collator.createInstance(Locale.forLanguageTag(\"da\"));\n",
257 | "sorted(b, key=da_coll.getSortKey)"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {},
263 | "source": [
264 | "Adding emoji collation support overrides the Danish language tailorings. Look at the order of __ü__ in the list for the `da` and `da-u-co-emoji` locales."
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 20,
270 | "metadata": {},
271 | "outputs": [
272 | {
273 | "data": {
274 | "text/plain": [
275 | "[',', '😀', '☹️', '✈️️', 'a', 'ü', 'y', 'Z', '글']"
276 | ]
277 | },
278 | "execution_count": 20,
279 | "metadata": {},
280 | "output_type": "execute_result"
281 | }
282 | ],
283 | "source": [
284 | "# locale for da-u-co-emoji\n",
285 | "da_emoji_coll = Collator.createInstance(Locale.forLanguageTag(\"da-u-co-emoji\"));\n",
286 | "sorted(b, key=da_emoji_coll.getSortKey)"
287 | ]
288 | },
289 | {
290 | "cell_type": "markdown",
291 | "metadata": {},
292 | "source": [
293 | "To overcome this, it is possible to combine the collation rules for the `da` and `da_and_emoji_rules`. We can do this by:\n",
294 | "\n",
295 | "1. Initiating collator instances for each locale, and retrieve the rules\n",
296 | "2. Concatenate the rule sets\n",
297 | "3. Initiate a collator instance using `RuleBasedCollator`\n",
298 | "\n",
299 | "This will order emoji according to the emoji collation rules and order Latin script text according to Danish collation rules."
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 19,
305 | "metadata": {},
306 | "outputs": [
307 | {
308 | "data": {
309 | "text/plain": [
310 | "[',', '😀', '☹️', '✈️️', 'a', 'y', 'ü', 'Z', '글']"
311 | ]
312 | },
313 | "execution_count": 19,
314 | "metadata": {},
315 | "output_type": "execute_result"
316 | }
317 | ],
318 | "source": [
319 | "# Combinded rules\n",
320 | "from icu import RuleBasedCollator\n",
321 | "#da_and_emoji_rules = Collator.createInstance(Locale.forLanguageTag('da')).getRules() + Collator.createInstance(Locale.forLanguageTag('und-u-co-emoji')).getRules()\n",
322 | "da_rules = Collator.createInstance(Locale.forLanguageTag('da')).getRules()\n",
323 | "emoji_rules = Collator.createInstance(Locale.forLanguageTag('und-u-co-emoji')).getRules()\n",
324 | "da_and_emoji_rules = da_rules + emoji_rules\n",
325 | "combined_coll = RuleBasedCollator(da_and_emoji_rules)\n",
326 | "sorted(b, key=combined_coll.getSortKey)"
327 | ]
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "metadata": {},
332 | "source": [
333 | "The same approach is needed for other languages that are not supported by the CLDR root collation algorithm and require tailored rules."
334 | ]
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "metadata": {},
339 | "source": [
340 | "## Resources\n",
341 | "\n",
342 | "* [Emoji ordering chart](https://www.unicode.org/emoji/charts/emoji-ordering.html)\n",
343 | "* [CLDR Root collation rules](https://github.com/unicode-org/cldr/blob/353527cdabf1e8870d261beb3c908de6deb1915b/common/collation/root.xml#L951)"
344 | ]
345 | }
346 | ],
347 | "metadata": {
348 | "interpreter": {
349 | "hash": "bb12d0de9674b66c629d2bafada2ec4f6e6dba6d129e54dea4badc21502d54d3"
350 | },
351 | "kernelspec": {
352 | "display_name": "Python 3.8.1 64-bit ('el': venv)",
353 | "language": "python",
354 | "name": "python3"
355 | },
356 | "language_info": {
357 | "codemirror_mode": {
358 | "name": "ipython",
359 | "version": 3
360 | },
361 | "file_extension": ".py",
362 | "mimetype": "text/x-python",
363 | "name": "python",
364 | "nbconvert_exporter": "python",
365 | "pygments_lexer": "ipython3",
366 | "version": "3.8.1"
367 | },
368 | "orig_nbformat": 4
369 | },
370 | "nbformat": 4,
371 | "nbformat_minor": 2
372 | }
373 |
--------------------------------------------------------------------------------
/notebooks/ethiopic_numbers.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Working with Ethiopic numbers\n",
8 | "\n",
9 | "CLDR sets the default number system for languages written in the Ethiopic script to the Arabic (Latin) Number System. The Ethiopic number system is marked as an alternative (traditional) numbering system, and is not used by default.\n",
10 | "\n",
11 | "CLDR defines decimal and algorithmic [number systems](https://github.com/unicode-org/cldr/blob/main/common/supplemental/numberingSystems.xml). The Ethiopic number system is an algorithmic alphabetic numeral system.\n",
12 | "\n",
13 | "For a description of the number system refer to [Ethiopic number system](http://www.geez.org/Numerals/) for more details. A list of [sample numbers](http://www.geez.org/Numerals/NumberSamples.html) is available.\n",
14 | "\n",
15 | "ICU provides a number of classes used for [formatting numbers](https://unicode-org.github.io/icu/userguide/format_parse/numbers/), but the class needed to format Ethiopic numbers is the [RuleBasedNumberFormat](https://unicode-org.github.io/icu/userguide/format_parse/numbers/rbnf.html) class.\n",
16 | "\n",
17 | "Refer to the ICU4C API [RuleBasedNumberFormat class reference](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedNumberFormat.html). The RBNF rule set for [Ethiopic](https://github.com/unicode-org/cldr/blob/6c8ad511801043124d6ce25e0388412fe9b7b2f4/common/rbnf/root.xml#L246) is defined in the CLDR root locale.\n",
18 | "\n",
19 | "The most common use for the `RuleBasedNumberFormat` class is to format numbers as ordinals or as words in the target locale. It is also the nechanism for formating and parsing algorithmic number systems.\n"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Spelling out numbers in Amharic\n",
27 | "\n",
28 | "1. Create a locale instance\n",
29 | "2. create a number formatter instance using `RuleBasedNumberFormat` class\n",
30 | "3. Format the number\n",
31 | "\n",
32 | "We start by importing the necessary classes from PyICU:"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 15,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "from icu import Locale, Formattable, RuleBasedNumberFormat, URBNFRuleSetTag"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "- [Locale](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1Locale.html) – methods for initiating and working with ICU's locale objects.\n",
49 | "- [Formattable](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1Formattable.html) – a wrapper that converts between numeric types, strings and date objects. It's primary use is in formatting.\n",
50 | "- [RulebasedNumberFormat](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedNumberFormat.html) – formats numbers according to a set of rules. The rules maybe inbuilt set of rules, or custom rules.\n",
51 | "- [URBNFRuleSetTag](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/namespaceicu.html#a55dbbbdd4946251c23988013e06e695e) – tags for predefined rule sets to use with `RulebasedNumberFormat`."
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "First, create a Locale instance, and a formatter instance. There are a number of methods for building a Locale instance. To keep things simple, we'll just pass a locale identifier directly to the class."
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 16,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "loc = Locale('am_ET')\n",
68 | "formatter = RuleBasedNumberFormat(URBNFRuleSetTag.SPELLOUT, loc)"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "We can control what rule sets are used. The following rule sets are available:"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 17,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "name": "stdout",
85 | "output_type": "stream",
86 | "text": [
87 | "%spellout-numbering-year\n",
88 | "%spellout-numbering\n",
89 | "%spellout-cardinal\n",
90 | "%spellout-ordinal\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "for n in range(formatter.getNumberOfRuleSetNames()):\n",
96 | " print(formatter.getRuleSetName(n))"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 26,
102 | "metadata": {},
103 | "outputs": [
104 | {
105 | "name": "stdout",
106 | "output_type": "stream",
107 | "text": [
108 | "%spellout-numbering\n"
109 | ]
110 | }
111 | ],
112 | "source": [
113 | "print(formatter.getDefaultRuleSetName())"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "The `%spellout-numbering` is the default for Amharic, but `%spellout-numbering-year`, `%spellout-cardinal`, and `%spellout-ordinal` are alternative rule sets available. Use the [setDefaultRuleSet](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedNumberFormat.html#aa0fbc19602d99cfcb550e2c11cb9ca91) method, if required."
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "`RuleBasedNumberFormat` can be used in a number of ways, refer to the API documentation. IN this particular case we want to create a formatter that uses the Amharic spellout rule set. We passed the relevant rule set identifer and the required locale to create a formatter instance.\n",
128 | "\n",
129 | "The same Python code can be used for any locale that have spellout [rule sets](https://github.com/unicode-org/icu/tree/main/icu4c/source/data/rbnf).\n",
130 | "\n",
131 | "To convert the number to its word representation, use the `format` method."
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 18,
137 | "metadata": {},
138 | "outputs": [
139 | {
140 | "name": "stdout",
141 | "output_type": "stream",
142 | "text": [
143 | "አስር ሁለት ሺ ሦስት መቶ አራት አስር አምስት\n"
144 | ]
145 | }
146 | ],
147 | "source": [
148 | "number = 12345\n",
149 | "r = formatter.format(number)\n",
150 | "print(r)"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "You can use the `parse` method to convert the word representation back into a formated number:"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 19,
163 | "metadata": {},
164 | "outputs": [
165 | {
166 | "data": {
167 | "text/plain": [
168 | ""
169 | ]
170 | },
171 | "execution_count": 19,
172 | "metadata": {},
173 | "output_type": "execute_result"
174 | }
175 | ],
176 | "source": [
177 | "rreverse = formatter.parse(r)\n",
178 | "rreverse"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "This returns a Formattable object, which you can either render as a formated string, or convert to an interger or float, as required."
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 20,
191 | "metadata": {},
192 | "outputs": [
193 | {
194 | "name": "stdout",
195 | "output_type": "stream",
196 | "text": [
197 | "12,345\n"
198 | ]
199 | }
200 | ],
201 | "source": [
202 | "rreverse_string = str(rreverse)\n",
203 | "print(rreverse_string)"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "Available methods are:\n",
211 | "\n",
212 | "- getDouble – returns a floating point number\n",
213 | "- getInt64 – returns an integer\n"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 21,
219 | "metadata": {},
220 | "outputs": [
221 | {
222 | "name": "stdout",
223 | "output_type": "stream",
224 | "text": [
225 | "12345\n",
226 | "12345.0\n"
227 | ]
228 | }
229 | ],
230 | "source": [
231 | "ireverse = rreverse.getInt64()\n",
232 | "print(ireverse)\n",
233 | "\n",
234 | "dreverse = rreverse.getDouble()\n",
235 | "print(dreverse)"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "## Working with the Ethiopic numeral system\n",
243 | "\n",
244 | "Creating a formatter for Ethiopic numbers is a two step process, we need to create a formatter passing a rule set identifier for number systems and a locale, then we need to set the actual rule set needed. Locales may support multiple rule sets. \n",
245 | "\n",
246 | "1. Create a locale instance\n",
247 | "2. Create a formatter instance\n",
248 | "3. Set the rule set required\n",
249 | "\n",
250 | "We'll reuse the existing Locale instance."
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 22,
256 | "metadata": {},
257 | "outputs": [
258 | {
259 | "name": "stdout",
260 | "output_type": "stream",
261 | "text": [
262 | "%armenian-lower\n",
263 | "%armenian-upper\n",
264 | "%cyrillic-lower\n",
265 | "%ethiopic\n",
266 | "%georgian\n",
267 | "%greek-lower\n",
268 | "%greek-upper\n",
269 | "%hebrew\n",
270 | "%hebrew-item\n",
271 | "%roman-lower\n",
272 | "%roman-upper\n",
273 | "%tamil\n",
274 | "%zz-default\n"
275 | ]
276 | }
277 | ],
278 | "source": [
279 | "eformatter = RuleBasedNumberFormat(URBNFRuleSetTag.NUMBERING_SYSTEM, loc)\n",
280 | "\n",
281 | "for n in range(eformatter.getNumberOfRuleSetNames()):\n",
282 | " print(eformatter.getRuleSetName(n))\n"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "metadata": {},
288 | "source": [
289 | "The public name of the rule set we need is `%ethiopic`, so we set this as our default rule set:"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 23,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "eformatter.setDefaultRuleSet('%ethiopic')"
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "Then format the number as above:"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 24,
311 | "metadata": {},
312 | "outputs": [
313 | {
314 | "name": "stdout",
315 | "output_type": "stream",
316 | "text": [
317 | "፳፫፻፵፩\n"
318 | ]
319 | }
320 | ],
321 | "source": [
322 | "number = 2341\n",
323 | "r = eformatter.format(number)\n",
324 | "print(r)"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {},
330 | "source": [
331 | "And likewise, we can parse the ethiopic digits back to the Arabic (Latin) number system:"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": 25,
337 | "metadata": {},
338 | "outputs": [
339 | {
340 | "name": "stdout",
341 | "output_type": "stream",
342 | "text": [
343 | "2,341\n",
344 | "2341\n",
345 | "2341.0\n"
346 | ]
347 | }
348 | ],
349 | "source": [
350 | "rreverse = eformatter.parse(r)\n",
351 | "print(str(rreverse))\n",
352 | "print(rreverse.getInt64())\n",
353 | "print(rreverse.getDouble())"
354 | ]
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "metadata": {},
359 | "source": [
360 | "## Further information\n",
361 | "\n",
362 | "- Unicode Locale Data Markup Language (LDML) [Part 3: Numbers](https://www.unicode.org/reports/tr35/tr35-numbers.html#unicode-locale-data-markup-language-ldmlpart-3-numbers)\n",
363 | " - [Number Systems](http://www.unicode.org/reports/tr35/tr35-numbers.html#Numbering_Systems)\n",
364 | " - [Rule-Based Number Formatting](https://www.unicode.org/reports/tr35/tr35-numbers.html#Rule-Based_Number_Formatting)"
365 | ]
366 | }
367 | ],
368 | "metadata": {
369 | "kernelspec": {
370 | "display_name": "athinkra",
371 | "language": "python",
372 | "name": "python3"
373 | },
374 | "language_info": {
375 | "codemirror_mode": {
376 | "name": "ipython",
377 | "version": 3
378 | },
379 | "file_extension": ".py",
380 | "mimetype": "text/x-python",
381 | "name": "python",
382 | "nbconvert_exporter": "python",
383 | "pygments_lexer": "ipython3",
384 | "version": "3.11.0"
385 | },
386 | "orig_nbformat": 4
387 | },
388 | "nbformat": 4,
389 | "nbformat_minor": 2
390 | }
391 |
--------------------------------------------------------------------------------
/notebooks/images/sorani_plotly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/images/sorani_plotly.png
--------------------------------------------------------------------------------
/notebooks/images/sorani_plotly2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/images/sorani_plotly2.png
--------------------------------------------------------------------------------
/notebooks/images/sorani_plotly_inline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/images/sorani_plotly_inline.png
--------------------------------------------------------------------------------
/notebooks/img/1440px-Lake_Dukan_12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/1440px-Lake_Dukan_12.jpg
--------------------------------------------------------------------------------
/notebooks/img/ckb_IQ_collation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/ckb_IQ_collation.png
--------------------------------------------------------------------------------
/notebooks/img/khamti.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/khamti.jpg
--------------------------------------------------------------------------------
/notebooks/img/linux1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/linux1.png
--------------------------------------------------------------------------------
/notebooks/img/macos1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/macos1.png
--------------------------------------------------------------------------------
/notebooks/img/mplcairo_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/mplcairo_output.png
--------------------------------------------------------------------------------
/notebooks/img/sibe.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/sibe.jpg
--------------------------------------------------------------------------------
/notebooks/img/std_matplotlib_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/std_matplotlib_output.png
--------------------------------------------------------------------------------
/notebooks/img/tai_aiton.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/tai_aiton.jpg
--------------------------------------------------------------------------------
/notebooks/img/tai_aiton_text_to_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/tai_aiton_text_to_image.png
--------------------------------------------------------------------------------
/notebooks/img/yolngu.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/yolngu.jpg
--------------------------------------------------------------------------------
/notebooks/pandas_plot_mplcairo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Sorani Kurdish data using Pandas plot\n",
8 | "\n",
9 | "Enabling `mplcairo`, with `raqm`, as the backend for `matplotlib` will allow us to reuse the [Kurdish matplotlib example](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/matplotlib_mplcairo.ipynb) with Pandas `plot`.\n",
10 | "\n",
11 | "__Please note:__ This notebook will run on MacOS, but tends to be buggy on other platforms. The _mplcairo_ package does not currently support Jupyter. It is better to use _mplcairo_ in a script, rather than a notebook. See [pandas_plot_kurdish.py](../py/pandas_plot_kurdish.py)."
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "## Setup"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 3,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "import pandas as pd\n",
28 | "import locale, platform\n",
29 | "import mplcairo\n",
30 | "import matplotlib as mpl\n",
31 | "if platform.system() == \"Darwin\":\n",
32 | " mpl.use(\"module://mplcairo.macosx\")\n",
33 | "else:\n",
34 | " mpl.use(\"module://mplcairo.qt\")\n",
35 | "import matplotlib.pyplot as plt\n",
36 | "import matplotlib.ticker as ticker\n",
37 | "import unicodedata as ud, regex as re"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "## Helper functions"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 4,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "def convert_digits(s, sep = (\",\", \".\")):\n",
54 | " nd = re.compile(r'^-?\\p{Nd}[,.\\u066B\\u066C\\u0020\\u2009\\u202F\\p{Nd}]*$')\n",
55 | " tsep, dsep = sep\n",
56 | " if nd.match(s):\n",
57 | " s = s.replace(tsep, \"\")\n",
58 | " s = ''.join([str(ud.decimal(c, c)) for c in s])\n",
59 | " if dsep in s:\n",
60 | " return float(s.replace(dsep, \".\")) if dsep != \".\" else float(s)\n",
61 | " return int(s)\n",
62 | " return s\n",
63 | "\n",
64 | "seps = (\"\\u066C\", \"\\u066B\")\n",
65 | "digitsconv = lambda x: convert_digits(x.replace(\"-\", \"٠\"), sep = seps)"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "## Process data and plot data"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 5,
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "data": {
82 | "text/html": [
83 | "\n",
84 | "\n",
97 | "
\n",
98 | " \n",
99 | " \n",
100 | " | \n",
101 | " --- | \n",
102 | " جیھانی | \n",
103 | " تورکیا | \n",
104 | " ئێران | \n",
105 | " عێراق | \n",
106 | " سووریا | \n",
107 | "
\n",
108 | " \n",
109 | " \n",
110 | " \n",
111 | " 0 | \n",
112 | " کرمانجی | \n",
113 | " 14419000 | \n",
114 | " 7919000 | \n",
115 | " 443000 | \n",
116 | " 3185000 | \n",
117 | " 1661000 | \n",
118 | "
\n",
119 | " \n",
120 | " 1 | \n",
121 | " ئەوانەی بە تورکی دەدوێن | \n",
122 | " 5732000 | \n",
123 | " 5732000 | \n",
124 | " 0 | \n",
125 | " 0 | \n",
126 | " 0 | \n",
127 | "
\n",
128 | " \n",
129 | " 2 | \n",
130 | " باشوور | \n",
131 | " 3381000 | \n",
132 | " 0 | \n",
133 | " 3381000 | \n",
134 | " 0 | \n",
135 | " 0 | \n",
136 | "
\n",
137 | " \n",
138 | " 3 | \n",
139 | " سۆرانی | \n",
140 | " 1576000 | \n",
141 | " 0 | \n",
142 | " 502000 | \n",
143 | " 567000 | \n",
144 | " 0 | \n",
145 | "
\n",
146 | " \n",
147 | " 4 | \n",
148 | " زازایی - دەملی | \n",
149 | " 1125000 | \n",
150 | " 1125000 | \n",
151 | " 0 | \n",
152 | " 0 | \n",
153 | " 0 | \n",
154 | "
\n",
155 | " \n",
156 | " 5 | \n",
157 | " زازایی - ئەلڤێکا | \n",
158 | " 184000 | \n",
159 | " 179000 | \n",
160 | " 0 | \n",
161 | " 0 | \n",
162 | " 0 | \n",
163 | "
\n",
164 | " \n",
165 | " 6 | \n",
166 | " ڕەوەند | \n",
167 | " 90000 | \n",
168 | " 38000 | \n",
169 | " 20000 | \n",
170 | " 33000 | \n",
171 | " 0 | \n",
172 | "
\n",
173 | " \n",
174 | " 7 | \n",
175 | " ھەورامی | \n",
176 | " 54000 | \n",
177 | " 0 | \n",
178 | " 26000 | \n",
179 | " 28000 | \n",
180 | " 0 | \n",
181 | "
\n",
182 | " \n",
183 | " 8 | \n",
184 | " شکاکی | \n",
185 | " 49000 | \n",
186 | " 23000 | \n",
187 | " 26000 | \n",
188 | " 0 | \n",
189 | " 0 | \n",
190 | "
\n",
191 | " \n",
192 | " 9 | \n",
193 | " کۆی گشتی | \n",
194 | " 26712000 | \n",
195 | " 15016000 | \n",
196 | " 4398000 | \n",
197 | " 3916000 | \n",
198 | " 1661000 | \n",
199 | "
\n",
200 | " \n",
201 | "
\n",
202 | "
"
203 | ],
204 | "text/plain": [
205 | " --- جیھانی تورکیا ئێران عێراق سووریا\n",
206 | "0 کرمانجی 14419000 7919000 443000 3185000 1661000\n",
207 | "1 ئەوانەی بە تورکی دەدوێن 5732000 5732000 0 0 0\n",
208 | "2 باشوور 3381000 0 3381000 0 0\n",
209 | "3 سۆرانی 1576000 0 502000 567000 0\n",
210 | "4 زازایی - دەملی 1125000 1125000 0 0 0\n",
211 | "5 زازایی - ئەلڤێکا 184000 179000 0 0 0\n",
212 | "6 ڕەوەند 90000 38000 20000 33000 0\n",
213 | "7 ھەورامی 54000 0 26000 28000 0\n",
214 | "8 شکاکی 49000 23000 26000 0 0\n",
215 | "9 کۆی گشتی 26712000 15016000 4398000 3916000 1661000"
216 | ]
217 | },
218 | "execution_count": 5,
219 | "metadata": {},
220 | "output_type": "execute_result"
221 | }
222 | ],
223 | "source": [
224 | "import pandas as pd\n",
225 | "conv = {\n",
226 | " 'سووریا': digitsconv,\n",
227 | " 'عێراق': digitsconv,\n",
228 | " 'ئێران': digitsconv,\n",
229 | " 'تورکیا': digitsconv,\n",
230 | " 'جیھانی': digitsconv\n",
231 | "}\n",
232 | "df = pd.read_table(\"../data/demographics.tsv\", converters=conv)\n",
233 | "df"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 6,
239 | "metadata": {},
240 | "outputs": [
241 | {
242 | "name": "stdout",
243 | "output_type": "stream",
244 | "text": [
245 | "تورکیا 30032000\n",
246 | "ئێران 8796000\n",
247 | "عێراق 7729000\n",
248 | "سووریا 3322000\n",
249 | "dtype: int64\n"
250 | ]
251 | }
252 | ],
253 | "source": [
254 | "col_list=[\"تورکیا\" ,\"ئێران\" ,\"عێراق\" ,\"سووریا\"]\n",
255 | "\n",
256 | "total_df = df[col_list].sum(axis=0)\n",
257 | "print(total_df)"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {},
263 | "source": [
264 | "Using indicies and values of the `total_df` series:"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 20,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "def convert_to_arab_ns(n, p=None, decimal=2, sep_in=[\"\", \".\"], sep_out=[\"\\u066C\", \"\\u066B\"], scale=None):\n",
274 | " locale.setlocale(locale.LC_ALL, \"en_US.UTF-8\")\n",
275 | " decimal_places = decimal\n",
276 | " if sep_in == [\"\", \".\"]:\n",
277 | " n = n * scale if scale else n\n",
278 | " format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d'\n",
279 | " n = locale.format_string(format_string, n, grouping=True, monetary=True)\n",
280 | " n = n.replace(\",\", \"ṯ\").replace(\".\", \"ḏ\")\n",
281 | " #n = str(n)\n",
282 | " if sep_in[0] in [\" \", \",\", \"٬\", \"\\u2009\"]:\n",
283 | " n = n.replace(r'[\\u0020,٬\\u2009]', \"ṯ\")\n",
284 | " elif sep_in[0] == \".\":\n",
285 | " n = n.replace(\".\", \"ṯ\")\n",
286 | " if sep_in[1] in [\",\", \".\", \"٫\"]:\n",
287 | " n = n.replace(r'[,.٫]', \"ḏ\")\n",
288 | " sep = sep_out\n",
289 | " t = n.maketrans(\"0123456789\", \"٠١٢٣٤٥٦٧٨٩\")\n",
290 | " locale.setlocale(locale.LC_ALL, \"\")\n",
291 | " return n.translate(t).replace(\"ṯ\", sep[0] ).replace(\"ḏ\", sep[1])"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 23,
297 | "metadata": {},
298 | "outputs": [],
299 | "source": [
300 | "\n",
301 | "# ax = total_df.plot(kind=\"bar\", title='ڕێژەی دانیشتووانی کورد', xlabel=\"ناوچە\", ylabel=\"ڕێژەی دانیشتووان\" ,rot=0)\n",
302 | "# DEFAULT_NUMERAL_SYSYEM = \"arab\"\n",
303 | "# ns_formatter = ticker.FuncFormatter(lambda x, p: convert_to_arab_ns(x, p, scale=0.000001))\n",
304 | "# ax.get_yaxis().set_major_formatter(ns_formatter)\n",
305 | "# plt.show()"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "# fig = px.bar(x=total_df.index, y=total_df.values)\n",
315 | "fig = total_df.plot(kind=\"bar\", title='ڕێژەی دانیشتووانی کورد', xlabel=\"ناوچە\", ylabel=\"ڕێژەی دانیشتووان\" ,rot=0)\n",
316 | "\n",
317 | "fig.update_layout(\n",
318 | " title={\n",
319 | " 'text': 'ڕێژەی دانیشتووانی کورد',\n",
320 | " 'y':0.95,\n",
321 | " 'x':0.5,\n",
322 | " 'xanchor': 'center',\n",
323 | " 'yanchor': 'top'},\n",
324 | " xaxis_title=\"ناوچە\",\n",
325 | " yaxis_title=\"ڕێژەی دانیشتووان\",\n",
326 | " font=dict(\n",
327 | " family=\"Vazirmatn\",\n",
328 | " size=14,\n",
329 | " color=\"Grey\"\n",
330 | " )\n",
331 | ")\n",
332 | "\n",
333 | "fig.show()"
334 | ]
335 | }
336 | ],
337 | "metadata": {
338 | "interpreter": {
339 | "hash": "05c935ee2b4ff45f26d355be2499c84aedc5a4939bfa2f7a9b7f00dda4a86ade"
340 | },
341 | "kernelspec": {
342 | "display_name": "Python 3.10.1 ('el-test')",
343 | "language": "python",
344 | "name": "python3"
345 | },
346 | "language_info": {
347 | "codemirror_mode": {
348 | "name": "ipython",
349 | "version": 3
350 | },
351 | "file_extension": ".py",
352 | "mimetype": "text/x-python",
353 | "name": "python",
354 | "nbconvert_exporter": "python",
355 | "pygments_lexer": "ipython3",
356 | "version": "3.10.1"
357 | },
358 | "orig_nbformat": 4
359 | },
360 | "nbformat": 4,
361 | "nbformat_minor": 2
362 | }
363 |
--------------------------------------------------------------------------------
/notebooks/persian_df.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Pandas internationalisation: Persian (فارسی) data example\n",
8 | "\n",
9 | "An example of reading in Persian data in Pandas.\n",
10 | "\n",
11 | "The file `fa_stats.tsv` is a tab delimited file in Persian. Column 1 contains a four digit year based on the Islamic calendar. Columns 2 and 3 contain integers using Eastern Arabic-Indic digits, using the Arabic thousands seperator.\n",
12 | "\n",
13 | "A set of conversion functions are used with `pd.read_table()` to convert the data to a format that cen be used in Pandas.\n",
14 | "\n",
15 | "Column 1 is converted to the Gregorian Calendar, using a combination of the `convert_digits()` function and PyICU's `icu.Calendar` and `icu.GregorianCalendar` modules. After the dataframe is available, we use `pandas.Series.dt.year` to convert the datetime objects in the column to Four digit year display.\n",
16 | "\n",
17 | "The `convert_digits()` function is used to convert the Eastern Arabic-Indic digits in columns 2 and 3 to Arabic digits that can be manipulated by Pandas."
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 1,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "import unicodedataplus as ud, regex as re, pandas as pd\n",
27 | "from icu import Locale, Calendar, GregorianCalendar\n",
28 | "\n",
29 | "def convert_digits(s, sep = (\",\", \".\")):\n",
30 | " nd = re.compile(r'^-?\\p{Nd}[,.\\u066B\\u066C\\u0020\\u2009\\u202F\\p{Nd}]*$')\n",
31 | " tsep, dsep = sep\n",
32 | " if nd.match(s):\n",
33 | " s = s.replace(tsep, \"\")\n",
34 | " s = ''.join([str(ud.decimal(c, c)) for c in s])\n",
35 | " if dsep in s:\n",
36 | " return float(s.replace(dsep, \".\")) if dsep != \".\" else float(s)\n",
37 | " return int(s)\n",
38 | " return s\n",
39 | "\n",
40 | "loc = \"fa_IR\"\n",
41 | "in_c = Calendar.createInstance(Locale(loc + \"@calendar=persian\"))\n",
42 | "out_c = GregorianCalendar(Locale(loc + \"@calendar=gregorian\"))\n",
43 | "\n",
44 | "def convert_islamic_year(y, in_c, out_c):\n",
45 | " y = convert_digits(y.strip())\n",
46 | " in_c.set(Calendar.YEAR, y)\n",
47 | " out_c.setTime(in_c.getTime())\n",
48 | " return out_c.get(Calendar.YEAR)\n",
49 | "\n",
50 | "seps = (\"\\u066C\", \"\\u066B\")\n",
51 | "digitf = lambda x: convert_digits(x.strip(), sep = seps)\n",
52 | "datef = lambda x: convert_islamic_year(x, in_c=in_c, out_c=out_c)\n"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 2,
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "data": {
62 | "text/html": [
63 | "\n",
64 | "\n",
77 | "
\n",
78 | " \n",
79 | " \n",
80 | " | \n",
81 | " سال | \n",
82 | " ولادت | \n",
83 | " وفات | \n",
84 | "
\n",
85 | " \n",
86 | " \n",
87 | " \n",
88 | " 0 | \n",
89 | " 1959 | \n",
90 | " 864846 | \n",
91 | " 176288 | \n",
92 | "
\n",
93 | " \n",
94 | " 1 | \n",
95 | " 1960 | \n",
96 | " 876206 | \n",
97 | " 171040 | \n",
98 | "
\n",
99 | " \n",
100 | " 2 | \n",
101 | " 1961 | \n",
102 | " 902260 | \n",
103 | " 159371 | \n",
104 | "
\n",
105 | " \n",
106 | "
\n",
107 | "
"
108 | ],
109 | "text/plain": [
110 | " سال ولادت وفات\n",
111 | "0 1959 864846 176288\n",
112 | "1 1960 876206 171040\n",
113 | "2 1961 902260 159371"
114 | ]
115 | },
116 | "execution_count": 2,
117 | "metadata": {},
118 | "output_type": "execute_result"
119 | }
120 | ],
121 | "source": [
122 | "conv = {\"سال\": datef ,\"ولادت\": digitf, \"وفات\": digitf}\n",
123 | "df = pd.read_table(\"../data/csv/fa_stats.tsv\", converters=conv, parse_dates=['سال'])\n",
124 | "df[\"سال\"] = df[\"سال\"].dt.year\n",
125 | "df.head(3)"
126 | ]
127 | }
128 | ],
129 | "metadata": {
130 | "interpreter": {
131 | "hash": "bb12d0de9674b66c629d2bafada2ec4f6e6dba6d129e54dea4badc21502d54d3"
132 | },
133 | "kernelspec": {
134 | "display_name": "Python 3",
135 | "language": "python",
136 | "name": "python3"
137 | },
138 | "language_info": {
139 | "codemirror_mode": {
140 | "name": "ipython",
141 | "version": 3
142 | },
143 | "file_extension": ".py",
144 | "mimetype": "text/x-python",
145 | "name": "python",
146 | "nbconvert_exporter": "python",
147 | "pygments_lexer": "ipython3",
148 | "version": "3.8.1"
149 | }
150 | },
151 | "nbformat": 4,
152 | "nbformat_minor": 2
153 | }
154 |
--------------------------------------------------------------------------------
/notebooks/strings_casing_matching.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Python string operations: casing and matching"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "|Operation |Python |Pandas |PyICU |\n",
15 | "|----------- |-------- |------- |------ |\n",
16 | "|Lowercasing |[str.lower()](https://docs.python.org/3/library/stdtypes.html#str.lower) |[pandas.Series.str.lower()](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.lower.html?highlight=lower#pandas-series-str-lower) |icu.UnicodeString.toLower() |\n",
17 | "|Uppercasing |[str.upper()](https://docs.python.org/3/library/stdtypes.html#str.upper) |[pandas.Series.str.upper()](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.upper.html#pandas-series-str-upper) |icu.UnicodeString.toUpper() |\n",
18 | "|Titlecasing |[str.title()](https://docs.python.org/3/library/stdtypes.html#str.title) |[pandas.Series.str.title](pandas.Series.str.title) |icu.UnicodeString.toTitle() |\n",
19 | "|Casefolding |[str.casefold()](https://docs.python.org/3/library/stdtypes.html#str.casefold) |[pandas.Series.str.casefold()]() |icu.UnicodeString.CaseFold() |\n",
20 | "\n",
21 | "The operations [str.capitalize()](https://docs.python.org/3/library/stdtypes.html#str.capitalize)/[pandas.Series.str.capitalize()](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.capitalize.html#pandas-series-str-capitalize) and [str.swapcase()](https://docs.python.org/3/library/stdtypes.html#str.swapcase)/[pandas.Series.str.swapcase()](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.swapcase.html#pandas-series-str-swapcase), although string operations, aren't necessarily casing operations.\n",
22 | "\n",
23 | "N.B. we will not explore the differences between an [object and `StringDtype`](https://pandas.pydata.org/docs/user_guide/text.html#behavior-differences) in Pandas."
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 4,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "from el_internationalisation import cp, cpnames, udata"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "## Python casing operations\n",
40 | "\n",
41 | "Unicode contains a set of special casing mappings. These are divided intto unconditional and conditional mappings. All casing operations should support unconditional special mappings by default.\n",
42 | "\n",
43 | "Python's casing operations are language insensitive, that is langauge is not taken into account when casing operations occur. The current locale has no impact on casing operations, therefore language sensitive mappings are unsupported.\n",
44 | "\n",
45 | "Unconditional mappings:\n",
46 | "\n",
47 | " * Eszett (ß) casing \n",
48 | " * Preserving canonical equivalence of I WITH DOT ABOVE (İ)\n",
49 | " * Ligatures (Latin and Armenian script)\n",
50 | " * When a lowercase charcater has no corresponding uppercase precomposed character\n",
51 | " * Greek letters with letters with hupogegramménē (ὑπογεγραμμένη) or prosgráphō (προσγράφω) have special uppercase equivalents.\n",
52 | " * Some Greek letters with letters with hupogegramménē (ὑπογεγραμμένη) have no titlecase\n",
53 | "\n",
54 | "Conditional mappings:\n",
55 | " 1. Language-Insensitive Mappings\n",
56 | " * Final form of Greek sigma\n",
57 | " 2. Language-Sensitive Mappings\n",
58 | " * Lithuanian retains the dot in a lowercase i/j when followed by accents\n",
59 | " * For Turkish and Azeri, I and i-dotless; I-dot and i are case pairs\n",
60 | "\n",
61 | "See [Special Casings](https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt), which forms part of the Unicode Character database (UCD)."
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "### Unconditional mappings\n",
69 | "\n",
70 | "Python lowercasing and uppercasing support the unconditional mappings of Unicode's special mappings.\n",
71 | "\n",
72 | "|Character |Lowercase |Titlecase |Uppercase |Notes |\n",
73 | "|---------- |---------- |---------- |---------- |------ |\n",
74 | "\n",
75 | "#### Latin script"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 20,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "name": "stdout",
85 | "output_type": "stream",
86 | "text": [
87 | "ß (00DF) ⇒ SS (0053 0053)\n",
88 | "Titlecase: should not appear word initial.\n",
89 | "i̇ (0069 0307) ⇐ İ (0130)\n",
90 | "Titlecase: İ (0049 0307)\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "# ß\n",
96 | "ESZETT = \"ß\"\n",
97 | "print(f'{ESZETT} ({cp(ESZETT)}) ⇒ {ESZETT.upper()} ({cp(ESZETT.upper())})')\n",
98 | "print(\"Titlecase: should not appear word initial.\")\n",
99 | "\n",
100 | "# I WITH DOT ABOVE\n",
101 | "IDOT = \"\\u0130\"\n",
102 | "print(f'{IDOT.lower()} ({cp(IDOT.lower())}) ⇐ {IDOT} ({cp(IDOT)})')\n",
103 | "print(f'Titlecase: {\"i̇\".title()} ({cp(\"i̇\".title())})')"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "Note that Python titlecasing does not resolve back to the precomosed U+0130, but this is part of a wider issue with Python titlecasing, unlike uppercasing and lowercasing, titlecasing does not adhere to the Unicode specification\n",
111 | "\n",
112 | "If we take the name of the Turkish city İstanbul:"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 26,
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "name": "stdout",
122 | "output_type": "stream",
123 | "text": [
124 | "İstanbul: 0130 0073 0074 0061 006E 0062 0075 006C\n",
125 | "i̇stanbul: 0069 0307 0073 0074 0061 006E 0062 0075 006C\n",
126 | "Titlecase: İStanbul (0049 0307 0053 0074 0061 006E 0062 0075 006C)\n"
127 | ]
128 | }
129 | ],
130 | "source": [
131 | "print(f'İstanbul: {cp(\"İstanbul\")}')\n",
132 | "istanbul = \"İstanbul\".lower()\n",
133 | "print(f'{istanbul}: {cp(istanbul)}')\n",
134 | "istanbul_title = istanbul.title()\n",
135 | "print(f'Titlecase: {istanbul_title} ({cp(istanbul_title)})')"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "The first three characters in the titlecased string are U+0049 U+0307 U+0053. Python titlecases the first alphabetic character after a non-alphabetic character. Combining diacritics are not considered alphabetic characaters:"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 27,
148 | "metadata": {},
149 | "outputs": [
150 | {
151 | "data": {
152 | "text/plain": [
153 | "False"
154 | ]
155 | },
156 | "execution_count": 27,
157 | "metadata": {},
158 | "output_type": "execute_result"
159 | }
160 | ],
161 | "source": [
162 | "istanbul.isalpha()"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "So __i__ is uppercased to __I__, U+0307 is treated as a non-alphabetic character and the titlecasing operation titlecases the __s__, giving us İStanbul as the titlecased version of the string.\n",
170 | "\n",
171 | "It is important to note that the Unicode definition also excludes marks, like combining diacrtics, but Unicode titlecasing does not apply an alphabetic mask to titlecasing."
172 | ]
173 | }
174 | ],
175 | "metadata": {
176 | "kernelspec": {
177 | "display_name": "Python 3.8.1 ('el')",
178 | "language": "python",
179 | "name": "python3"
180 | },
181 | "language_info": {
182 | "codemirror_mode": {
183 | "name": "ipython",
184 | "version": 3
185 | },
186 | "file_extension": ".py",
187 | "mimetype": "text/x-python",
188 | "name": "python",
189 | "nbconvert_exporter": "python",
190 | "pygments_lexer": "ipython3",
191 | "version": "3.8.1"
192 | },
193 | "orig_nbformat": 4,
194 | "vscode": {
195 | "interpreter": {
196 | "hash": "bb12d0de9674b66c629d2bafada2ec4f6e6dba6d129e54dea4badc21502d54d3"
197 | }
198 | }
199 | },
200 | "nbformat": 4,
201 | "nbformat_minor": 2
202 | }
203 |
--------------------------------------------------------------------------------
/py/am_ET_numbers_icu.py:
--------------------------------------------------------------------------------
1 | from icu import Locale, LocalizedNumberFormatter, Formattable, RuleBasedNumberFormat, URBNFRuleSetTag
2 | # lang = "hi-IN-u-nu-deva"
3 | # lang = "en-IN"
4 | lang = input("Enter language tag: ")
5 | LOC = Locale.createCanonical(lang)
6 |
7 | number = 123452.54
8 | formatter = LocalizedNumberFormatter(LOC)
9 | r = formatter.formatDouble(number)
10 | print(r)
11 | # १,२३,४५२.५४
12 |
13 | rb_formatter = RuleBasedNumberFormat(URBNFRuleSetTag.SPELLOUT, LOC)
14 | r2 = rb_formatter.format(number)
15 | print(r2)
16 | # एक लाख तेईस हज़ार चार सौ बावन दशमलव पाँच चार
17 |
18 | r3 = rb_formatter.parse(r2)
19 | print(Formattable.getDouble(r3))
20 | # 123452.54
21 |
22 |
--------------------------------------------------------------------------------
/py/am_ET_numbers_icu_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/am_ET_numbers_icu_1.png
--------------------------------------------------------------------------------
/py/am_ET_numbers_icu_1.py:
--------------------------------------------------------------------------------
1 | from icu import Locale, LocalizedNumberFormatter, Formattable, RuleBasedNumberFormat, URBNFRuleSetTag
2 | lang = "am-ET-u-nu-ethi"
3 | LOC = Locale.createCanonical(lang)
4 | number = 123452
5 | formatter = RuleBasedNumberFormat(URBNFRuleSetTag.SPELLOUT, LOC)
6 |
7 | #
8 | # Spellout (in Amharic)
9 | #
10 | r = formatter.format(number)
11 | print(r)
12 | # መቶ ሁለት አስር ሦስት ሺ አራት መቶ አምስት አስር ሁለት
13 |
14 | #
15 | # Convert back
16 | #
17 | n = formatter.parse(r)
18 | print(n)
19 | # 123,452
20 | print(Formattable.getInt64(n))
21 | # 123452
22 |
23 |
24 |
--------------------------------------------------------------------------------
/py/am_ET_numbers_icu_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/am_ET_numbers_icu_2.png
--------------------------------------------------------------------------------
/py/am_ET_numbers_icu_2.py:
--------------------------------------------------------------------------------
1 | from icu import Locale, RuleBasedNumberFormat, URBNFRuleSetTag
2 | lang = "am-ET-u-nu-ethi"
3 | LOC = Locale.createCanonical(lang)
4 | number = 123452
5 | formatter = RuleBasedNumberFormat(URBNFRuleSetTag.NUMBERING_SYSTEM, LOC)
6 | formatter.setDefaultRuleSet('%ethiopic')
7 | r = formatter.format(number)
8 | print(r)
9 | # ፲፪፼፴፬፻፶፪
10 |
11 | # http://www.geez.org/Numerals/NumberSamples.html
12 |
13 |
14 | def toEthiopicNS(n):
15 | formatter = RuleBasedNumberFormat(URBNFRuleSetTag.NUMBERING_SYSTEM, Locale("am_ET"))
16 | formatter.setDefaultRuleSet('%ethiopic')
17 | return formatter.format(n)
18 |
19 | import pytest
20 | arabic_numbers = [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 100010000, 100100000, 100200000, 100110000, 1, 11, 111, 1111, 11111, 111111, 1111111, 11111111, 111111111, 1111111111, 11111111111, 111111111111, 1111111111111, 1, 12, 123, 1234, 12345, 7654321, 17654321, 51615131, 15161513, 10101011, 101, 1001, 1010, 1011, 1100, 1101, 1111, 10001, 10010, 10100, 10101, 10110, 10111, 100001, 100010, 100011, 100100, 101010, 1000001, 1000101, 1000100, 1010000, 1010001, 1100001, 1010101, 101010101, 100010000, 100010100, 101010100, 3, 30, 33, 303, 3003, 3030, 3033, 3300, 3303, 3333, 30003, 30303, 300003, 303030, 3000003, 3000303, 3030003, 3300003, 3030303, 303030303, 333333333]
21 | ethiopic_numbers = ["a፩", "፲", "፻", "፲፻", "፼", "፲፼", "፻፼", "፲፻፼", "፼፼", "፲፼፼", "፻፼፼", "፲፻፼፼", "፼፼፼", "፼፩፼", "፼፲፼", "፼፳፼", "፼፲፩፼", "፩", "፲፩", "፻፲፩", "፲፩፻፲፩", "፼፲፩፻፲፩", "፲፩፼፲፩፻፲፩", "፻፲፩፼፲፩፻፲፩", "፲፩፻፲፩፼፲፩፻፲፩", "፼፲፩፻፲፩፼፲፩፻፲፩", "፲፩፼፲፩፻፲፩፼፲፩፻፲፩", "፻፲፩፼፲፩፻፲፩፼፲፩፻፲፩", "፲፩፻፲፩፼፲፩፻፲፩፼፲፩፻፲፩", "፼፲፩፻፲፩፼፲፩፻፲፩፼፲፩፻፲፩", "፩", "፲፪", "፻፳፫", "፲፪፻፴፬", "፼፳፫፻፵፭", "፯፻፷፭፼፵፫፻፳፩", "፲፯፻፷፭፼፵፫፻፳፩", "፶፩፻፷፩፼፶፩፻፴፩", "፲፭፻፲፮፼፲፭፻፲፫", "፲፻፲፼፲፻፲፩", "፻፩", "፲፻፩", "፲፻፲", "፲፻፲፩", "፲፩፻", "፲፩፻፩", "፲፩፻፲፩", "፼፩", "፼፲", "፼፻", "፼፻፩", "፼፻፲", "፼፻፲፩", "፲፼፩", "፲፼፲", "፲፼፲፩", "፲፼፻", "፲፼፲፻፲", "፻፼፩", "፻፼፻፩", "፻፼፻", "፻፩፼", "፻፩፼፩", "፻፲፼፩", "፻፩፼፻፩", "፼፻፩፼፻፩", "፼፩፼", "፼፩፼፻", "፼፻፩፼፻", "፫", "፴", "፴፫", "፫፻፫", "፴፻፫", "፴፻፴", "፴፻፴፫", "፴፫፻", "፴፫፻፫", "፴፫፻፴፫", "፫፼፫", "፫፼፫፻፫", "፴፼፫", "፴፼፴፻፴", "፫፻፼፫", "፫፻፼፫፻፫", "፫፻፫፼፫", "፫፻፴፼፫", "፫፻፫፼፫፻፫", "፫፼፫፻፫፼፫፻፫", "፫፼፴፫፻፴፫፼፴፫፻፴፫"]
22 | converted = list(map(toEthiopicNS, arabic_numbers))
23 | converted == ethiopic_numbers
24 | # True
25 |
26 | def test_ethiopic_ns(l, r):
27 | converted = list(map(toEthiopicNS, l))
28 | assert converted == r
29 |
30 | test_ethiopic_ns(arabic_numbers, ethiopic_numbers)
31 |
32 | # * [icu::RuleBasedNumberFormat Class Reference](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedNumberFormat.html)
33 | # * [URBNFRuleSetTag](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/namespaceicu.html#a55dbbbdd4946251c23988013e06e695e)
34 |
35 |
36 | for n in range(formatter.getNumberOfRuleSetNames()):
37 | print(formatter.getRuleSetName(n))
38 | # %armenian-lower
39 | # %armenian-upper
40 | # %cyrillic-lower
41 | # %ethiopic
42 | # %georgian
43 | # %greek-lower
44 | # %greek-upper
45 | # %hebrew
46 | # # %hebrew-item
47 | # %roman-lower
48 | # %roman-upper
49 | # %tamil
50 | # %zz-default
51 |
--------------------------------------------------------------------------------
/py/arabic_reshaper_example.py:
--------------------------------------------------------------------------------
1 | import arabic_reshaper
2 | from bidi.algorithm import get_display
3 |
4 | from el_internationalisation import cp, clean_presentation_forms
5 |
6 | def rtl_hack(text: str, arabic: bool = True) -> str:
7 | """Visually reorders Arabic or Hebrew script Unicode text
8 |
9 | Visually reorders Arabic or Hebrew script Unicode text. For Arabic script text,
10 | individual Unicode characters are substituting each character for its equivalent
11 | presentation form. The modules are used to overcome lack of bidirectional algorithm
12 | and complex font rendering in some modules and terminals.
13 |
14 | It is better to solutions that utilise proper bidirectional algorithm and font
15 | rendering implementations. For matplotlib use the mplcairo backend instead. For
16 | annotating images use Pillow. Both make use of libraqm.
17 |
18 | arabic_reshaper module converts Arabic characters to Arabic Presentation Forms:
19 | pip install arabic-reshaper
20 |
21 | bidi.algorithm module converts a logically ordered string to visually ordered
22 | equivalent.
23 | pip install python-bidi
24 |
25 | Args:
26 | text (str): _description_
27 |
28 | Returns:
29 | str: _description_
30 | """
31 | return get_display(arabic_reshaper.reshape(text)) if arabic == True else get_display(text)
32 |
33 | text = 'اللغة العربية رائعة'
34 | text_h = rtl_hack(text)
35 | print(text)
36 | print(cp(text))
37 | print(text_h)
38 | print(cp(text_h))
39 |
40 |
41 |
42 |
43 |
44 | s1 = "لا"
45 | s1_h = rtl_hack(s1)
46 | s2 = "لأ"
47 | s2_h = rtl_hack(s2)
48 |
49 | print("\n")
50 | print(s1)
51 | print(cp(s1))
52 | print(s1_h)
53 | print(cp(s1_h))
54 |
55 | print("\n")
56 | print(s2)
57 | print(cp(s2))
58 | print(s2_h)
59 | print(cp(s2_h))
60 |
61 |
62 | s3 = "עברית חדשה"
63 | s3_h = rtl_hack(s3, arabic=False)
64 | print("\n")
65 | print(s3)
66 | print(cp(s3))
67 | print(s3_h)
68 | print(cp(s3_h))
69 | # print(s3_h == s3[::-1])
70 |
71 |
72 | # Note s3[::-1] is used for reversing strings,
73 | # but for languages that use combining marks,
74 | # it is better to reverse grapheme clusters:
75 | #
76 | # from grapheme import graphemes
77 | # print(s3_h == "".join(list(graphemes(s3))[::-1]))
78 |
79 | from grapheme import graphemes
80 | def reverse_string(text: str, use_graphemes: bool = False) -> str:
81 | return "".join(list(graphemes(text))[::-1]) if use_graphemes else text[::-1]
82 |
83 | import regex as re
84 | def reverse_string_regex(text: str, use_graphemes: bool = False) -> str:
85 | return "".join(re.findall(r'\X', text)[::-1]) if use_graphemes else text[::-1]
86 |
87 | print("---")
88 | # print(s3_h == "".join(list(graphemes(s3))[::-1]))
89 | # print("\n")
90 | print(text_h == text[::-1])
91 | print(clean_presentation_forms(text_h) == text[::-1])
92 | # print(clean_presentation_forms(text_h) == "".join(list(graphemes(text))[::-1]))
93 |
94 |
95 | print(clean_presentation_forms(text_h) == reverse_string(text))
96 | print(clean_presentation_forms(text_h) == reverse_string(text, use_graphemes=True))
97 |
98 | print(clean_presentation_forms(text_h) == reverse_string_regex(text))
99 | print(clean_presentation_forms(text_h) == reverse_string_regex(text, use_graphemes=True))
--------------------------------------------------------------------------------
/py/hi_IN_numbers_icu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/hi_IN_numbers_icu.png
--------------------------------------------------------------------------------
/py/hi_IN_numbers_icu.py:
--------------------------------------------------------------------------------
1 | from icu import Locale, LocalizedNumberFormatter, Formattable, RuleBasedNumberFormat, URBNFRuleSetTag
2 | # lang = "hi-IN-u-nu-deva"
3 | # lang = "en-IN"
4 | lang = input("Enter language tag: ")
5 | LOC = Locale.createCanonical(lang)
6 |
7 | number = 123452.54
8 | formatter = LocalizedNumberFormatter(LOC)
9 | r = formatter.formatDouble(number)
10 | print(r)
11 | # १,२३,४५२.५४
12 |
13 | rb_formatter = RuleBasedNumberFormat(URBNFRuleSetTag.SPELLOUT, LOC)
14 | r2 = rb_formatter.format(number)
15 | print(r2)
16 | # एक लाख तेईस हज़ार चार सौ बावन दशमलव पाँच चार
17 |
18 | r3 = rb_formatter.parse(r2)
19 | print(Formattable.getDouble(r3))
20 | # 123452.54
21 |
22 |
--------------------------------------------------------------------------------
/py/matplotlib_kurdish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/matplotlib_kurdish.png
--------------------------------------------------------------------------------
/py/matplotlib_kurdish.py:
--------------------------------------------------------------------------------
1 | #
2 | # matplotlib_kurdish.py
3 | #
4 | # This script will read in and process a Sorani Kurdish TSV file.
5 | #
6 | # mplcairo supports a number of backends available.
7 | #
8 | # If you wish to save plot as an image, rather than display plot
9 | # use module://mplcairo.base
10 | #
11 | # Depending on your OS and system configuration a number of
12 | # backends that render to widgets are available:
13 | # * module://mplcairo.gtk (used below for non-macOS installs)
14 | # * module://mplcairo.gtk_native
15 | # * module://mplcairo.qt
16 | # * module://mplcairo.tk
17 | # * module://mplcairo.wx
18 | # * module://mplcairo.macosx (used below for macOS)
19 |
20 | import pandas as pd
21 | import locale, platform
22 | import gi
23 | import mplcairo
24 | import matplotlib as mpl
25 | if platform.system() == "Darwin":
26 | mpl.use("module://mplcairo.macosx")
27 | else:
28 | gi.require_version("Gtk", "3.0")
29 | mpl.use("module://mplcairo.gtk")
30 | # mpl.use("module://mplcairo.qt")
31 | import matplotlib.pyplot as plt
32 | import matplotlib.ticker as ticker
33 | import seaborn as sns
34 | import unicodedata as ud, regex as re
35 |
36 | # Convert non-Western Arabic digits to Western Arabic digits
37 | def convert_digits(s, sep = (",", ".")):
38 | nd = re.compile(r'^-?\p{Nd}[,.\u066B\u066C\u0020\u2009\u202F\p{Nd}]*$')
39 | tsep, dsep = sep
40 | if nd.match(s):
41 | s = s.replace(tsep, "")
42 | s = ''.join([str(ud.decimal(c, c)) for c in s])
43 | if dsep in s:
44 | return float(s.replace(dsep, ".")) if dsep != "." else float(s)
45 | return int(s)
46 | return s
47 |
48 | # Specify grouping and decimal seperators using in data
49 | seps = ("\u066C", "\u066B")
50 | # convert entries to hyphen to Eastern Arabic zero, and pass to convert_digits()
51 | digitsconv = lambda x: convert_digits(x.replace("-", "٠"), sep = seps)
52 |
53 | # Covert Western Arabic digits to Eastern Arabic digits for tick labels
54 | def convert_to_sorani_ns(n, p=None, scale=None):
55 | locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
56 | decimal_places = 2
57 | n = n * scale if scale else n
58 | format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d'
59 | n = locale.format_string(format_string, n, grouping=True, monetary=True)
60 | n = n.replace(",", "ṯ").replace(".", "ḏ")
61 | sep = ["\u066C", "\u066B"]
62 | t = n.maketrans("0123456789", "٠١٢٣٤٥٦٧٨٩")
63 | locale.setlocale(locale.LC_ALL, "")
64 | return n.translate(t).replace("ṯ", sep[0] ).replace("ḏ", sep[1])
65 |
66 | # import data
67 | import pandas as pd
68 | conv = {
69 | 'سووریا': digitsconv,
70 | 'عێراق': digitsconv,
71 | 'ئێران': digitsconv,
72 | 'تورکیا': digitsconv,
73 | 'جیھانی': digitsconv
74 | }
75 | df = pd.read_table("../data/demographics.tsv", converters=conv)
76 | print(df)
77 |
78 | # get sum of each column
79 | col_list=["تورکیا" ,"ئێران" ,"عێراق" ,"سووریا"]
80 | total_df = df[col_list].sum(axis=0)
81 | print(total_df)
82 |
83 |
84 |
85 |
86 | fig, axes = plt.subplots(1,2)
87 | plt.rcParams.update({'font.family':'Vazirmatn'})
88 |
89 | # axes[0] - subplot with default (LTR) layout
90 | axes[0].bar(total_df.index, total_df.values, color='royalblue', alpha=0.7)
91 | axes[0].grid(color='#95a5a6', linestyle='--', linewidth=2, axis='y', alpha=0.7)
92 | axes[0].set_xlabel("ناوچە", size=12)
93 | axes[0].set_ylabel("ڕێژەی دانیشتووان (بە ملیۆن)", size=12)
94 | axes[0].set_title('ڕێژەی دانیشتووانی کورد', size=15)
95 |
96 | ns_formatter = ticker.FuncFormatter(lambda x, p: convert_to_sorani_ns(x, p, scale=0.000001))
97 | axes[0].get_yaxis().set_major_formatter(ns_formatter)
98 |
99 | # axes[1] - subplot with RTL layout
100 | axes[1].bar(total_df.index, total_df.values, color='royalblue', alpha=0.7)
101 | axes[1].grid(color='#95a5a6', linestyle='--', linewidth=2, axis='y', alpha=0.7)
102 |
103 | # move y axis and associated label to right of plot
104 | axes[1].yaxis.tick_right()
105 | axes[1].yaxis.set_label_position("right")
106 | # invert x-axis
107 | #plt.gca().invert_xaxis()
108 | axes[1].invert_xaxis()
109 | axes[1].set_xlabel("ناوچە", size=12)
110 | axes[1].set_ylabel("ڕێژەی دانیشتووان (بە ملیۆن)", size=12, labelpad=10)
111 | axes[1].set_title('ڕێژەی دانیشتووانی کورد', size=15)
112 | axes[1].get_yaxis().set_major_formatter(ns_formatter)
113 |
114 | # block=True required for running script in CLI when outputting canvas to widget.
115 | plt.tight_layout()
116 | plt.show(block=True)
--------------------------------------------------------------------------------
/py/pandas_plot_kurdish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/pandas_plot_kurdish.png
--------------------------------------------------------------------------------
/py/pandas_plot_kurdish.py:
--------------------------------------------------------------------------------
1 | #
2 | # matplotlib_kurdish.py
3 | #
4 | # This script will read in and process a Sorani Kurdish TSV file.
5 | #
6 | # mplcairo supports a number of backends available.
7 | #
8 | # If you wish to save plot as an image, rather than display plot
9 | # use module://mplcairo.base
10 | #
11 | # Depending on your OS and system configuration a number of
12 | # backends that render to widgets are available:
13 | # * module://mplcairo.gtk (used below for non-macOS installs)
14 | # * module://mplcairo.gtk_native
15 | # * module://mplcairo.qt
16 | # * module://mplcairo.tk
17 | # * module://mplcairo.wx
18 | # * module://mplcairo.macosx (used below for macOS)
19 |
20 | import pandas as pd
21 | import locale, platform
22 | import gi
23 | import mplcairo
24 | import matplotlib as mpl
25 | if platform.system() == "Darwin":
26 | mpl.use("module://mplcairo.macosx")
27 | else:
28 | gi.require_version("Gtk", "3.0")
29 | mpl.use("module://mplcairo.gtk")
30 | # mpl.use("module://mplcairo.qt")
31 | import matplotlib.pyplot as plt
32 | import matplotlib.ticker as ticker
33 | import seaborn as sns
34 | import unicodedata as ud, regex as re
35 |
36 | # Convert non-Western Arabic digits to Western Arabic digits
37 | def convert_digits(s, sep = (",", ".")):
38 | nd = re.compile(r'^-?\p{Nd}[,.\u066B\u066C\u0020\u2009\u202F\p{Nd}]*$')
39 | tsep, dsep = sep
40 | if nd.match(s):
41 | s = s.replace(tsep, "")
42 | s = ''.join([str(ud.decimal(c, c)) for c in s])
43 | if dsep in s:
44 | return float(s.replace(dsep, ".")) if dsep != "." else float(s)
45 | return int(s)
46 | return s
47 |
48 | # Specify grouping and decimal seperators using in data
49 | seps = ("\u066C", "\u066B")
50 | # convert entries to hyphen to Eastern Arabic zero, and pass to convert_digits()
51 | digitsconv = lambda x: convert_digits(x.replace("-", "٠"), sep = seps)
52 |
53 | # Covert Western Arabic digits to Eastern Arabic digits for tick labels
54 | def convert_to_sorani_ns(n, p=None, scale=None):
55 | locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
56 | decimal_places = 2
57 | n = n * scale if scale else n
58 | format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d'
59 | n = locale.format_string(format_string, n, grouping=True, monetary=True)
60 | n = n.replace(",", "ṯ").replace(".", "ḏ")
61 | sep = ["\u066C", "\u066B"]
62 | t = n.maketrans("0123456789", "٠١٢٣٤٥٦٧٨٩")
63 | locale.setlocale(locale.LC_ALL, "")
64 | return n.translate(t).replace("ṯ", sep[0] ).replace("ḏ", sep[1])
65 |
66 | # import data
67 | import pandas as pd
68 | conv = {
69 | 'سووریا': digitsconv,
70 | 'عێراق': digitsconv,
71 | 'ئێران': digitsconv,
72 | 'تورکیا': digitsconv,
73 | 'جیھانی': digitsconv
74 | }
75 | df = pd.read_table("../data/demographics.tsv", converters=conv)
76 | print(df)
77 |
78 | # get sum of each column
79 | col_list=["تورکیا" ,"ئێران" ,"عێراق" ,"سووریا"]
80 | total_df = df[col_list].sum(axis=0)
81 | print(total_df)
82 |
83 | plt.figure()
84 | plt.rcParams.update({'font.family':'Vazirmatn'})
85 | ns_formatter = ticker.FuncFormatter(lambda x, p: convert_to_sorani_ns(x, p, scale=0.000001))
86 |
87 | plt.subplot(1, 2, 1)
88 | ax1 = total_df.plot(kind="bar", title='ڕێژەی دانیشتووانی کورد', xlabel="ناوچە", ylabel="ڕێژەی دانیشتووان (بە ملیۆن)", rot=0)
89 | ax1.get_yaxis().set_major_formatter(ns_formatter)
90 |
91 | plt.subplot(1, 2, 2)
92 | ax2 = total_df.plot(kind="bar", title='ڕێژەی دانیشتووانی کورد', xlabel="ناوچە", ylabel="ڕێژەی دانیشتووان (بە ملیۆن)", rot=0)
93 | ax2.get_yaxis().set_major_formatter(ns_formatter)
94 | # move y axis and associated label to right of plot
95 | ax2.yaxis.tick_right()
96 | ax2.yaxis.set_label_position("right")
97 | # invert x-axis
98 | #plt.gca().invert_xaxis()
99 | ax2.invert_xaxis()
100 |
101 | plt.tight_layout()
102 | plt.show(block=True)
--------------------------------------------------------------------------------
/py/pyuca_test.py:
--------------------------------------------------------------------------------
1 | import pyuca
2 | test_list = ["₨", "Z", "ز", "z", "ر", "٨", "R", "﷼"]
3 | ducet_rules = "../rules/collation/allkeys_DUCET.txt"
4 | cldr_rules = "../rules/collation/allkeys_CLDR.txt"
5 | ducet_collator = pyuca.Collator(ducet_rules)
6 | cldr_collator = pyuca.Collator(cldr_rules)
7 |
8 | sorted_default = sorted(test_list)
9 | print(sorted_default)
10 | sorted_ducet = sorted(test_list, key=ducet_collator.sort_key)
11 | print(sorted_ducet)
12 | sorted_cldr = sorted(test_list, key=cldr_collator.sort_key)
13 | print(sorted_cldr)
14 |
15 | from icu import Locale, Collator
16 | loc = Locale.getRoot()
17 | collator = Collator.createInstance(loc)
18 | sorted_icu_root = sorted(test_list, key=collator.getSortKey)
19 | print(sorted_icu_root)
20 |
21 | print(sorted_icu_root == sorted_cldr)
--------------------------------------------------------------------------------
/py/seaborn_kurdish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/seaborn_kurdish.png
--------------------------------------------------------------------------------
/py/seaborn_kurdish.py:
--------------------------------------------------------------------------------
1 | #
2 | # seaborn_kurdish.py
3 | #
4 | # This script will read in and process a Sorani Kurdish TSV file.
5 | # Two plots will be generated (a LTR layout and a RTL layout).
6 | #
7 | # mplcairo supports a number of backends available.
8 | #
9 | # If you wish to save plot as an image, rather than display plot
10 | # use module://mplcairo.base
11 | #
12 | # Depending on your OS and system configuration a number of
13 | # backends that render to widgets are available:
14 | # * module://mplcairo.gtk (used below for non-macOS installs)
15 | # * module://mplcairo.gtk_native
16 | # * module://mplcairo.qt
17 | # * module://mplcairo.tk
18 | # * module://mplcairo.wx
19 | # * module://mplcairo.macosx (used below for macOS)
20 |
21 | import pandas as pd
22 | import locale, platform
23 | import gi
24 | import mplcairo
25 | import matplotlib as mpl
26 | if platform.system() == "Darwin":
27 | mpl.use("module://mplcairo.macosx")
28 | else:
29 | gi.require_version("Gtk", "3.0")
30 | mpl.use("module://mplcairo.gtk")
31 | # mpl.use("module://mplcairo.qt")
32 | import matplotlib.pyplot as plt
33 | import matplotlib.ticker as ticker
34 | import seaborn as sns
35 | import unicodedata as ud, regex as re
36 |
37 | # Convert non-Western Arabic digits to Western Arabic digits
38 | def convert_digits(s, sep = (",", ".")):
39 | nd = re.compile(r'^-?\p{Nd}[,.\u066B\u066C\u0020\u2009\u202F\p{Nd}]*$')
40 | tsep, dsep = sep
41 | if nd.match(s):
42 | s = s.replace(tsep, "")
43 | s = ''.join([str(ud.decimal(c, c)) for c in s])
44 | if dsep in s:
45 | return float(s.replace(dsep, ".")) if dsep != "." else float(s)
46 | return int(s)
47 | return s
48 |
49 | # Specify grouping and decimal seperators using in data
50 | seps = ("\u066C", "\u066B")
51 | # convert entries to hyphen to Eastern Arabic zero, and pass to convert_digits()
52 | digitsconv = lambda x: convert_digits(x.replace("-", "٠"), sep = seps)
53 |
54 | # Covert Western Arabic digits to Eastern Arabic digits for tick labels
55 | def convert_to_sorani_ns(n, p=None, scale=None):
56 | locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
57 | decimal_places = 2
58 | n = n * scale if scale else n
59 | format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d'
60 | n = locale.format_string(format_string, n, grouping=True, monetary=True)
61 | n = n.replace(",", "ṯ").replace(".", "ḏ")
62 | sep = ["\u066C", "\u066B"]
63 | t = n.maketrans("0123456789", "٠١٢٣٤٥٦٧٨٩")
64 | locale.setlocale(locale.LC_ALL, "")
65 | return n.translate(t).replace("ṯ", sep[0] ).replace("ḏ", sep[1])
66 |
67 | # import data
68 | import pandas as pd
69 | conv = {
70 | 'سووریا': digitsconv,
71 | 'عێراق': digitsconv,
72 | 'ئێران': digitsconv,
73 | 'تورکیا': digitsconv,
74 | 'جیھانی': digitsconv
75 | }
76 | df = pd.read_table("../data/demographics.tsv", converters=conv)
77 | print(df)
78 |
79 | # get sum of each column
80 | col_list=["تورکیا" ,"ئێران" ,"عێراق" ,"سووریا"]
81 | total_df = df[col_list].sum(axis=0)
82 | print(total_df)
83 |
84 | # Plot data. First subplot (axes[0]) is default layout, second subplot (axes[1]) is an RTL layout
85 | sns.set_style('darkgrid')
86 | sns.set_context({"font.family": "Vazirmatn"})
87 | fig, axes = plt.subplots(1,2)
88 | sns.barplot(x=total_df.index, y=total_df.values, ax=axes[0])
89 | sns.barplot(x=total_df.index, y=total_df.values, ax=axes[1])
90 |
91 | # set common labels for X and Y axes.
92 | plt.setp(axes, xlabel="ناوچە")
93 | plt.setp(axes, ylabel="ڕێژەی دانیشتووان (بە ملیۆن)")
94 | # Set single title for all subplots
95 | fig.suptitle('ڕێژەی دانیشتووانی کورد')
96 |
97 | # Define and apply conversion to tick labels for both axes
98 | ns_formatter = ticker.FuncFormatter(lambda x, p: convert_to_sorani_ns(x, p, scale=0.000001))
99 | axes[0].get_yaxis().set_major_formatter(ns_formatter)
100 | axes[1].get_yaxis().set_major_formatter(ns_formatter)
101 |
102 | # move y axis and associated label to right of axes[1]
103 | axes[1].yaxis.tick_right()
104 | axes[1].yaxis.set_label_position("right")
105 | # invert x-axis for axes[1]
106 | #plt.gca().invert_xaxis()
107 | axes[1].invert_xaxis()
108 |
109 | # block=True required for running script in CLI when outputting canvas to widget.
110 | plt.show(block=True)
--------------------------------------------------------------------------------
/py/wordcloud_kurdish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/wordcloud_kurdish.png
--------------------------------------------------------------------------------
/py/wordcloud_kurdish.py:
--------------------------------------------------------------------------------
1 | import gi, platform, os
2 | import mplcairo
3 | import matplotlib as mpl
4 | if platform.system() == "Darwin":
5 | mpl.use("module://mplcairo.macosx")
6 | else:
7 | gi.require_version("Gtk", "3.0")
8 | mpl.use("module://mplcairo.gtk")
9 | # mpl.use("module://mplcairo.qt")
10 | import matplotlib.pyplot as plt
11 | from wordcloud import WordCloud
12 |
13 | # Stopword list from klpt (Kurdish Language Processing Toolkit)
14 | # Available stopword lists: Sorani (Arabic) and Kurmanji (Latin)
15 | def get_kurdish_stopwords(dialect, script):
16 | from urllib.request import urlopen
17 | import json
18 | url = "https://raw.githubusercontent.com/sinaahmadi/klpt/master/klpt/data/stopwords.json"
19 | response = urlopen(url)
20 | data_json = json.loads(response.read())
21 | return set(data_json[dialect][script])
22 |
23 | ckb_stopwords = get_kurdish_stopwords("Sorani", "Arabic")
24 | text = """
25 | زمانی کوردی
26 | لە ئینسایکڵۆپیدیای ئازادی ویکیپیدیاوە
27 | ئەم وتارە سەبارەت بە زمانی کوردی نووسراوە. بۆ شاعیرە کوردەکە، بڕوانە کوردی (شاعیر). بۆ وتارە ھاوشێوەکان، بڕوانە کوردی (ڕوونکردنەوە).
28 | زمانی کوردی (بە کرمانجی، بە سۆرانی: زمانی کوردی، بە کەڵهوڕی: زوان کوردی، بە لەکی: زوۆن کوردی، بە زازاکی، بە ھەورامی: زوانو کوردی) زمانێکە کە خەڵکی کورد قسەی پێدەکەن. لە ڕووی بنەماڵەوە بەشێکە لە زمانە ھیندوئەورووپایییەکان. ئەم زمانە لە زمانی کەڤناری مادی کەوتووەتەوە. زمانی کوردی لە نێوان زمانە ئێرانییەکاندا لە بواری پرژماربوونی ئاخێوەران سێیەمین زمانە و دەکەوێتە دوای زمانەکانی فارسی و پەشتۆ.
29 | شێوەزارەکانی کوردی
30 | وتاری سەرەکی: شێوەزارەکانی زمانی کوردی
31 | زمانی کوردی چەند شێوەزارێکی سەرەکی ھەیە کە جیاوازیی زۆریان ھەیە و زمانناسەکان لە سەر چۆنیەتی جیاکردنەوەی ئەم شێوەزارانە یەکدەنگ نین و زۆرێک لە زمانناسەکان باوەڕییان بە ماڵباتی زمانگەلی کوردی ھەیە. یانی کورمانجیی باکووری و گۆرانی، بە پێی یاسا و ڕێسای زمانناسی و زمانەوانییەوە، دو زمانی سەربەخۆی کوردینە، نەک دو شێوەزار. بەڵام زۆربەی ئەو کەسانەی زمانی(زمانەکانی) کوردییان دابەش کردووە، بەم چوار دەستەیە بووە
32 | کوردیی باکووری
33 | کوردیی ناوەندی
34 | کوردیی باشووری
35 | گۆرانی-زازایی
36 | ھەندێک لە زمانناسان، لوڕیش وەک شێوەزارێکی زمانی کوردی پۆلبەند دەکەن. ئەگەر چی لوڕی ژمارەیەکی زۆری وشەی کوردی تێدایە، بەڵام ھێشتاش لێکۆلینەوەیەکی ئەوتۆ لە سەر لوڕی لە بەر دەستدا نییە.
37 | ئەلفوبێی کوردی
38 | وتار سەرەکییەکان: ئەلفوبێکانی کوردی و ئەلفوبێی عەرەبیی زمانی کوردی
39 | بەھۆی ئەوەی کە کوردەکان لە ژێر دەسەڵاتی عوسمانی و ئێران بوون و ئەلفوبێی فەرمیی ئەو دوو وڵاتە ئەلفوبێی عەرەبی بوو، کوردەکانیش تا پێش سییەکان تەنیا ئەلفوبێی عەرەبییان بۆ نووسینی کوردی بەکار دەھێنا. لە تورکیا، لە دوای بە فەرمیکردنی ئەلفوبێی لاتینی بۆ زمانی تورکی، جەلادەت عەلی بەدرخان لە ساڵی ١٩٣٢ ئەلفوبێیەکی لاتینیی بۆ زمانی کوردی داھێنا کە ئێستا بە ناوی "ئەلفوبێی ھاوار" یان "بەدرخان" دەناسرێت.
40 | """
41 |
42 | font_file = os.path.expanduser("~/.local/share/fonts/fontamin/TrueType/Estedad/Estedad_Regular.ttf")
43 |
44 | word_cloud = WordCloud(font_path=font_file, collocations = False, background_color = 'white', stopwords=ckb_stopwords).generate(text)
45 | plt.imshow(word_cloud, interpolation='bilinear')
46 | plt.axis("off")
47 | plt.show(block=True)
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gcld3==3.0.13
2 | grapheme==0.6.0
3 | LaoNLP==0.2.dev5
4 | nltk>=3.6.4
5 | pandas==1.1.3
6 | PyICU==2.7.4
7 | pyidaungsu==0.0.9
8 | pythainlp==2.3.1
9 | python-myanmar==1.10.0
10 | pyuca==1.2
11 | regex==2020.4.4
12 | tangled-up-in-unicode==0.0.6
13 | unicodedata2==13.0.0.post2
14 | unicodedataplus==13.0.0.post2
15 |
--------------------------------------------------------------------------------
/rules/collation/README.md:
--------------------------------------------------------------------------------
1 | # Collation rules
2 |
3 | Unicode 15.0.0 \
4 | CLDR v41
5 |
6 | __Collation data:__
7 |
8 | * [allkeys_CLDR.txt](https://github.com/unicode-org/cldr/blob/main/common/uca/allkeys_CLDR.txt)
9 | * [allkeys_DUCET.txt](https://www.unicode.org/Public/UCA/latest/allkeys.txt)
10 | * [CLDR collation rules per locale](https://github.com/unicode-org/cldr/tree/release-42-beta2/common/collation)
11 |
12 |
13 | __Other links:__
14 |
15 | * [CLDR versions](https://cldr.unicode.org/index/downloads)
16 | * [CLDR (GitHub)](https://github.com/unicode-org/cldr)
17 | * [CLDR development version](https://cldr.unicode.org/index/downloads/dev)
18 | * [UCA data - latest](https://www.unicode.org/Public/UCA/latest/)
19 | * [UCD data and charts - latest](https://www.unicode.org/Public/UCD/latest/)
20 | * [Collator demo](https://icu4c-demos.unicode.org/icu-bin/collation.html)
--------------------------------------------------------------------------------
/rules/collation/cldr/ckb.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
18 |
19 |
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/rules/collation/cldr/ckb_IQ.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/rules/collation/cldr/ckb_IR.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/rules/collation/collation_rules.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #
3 | # Collation _rules
4 | #
5 | ###############################
6 |
7 | # Akan (ak, fat, twi, wss)
8 | ak_rules = fat_rules = twi_rules = wss_rules = (
9 | "&E<ɛ<<<Ɛ"
10 | "&O<ɔ<<<Ɔ"
11 | )
12 |
13 | # Dinka (din, dip, diw, dib, dks, dik)
14 | din_rules = dip_rules = diw_rules = dib_rules = dks_rules = dik_rules = (
15 | "[normalization on]"
16 | "&A< from "" % Double ARABIC LETTER WAW
60 |
61 | reorder-after % ARABIC LETTER REH
62 | % ARABIC LETTER REH WITH SMALL V BELOW
63 |
64 | reorder-after % ARABIC LETTER WAW
65 | % Double ARABIC LETTER WAW
66 |
67 | reorder-end
68 |
69 | END LC_COLLATE
70 |
71 | LC_MONETARY
72 | copy "ckb_IQ"
73 | END LC_MONETARY
74 |
75 | LC_NUMERIC
76 | copy "ckb_IQ"
77 | END LC_NUMERIC
78 |
79 | LC_TIME
80 | copy "ckb_IQ"
81 | END LC_TIME
82 |
83 | LC_MESSAGES
84 | copy "ckb_IQ"
85 | END LC_MESSAGES
86 |
87 | LC_PAPER
88 | copy "ckb_IQ"
89 | END LC_PAPER
90 |
91 | LC_NAME
92 | copy "ckb_IQ"
93 | END LC_NAME
94 |
95 | LC_ADDRESS
96 | copy "ckb_IQ"
97 | END LC_ADDRESS
98 |
99 | LC_TELEPHONE
100 | copy "ckb_IQ"
101 | END LC_TELEPHONE
102 |
103 | LC_MEASUREMENT
104 | copy "ckb_IQ"
105 | END LC_MEASUREMENT
--------------------------------------------------------------------------------
/rules/collation/glibc/en_SS:
--------------------------------------------------------------------------------
1 | comment_char %
2 | escape_char /
3 |
4 | % This file is part of the GNU C Library and contains locale data.
5 | % The Free Software Foundation does not claim any copyright interest
6 | % in the locale data contained in this file. The foregoing does not
7 | % affect the license of the GNU C Library as a whole. It does not
8 | % exempt you from the conditions of the license if your use would
9 | % otherwise be governed by that license.
10 |
11 | LC_IDENTIFICATION
12 | title "English locale for South Sudan"
13 | source "CLDR"
14 | address ""
15 | contact "Andjc"
16 | email ""
17 | tel ""
18 | fax ""
19 | language "English"
20 | territory "South Sudan"
21 | revision "1.0"
22 | date "2022-10-13"
23 |
24 | category "i18n:2012";LC_IDENTIFICATION
25 | category "i18n:2012";LC_CTYPE
26 | category "i18n:2012";LC_COLLATE
27 | category "i18n:2012";LC_TIME
28 | category "i18n:2012";LC_NUMERIC
29 | category "i18n:2012";LC_MONETARY
30 | category "i18n:2012";LC_MESSAGES
31 | category "i18n:2012";LC_PAPER
32 | category "i18n:2012";LC_NAME
33 | category "i18n:2012";LC_ADDRESS
34 | category "i18n:2012";LC_TELEPHONE
35 | category "i18n:2012";LC_MEASUREMENT
36 | END LC_IDENTIFICATION
37 |
38 | LC_CTYPE
39 | copy "i18n"
40 |
41 | translit_start
42 | include "translit_combining";""
43 | translit_end
44 | END LC_CTYPE
45 |
46 | LC_COLLATE
47 | % Copy the template from ISO/IEC 14651
48 | copy "iso14651_t1"
49 | END LC_COLLATE
50 |
51 | LC_MONETARY
52 | int_curr_symbol "SSP "
53 | currency_symbol ""
54 | mon_decimal_point "."
55 | mon_thousands_sep ","
56 | mon_grouping 3;3
57 | positive_sign ""
58 | negative_sign "-"
59 | int_frac_digits 2
60 | frac_digits 2
61 | p_cs_precedes 1
62 | p_sep_by_space 0
63 | n_cs_precedes 1
64 | n_sep_by_space 0
65 | p_sign_posn 1
66 | n_sign_posn 1
67 | END LC_MONETARY
68 |
69 | LC_NUMERIC
70 | decimal_point "."
71 | thousands_sep ","
72 | grouping 3;3
73 | END LC_NUMERIC
74 |
75 | LC_TIME
76 | abday "Sun";"Mon";"Tue";"Wed";"Thu";"Fri";"Sat"
77 | day "Sunday";/
78 | "Monday";/
79 | "Tuesday";/
80 | "Wednesday";/
81 | "Thursday";/
82 | "Friday";/
83 | "Saturday"
84 | abmon "Jan";"Feb";/
85 | "Mar";"Apr";/
86 | "May";"Jun";/
87 | "Jul";"Aug";/
88 | "Sep";"Oct";/
89 | "Nov";"Dec"
90 | mon "January";/
91 | "February";/
92 | "March";/
93 | "April";/
94 | "May";/
95 | "June";/
96 | "July";/
97 | "August";/
98 | "September";/
99 | "October";/
100 | "November";/
101 | "December"
102 | d_t_fmt "%a %d %b %Y %T %Z"
103 | d_fmt "%d//%m//%y"
104 | t_fmt "%T"
105 | am_pm "am";"pm"
106 | t_fmt_ampm "%l:%M:%S %P %Z"
107 | date_fmt "%a %e %b %H:%M:%S %Z %Y"
108 | week 7;19971130;4
109 | first_weekday 2
110 | END LC_TIME
111 |
112 | LC_MESSAGES
113 | copy "en_US"
114 | END LC_MESSAGES
115 |
116 | LC_PAPER
117 | copy "i18n"
118 | END LC_PAPER
119 |
120 | LC_TELEPHONE
121 | tel_int_fmt "+%c %a %l"
122 | tel_dom_fmt "%A %l"
123 | int_select "00" % https://en.wikipedia.org/wiki/International_call_prefix, https://en.wikipedia.org/wiki/List_of_international_call_prefixes
124 | int_prefix "211" % https://en.wikipedia.org/wiki/List_of_country_calling_codes
125 | END LC_TELEPHONE
126 |
127 | LC_MEASUREMENT
128 | copy "i18n"
129 | END LC_MEASUREMENT
130 |
131 | LC_NAME
132 | copy "en_US"
133 | END LC_NAME
134 |
135 | LC_ADDRESS
136 | postal_fmt "%f%N%a%N%d%N%b%N%s %h %e %r%N%z %T%N%c%N"
137 | country_name "South Sudan"
138 | country_ab2 "SS" % https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
139 | country_ab3 "SSD"
140 | country_num 728
141 | country_car ""
142 | lang_name "English"
143 | lang_ab "en"
144 | lang_term "eng"
145 | lang_lib "eng"
146 | END LC_ADDRESS
147 |
148 | % https://man7.org/linux/man-pages/man5/locale.5.html
149 | % https://metacpan.org/dist/DateTime-Locale/view/lib/DateTime/Locale/en_SS.pod
150 | % https://www.localeplanet.com/icu/en-SS/index.html
151 | % https://www.iana.org/time-zones - https://www.timeanddate.com/worldclock/south-sudan/juba - Central African Time (CAT) - Africa/Juba - UTC+2
--------------------------------------------------------------------------------
/rules/collation/icu/ckb.txt:
--------------------------------------------------------------------------------
1 | // © 2016 and later: Unicode, Inc. and others.
2 | // License & terms of use: http://www.unicode.org/copyright.html
3 | // Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
4 | ckb{
5 | collations{
6 | standard{
7 | Sequence{
8 | "[normalization on]"
9 | "[reorder Arab]"
10 | "&ر < ڕ"
11 | "&و < وو"
12 | }
13 | Version{"42"}
14 | }
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/rules/collation/icu/ckb_IQ.txt:
--------------------------------------------------------------------------------
1 | // © 2016 and later: Unicode, Inc. and others.
2 | // License & terms of use: http://www.unicode.org/copyright.html
3 | // Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
4 | ckb_IQ{
5 | collations{
6 | standard{
7 | Sequence{"[import ckb]"}
8 | Version{"42"}
9 | }
10 | }
11 | }
--------------------------------------------------------------------------------
/rules/collation/sorani_alphabet.tsv:
--------------------------------------------------------------------------------
1 | Order Character Codepoint
2 | 1 ئ U+0626
3 | 2 ا U+0627
4 | 3 ب U+0628
5 | 4 پ U+067E
6 | 5 ت U+062A
7 | 6 ج U+062C
8 | 7 چ U+0686
9 | 8 ح U+062D
10 | 9 خ U+062E
11 | 10 د U+062F
12 | 11 ر U+0631
13 | 12 ڕ U+0695
14 | 13 ز U+0632
15 | 14 ژ U+0698
16 | 15 س U+0633
17 | 16 ش U+0634
18 | 17 ع U+0639
19 | 18 غ U+063A
20 | 19 ف U+0641
21 | 20 ڤ U+06A4
22 | 21 ق U+0642
23 | 22 ک U+06A9
24 | 23 گ U+06AF
25 | 24 ل U+0644
26 | 25 ڵ U+06B5
27 | 26 م U+0645
28 | 27 ن U+0646
29 | 28 ه U+0647
30 | 29 ە U+06D5
31 | 30 و U+0648
32 | 32 وو U+0648 U+0648
33 | 31 ۆ U+06C6
34 | 33 ی U+06CC
35 | 34 ێ U+06CE
--------------------------------------------------------------------------------
/rules/collation/temp.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 6,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import regex as re\n",
10 | "text = \"ရန်ကုန်ကွန်ပျူတာတက္ကသိုလ်\""
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 7,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "name": "stdout",
20 | "output_type": "stream",
21 | "text": [
22 | "Number of graphemes: 14\n",
23 | "Graphemes: ['ရ', 'န်', 'ကု', 'န်', 'ကွ', 'န်', 'ပျူ', 'တ', 'ာ', 'တ', 'က္', 'က', 'သို', 'လ်']\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "# split string into extended grapheme clusters\n",
29 | "graphemes = re.findall(r'\\X', text)\n",
30 | "print(f'Number of graphemes: {len(graphemes)}\\nGraphemes: {graphemes}')"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 8,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | "Number of syllables: 7\n",
43 | "Syllables: ['န်', 'ုန်', 'ွန်', 'ျူ', 'ာ', 'က္က', 'ိုလ်']\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "# syllable segmentation with regex\n",
49 | "pattern = r'(?:(?= 60:
183 | formatter = LocalizedNumberFormatter(loc)
184 | r = formatter.formatDouble(digit) if isinstance(digit, float) else formatter.formatInt(digit)
185 | else:
186 | formatter = NumberFormat.createInstance(loc)
187 | r = formatter.format(digit)
188 | return r
189 |
190 |
--------------------------------------------------------------------------------
/snippets/data_cleaning.py:
--------------------------------------------------------------------------------
1 | ####################
2 | #
3 | # Data cleaning
4 | # © Enabling Languages 2022
5 | # Released under the MIT License.
6 | #
7 | ####################
8 |
9 | # import unicodedata as ud
10 | import unicodedataplus as ud
11 | import regex as re
12 | from icu import UnicodeString, Locale, Normalizer2, UNormalizationMode2
13 |
14 |
15 | #
16 | # Unicode normalisation
17 | # Simple wrappers for Unicode normalisation
18 | #
19 |
20 | def NFD(s, engine="ud"):
21 | if engine == "icu":
22 | normalizer = Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE)
23 | return normalizer.normalize(s)
24 | return ud.normalize('NFD', s)
25 |
26 | def NFKD(s, engine="ud"):
27 | if engine == "icu":
28 | normalizer = Normalizer2.getInstance(None, "nfkc", UNormalizationMode2.DECOMPOSE)
29 | return normalizer.normalize(s)
30 | return ud.normalize('NFKD', s)
31 |
32 | def NFC(s, engine="ud"):
33 | if engine == "icu":
34 | normalizer = Normalizer2.getInstance(None, "nfc", UNormalizationMode2.COMPOSE)
35 | return normalizer.normalize(s)
36 | return ud.normalize('NFC', s)
37 |
38 | def NFKC(s, engine="ud"):
39 | if engine == "icu":
40 | normalizer = Normalizer2.getInstance(None, "nfkc", UNormalizationMode2.COMPOSE)
41 | return normalizer.normalize(s)
42 | return ud.normalize('NFKC', s)
43 |
44 | #
45 | # Clean presentation forms
46 | #
47 | # For Latin and Armenian scripts, use either folding=True or folding=False (default),
48 | # while for Arabic and Hebrew scripts, use folding=False.
49 | #
50 |
51 | def has_presentation_forms(text):
52 | pattern = r'([\p{InAlphabetic_Presentation_Forms}\p{InArabic_Presentation_Forms-A}\p{InArabic_Presentation_Forms-B}]+)'
53 | return bool(re.findall(pattern, text))
54 |
55 | def clean_presentation_forms(text, folding=False):
56 | def clean_pf(match, folding):
57 | return match.group(1).casefold() if folding else ud.normalize("NFKC", match.group(1))
58 | pattern = r'([\p{InAlphabetic_Presentation_Forms}\p{InArabic_Presentation_Forms-A}\p{InArabic_Presentation_Forms-B}]+)'
59 | return re.sub(pattern, lambda match, folding=folding: clean_pf(match, folding), text)
60 |
61 | # PyICU Helper functions for casing and casefolding.
62 | # s is a string, l is an ICU locale object (defaulting to CLDR Root Locale)
63 | def toLower(s, l=Locale.getRoot()):
64 | return str(UnicodeString(s).toLower(l))
65 | def toUpper(s, l=Locale.getRoot()):
66 | return str(UnicodeString(s).toUpper(l))
67 | def toTitle(s, l=Locale.getRoot()):
68 | return str(UnicodeString(s).toTitle(l))
69 | def toSentence(s, l=Locale.getRoot()):
70 | return(str(UnicodeString(s[0]).toUpper(l)) + str(UnicodeString(s[1:]).toLower(l)))
71 | def foldCase(s):
72 | return str(UnicodeString(s).foldCase())
73 |
74 | #
75 | # Turkish casing implemented without module dependencies.
76 | # PyICU provides a more comprehensive solution for Turkish.
77 | #
78 |
79 | # To lowercase
80 | def kucukharfyap(s):
81 | return ud.normalize("NFC", s).replace('İ', 'i').replace('I', 'ı').lower()
82 |
83 | # To uppercase
84 | def buyukharfyap(s):
85 | return ud.normalize("NFC", s).replace('ı', 'I').replace('i', 'İ').upper()
86 |
--------------------------------------------------------------------------------
/snippets/matching.py:
--------------------------------------------------------------------------------
1 | ####################
2 | #
3 | # Unicode matching
4 | #
5 | # © Enabling Languages 2022
6 | # Released under the MIT License.
7 | #
8 | ####################
9 |
10 | import unicodedataplus as ud
11 | import regex as re
12 |
13 | def caseless_match(x, y):
14 | return x.casefold() == y.casefold()
15 |
16 | def canonical_caseless_match(x, y):
17 | return ud.normalize("NFD", ud.normalize("NFD", x).casefold()) == ud.normalize("NFD", ud.normalize("NFD", y).casefold())
18 |
19 | def compatibility_caseless_match(x, y):
20 | return ud.normalize("NFKD", ud.normalize("NFKD", ud.normalize("NFD", x).casefold()).casefold()) == ud.normalize("NFKD", ud.normalize("NFKD", ud.normalize("NFD", y).casefold()).casefold())
21 |
22 | def NFKC_Casefold(s):
23 | pattern = re.compile(r"\p{Default_Ignorable_Code_Point=Yes}")
24 | s = re.sub(pattern, '', s)
25 | return ud.normalize("NFC", ud.normalize('NFKC', s).casefold())
26 |
27 | def identifier_caseless_match(x, y):
28 | return NFKC_Casefold(ud.normalize("NFD", x)) == NFKC_Casefold(ud.normalize("NFD", y))
29 |
--------------------------------------------------------------------------------
/snippets/regex_segmentation.py:
--------------------------------------------------------------------------------
1 | import regex
2 | from el_internationalisation import cp
3 |
4 |
5 | def regex_segmentation(text: str, pattern: str, sep: str = "\u200B", mode: list = ["list"]) -> list | str | None:
6 | """Tokenise string using regex, returning results as a list or string.
7 |
8 | Args:
9 | text (str): text to be segmented
10 | pattern (str): regex pattern for segmentation
11 | sep (str, optional): seperator to use if string is returned or results are displayed to STDOUT. Defaults to "\u200B" (ZWSP - Zero Width Space).
12 | display (bool, optional): Indicates whether results should displayed on STDOUT (True) or returned (False). Defaults to False.
13 | mode (str, optional): Indicates if results should be returned as a list or string, or displayed to STDOUT. Defaults to "list". Use "string" to return results as a string. Use "display" to output to STDOUT
14 |
15 | Returns:
16 | list | str | None: Results returned as list or string (see mode argument) or as None (if display)
17 | """
18 | result: str = regex.sub(pattern, r"\u200B\1", text)
19 | if result[0] == "\u200B":
20 | result = result[1:]
21 | result_list: list = result.split("\u200B")
22 | result_string: str = sep.join(result_list)
23 | if "display" in mode:
24 | print(
25 | f"Number of tokens: {str(len(result_list))} \nSegmentation boundaries: {result_string}")
26 | if "codepoints" in mode:
27 | for item in result_list:
28 | print(cp(item))
29 | if ("string" not in mode) and ("list" not in mode):
30 | print("Nothing to return")
31 | return None
32 | return result_string if "string" in mode else result_list
33 |
34 | #####################
35 | #
36 | # Examples
37 | #
38 | #####################
39 |
40 |
41 | s = 'ရန်ကုန်ကွန်ပျူတာတက္ကသိုလ်'
42 | pattern = r'(?:(?= 0xa0 and bytenums[1] <= 0xbf and \
65 | bytenums[2] >= 0x80 and bytenums[2] <= 0xbf and \
66 | bytenums[3] == SURROGATE_IDENTICATOR_INT and \
67 | bytenums[4] >= 0xb0 and bytenums[4] <= 0xbf and \
68 | bytenums[5] >= 0x80 and bytenums[5] <= 0xbf:
69 |
70 | codepoint = (
71 | ((bytenums[1] & 0x0f) << 16) +
72 | ((bytenums[2] & 0x3f) << 10) +
73 | ((bytenums[4] & 0x0f) << 6) +
74 | (bytenums[5] & 0x3f) +
75 | 0x10000
76 | )
77 | return chr(codepoint), 6
78 |
79 | # No CESU-8 surrogate but probably a 3 byte UTF-8 sequence
80 | return codecs.utf_8_decode(input[:3], errors, final)
81 |
82 | cesu8_surrogate_start = input.find(SURROGATE_IDENTICATOR_BYTE)
83 | if cesu8_surrogate_start > 0:
84 | # Decode everything until start of cesu8 surrogate pair
85 | return codecs.utf_8_decode(input[:cesu8_surrogate_start], errors, final)
86 |
87 | # No sign of CESU-8 encoding
88 | return codecs.utf_8_decode(input, errors, final)
89 |
90 | class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
91 |
92 | def _buffer_encode(self, input, errors, final=False):
93 | encoded_segments = []
94 | position = 0
95 | input_length = len(input)
96 |
97 | while position + 1 <= input_length:
98 | encoded, consumed = self._buffer_encode_step(
99 | input[position], errors, final
100 | )
101 |
102 | if consumed == 0:
103 | break
104 |
105 | encoded_segments.append(encoded)
106 | position += consumed
107 |
108 | if final and position != len(input):
109 | raise Exception("Final encoder doesn't encode all characters")
110 |
111 | return b''.join(encoded_segments), position
112 |
113 | def _buffer_encode_step(self, char, errors, final):
114 | codepoint = ord(char)
115 | if codepoint <= 65535:
116 | return codecs.utf_8_encode(char, errors)
117 | else:
118 | seq = bytearray(6)
119 | seq[0] = 0xED
120 | seq[1] = 0xA0 | (((codepoint & 0x1F0000) >> 16) - 1)
121 | seq[2] = 0x80 | (codepoint & 0xFC00) >> 10
122 | seq[3] = 0xED
123 | seq[4] = 0xB0 | ((codepoint >> 6) & 0x3F)
124 | seq[5] = 0x80 | (codepoint & 0x3F)
125 | return bytes(seq), 1
126 |
127 | def encode(input, errors='strict'):
128 | return IncrementalEncoder(errors).encode(input, final=True), len(input)
129 |
130 | def decode(input, errors='strict'):
131 | return IncrementalDecoder(errors).decode(input, final=True), len(input)
132 |
133 | class StreamWriter(codecs.StreamWriter):
134 | encode = encode
135 |
136 | class StreamReader(codecs.StreamReader):
137 | decode = decode
138 |
139 | CESU8_CODEC_INFO = codecs.CodecInfo(
140 | name="cesu-8",
141 | encode=encode,
142 | decode=decode,
143 | incrementalencoder=IncrementalEncoder,
144 | incrementaldecoder=IncrementalDecoder,
145 | streamreader=StreamReader,
146 | streamwriter=StreamWriter,
147 | )
148 |
149 | def search_function(encoding):
150 | if encoding == 'cesu-8':
151 | return CESU8_CODEC_INFO
152 | else:
153 | return None
154 |
155 | codecs.register(search_function)
156 |
--------------------------------------------------------------------------------