├── .gitignore ├── LICENSE ├── README.md ├── colab ├── en_IN_on_colab.ipynb ├── locale-colab-snippet.ipynb └── locale_module_colab.ipynb ├── data ├── bn_global_popl.tsv ├── demographics.tsv ├── din.txt ├── dolar_endeksi.tsv ├── fa_stats.tsv ├── klpt_stopwords.json ├── myanmar-regions.tsv ├── myanmar_ethnic_groups.tsv ├── rbbi │ ├── Default.rbbi │ ├── Lao.rbbi │ ├── lucene │ │ ├── source.md │ │ └── uax29 │ │ │ ├── Default.rbbi │ │ │ └── MyanmarSyllable.rbbi │ ├── solrcene │ │ ├── Hebrew.rbbi │ │ ├── Khmer.rbbi │ │ ├── Lao.rbbi │ │ ├── Myanmar.rbbi │ │ └── source.md │ └── source.md ├── régions_métropolitaines.tsv ├── sorani_alphabet.tsv ├── sorani_alphabet_wikipedia.tsv ├── source.md ├── türkiye'ninz-illeri.tsv └── wordlists │ └── source.md ├── docs ├── DRAFT_icu_transforms.pdf ├── README.md └── matplotlib.md ├── notebooks ├── Collation.ipynb ├── Sorting_emoji.ipynb ├── armenian_pandas.ipynb ├── bangla_df.ipynb ├── ckb_sort.ipynb ├── complex_script_support_images.ipynb ├── data │ └── allkeys.txt ├── ethiopic_numbers.ipynb ├── icu_transforms.ipynb ├── images │ ├── sorani_plotly.png │ ├── sorani_plotly2.png │ └── sorani_plotly_inline.png ├── img │ ├── 1440px-Lake_Dukan_12.jpg │ ├── ckb_IQ_collation.png │ ├── khamti.jpg │ ├── linux1.png │ ├── macos1.png │ ├── mplcairo_output.png │ ├── sibe.jpg │ ├── std_matplotlib_output.png │ ├── tai_aiton.jpg │ ├── tai_aiton_text_to_image.png │ └── yolngu.jpg ├── is_IS.ipynb ├── kn_demographics_pandas_matplotlib.ipynb ├── kn_demographics_pandas_plottly.ipynb ├── matplotlib_locale.ipynb ├── matplotlib_mplcairo.ipynb ├── matplotlib_mplcairo2.ipynb ├── matplotlib_pyicu.ipynb ├── my-segmentation.ipynb ├── pandas_plot_mplcairo.ipynb ├── pandas_plot_plotly.ipynb ├── persian_df.ipynb ├── plotly.ipynb ├── plotly2.ipynb ├── seaborn.ipynb ├── sorting_pandas.ipynb ├── strings_casing_matching.ipynb ├── turkish_df.ipynb └── vietnamese_pandas.ipynb ├── py ├── am_ET_numbers_icu.py ├── am_ET_numbers_icu_1.png ├── am_ET_numbers_icu_1.py ├── am_ET_numbers_icu_2.png ├── am_ET_numbers_icu_2.py ├── arabic_reshaper_example.py ├── hi_IN_numbers_icu.png ├── hi_IN_numbers_icu.py ├── matplotlib_kurdish.png ├── matplotlib_kurdish.py ├── pandas_plot_kurdish.png ├── pandas_plot_kurdish.py ├── pyuca_test.py ├── seaborn_kurdish.png ├── seaborn_kurdish.py ├── wordcloud_kurdish.png └── wordcloud_kurdish.py ├── requirements.txt ├── rules ├── collation │ ├── README.md │ ├── allkeys_CLDR.txt │ ├── allkeys_DUCET.txt │ ├── cldr │ │ ├── ckb.xml │ │ ├── ckb_IQ.xml │ │ ├── ckb_IR.xml │ │ └── dtd │ │ │ └── ldml.dtd │ ├── collation_rules.py │ ├── glibc │ │ ├── ckb_IQ@academy │ │ └── en_SS │ ├── icu │ │ ├── ckb.txt │ │ └── ckb_IQ.txt │ ├── sorani_alphabet.tsv │ └── temp.ipynb └── segmentation │ ├── regex_patterns.md │ └── syllables │ ├── Khmer.rbbi │ ├── Lao.rbbi │ └── Myanmar.rbbi ├── snippets ├── break_iterator.py ├── convert_digits.py ├── data_cleaning.py ├── matching.py ├── regex_segmentation.py └── sort_key_normalise.py └── utils ├── cesu8.py ├── el_utils.py └── elle.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | ### JupyterNotebooks ### 132 | # gitignore template for Jupyter Notebooks 133 | # website: http://jupyter.org/ 134 | 135 | .ipynb_checkpoints 136 | */.ipynb_checkpoints/* 137 | 138 | # IPython 139 | profile_default/ 140 | ipython_config.py 141 | 142 | # Remove previous ipynb_checkpoints 143 | # git rm -r .ipynb_checkpoints/ 144 | 145 | ### Linux ### 146 | *~ 147 | 148 | # temporary files which can be created if a process still has a handle open of a deleted file 149 | .fuse_hidden* 150 | 151 | # KDE directory preferences 152 | .directory 153 | 154 | # Linux trash folder which might appear on any partition or disk 155 | .Trash-* 156 | 157 | # .nfs files are created when an open file is removed but is still being accessed 158 | .nfs* 159 | 160 | ### macOS ### 161 | # General 162 | .DS_Store 163 | .AppleDouble 164 | .LSOverride 165 | 166 | # Icon must end with two \r 167 | Icon 168 | 169 | 170 | # Thumbnails 171 | ._* 172 | 173 | # Files that might appear in the root of a volume 174 | .DocumentRevisions-V100 175 | .fseventsd 176 | .Spotlight-V100 177 | .TemporaryItems 178 | .Trashes 179 | .VolumeIcon.icns 180 | .com.apple.timemachine.donotpresent 181 | 182 | # Directories potentially created on remote AFP share 183 | .AppleDB 184 | .AppleDesktop 185 | Network Trash Folder 186 | Temporary Items 187 | .apdisk 188 | 189 | ### VisualStudioCode ### 190 | .vscode/* 191 | !.vscode/settings.json 192 | !.vscode/tasks.json 193 | !.vscode/launch.json 194 | !.vscode/extensions.json 195 | *.code-workspace 196 | 197 | # Local History for Visual Studio Code 198 | .history/ 199 | 200 | ### VisualStudioCode Patch ### 201 | # Ignore all local history of files 202 | .history 203 | .ionide 204 | 205 | ### Windows ### 206 | # Windows thumbnail cache files 207 | Thumbs.db 208 | Thumbs.db:encryptable 209 | ehthumbs.db 210 | ehthumbs_vista.db 211 | 212 | # Dump file 213 | *.stackdump 214 | 215 | # Folder config file 216 | [Dd]esktop.ini 217 | 218 | # Recycle Bin used on file shares 219 | $RECYCLE.BIN/ 220 | 221 | # Windows Installer files 222 | *.cab 223 | *.msi 224 | *.msix 225 | *.msm 226 | *.msp 227 | 228 | # Windows shortcuts 229 | *.lnk 230 | 231 | # Repo specfic 232 | notes/ 233 | print/ 234 | archive/ 235 | .vscode/ 236 | data/wordlists/kurdi_words.txt 237 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021-2 Enabling Languages 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python internationalisation 2 | 3 | There is limited information on Python internationalisation or Python internationalisation best practices. What little that's available is scattered, and most available articles and tutorials on Python internationalisation are specifically on localisation. 4 | 5 | The EL notebooks contain notes on various aspects of Python internationalisation, and new topics will be added over time. 6 | 7 | Feedback is welcome. 8 | 9 | ## Python internationalisation notes 10 | 11 | * Collation 12 | 1. [Sorting](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/Collation.ipynb) 13 | 2. [Sorting emoji](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/Sorting_emoji.ipynb) 14 | 3. [Sorting pandas](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/sorting_pandas.ipynb) 15 | * Data visualisation 16 | 1. [Matplotlib, pandas plot, seaborn, wordcloud](https://github.com/enabling-languages/python-i18n/blob/main/docs/matplotlib.md) 17 | 2. [Locale specific formatting of numeric tick labels on matplotlib](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/matplotlib_locale.ipynb) 18 | 3. [Using PyICU to format matplotlib numeric tick labels](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/matplotlib_pyicu.ipynb) 19 | * Working with digits 20 | 1. [snippets](https://github.com/enabling-languages/python-i18n/blob/main/snippets/convert_digits.py) 21 | 22 | ## Google Colab notes 23 | 24 | 1. [Setting the locale of a notebook for Google Colab](https://github.com/enabling-languages/python-i18n/blob/main/colab/locale_module_colab.ipynb) 25 | 26 | ## Resources 27 | 28 | Python documentation: 29 | 30 | * [Internationalization](https://docs.python.org/3/library/i18n.html) 31 | * [Unicode HOWTO](https://docs.python.org/3/howto/unicode.html) 32 | * [Unicode Objects and Codecs](https://docs.python.org/3/c-api/unicode.html) 33 | -------------------------------------------------------------------------------- /colab/locale-colab-snippet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Loading locales in Colab: snippet\n", 8 | "\n", 9 | "Refer to [locale_module_colab.ipynb](locale_module_colab.ipynb)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# Import locale module\n", 19 | "import locale" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "try:\n", 29 | " import google.colab\n", 30 | " IN_COLAB = True\n", 31 | "except ImportError:\n", 32 | " IN_COLAB = False\n", 33 | "if IN_COLAB:\n", 34 | " try:\n", 35 | " locale.setlocale(locale.LC_ALL, \"en_AU.UTF-8\")\n", 36 | " except locale.Error:\n", 37 | " !sudo apt-get install language-pack-en language-pack-fr language-pack-sv language-pack-de\n", 38 | " #!sudo apt autoremove\n", 39 | " import os\n", 40 | " os.kill(os.getpid(), 9)\n", 41 | "else:\n", 42 | " locale.setlocale(locale.LC_ALL, \"en_AU.UTF-8\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "print(\"IN_COLAB: \" + str(IN_COLAB))\n", 52 | "print(locale.getlocale())" 53 | ] 54 | } 55 | ], 56 | "metadata": { 57 | "interpreter": { 58 | "hash": "bb12d0de9674b66c629d2bafada2ec4f6e6dba6d129e54dea4badc21502d54d3" 59 | }, 60 | "kernelspec": { 61 | "display_name": "Python 3.8.1 ('el')", 62 | "language": "python", 63 | "name": "python3" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.8.1" 76 | }, 77 | "orig_nbformat": 4 78 | }, 79 | "nbformat": 4, 80 | "nbformat_minor": 2 81 | } 82 | -------------------------------------------------------------------------------- /data/demographics.tsv: -------------------------------------------------------------------------------- 1 | "---" "جیھانی" "تورکیا" "ئێران" "عێراق" "سووریا" 2 | "کرمانجی" "١٤٬٤١٩٬٠٠٠" "٧٬٩١٩٬٠٠٠" "٤٤٣٬٠٠٠" "٣٬١٨٥٬٠٠٠" "١٬٦٦١٬٠٠٠ 3 | " 4 | "ئەوانەی بە تورکی دەدوێن" "٥٬٧٣٢٬٠٠٠" "٥٬٧٣٢٬٠٠٠" "-" "-" "- 5 | " 6 | "باشوور" "٣٬٣٨١٬٠٠٠" "-" "٣٬٣٨١٬٠٠٠" "-" "- 7 | " 8 | "سۆرانی" "١٬٥٧٦٬٠٠٠" "-" "٥٠٢٬٠٠٠" "٥٦٧٬٠٠٠" "- 9 | " 10 | "زازایی - دەملی" "١٬١٢٥٬٠٠٠" "١٬١٢٥٬٠٠٠" "-" "-" "- 11 | " 12 | "زازایی - ئەلڤێکا" "١٨٤٬٠٠٠" "١٧٩٬٠٠٠" "-" "-" "- 13 | " 14 | "ڕەوەند" "٩٠٬٠٠٠" "٣٨٬٠٠٠" "٢٠٬٠٠٠" "٣٣٬٠٠٠" "- 15 | " 16 | "ھەورامی" "٥٤٬٠٠٠" "-" "٢٦٬٠٠٠" "٢٨٬٠٠٠" "- 17 | " 18 | "شکاکی" "٤٩٬٠٠٠" "٢٣٬٠٠٠" "٢٦٬٠٠٠" "-" "- 19 | " 20 | "کۆی گشتی" "٢٦٬٧١٢٬٠٠٠" "١٥٬٠١٦٬٠٠٠" "٤٬٣٩٨٬٠٠٠" "٣٬٩١٦٬٠٠٠" "١٬٦٦١٬٠٠٠" -------------------------------------------------------------------------------- /data/fa_stats.tsv: -------------------------------------------------------------------------------- 1 | سال ولادت وفات 2 | "۱۳۳۸ " "۸۶۴٬۸۴۶ " ۱۷۶٬۲۸۸ 3 | "۱۳۳۹ " "۸۷۶٬۲۰۶ " ۱۷۱٬۰۴۰ 4 | "۱۳۴۰ " "۹۰۲٬۲۶۰ " ۱۵۹٬۳۷۱ 5 | "۱۳۴۱ " "۹۵۷٬۵۰۰ " ۱۶۵٬۴۸۸ 6 | "۱۳۴۲ " "۹۲۰٬۹۶۷ " ۱۳۵٬۹۱۲ 7 | "۱۳۴۳ " "۱٬۱۱۸٬۹۱۱ " ۱۴۵٬۱۷۴ 8 | "۱۳۴۴ " "۱٬۱۸۸٬۳۴۶ " ۱۷۱٬۹۴۰ 9 | "۱۳۴۵ " "۱٬۱۰۲٬۸۴۸ " ۱۷۸٬۹۹۱ 10 | "۱۳۴۶ " "۱٬۰۱۴٬۳۲۱ " ۱۷۸٬۷۴۹ 11 | "۱۳۴۷ " "۱٬۰۴۶٬۱۳۴ " ۱۷۳٬۳۵۲ 12 | "۱۳۴۸ " "۱٬۱۰۷٬۹۱۰ " ۱۶۷٬۵۷۵ 13 | "۱۳۴۹ " "۱٬۱۹۰٬۹۵۷ " ۱۶۳٬۸۹۶ 14 | "۱۳۵۰ " "۱٬۲۳۵٬۰۲۵ " ۱۴۹٬۰۱۱ 15 | "۱۳۵۱ " "۱٬۱۳۸٬۸۴۳ " ۱۵۳٬۹۲۰ 16 | "۱۳۵۲ " "۱٬۱۹۹٬۷۷۷ " ۱۵۵٬۳۰۵ 17 | "۱۳۵۳ " "۱٬۲۴۸٬۲۵۶ " ۱۴۹٬۸۷۵ 18 | "۱۳۵۴ " "۱٬۳۳۹٬۲۶۷ " ۱۴۸٬۵۴۳ 19 | "۱۳۵۵ " "۱٬۳۹۹٬۹۷۷ " ۱۵۶٬۰۱۰ 20 | "۱۳۵۶ " "۱٬۴۰۶٬۲۰۴ " ۱۴۶٬۳۶۹ 21 | "۱۳۵۷ " "۱٬۳۷۳٬۷۳۸ " ۱۲۷٬۸۸۳ 22 | "۱۳۵۸ " "۱٬۶۸۸٬۹۴۲ " ۱۴۲٬۴۰۱ 23 | "۱۳۵۹ " "۲٬۴۵۱٬۷۶۵ " ۱۶۲٬۱۷۵ 24 | "۱۳۶۰ " "۲٬۴۱۹٬۹۵۱ " ۱۷۸٬۰۶۵ 25 | "۱۳۶۱ " "۲٬۰۹۷٬۹۵۷ " ۲۰۰٬۶۱۴ 26 | "۱۳۶۲ " "۲٬۲۰۳٬۵۶۰ " ۲۰۷٬۲۲۸ 27 | "۱۳۶۳ " "۲٬۰۶۸٬۲۷۹ " ۱۸۶٬۴۴۰ 28 | "۱۳۶۴ " "۲٬۰۳۱٬۹۶۹ " ۱۹۰٬۰۶۱ 29 | "۱۳۶۵ " "۲٬۲۵۶٬۹۷۱ " ۱۹۹٬۵۱۱ 30 | "۱۳۶۶ " "۱٬۸۳۲٬۷۲۲ " ۲۰۴٬۲۳۰ 31 | "۱۳۶۷ " "۱٬۹۴۲٬۹۳۶ " ۲۳۸٬۳۹۰ 32 | "۱۳۶۸ " "۱٬۷۸۹٬۸۱۷ " ۱۹۹٬۶۴۵ 33 | "۱۳۶۹ " "۱٬۷۲۶٬۴۸۸ " ۲۱۷٬۶۱۵ 34 | "۱۳۷۰ " "۱٬۵۹۲٬۸۹۸ " ۲۱۷٬۶۰۴ 35 | "۱۳۷۱ " "۱٬۴۳۳٬۲۴۳ " ۱۸۸٬۶۴۷ 36 | "۱۳۷۲ " "۱٬۳۸۸٬۰۱۷ " ۲۰۸٬۱۶۱ 37 | "۱۳۷۳ " "۱٬۴۲۶٬۷۸۴ " ۲٬۵۳۸٬۰۷۸ 38 | "۱۳۷۴ " "۱٬۲۰۵٬۳۷۲ " ۲٬۷۵۶٬۴۸۲ 39 | "۱۳۷۵ " "۱٬۱۸۷٬۹۰۳ " ۱٬۲۴۰٬۹۷۵ 40 | "۱۳۷۶ " "۱٬۱۷۹٬۲۶۰ " ۱٬۰۳۱٬۸۳۶ 41 | "۱۳۷۷ " "۱٬۱۸۶٬۶۵۹ " ۵۵۱٬۳۴۵ 42 | "۱۳۷۸ " "۱٬۱۷۴٬۲۷۹ " ۵۰۶٬۹۴۵ 43 | "۱۳۷۹ " "۱٬۰۹۵٬۱۶۵ " ۳۸۲٬۶۷۴ 44 | "۱۳۸۰ " "۱٬۱۱۰٬۸۳۶ " ۴۲۱٬۵۲۵ 45 | "۱۳۸۱ " "۱٬۱۲۲٬۱۰۴ " ۳۳۷٬۲۳۷ 46 | "۱۳۸۲ " "۱٬۱۷۱٬۵۷۳ " ۳۶۸٬۵۱۸ 47 | "۱۳۸۳ " "۱٬۱۵۴٬۳۶۸ " ۳۵۵٬۲۱۳ 48 | "۱۳۸۴ " "۱٬۲۳۹٬۴۰۸ " ۳۶۳٬۷۲۳ 49 | "۱۳۸۵ " "۱٬۲۵۳٬۹۱۲ " ۴۰۸٬۵۶۶ 50 | "۱۳۸۶ " "۱٬۲۸۶٬۷۱۶ " ۴۱۲٬۷۳۶ 51 | "۱۳۸۷ " "۱٬۳۰۰٬۱۶۶ " ۴۱۷٬۷۹۸ 52 | "۱۳۸۸ " "۱٬۳۴۸٬۵۲۶ " ۳۹۳٬۵۱۴ 53 | "۱۳۸۹ " "۱٬۳۶۴٬۵۲۳ " ۴۴۰٬۵۳۸ 54 | "۱۳۹۰ " "۱٬۳۸۲٬۲۲۹ " ۴۲۲٬۱۳۳ 55 | "۱۳۹۱ " "۱٬۴۲۱٬۶۸۹ " ۳۶۷٬۵۳۹ 56 | "۱۳۹۲ " "۱٬۴۷۱٬۷۵۸ " ۳۶۱٬۲۲۷ 57 | "۱۳۹۳ " "۱٬۵۳۴٬۳۱۱ " ۴۳۶٬۸۴۰ 58 | "۱۳۹۴ " "۱٬۵۷۰٬۱۸۳ " ۳۶۶٬۶۸۴ 59 | "۱۳۹۵ " "۱٬۵۲۸٬۰۰۳ " ۳۶۹٬۱۵۲ 60 | "۱۳۹۶ " "۱٬۴۸۷٬۸۶۱ " ۳۷۶٬۳۱۳ 61 | "۱۳۹۷ " "۱٬۳۶۶٬۴۹۱ " ۳۷۷٬۰۲۴ 62 | "۱۳۹۸ " "۱٬۱۹۶٬۱۳۵ " ۳۹۵٬۳۹۲ 63 | "۱۳۹۹ " "۱٬۱۱۳٬۹۶۴ " ۵۰۷٬۵۱۱ -------------------------------------------------------------------------------- /data/klpt_stopwords.json: -------------------------------------------------------------------------------- 1 | { 2 | "Sorani": { 3 | "Arabic": [ 4 | "ئاستی", 5 | "ئێستا", 6 | "ئێمە", 7 | "ئێوە", 8 | "ئەم", 9 | "ئەمساڵ", 10 | "ئەمه", 11 | "ئەمڕۆ", 12 | "ئەمەش", 13 | "ئەنجام", 14 | "ئەنجامدانی", 15 | "ئەو", 16 | "ئەوان", 17 | "ئەوانەی", 18 | "ئەوه", 19 | "ئەویش", 20 | "ئەوەش", 21 | "ئەوەشی", 22 | "ئەوەی", 23 | "ئەڤ", 24 | "ئەگەر", 25 | "ب", 26 | "بارەی", 27 | "باس", 28 | "باسی", 29 | "باش", 30 | "باشترین", 31 | "بدات", 32 | "بن", 33 | "به", 34 | "بواری", 35 | "بوو", 36 | "بوون", 37 | "بوونی", 38 | "بووە", 39 | "بڕی", 40 | "بکات", 41 | "بکرێت", 42 | "بکەن", 43 | "بکەین", 44 | "بۆ", 45 | "بۆیه", 46 | "بی", 47 | "بێ", 48 | "بێت", 49 | "بێجگە", 50 | "بە", 51 | "بەبێ", 52 | "بەدەست", 53 | "بەدەم", 54 | "بەر", 55 | "بەرامبەر", 56 | "بەردەم", 57 | "بەردەوام", 58 | "بەرلە", 59 | "بەرەو", 60 | "بەرەوی", 61 | "بەرەوە", 62 | "بەسەر", 63 | "بەشی", 64 | "بەشێکی", 65 | "بەلای", 66 | "بەم", 67 | "بەمەبەستی", 68 | "بەهۆی", 69 | "بەو", 70 | "بەپێی", 71 | "بەڵام", 72 | "بەڵکو", 73 | "تا", 74 | "تاوەکو", 75 | "تاکو", 76 | "تر", 77 | "تری", 78 | "تووشی", 79 | "تۆ", 80 | "تیادا", 81 | "تیایدا", 82 | "تێ", 83 | "تێدا", 84 | "تێیدا", 85 | "تەنها", 86 | "تەنیا", 87 | "تەواو", 88 | "تەواوی", 89 | "جار", 90 | "جگە", 91 | "جۆره", 92 | "جێگەی", 93 | "جێی", 94 | "خۆی", 95 | "خۆیان", 96 | "داهاتوو", 97 | "داهاتوودا", 98 | "داهاتووی", 99 | "داوای", 100 | "داوه", 101 | "در", 102 | "درێژەی", 103 | "دوا", 104 | "دواتر", 105 | "دوای", 106 | "دوێنێ", 107 | "دژی", 108 | "دی", 109 | "دیکه", 110 | "دیکەش", 111 | "دیکەی", 112 | "دێ", 113 | "دێت", 114 | "دە", 115 | "دەبن", 116 | "دەبێت", 117 | "دەبێته", 118 | "دەدات", 119 | "دەدرێت", 120 | "دەربارەی", 121 | "دەرەوەی", 122 | "دەکات", 123 | "دەکرێت", 124 | "دەکەن", 125 | "دەکەین", 126 | "دەگەڵ", 127 | "زۆر", 128 | "زۆربەی", 129 | "زۆری", 130 | "زیاتر", 131 | "ساڵ", 132 | "سبەی", 133 | "سەبارەت", 134 | "سەر", 135 | "سەرجەم", 136 | "سەرەکی", 137 | "شوێنی", 138 | "شێوەی", 139 | "شێوەیەکی", 140 | "لای", 141 | "لایەن", 142 | "لایەنه", 143 | "لایەنی", 144 | "لێ", 145 | "لە", 146 | "لەبابەت", 147 | "لەباتی", 148 | "لەبارەی", 149 | "لەبرێتی", 150 | "لەبەر", 151 | "لەبەینی", 152 | "لەدەم", 153 | "لەرێ", 154 | "لەرێگا", 155 | "لەسەر", 156 | "لەلایەن", 157 | "لەم", 158 | "لەناو", 159 | "لەنێو", 160 | "لەو", 161 | "لەپێناوی", 162 | "لەژێر", 163 | "لەگەڵ", 164 | "ماوەی", 165 | "ملیۆن", 166 | "من", 167 | "میانەی", 168 | "مەبەستی", 169 | "ناو", 170 | "ناوخۆی", 171 | "ناوی", 172 | "نییه", 173 | "نێو", 174 | "نێوان", 175 | "هات", 176 | "هاته", 177 | "هاتووە", 178 | "هاوکات", 179 | "هۆکاری", 180 | "هۆڵی", 181 | "هۆی", 182 | "هیچ", 183 | "هێڵی", 184 | "هەبێت", 185 | "هەر", 186 | "هەردوو", 187 | "هەردوولا", 188 | "هەروەها", 189 | "هەریەک", 190 | "هەفتەی", 191 | "هەمان", 192 | "هەموو", 193 | "هەندێک", 194 | "هەیە", 195 | "هەیەو", 196 | "و", 197 | "واته", 198 | "وایه", 199 | "وتی", 200 | "وەک", 201 | "وەکوو", 202 | "پاش", 203 | "پلەی", 204 | "پێ", 205 | "پێش", 206 | "پێشتر", 207 | "پێشووی", 208 | "پێویسته", 209 | "پێی", 210 | "چوونکه", 211 | "چەند", 212 | "چەندین", 213 | "ڕوو", 214 | "ڕووی", 215 | "ژمارەیەک", 216 | "ژمارەیەکی", 217 | "ژێر", 218 | "کاتێک", 219 | "کرا", 220 | "کران", 221 | "کرد", 222 | "کردبوو", 223 | "کردن", 224 | "کردنی", 225 | "کردنەوەی", 226 | "کردووه", 227 | "کردووەو", 228 | "کردەوه", 229 | "کە", 230 | "کەس", 231 | "کەم", 232 | "یا", 233 | "یان", 234 | "یێ", 235 | "یەک", 236 | "یەکێک", 237 | "یەکەم", 238 | "یەکەمی", 239 | "یەکەمین" 240 | ], 241 | "Latin": [] 242 | }, 243 | "Kurmanji": { 244 | "Latin": [ 245 | "a", 246 | "an", 247 | "bareya", 248 | "bareyê", 249 | "barên", 250 | "basa", 251 | "be", 252 | "belê", 253 | "ber", 254 | "bereya", 255 | "berê", 256 | "berî", 257 | "bi", 258 | "bibe", 259 | "bila", 260 | "bin", 261 | "bo", 262 | "bê", 263 | "bû", 264 | "bûn", 265 | "bûye", 266 | "da", 267 | "dawî", 268 | "dawîyê", 269 | "daye", 270 | "de", 271 | "dema", 272 | "demekê", 273 | "demê", 274 | "derbarê", 275 | "derve", 276 | "dev", 277 | "di", 278 | "dibe", 279 | "digel", 280 | "dijî", 281 | "dikir", 282 | "din", 283 | "dinê", 284 | "divê", 285 | "diçe", 286 | "doh", 287 | "du", 288 | "dê", 289 | "dîsan", 290 | "e", 291 | "eger", 292 | "em", 293 | "encam", 294 | "ev", 295 | "evan", 296 | "eve", 297 | "evê", 298 | "evî", 299 | "ew", 300 | "ewa", 301 | "ewan", 302 | "ewê", 303 | "ewên", 304 | "ewî", 305 | "ez", 306 | "gelek", 307 | "gelekî", 308 | "gelê", 309 | "gerek", 310 | "giştî", 311 | "gor", 312 | "han", 313 | "heger", 314 | "hejmarek", 315 | "hem", 316 | "heman", 317 | "hember", 318 | "hemû", 319 | "hene", 320 | "her", 321 | "herdem", 322 | "herdu", 323 | "herweha", 324 | "herwiha", 325 | "herwisa", 326 | "herî", 327 | "heta", 328 | "hev", 329 | "hevdu", 330 | "heye", 331 | "hin", 332 | "hinek", 333 | "hîngê", 334 | "hûn", 335 | "in", 336 | "ji", 337 | "jiber", 338 | "jibo", 339 | "jê", 340 | "jêr", 341 | "jî", 342 | "ka", 343 | "ke", 344 | "kes", 345 | "kir", 346 | "kirîye", 347 | "ku", 348 | "kû", 349 | "layê", 350 | "le", 351 | "li", 352 | "ligel", 353 | "lê", 354 | "me", 355 | "min", 356 | "nav", 357 | "nava", 358 | "navbera", 359 | "navê", 360 | "navîn", 361 | "ne", 362 | "nêvbera", 363 | "nêzîkî", 364 | "nîne", 365 | "piştî", 366 | "pê", 367 | "pêk", 368 | "pêş", 369 | "re", 370 | "ser", 371 | "serê", 372 | "tenê", 373 | "ti", 374 | "tiştekî", 375 | "tu", 376 | "tê", 377 | "u", 378 | "van", 379 | "ve", 380 | "vir", 381 | "vê", 382 | "vî", 383 | "wan", 384 | "we", 385 | "weha", 386 | "wek", 387 | "weke", 388 | "wekî", 389 | "wiha", 390 | "wir", 391 | "wisa", 392 | "wê", 393 | "wî", 394 | "xwarê", 395 | "xwe", 396 | "ya", 397 | "yan", 398 | "ye", 399 | "yek", 400 | "yekê", 401 | "yê", 402 | "yên", 403 | "zêde", 404 | "zêdetir", 405 | "çawa", 406 | "çend", 407 | "çendê", 408 | "çendîn", 409 | "çi", 410 | "ê", 411 | "êdî", 412 | "ên", 413 | "îro", 414 | "û" 415 | ], 416 | "Arabic": [] 417 | } 418 | } -------------------------------------------------------------------------------- /data/myanmar-regions.tsv: -------------------------------------------------------------------------------- 1 | အင်္ဂလိပ်အမည် မြန်မာအမည် မြို့တော် ISO နေရာဒေသ လူဦးရေ _၂၀၁၄ ဧရိယာ အမျိုးအစား 2 | Ayeyarwady ဧရာဝတီ ပုသိမ်မြို့ MM-07 အောက်ပိုင်း ၆,၁၈၄,၈၂၉ ၃၅,၀၃၁.၈ တိုင်းဒေသကြီး 3 | Bago ပဲခူး ပဲခူးမြို့ MM-02 အောက်ပိုင်း ၄,၈၆၇,၃၇၃ ၃၉,၄၀၂.၃ တိုင်းဒေသကြီး 4 | Chin ချင်း ဟားခါးမြို့ MM-14 အနောက်ပိုင်း ၄၇၈,၈၀၁ ၃၆,၀၁၈.၈ ပြည်နယ် 5 | Kachin ကချင် မြစ်ကြီးနားမြို့ MM-11 မြောက်ပိုင်း ၁,၆၈၉,၄၄၁ ၈၉,၀၄၁.၈ ပြည်နယ် 6 | Kayah ကယားကယား လွိုင်ကော်မြို့ MM-12 အရှေ့ပိုင်း ၂၈၆,၆၂၇ ၁၁,၇၃၁.၅ ပြည်နယ် 7 | Kayin ကရင် ဘားအံမြို့ MM-13 တောင်ပိုင်း ၁,၅၇၄,၀၇၉ ၃၀,၃၈၃ ပြည်နယ် 8 | Magway မကွေး မကွေးမြို့ MM-03 အလယ်ပိုင်း ၃,၉၁၇,၀၅၅ ၄၄,၈၂၀.၆ တိုင်းဒေသကြီး 9 | Mandalay မန္တလေး မန္တလေးမြို့ MM-04 အလယ်ပိုင်း ၆,၁၆၅,၇၂၃ ၃၇,၉၄၅.၆ တိုင်းဒေသကြီး 10 | Mon မွန် မော်လမြိုင်မြို့ MM-15 တောင်ပိုင်း ၂,၀၅၄,၃၉၃ ၁၂,၂၉၆.၆ ပြည်နယ် 11 | Rakhine ရခိုင် စစ်တွေမြို့ MM-16 အနောက်ပိုင်း ၃,၁၈၈,၈၀၇ ၃၆,၇၇၈.၀ ပြည်နယ် 12 | Shan ရှမ်း တောင်ကြီးမြို့၂ MM-17 အရှေ့ပိုင်း ၅,၈၂၄,၄၃၂ ၁၅၅,၈၀၁.၃ ပြည်နယ် 13 | Sagaing စစ်ကိုင်း မုံရွာမြို့ MM-01 မြောက်ပိုင်း ၅,၃၂၅,၃၄၇ ၉၃,၇၀၄.၈ တိုင်းဒေသကြီး 14 | Tanintharyi တနင်္သာရီ ထားဝယ်မြို့ MM-05 တောင်ပိုင်း ၁,၄၀၈,၄၀၁ ၄၄,၃၄၄.၉ တိုင်းဒေသကြီး 15 | Yangon ရန်ကုန် ရန်ကုန်မြို့ MM-06 အောက်ပိုင်း ၇,၃၆၀,၇၀၃ ၁၀,၂၆၇.၇ တိုင်းဒေသကြီး 16 | Naypyidaw နေပြည်တော် နေပြည်တော် MM-18 အလယ်ပိုင်း ၁,၁၆၀,၂၄၂ ၇,၀၅၄ ပြည်ထောင်စုနယ်မြေ -------------------------------------------------------------------------------- /data/myanmar_ethnic_groups.tsv: -------------------------------------------------------------------------------- 1 | မြန်မာတိုင်းရင်းသားများ အကြမ်းဖျင်းခန့်မှန်း အကြမ်းဖျင်းခန့်မှန်း 2 | ကချင် 1.50 ၁.၅၀ 3 | ကယား 0.75 ၀.၇၅ 4 | ကရင် 7.00 ၇.၀၀ 5 | တရုတ် 2.50 ၂.၅၀ 6 | ဗမာ 68.00 ၆၈.၀၀ 7 | မွန် 2.00 ၂.၀၀ 8 | ရခိုင် 1.7 ၁.၇၀ 9 | ရိုဟင်ဂျာ 1.8 ၁.၈၀ 10 | ရှမ်း 9.00 ၉.၀၀ 11 | အခြားအုပ်စုများ 4.50 ၄.၅၀ 12 | အိန္ဒိယ 1.25 ၁.၂၅ -------------------------------------------------------------------------------- /data/rbbi/Default.rbbi: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # This file is from ICU (with some small modifications, to avoid CJK dictionary break, 18 | # and status code change related to that) 19 | # 20 | # Copyright (C) 2016 and later: Unicode, Inc. and others. 21 | # License & terms of use: http://www.unicode.org/copyright.html 22 | # Copyright (C) 2002-2016, International Business Machines Corporation 23 | # and others. All Rights Reserved. 24 | # 25 | # file: word.txt 26 | # 27 | # ICU Word Break Rules 28 | # See Unicode Standard Annex #29. 29 | # These rules are based on UAX #29 Revision 29 for Unicode Version 9.0 30 | # with additions for Emoji Sequences from https://goo.gl/cluFCn 31 | # Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html 32 | # 33 | # Note: Updates to word.txt will usually need to be merged into 34 | # word_POSIX.txt also. 35 | 36 | ############################################################################## 37 | # 38 | # Character class definitions from TR 29 39 | # 40 | ############################################################################## 41 | 42 | !!chain; 43 | !!quoted_literals_only; 44 | 45 | 46 | # 47 | # Character Class Definitions. 48 | # 49 | 50 | $CR = [\p{Word_Break = CR}]; 51 | $LF = [\p{Word_Break = LF}]; 52 | $Newline = [\p{Word_Break = Newline} ]; 53 | $Extend = [\p{Word_Break = Extend}]; 54 | $ZWJ = [\p{Word_Break = ZWJ}]; 55 | $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; 56 | $Format = [\p{Word_Break = Format}]; 57 | $Katakana = [\p{Word_Break = Katakana}]; 58 | $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; 59 | $ALetter = [\p{Word_Break = ALetter}]; 60 | $Single_Quote = [\p{Word_Break = Single_Quote}]; 61 | $Double_Quote = [\p{Word_Break = Double_Quote}]; 62 | $MidNumLet = [\p{Word_Break = MidNumLet}]; 63 | $MidLetter = [\p{Word_Break = MidLetter}]; 64 | $MidNum = [\p{Word_Break = MidNum}]; 65 | $Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]]; 66 | 67 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 68 | $WSegSpace = [\p{Word_Break = WSegSpace}]; 69 | $Extended_Pict = [:ExtPict:]; 70 | 71 | $Han = [:Han:]; 72 | $Hiragana = [:Hiragana:]; 73 | 74 | 75 | # Dictionary character set, for triggering language-based break engines. Currently 76 | # limited to LineBreak=Complex_Context. Note that this set only works in Unicode 77 | # 5.0 or later as the definition of Complex_Context was corrected to include all 78 | # characters requiring dictionary break. 79 | 80 | $Control = [\p{Grapheme_Cluster_Break = Control}]; 81 | $HangulSyllable = [\uac00-\ud7a3]; 82 | $ComplexContext = [:LineBreak = Complex_Context:]; 83 | $KanaKanji = [$Han $Hiragana $Katakana]; 84 | $dictionaryCJK = [$Han $Hiragana $HangulSyllable]; 85 | $dictionary = [$ComplexContext]; 86 | 87 | # leave CJK scripts out of ALetterPlus 88 | $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; 89 | 90 | 91 | # 92 | # Rules 4 Ignore Format and Extend characters, 93 | # except when they appear at the beginning of a region of text. 94 | # 95 | # TODO: check if handling of katakana in dictionary makes rules incorrect/void 96 | $KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*; 97 | $Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*; 98 | $ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*; 99 | $Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*; 100 | $Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*; 101 | $MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*; 102 | $MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*; 103 | $MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*; 104 | $NumericEx = $Numeric ($Extend | $Format | $ZWJ)*; 105 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*; 106 | $Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*; 107 | 108 | $Ideographic = [\p{Ideographic}]; 109 | $HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*; 110 | $IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*; 111 | 112 | ## ------------------------------------------------- 113 | 114 | # Rule 3 - CR x LF 115 | # 116 | $CR $LF; 117 | 118 | # Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed. 119 | # 120 | $ZWJ $Extended_Pict; 121 | 122 | # Rule 3d - Keep horizontal whitespace together. 123 | # 124 | $WSegSpace $WSegSpace; 125 | 126 | # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning 127 | # of a region of Text. The rule here comes into play when the start of text 128 | # begins with a group of Format chars, or with a "word" consisting of a single 129 | # char that is not in any of the listed word break categories followed by 130 | # format char(s), or is not a CJK dictionary character. 131 | [^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+; 132 | 133 | $NumericEx {100}; 134 | $ALetterEx {200}; 135 | $HangulSyllable {200}; 136 | $Hebrew_LetterEx{200}; 137 | $KatakanaEx {300}; # note: these status values override those from rule 5 138 | $HiraganaEx {300}; # by virtue of being numerically larger. 139 | $IdeographicEx {400}; # 140 | 141 | $Extended_Pict ($Extend | $Format | $ZWJ)*; 142 | 143 | # 144 | # rule 5 145 | # Do not break between most letters. 146 | # 147 | ($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200}; 148 | 149 | # rule 6 and 7 150 | ($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200}; 151 | 152 | # rule 7a 153 | $Hebrew_LetterEx $Single_QuoteEx {200}; 154 | 155 | # rule 7b and 7c 156 | $Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200}; 157 | 158 | # rule 8 159 | 160 | $NumericEx $NumericEx {100}; 161 | 162 | # rule 9 163 | 164 | ($ALetterEx | $Hebrew_LetterEx) $NumericEx {200}; 165 | 166 | # rule 10 167 | 168 | $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200}; 169 | 170 | # rule 11 and 12 171 | 172 | $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100}; 173 | 174 | # rule 13 175 | $KatakanaEx $KatakanaEx {300}; 176 | 177 | # rule 13a/b 178 | 179 | $ALetterEx $ExtendNumLetEx {200}; # (13a) 180 | $Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a) 181 | $NumericEx $ExtendNumLetEx {100}; # (13a) 182 | $KatakanaEx $ExtendNumLetEx {300}; # (13a) 183 | $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) 184 | 185 | $ExtendNumLetEx $ALetterEx {200}; # (13b) 186 | $ExtendNumLetEx $Hebrew_Letter {200}; # (13b) 187 | $ExtendNumLetEx $NumericEx {100}; # (13b) 188 | $ExtendNumLetEx $KatakanaEx {300}; # (13b) 189 | 190 | # rules 15 - 17 191 | # Pairs of Regional Indicators stay together. 192 | # With rule chaining disabled by ^, this rule will match exactly two of them. 193 | # No other rule begins with a Regional_Indicator, so chaining cannot extend the match. 194 | # 195 | ^$Regional_IndicatorEx $Regional_IndicatorEx; 196 | 197 | # special handling for CJK characters: chain for later dictionary segmentation 198 | $HangulSyllable $HangulSyllable {200}; 199 | 200 | # Rule 999 201 | # Match a single code point if no other rule applies. 202 | .; 203 | -------------------------------------------------------------------------------- /data/rbbi/Lao.rbbi: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Parses Lao text, with syllable as token. 18 | # 19 | # The definition of Lao syllable is based from: 20 | # 21 | # Syllabification of Lao Script for Line Breaking 22 | # Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, 23 | # Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP 24 | # http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf 25 | # http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf 26 | # 27 | # NOTE: 28 | # There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper. 29 | # For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work. 30 | # 31 | # Syllable structure, where X is the nuclear consonant: 32 | # 33 | # +----+ 34 | # | X5 | 35 | # +----+ 36 | # | X4 | 37 | # +----+----+----+----+----+----+----+-----+ 38 | # | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 | 39 | # +----+----+----+----+----+----+----+-----+ 40 | # | X2 | 41 | # +----+ 42 | # | X3 | 43 | # +----+ 44 | # 45 | # X0 represents a vowel which occurs before the nuclear consonant. 46 | # It can always define the beginning of syllable. 47 | $X0 = [\u0EC0-\u0EC4]; 48 | # X1 is a combination consonant which comes before the nuclear consonant, 49 | # but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ} 50 | $X1 = [\u0EAB]; 51 | # X represents the nuclear consonant. 52 | $X = [\u0E81-\u0EAE\u0EDC\u0EDD]; 53 | # X2 is a combination consonant which comes after the nuclear consonant, 54 | # which is placed under or next to the nuclear consonant. 55 | $X2 = [\u0EBC\u0EA3\u0EA7\u0EA5]; 56 | # X3 represents a vowel which occurs under the nuclear consonant. 57 | $X3 = [\u0EB8\u0EB9]; 58 | # X4 represents a vowel which occurs above the nuclear consonant. 59 | $X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1]; 60 | # X5 represents a tone mark which occurs above the nuclear consonant or upper vowel. 61 | $X5 = [\u0EC8-\u0ECB]; 62 | # X6 represents a consonant vowel, which occurs after the nuclear consonant. 63 | # It functions when the syllable doesn’t have any vowels. And it always exists with X8. 64 | $X6 = [\u0EA7\u0EAD\u0EBD]; 65 | # X7 represents a final vowel. 66 | # However X7_1 always represents the end of syllable and it never exists with tone mark. 67 | $X7 = [\u0EB0\u0EB2\u0EB3]; 68 | # X8 represents an alternate consonant. 69 | $X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7]; 70 | # X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3. 71 | $X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5]; 72 | # X10 represents a sign mark. 73 | # It always occurs at the end of a syllable, but mostly people keep it separate from syllable. 74 | $X10 = [\u0EAF\u0EC6\u0ECC]; 75 | 76 | # Section 1 77 | $X0_1 = [\u0EC0]; 78 | $X4_1_2 = [\u0EB4\u0EB5]; 79 | $X4_3_4 = [\u0EB6\u0EB7]; 80 | $X4_6 = [\u0EBB]; 81 | $X4_7 = [\u0EB1]; 82 | $X6_2 = [\u0EAD]; 83 | $X6_3 = [\u0EBD]; 84 | $X7_1 = [\u0EB0]; 85 | $X7_2 = [\u0EB2]; 86 | $X10_1 = [\u0EAF]; 87 | $X10_2 = [\u0EC6]; 88 | $X10_3 = [\u0ECC]; 89 | 90 | $Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 91 | $Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 92 | $Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 93 | $Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1; 94 | $Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 95 | $Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 96 | $Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 97 | 98 | $Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7); 99 | 100 | # Section 2 101 | $X0_2 = [\u0EC1]; 102 | 103 | $Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 104 | $Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1; 105 | $Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 106 | 107 | $Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3); 108 | 109 | # Section 3 110 | $X0_3 = [\u0EC2]; 111 | $X8_3 = [\u0E8D]; 112 | $X8_8 = [\u0EA7]; 113 | 114 | $Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 115 | $Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1; 116 | $Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8); 117 | 118 | $Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3); 119 | 120 | # Section 4 121 | $X0_4 = [\u0EC4]; 122 | $X6_1 = [\u0EA7]; 123 | 124 | $Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 125 | 126 | # Section 5 127 | $X0_5 = [\u0EC3]; 128 | 129 | $Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 130 | 131 | # Section 6 132 | $Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 133 | 134 | # Section 7 135 | $X4_1_4 = [\u0EB4-\u0EB7]; 136 | 137 | $Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 138 | 139 | # Section 8 140 | $X4_5 = [\u0ECD]; 141 | 142 | $Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 143 | 144 | # Section 9 145 | 146 | $Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 147 | $Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1; 148 | 149 | $Rule9 = ($Rule9_1 | $Rule9_2); 150 | 151 | # Section 10 152 | $Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 153 | 154 | # Section 11 155 | $Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 156 | 157 | # Section 12 158 | $Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1; 159 | 160 | # Section 13 161 | $Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 162 | 163 | # Section 14 164 | $X7_3 = [\u0EB3]; 165 | 166 | $Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 167 | 168 | $LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14); 169 | 170 | $WordJoin = [:Line_Break=Word_Joiner:]; 171 | 172 | $LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*; 173 | 174 | # 175 | # default numerical definitions 176 | # 177 | $Extend = [\p{Word_Break = Extend}]; 178 | $Format = [\p{Word_Break = Format}]; 179 | $MidNumLet = [\p{Word_Break = MidNumLet}]; 180 | $MidNum = [\p{Word_Break = MidNum}]; 181 | $Numeric = [\p{Word_Break = Numeric}]; 182 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 183 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*; 184 | $MidNumEx = $MidNum ($Extend | $Format)*; 185 | $NumericEx = $Numeric ($Extend | $Format)*; 186 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 187 | 188 | !!forward; 189 | 190 | $LaoJoinedSyllableEx {200}; 191 | # default numeric rules 192 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100}; 193 | -------------------------------------------------------------------------------- /data/rbbi/lucene/source.md: -------------------------------------------------------------------------------- 1 | * https://gitbox.apache.org/repos/asf?p=lucene.git;a=tree;f=lucene/analysis/icu/src/data;h=e7275ffa9541dab51e4b9a62166aeef457c5c22f;hb=refs/heads/main -------------------------------------------------------------------------------- /data/rbbi/lucene/uax29/Default.rbbi: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # This file is from ICU (with some small modifications, to avoid CJK dictionary break, 18 | # and status code change related to that) 19 | # 20 | # Copyright (C) 2016 and later: Unicode, Inc. and others. 21 | # License & terms of use: http://www.unicode.org/copyright.html 22 | # Copyright (C) 2002-2016, International Business Machines Corporation 23 | # and others. All Rights Reserved. 24 | # 25 | # file: word.txt 26 | # 27 | # ICU Word Break Rules 28 | # See Unicode Standard Annex #29. 29 | # These rules are based on UAX #29 Revision 29 for Unicode Version 9.0 30 | # with additions for Emoji Sequences from https://goo.gl/cluFCn 31 | # Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html 32 | # 33 | # Note: Updates to word.txt will usually need to be merged into 34 | # word_POSIX.txt also. 35 | 36 | ############################################################################## 37 | # 38 | # Character class definitions from TR 29 39 | # 40 | ############################################################################## 41 | 42 | !!chain; 43 | !!quoted_literals_only; 44 | 45 | 46 | # 47 | # Character Class Definitions. 48 | # 49 | 50 | $CR = [\p{Word_Break = CR}]; 51 | $LF = [\p{Word_Break = LF}]; 52 | $Newline = [\p{Word_Break = Newline} ]; 53 | $Extend = [\p{Word_Break = Extend}]; 54 | $ZWJ = [\p{Word_Break = ZWJ}]; 55 | $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; 56 | $Format = [\p{Word_Break = Format}]; 57 | $Katakana = [\p{Word_Break = Katakana}]; 58 | $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; 59 | $ALetter = [\p{Word_Break = ALetter}]; 60 | $Single_Quote = [\p{Word_Break = Single_Quote}]; 61 | $Double_Quote = [\p{Word_Break = Double_Quote}]; 62 | $MidNumLet = [\p{Word_Break = MidNumLet}]; 63 | $MidLetter = [\p{Word_Break = MidLetter}]; 64 | $MidNum = [\p{Word_Break = MidNum}]; 65 | $Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]]; 66 | 67 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 68 | $WSegSpace = [\p{Word_Break = WSegSpace}]; 69 | $Extended_Pict = [:ExtPict:]; 70 | 71 | $Han = [:Han:]; 72 | $Hiragana = [:Hiragana:]; 73 | 74 | 75 | # Dictionary character set, for triggering language-based break engines. Currently 76 | # limited to LineBreak=Complex_Context. Note that this set only works in Unicode 77 | # 5.0 or later as the definition of Complex_Context was corrected to include all 78 | # characters requiring dictionary break. 79 | 80 | $Control = [\p{Grapheme_Cluster_Break = Control}]; 81 | $HangulSyllable = [\uac00-\ud7a3]; 82 | $ComplexContext = [:LineBreak = Complex_Context:]; 83 | $KanaKanji = [$Han $Hiragana $Katakana]; 84 | $dictionaryCJK = [$Han $Hiragana $HangulSyllable]; 85 | $dictionary = [$ComplexContext]; 86 | 87 | # leave CJK scripts out of ALetterPlus 88 | $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; 89 | 90 | 91 | # 92 | # Rules 4 Ignore Format and Extend characters, 93 | # except when they appear at the beginning of a region of text. 94 | # 95 | # TODO: check if handling of katakana in dictionary makes rules incorrect/void 96 | $KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*; 97 | $Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*; 98 | $ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*; 99 | $Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*; 100 | $Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*; 101 | $MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*; 102 | $MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*; 103 | $MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*; 104 | $NumericEx = $Numeric ($Extend | $Format | $ZWJ)*; 105 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*; 106 | $Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*; 107 | 108 | $Ideographic = [\p{Ideographic}]; 109 | $HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*; 110 | $IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*; 111 | 112 | ## ------------------------------------------------- 113 | 114 | # Rule 3 - CR x LF 115 | # 116 | $CR $LF; 117 | 118 | # Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed. 119 | # 120 | $ZWJ $Extended_Pict; 121 | 122 | # Rule 3d - Keep horizontal whitespace together. 123 | # 124 | $WSegSpace $WSegSpace; 125 | 126 | # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning 127 | # of a region of Text. The rule here comes into play when the start of text 128 | # begins with a group of Format chars, or with a "word" consisting of a single 129 | # char that is not in any of the listed word break categories followed by 130 | # format char(s), or is not a CJK dictionary character. 131 | [^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+; 132 | 133 | $NumericEx {100}; 134 | $ALetterEx {200}; 135 | $HangulSyllable {200}; 136 | $Hebrew_LetterEx{200}; 137 | $KatakanaEx {300}; # note: these status values override those from rule 5 138 | $HiraganaEx {300}; # by virtue of being numerically larger. 139 | $IdeographicEx {400}; # 140 | 141 | $Extended_Pict ($Extend | $Format | $ZWJ)*; 142 | 143 | # 144 | # rule 5 145 | # Do not break between most letters. 146 | # 147 | ($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200}; 148 | 149 | # rule 6 and 7 150 | ($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200}; 151 | 152 | # rule 7a 153 | $Hebrew_LetterEx $Single_QuoteEx {200}; 154 | 155 | # rule 7b and 7c 156 | $Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200}; 157 | 158 | # rule 8 159 | 160 | $NumericEx $NumericEx {100}; 161 | 162 | # rule 9 163 | 164 | ($ALetterEx | $Hebrew_LetterEx) $NumericEx {200}; 165 | 166 | # rule 10 167 | 168 | $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200}; 169 | 170 | # rule 11 and 12 171 | 172 | $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100}; 173 | 174 | # rule 13 175 | $KatakanaEx $KatakanaEx {300}; 176 | 177 | # rule 13a/b 178 | 179 | $ALetterEx $ExtendNumLetEx {200}; # (13a) 180 | $Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a) 181 | $NumericEx $ExtendNumLetEx {100}; # (13a) 182 | $KatakanaEx $ExtendNumLetEx {300}; # (13a) 183 | $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) 184 | 185 | $ExtendNumLetEx $ALetterEx {200}; # (13b) 186 | $ExtendNumLetEx $Hebrew_Letter {200}; # (13b) 187 | $ExtendNumLetEx $NumericEx {100}; # (13b) 188 | $ExtendNumLetEx $KatakanaEx {300}; # (13b) 189 | 190 | # rules 15 - 17 191 | # Pairs of Regional Indicators stay together. 192 | # With rule chaining disabled by ^, this rule will match exactly two of them. 193 | # No other rule begins with a Regional_Indicator, so chaining cannot extend the match. 194 | # 195 | ^$Regional_IndicatorEx $Regional_IndicatorEx; 196 | 197 | # special handling for CJK characters: chain for later dictionary segmentation 198 | $HangulSyllable $HangulSyllable {200}; 199 | 200 | # Rule 999 201 | # Match a single code point if no other rule applies. 202 | .; 203 | -------------------------------------------------------------------------------- /data/rbbi/lucene/uax29/MyanmarSyllable.rbbi: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # 18 | # Parses Myanmar text, with syllable as token. 19 | # 20 | 21 | $Cons = [[:Other_Letter:]&[:Myanmar:]]; 22 | $Virama = [\u1039]; 23 | $Asat = [\u103A]; 24 | 25 | $WordJoin = [:Line_Break=Word_Joiner:]; 26 | 27 | # 28 | # default numerical definitions 29 | # 30 | $Extend = [\p{Word_Break = Extend}]; 31 | $Format = [\p{Word_Break = Format}]; 32 | $MidNumLet = [\p{Word_Break = MidNumLet}]; 33 | $MidNum = [\p{Word_Break = MidNum}]; 34 | $Numeric = [\p{Word_Break = Numeric}]; 35 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 36 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*; 37 | $MidNumEx = $MidNum ($Extend | $Format)*; 38 | $NumericEx = $Numeric ($Extend | $Format)*; 39 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 40 | 41 | $ConsEx = $Cons ($Extend | $Format)*; 42 | $AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*; 43 | $MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*; 44 | $MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*; 45 | 46 | !!forward; 47 | $MyanmarJoinedSyllableEx {200}; 48 | 49 | # default numeric rules 50 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100}; 51 | -------------------------------------------------------------------------------- /data/rbbi/solrcene/Hebrew.rbbi: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # 18 | # This is an example of rule tailoring for Hebrew. 19 | # In this example the single-quote is added to the Extend category 20 | # The double-quote is added to the MidLetter category. 21 | # 22 | !!chain; 23 | $CR = [\p{Word_Break = CR}]; 24 | $LF = [\p{Word_Break = LF}]; 25 | $Newline = [\p{Word_Break = Newline}]; 26 | $Extend = [\p{Word_Break = Extend}\u0027]; 27 | $Format = [\p{Word_Break = Format}]; 28 | $ALetter = [\p{Word_Break = ALetter}]; 29 | $MidNumLet = [\p{Word_Break = MidNumLet}]; 30 | $MidLetter = [\p{Word_Break = MidLetter}\u0022]; 31 | $MidNum = [\p{Word_Break = MidNum}]; 32 | $Numeric = [\p{Word_Break = Numeric}]; 33 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 34 | $dictionary = [:LineBreak = Complex_Context:]; 35 | $Control = [\p{Grapheme_Cluster_Break = Control}]; 36 | $ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; 37 | 38 | $ALetterEx = $ALetterPlus ($Extend | $Format)*; 39 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*; 40 | $MidLetterEx = $MidLetter ($Extend | $Format)*; 41 | $MidNumEx = $MidNum ($Extend | $Format)*; 42 | $NumericEx = $Numeric ($Extend | $Format)*; 43 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 44 | 45 | !!forward; 46 | 47 | $CR $LF; 48 | [^$CR $LF $Newline]? ($Extend | $Format)+; 49 | $NumericEx {100}; 50 | $ALetterEx {200}; 51 | $ALetterEx $ALetterEx {200}; 52 | $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; 53 | $NumericEx $NumericEx {100}; 54 | $ALetterEx $NumericEx {200}; 55 | $NumericEx $ALetterEx {200}; 56 | $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; 57 | $ALetterEx $ExtendNumLetEx {200}; 58 | $NumericEx $ExtendNumLetEx {100}; 59 | $ExtendNumLetEx $ExtendNumLetEx {200}; 60 | $ExtendNumLetEx $ALetterEx {200}; 61 | $ExtendNumLetEx $NumericEx {100}; 62 | -------------------------------------------------------------------------------- /data/rbbi/solrcene/Khmer.rbbi: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # 18 | # Parses Khmer text, with orthographic syllable as token. 19 | # 20 | # The definition of Khmer orthographic syllable is taken from the Unicode Standard. 21 | # 22 | # B = base character (consonant, independent vowel, etc) 23 | $KhmerBase = [\u1780-\u17B3]; 24 | # R = robat 25 | $KhmerRobat = [\u17CC]; 26 | # C = consonant shifter 27 | $KhmerShifter = [\u17C9\u17CA]; 28 | # S = subscript consonant or independent vowel sign 29 | $KhmerSub = ([\u17D2] $KhmerBase); 30 | # V = dependent vowel sign 31 | $KhmerVowel = [\u17B4-\u17C5]; 32 | # Z = zero-width joiner or non-joiner 33 | $KhmerZWC = [\u200C\u200D]; 34 | # O = any other sign 35 | $KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD]; 36 | 37 | $WordJoin = [:Line_Break=Word_Joiner:]; 38 | 39 | $KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?; 40 | 41 | $KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*; 42 | 43 | # 44 | # default numerical definitions 45 | # 46 | $Extend = [\p{Word_Break = Extend}]; 47 | $Format = [\p{Word_Break = Format}]; 48 | $MidNumLet = [\p{Word_Break = MidNumLet}]; 49 | $MidNum = [\p{Word_Break = MidNum}]; 50 | $Numeric = [\p{Word_Break = Numeric}]; 51 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 52 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*; 53 | $MidNumEx = $MidNum ($Extend | $Format)*; 54 | $NumericEx = $Numeric ($Extend | $Format)*; 55 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 56 | 57 | !!forward; 58 | $KhmerJoinedSyllableEx {200}; 59 | 60 | # default numeric rules 61 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100}; 62 | -------------------------------------------------------------------------------- /data/rbbi/solrcene/Lao.rbbi: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Parses Lao text, with syllable as token. 18 | # 19 | # The definition of Lao syllable is based from: 20 | # 21 | # Syllabification of Lao Script for Line Breaking 22 | # Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, 23 | # Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP 24 | # http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf 25 | # http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf 26 | # 27 | # NOTE: 28 | # There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper. 29 | # For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work. 30 | # 31 | # Syllable structure, where X is the nuclear consonant: 32 | # 33 | # +----+ 34 | # | X5 | 35 | # +----+ 36 | # | X4 | 37 | # +----+----+----+----+----+----+----+-----+ 38 | # | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 | 39 | # +----+----+----+----+----+----+----+-----+ 40 | # | X2 | 41 | # +----+ 42 | # | X3 | 43 | # +----+ 44 | # 45 | # X0 represents a vowel which occurs before the nuclear consonant. 46 | # It can always define the beginning of syllable. 47 | $X0 = [\u0EC0-\u0EC4]; 48 | # X1 is a combination consonant which comes before the nuclear consonant, 49 | # but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ} 50 | $X1 = [\u0EAB]; 51 | # X represents the nuclear consonant. 52 | $X = [\u0E81-\u0EAE\u0EDC\u0EDD]; 53 | # X2 is a combination consonant which comes after the nuclear consonant, 54 | # which is placed under or next to the nuclear consonant. 55 | $X2 = [\u0EBC\u0EA3\u0EA7\u0EA5]; 56 | # X3 represents a vowel which occurs under the nuclear consonant. 57 | $X3 = [\u0EB8\u0EB9]; 58 | # X4 represents a vowel which occurs above the nuclear consonant. 59 | $X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1]; 60 | # X5 represents a tone mark which occurs above the nuclear consonant or upper vowel. 61 | $X5 = [\u0EC8-\u0ECB]; 62 | # X6 represents a consonant vowel, which occurs after the nuclear consonant. 63 | # It functions when the syllable doesn’t have any vowels. And it always exists with X8. 64 | $X6 = [\u0EA7\u0EAD\u0EBD]; 65 | # X7 represents a final vowel. 66 | # However X7_1 always represents the end of syllable and it never exists with tone mark. 67 | $X7 = [\u0EB0\u0EB2\u0EB3]; 68 | # X8 represents an alternate consonant. 69 | $X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7]; 70 | # X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3. 71 | $X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5]; 72 | # X10 represents a sign mark. 73 | # It always occurs at the end of a syllable, but mostly people keep it separate from syllable. 74 | $X10 = [\u0EAF\u0EC6\u0ECC]; 75 | 76 | # Section 1 77 | $X0_1 = [\u0EC0]; 78 | $X4_1_2 = [\u0EB4\u0EB5]; 79 | $X4_3_4 = [\u0EB6\u0EB7]; 80 | $X4_6 = [\u0EBB]; 81 | $X4_7 = [\u0EB1]; 82 | $X6_2 = [\u0EAD]; 83 | $X6_3 = [\u0EBD]; 84 | $X7_1 = [\u0EB0]; 85 | $X7_2 = [\u0EB2]; 86 | $X10_1 = [\u0EAF]; 87 | $X10_2 = [\u0EC6]; 88 | $X10_3 = [\u0ECC]; 89 | 90 | $Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 91 | $Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 92 | $Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 93 | $Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1; 94 | $Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 95 | $Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 96 | $Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 97 | 98 | $Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7); 99 | 100 | # Section 2 101 | $X0_2 = [\u0EC1]; 102 | 103 | $Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 104 | $Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1; 105 | $Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 106 | 107 | $Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3); 108 | 109 | # Section 3 110 | $X0_3 = [\u0EC2]; 111 | $X8_3 = [\u0E8D]; 112 | $X8_8 = [\u0EA7]; 113 | 114 | $Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 115 | $Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1; 116 | $Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8); 117 | 118 | $Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3); 119 | 120 | # Section 4 121 | $X0_4 = [\u0EC4]; 122 | $X6_1 = [\u0EA7]; 123 | 124 | $Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 125 | 126 | # Section 5 127 | $X0_5 = [\u0EC3]; 128 | 129 | $Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 130 | 131 | # Section 6 132 | $Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 133 | 134 | # Section 7 135 | $X4_1_4 = [\u0EB4-\u0EB7]; 136 | 137 | $Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 138 | 139 | # Section 8 140 | $X4_5 = [\u0ECD]; 141 | 142 | $Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 143 | 144 | # Section 9 145 | 146 | $Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 147 | $Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1; 148 | 149 | $Rule9 = ($Rule9_1 | $Rule9_2); 150 | 151 | # Section 10 152 | $Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 153 | 154 | # Section 11 155 | $Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 156 | 157 | # Section 12 158 | $Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1; 159 | 160 | # Section 13 161 | $Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 162 | 163 | # Section 14 164 | $X7_3 = [\u0EB3]; 165 | 166 | $Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 167 | 168 | $LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14); 169 | 170 | $WordJoin = [:Line_Break=Word_Joiner:]; 171 | 172 | $LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*; 173 | 174 | # 175 | # default numerical definitions 176 | # 177 | $Extend = [\p{Word_Break = Extend}]; 178 | $Format = [\p{Word_Break = Format}]; 179 | $MidNumLet = [\p{Word_Break = MidNumLet}]; 180 | $MidNum = [\p{Word_Break = MidNum}]; 181 | $Numeric = [\p{Word_Break = Numeric}]; 182 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 183 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*; 184 | $MidNumEx = $MidNum ($Extend | $Format)*; 185 | $NumericEx = $Numeric ($Extend | $Format)*; 186 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 187 | 188 | !!forward; 189 | 190 | $LaoJoinedSyllableEx {200}; 191 | # default numeric rules 192 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100}; 193 | -------------------------------------------------------------------------------- /data/rbbi/solrcene/Myanmar.rbbi: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # 18 | # Parses Myanmar text, with syllable as token. 19 | # 20 | 21 | $Cons = [[:Other_Letter:]&[:Myanmar:]]; 22 | $Virama = [\u1039]; 23 | $Asat = [\u103A]; 24 | 25 | $WordJoin = [:Line_Break=Word_Joiner:]; 26 | 27 | # 28 | # default numerical definitions 29 | # 30 | $Extend = [\p{Word_Break = Extend}]; 31 | $Format = [\p{Word_Break = Format}]; 32 | $MidNumLet = [\p{Word_Break = MidNumLet}]; 33 | $MidNum = [\p{Word_Break = MidNum}]; 34 | $Numeric = [\p{Word_Break = Numeric}]; 35 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 36 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*; 37 | $MidNumEx = $MidNum ($Extend | $Format)*; 38 | $NumericEx = $Numeric ($Extend | $Format)*; 39 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 40 | 41 | $ConsEx = $Cons ($Extend | $Format)*; 42 | $AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*; 43 | $MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*; 44 | $MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*; 45 | 46 | !!forward; 47 | $MyanmarJoinedSyllableEx {200}; 48 | 49 | # default numeric rules 50 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100}; 51 | -------------------------------------------------------------------------------- /data/rbbi/solrcene/source.md: -------------------------------------------------------------------------------- 1 | * https://github.com/chrismattmann/solrcene/tree/master/modules/analysis/icu/src/data -------------------------------------------------------------------------------- /data/rbbi/source.md: -------------------------------------------------------------------------------- 1 | # RBBI files 2 | 3 | * [icu::RuleBasedBreakIterator Class Reference](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedBreakIterator.html) 4 | * [Boundary Analysis](https://unicode-org.github.io/icu/userguide/boundaryanalysis/) 5 | * [Break Rules](https://unicode-org.github.io/icu/userguide/boundaryanalysis/break-rules.html) 6 | * [Updating ICU's built-in Break Iterator rules](https://github.com/unicode-org/icu/blob/main/docs/processes/rules_update.md) 7 | 8 | ## Current Lucene 9 | 10 | * [gitbox.apache.org - lucene](https://gitbox.apache.org/repos/asf?p=lucene.git;a=tree;f=lucene/analysis/icu/src/data/uax29;h=8423b0c7713159c3dffb549f18a37c425eb96001;hb=HEAD) 11 | 12 | ## Old Lucene 13 | 14 | * [apache/lucene-solr](https://github.com/apache/lucene-solr/tree/releases/lucene-solr/4.0.0/lucene/analysis/icu/src/data/uax29) 15 | 16 | 17 | ## Misc 18 | 19 | * https://stackoverflow.com/questions/559949/the-word-break-rule-file 20 | 21 | ```txt 22 | RuleBasedBreakIterator (icu) 23 | RuleBasedCollator (icu) 24 | RuleBasedNumberFormat (icu) 25 | RuleBasedTimeZone (icu) 26 | ``` 27 | 28 | -------------------------------------------------------------------------------- /data/régions_métropolitaines.tsv: -------------------------------------------------------------------------------- 1 | "Dénomination " "Chef-lieu de région " Superficie (km2) Population (2019) "Population estimée (2022) " "Densité (2019) (hab./km2) " Code Insee 2 | Occitanie Toulouse 72 724 5 933 185 6 053 548 81,6 76 3 | Grand Est Strasbourg 57 441 5 556 219 5 542 094 96,7 44 4 | Normandie Rouen 29 907 3 325 032 3 307 286 111,2 28 5 | Bretagne Rennes 27 208 3 354 854 3 402 932 123,3 53 6 | Île-de-France Paris 12 011 12 262 544 12 395 148 1020,9 11 7 | Centre-Val de Loire Orléans 39 151 2 573 180 2 564 915 65,7 24 8 | Pays de la Loire Nantes 32 082 3 806 461 3 873 096 118,6 52 9 | Provence-Alpes-Côte d'Azur Marseille 31 400 5 081 101 5 131 187 161,8 93 10 | Auvergne-Rhône-Alpes Lyon 69 711 8 042 936 8 153 233 115,4 84 11 | Hauts-de-France Lille 31 806 6 004 947 5 987 172 188,8 32 12 | Bourgogne-Franche-Comté Dijon 47 784 2 805 580 2 785 393 58,7 27 13 | Nouvelle-Aquitaine Bordeaux 84 036 6 010 289 6 081 985 71,5 75 14 | Corse Ajaccio 8 680 340 440 349 465 39,2 94 -------------------------------------------------------------------------------- /data/sorani_alphabet.tsv: -------------------------------------------------------------------------------- 1 | Order Character Codepoint 2 | 1 ئ U+0626 3 | 2 ا U+0627 4 | 3 ب U+0628 5 | 4 پ U+067E 6 | 5 ت U+062A 7 | 6 ج U+062C 8 | 7 چ U+0686 9 | 8 ح U+062D 10 | 9 خ U+062E 11 | 10 د U+062F 12 | 11 ر U+0631 13 | 12 ڕ U+0695 14 | 13 ز U+0632 15 | 14 ژ U+0698 16 | 15 س U+0633 17 | 16 ش U+0634 18 | 17 ع U+0639 19 | 18 غ U+063A 20 | 19 ف U+0641 21 | 20 ڤ U+06A4 22 | 21 ق U+0642 23 | 22 ک U+06A9 24 | 23 گ U+06AF 25 | 24 ل U+0644 26 | 25 ڵ U+06B5 27 | 26 م U+0645 28 | 27 ن U+0646 29 | 28 ه U+0647 30 | 29 ە U+06D5 31 | 30 و U+0648 32 | 31 وو U+0648 U+0648 33 | 32 ۆ U+06C6 34 | 33 ی U+06CC 35 | 34 ێ U+06CE -------------------------------------------------------------------------------- /data/sorani_alphabet_wikipedia.tsv: -------------------------------------------------------------------------------- 1 | Order Character Codepoint 2 | 1 ئ U+0626 3 | 2 ا U+0627 4 | 3 ب U+0628 5 | 4 پ U+067E 6 | 5 ت U+062A 7 | 6 ج U+062C 8 | 7 چ U+0686 9 | 8 ح U+062D 10 | 9 خ U+062E 11 | 10 د U+062F 12 | 11 ر U+0631 13 | 12 ڕ U+0695 14 | 13 ز U+0632 15 | 14 ژ U+0698 16 | 15 س U+0633 17 | 16 ش U+0634 18 | 17 ع U+0639 19 | 18 غ U+063A 20 | 19 ف U+0641 21 | 20 ڤ U+06A4 22 | 21 ق U+0642 23 | 22 ک U+06A9 24 | 23 گ U+06AF 25 | 24 ل U+0644 26 | 25 ڵ U+06B5 27 | 26 م U+0645 28 | 27 ن U+0646 29 | 28 ه U+0647 30 | 29 ە U+06D5 31 | 30 و U+0648 32 | 32 ۆ U+06C6 33 | 31 وو U+0648 U+0648 34 | 33 ی U+06CC 35 | 34 ێ U+06CE -------------------------------------------------------------------------------- /data/source.md: -------------------------------------------------------------------------------- 1 | # Sources 2 | 3 | ## klpt_stopwords 4 | 5 | The [stopword list](https://github.com/sinaahmadi/klpt/blob/master/klpt/data/stopwords.json) is from Sina Ahmadi's [Kurdish Language Processing Toolkit](https://github.com/sinaahmadi/klpt), which was released under an [Attribution-ShareAlike 4.0 International Public License](https://github.com/sinaahmadi/klpt/blob/master/LICENSE). 6 | 7 | -------------------------------------------------------------------------------- /data/türkiye'ninz-illeri.tsv: -------------------------------------------------------------------------------- 1 | Ad Alan (km²) Nüfus (2019) NY kişi/km² Plaka kodu Telefon kodu Vali 2 | İstanbul 5.461 15.519.267 2.841,83 34 212, 216 Ali Yerlikaya 3 | Eskişehir 13.960 887.475 63,57 26 222 Erol Ayyıldız 4 | Bursa 10.813 3.056.120 282,63 16 224 Yakup Canbolat 5 | Yalova 798 270.976 339,56 77 226 Muammer Erol 6 | Bilecik 4.179 219.427 52,50 11 228 Bilal Şentürk 7 | İzmir 11.891 4.367.251 367,27 35 232 Yavuz Selim Köşger 8 | Manisa 13.339 1.440.611 107,99 45 236 Yaşar Karadeniz 9 | Antalya 20.177 2.511.700 124,48 07 242 Ersin Yazıcı 10 | Isparta 8.946 444.914 49,73 32 246 Ömer Seymenoğlu 11 | Burdur 7.175 270.796 37,74 15 248 Ali Arslantaş 12 | Muğla 12.654 983.142 77,69 48 252 Orhan Tavlı 13 | Aydın 8.116 1.110.972 136,88 09 256 Hüseyin Aksoy 14 | Denizli 12.134 1.037.208 85,47 20 258 Ali Fuat Atik 15 | Kocaeli 3.397 1.953.035 574,92 41 262 Seddar Yavuz 16 | Sakarya 4.824 1.029.650 213,44 54 264 Çetin Oktay Kaldırım 17 | Balıkesir 14.583 1.228.620 84,25 10 266 Hasan Şıldak 18 | Afyonkarahisar 14.016 729.483 52,04 03 272 Gökmen Çiçek 19 | Kütahya 11.634 579.257 49,79 43 274 Ali Çelik 20 | Uşak 5.555 370.509 66,69 64 276 Funda Kocabıyık 21 | Tekirdağ 6.190 1.055.412 170,50 59 282 Aziz Yıldırım 22 | Edirne 6.145 413.903 67,35 22 284 Ekrem Canalp 23 | Çanakkale 9.817 542.157 55,22 17 286 İlhami Aktaş 24 | Kırklareli 6.459 361.836 56,02 39 288 Osman Bilgin 25 | Ankara 25.632 5.639.076 220 06 312 Vasip Şahin 26 | Kırıkkale 4.791 283.017 59,07 71 318 Yunus Sezer 27 | Adana 13.844 2.237.940 161,65 01 322 Süleyman Elban 28 | Mersin 16.010 1.840.425 114,95 33 324 Ali İhsan Su 29 | Hatay 5.524 1.628.894 294,87 31 326 Rahmi Doğan 30 | Osmaniye 3.320 538.759 162,27 80 328 Erdinç Yılmaz 31 | Konya 40.838 2.232.374 54,66 42 332 Vahdettin Özkan 32 | Karaman 8.678 253.279 29,18 70 338 Mehmet Alpaslan Işık 33 | Gaziantep 6.803 2.069.364 304,18 27 342 Davut Gül 34 | Kahramanmaraş 14.520 1.154.102 79,48 46 344 Ömer Faruk Coşkun 35 | Sivas 28.164 638.956 22,68 58 346 Salih Ayhan 36 | Kilis 1.412 142.490 100,91 79 348 Recep Soytürk 37 | Kayseri 16.970 1.407.409 82,93 38 352 Şehmus Günaydın 38 | Yozgat 13.690 421.200 30,76 66 354 Ziya Polat 39 | Tokat 10.042 612.747 61,01 60 356 Ozan Balcı 40 | Amasya 5.628 337.800 60,02 05 358 Mustafa Masatlı 41 | Samsun 9.725 1.348.542 138,66 55 362 Zülkif Dağlı 42 | Çorum 12.428 530.864 42,71 19 364 Mustafa Çiftçi 43 | Kastamonu 13.064 379.405 29,04 37 366 Avni Çakır 44 | Sinop 5.717 218.243 38,17 57 368 Erol Karaömeroğlu 45 | Karabük 4.142 248.458 59,98 78 370 Fuat Gürel 46 | Zonguldak 3.342 596.053 178,35 67 372 Mustafa Tutulmaz 47 | Bolu 8.313 316.126 38,02 14 374 Ahmet Ümit 48 | Çankırı 7.542 195.789 25,95 18 376 Abdullah Ayaz 49 | Bartın 2.330 198.249 85,08 74 378 Sinan Güner 50 | Düzce 2.492 392.166 157,36 81 380 Cevdet Atay 51 | Aksaray 7.659 416.367 54,36 68 382 Hamza Aydoğdu 52 | Nevşehir 5.485 303.010 55,24 50 384 İnci Sezer Becel 53 | Kırşehir 6.584 242.938 36,89 40 386 İbrahim Akın 54 | Niğde 7.234 362.861 48,59 51 388 Yılmaz Şimşek 55 | Diyarbakır 15.168 1.756.353 115,79 21 412 Münir Karaloğlu 56 | Şanlıurfa 19.242 2.073.614 107,76 63 414 Abdullah Erin 57 | Adıyaman 7.337 626.465 85,38 02 416 Aykut Pekmez 58 | Malatya 12.259 800.165 65,27 44 422 Aydın Baruş 59 | Elazığ 9.383 591.098 62,99 23 424 Erkaya Yırık 60 | Bingöl 8.004 279.812 34,95 12 426 Kadir Ekinci 61 | Tunceli 7.582 84.660 11,16 62 428 Mehmet Ali Özkan 62 | Van 20.921 1.136.757 54,33 65 432 Mehmet Emin Bilmez 63 | Bitlis 8.294 348.115 41,97 13 434 Oktay Çağatay 64 | Muş 8.650 408.809 47,26 49 436 İlker Gündüzöz 65 | Hakkâri 7.095 280.991 39,60 30 438 İdris Akbıyık 66 | Erzurum 25.006 762.062 30,47 25 442 Okay Memiş 67 | Erzincan 11.815 234.747 19,86 24 446 Mehmet Makas 68 | Ordu 5.861 754.198 128,68 52 452 Tuncay Sonel 69 | Giresun 7.025 448.400 63,82 28 454 Enver Ünlü 70 | Gümüşhane 6.668 164.521 24,67 29 456 Kamuran Taşbilek 71 | Bayburt 3.746 84.843 22,64 69 458 Cüneyt Epcim 72 | Trabzon 4.628 808.974 174,79 61 462 İsmail Ustaoğlu 73 | Rize 3.835 343.212 89,49 53 464 Kemal Çeber 74 | Artvin 7.393 170.875 23,11 08 466 Yılmaz Doruk 75 | Ağrı 11.099 536.199 48,31 04 472 Osman Varol 76 | Kars 10.193 285.410 28 36 474 Türker Öksüz 77 | Iğdır 3.664 199.442 54,43 76 476 Hüseyin Engin Sarıibrahim 78 | Ardahan 4.934 97.319 19,72 75 478 Hüseyin Öner 79 | Mardin 8.780 838.778 95,53 47 482 Mahmut Demirtaş 80 | Siirt 5.717 330.280 57,77 56 484 Osman Hacıbektaşoğlu 81 | Şırnak 7.078 529.615 74,82 73 486 Ali Hamza Pehlivan 82 | Batman 4.477 608.659 135,95 72 488 Hulusi Şahin -------------------------------------------------------------------------------- /data/wordlists/source.md: -------------------------------------------------------------------------------- 1 | # Data sources – wordlists 2 | 3 | * [kurdi_words.txt](https://raw.githubusercontent.com/0xdolan/kurdi/master/corpus/kurdi_words.txt) (Sorani) -------------------------------------------------------------------------------- /docs/DRAFT_icu_transforms.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/docs/DRAFT_icu_transforms.pdf -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Python internationalisation -------------------------------------------------------------------------------- /docs/matplotlib.md: -------------------------------------------------------------------------------- 1 | appropriate # Python and Pandas internationalisation 2 | 3 | ## Data visualisation issues for languages that need bidirectional support or complex font rendering. 4 | 5 | _Matplotlib_ is a commonly used tool for basic data visualisation in Python, and is the default plotting tool with _pandas.Dataframe.plot_. It is also used by _seaborn_ and _wordcount_, along with other libraries and tools. 6 | 7 | The default backends for _Matplotlib_ have a number of limitations: 8 | 9 | 1. No support for the Unicode bidirectional algorithm, 10 | 2. No support for complex font rendering 11 | 12 | This places severe limits on what natural languages can be used in titles, lables, legends, and other text elements in plots. 13 | 14 | The package [mplcairo](https://github.com/matplotlib/mplcairo) provides an alternative backend for _matplotlib_ that uses [Raqm](https://github.com/HOST-Oman/libraqm) and [GNU FriBidi](https://github.com/fribidi/fribidi) for bidirectional text layout and complex rendering of OpenType features. This allows the use of most languages to be supported in plots. 15 | 16 | The key limitations for _mplcairo_ are bugs in iPython and the lack of support for _Jupyter notebooks_. 17 | 18 | Using the _mplcairo_ backend for _matplotlib_ we can display plot titles, axes labels and categorical tick labels in any language we need to support. 19 | 20 | There are two missing pieces at this point: 21 | 22 | 1. Display of numeric tick labels in a numeral system appropriate for the UI language. 23 | 2. Choice on bidirectional layout req 24 | uirements of the appropriate 25 | data visualisation. 26 | 27 | ## Numeral systems 28 | 29 | Regarding the first issue, it is possible to use `matplotlib.ticker.FuncFormatter()` to apply a function to convert to the target numeral system, and apply necessary grouping and decimal separators. 30 | 31 | ### RTL layout and data visualisation 32 | 33 | It isn't always necessary to change the layout of the plot. If the plot is using a cartesian coordinate system, it is best to use the default layout. 34 | The layout used, combined with user expectations, will impact the interpretation of trends in data visualisations. User interpretation of the visualisations, combined with user experience are critical inputs into a data visualisation design. 35 | 36 | If a RTL layout is required: 37 | 38 | 1. Use `yaxis.tick_right()` and `yaxis.set_label_position("right")` to reposition y-axis to the right side of the plot 39 | 2. Use `plt.gca().invert_xaxis()` to invert the x-axis. This step may not be necessary. UX is an important consideration. 40 | 41 | ### Examples 42 | 43 | The following python scripts uses [Sorani Kurdish data](https://github.com/enabling-languages/python-i18n/blob/main/data/demographics.tsv): 44 | 45 | * [matplotlib](https://github.com/enabling-languages/python-i18n/blob/main/py/matplotlib_kurdish.py) 46 | * [pandas.Dataframe.plot](https://github.com/enabling-languages/python-i18n/blob/main/py/pandas_plot_kurdish.py) 47 | * [seaborn](https://github.com/enabling-languages/python-i18n/blob/main/py/seaborn_kurdish.py) 48 | * [wordcount](https://github.com/enabling-languages/python-i18n/blob/main/py/wordcloud_kurdish.py) 49 | 50 |
Kurdish plot using Seaborn
Fig.1 - Kurdish bar charts in both LTR and RTL layouts.
51 | 52 |
Kurdish wordcloud
Fig.2 - Kurdish wordcloud.
53 | -------------------------------------------------------------------------------- /notebooks/Sorting_emoji.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Sorting emoji\n", 8 | "\n", 9 | "Python's inbuild sorting algorithms sort emoji by codepoint.\n", 10 | "\n", 11 | "Codepoint order, as well as the default collation rules provided by the Unicode Collation Algorithm do not provide adequate [ordering and grouping](https://www.unicode.org/reports/tr51/#Sorting) of emoji.\n", 12 | "\n", 13 | "The Unicode Common Locale Data Repository (CLDR) provides colation rules for emoji. [Conformant emoji collation](https://www.unicode.org/reports/tr51/#Collation_Conformance) is defined in CLDR tailoring rules for the Unicode Collation Algorthim (UCA).\n", 14 | "\n", 15 | "CLDR groups emoji into broad conceptual categories in order to group related emoji together." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Emoji only collation\n", 23 | "\n", 24 | "For the following discussion we will use the following emoji:\n", 25 | "\n", 26 | "|Character |Codepoint |Description |Category |\n", 27 | "|--------- |--------- |----------- |-------- |\n", 28 | "|🦜 |U+1F99C |Parrot |animal-bird |\n", 29 | "|🥚 |U+1F95A |Egg |food-prepared |\n", 30 | "|🐔 |U+1F414 |Chicken |animal-bird |\n", 31 | "\n", 32 | "The default python sort algorithm will order then in terms of the emoji's codepoint: U+1F414 (chicken), U+1F95A (egg), and then U+1F99C (parrot).\n", 33 | "\n", 34 | "The CLDR ordering would be to sort the two bids together (U+1F414 then U+1F99C), followed by U+1F95A." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 1, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/plain": [ 45 | "['🐔', '🥚', '🦜']" 46 | ] 47 | }, 48 | "execution_count": 1, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "a = ['🦜', '🥚', '🐔']\n", 55 | "sorted(a)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "Using PyICU, it is possible to sort emoji according to CLDR's collation rules for Emoji. The `-u-co-emoji` Unicode BCP-47 extension will enable CLDR based emoji collation. When sorting just wmoji we can use the langauge subtag `und` (undetermined) as the base for the locale identifier: `und-u-co-emoji`." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 2, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "['🐔', '🦜', '🥚']\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "from icu import Collator, Locale\n", 80 | "coll = Collator.createInstance(Locale.createCanonical(\"und-u-co-emoji\"))\n", 81 | "print(sorted(a, key=coll.getSortKey))" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "This yields a CLDR based sort using the CLDR emoji collation rules." 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## Sorting text and emoji\n", 96 | "\n", 97 | "A more complex scenario is sorting a set of text and emoji.\n", 98 | "\n", 99 | "[UTS #35](https://unicode.org/reports/tr35/tr35-collation.html#Combining_Rules) provides a discussion of tailoring and combining rules in relation to sorting emoji and text. We'll implement the example given in UTS #35 in Python.\n", 100 | "\n", 101 | "The following characters are used:\n", 102 | "\n", 103 | "|Character |Codepoint |Description |\n", 104 | "|---------- |---------- |------------ |\n", 105 | "|😀 |U+1F600 |Grinning Face |\n", 106 | "|글 |U+AE00 |Hangul Syllable Geul |\n", 107 | "|Z |U+005A |Latin Capital Letter Z |\n", 108 | "|ü |U+00FC |Latin Small Letter U with Diaeresis |\n", 109 | "|, |U+002C |Comma |\n", 110 | "|✈️️ |U+2708 U+FE0F |Airplane |\n", 111 | "|y |U+0079 |Latin Small Letter Y |\n", 112 | "|☹️ |U+2639 U+FE0F |White Frowning Face |\n", 113 | "|a |U+0061 |Latin Small Letter A |\n", 114 | "\n", 115 | "Enabling emoji collation overrides language specific tailorings. This has no impact on text for languages that use the root collation, but will have a negative impact on languages that do require tailoring to obtain the correct collation order.\n", 116 | "\n", 117 | "The python sort algorithm will order content by codepoint:" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 11, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "[',', 'Z', 'a', 'y', 'ü', '☹️', '✈️️', '글', '😀']" 129 | ] 130 | }, 131 | "execution_count": 11, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "# List to be sorted\n", 138 | "b = ['😀', '글', 'Z', 'ü', ',', '✈️️', 'y', '☹️', 'a']\n", 139 | "\n", 140 | "#Default Python sort\n", 141 | "sorted(b)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "The `en` locale identifier will use the CLDR root collation. Emoji are not sorted using the CLDR emoji collation rules:" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 25, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "[',', '☹️', '✈️️', '😀', 'a', 'ü', 'y', 'Z', '글']" 160 | ] 161 | }, 162 | "execution_count": 25, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "# locale: en\n", 169 | "en_coll = Collator.createInstance(Locale.forLanguageTag(\"en\"));\n", 170 | "sorted(b, key=en_coll.getSortKey)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "Enabling emoji collation using the `en-u-co-emoji` locale will sort the emoji based on the emoji collation rules and the remaining characters are sorted as per the root collation algorithm." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 24, 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/plain": [ 188 | "[',', '😀', '☹️', '✈️️', 'a', 'ü', 'y', 'Z', '글']" 189 | ] 190 | }, 191 | "execution_count": 24, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "# locale for en-u-co-emoji\n", 198 | "en_emoji_coll = Collator.createInstance(Locale.forLanguageTag(\"en-u-co-emoji\"));\n", 199 | "sorted(b, key=en_emoji_coll.getSortKey)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "`en-u-co-emoji\"`will yield the same result as `und-u-co-emoji`, i.e. sort emoji according to the CLDR emoji collation order and sort other characters according to the root collation algorithm." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 23, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/plain": [ 217 | "[',', '😀', '☹️', '✈️️', 'a', 'ü', 'y', 'Z', '글']" 218 | ] 219 | }, 220 | "execution_count": 23, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "# locale for und-u-co-emoji\n", 227 | "und_emoji_coll = Collator.createInstance(Locale.forLanguageTag(\"und-u-co-emoji\"));\n", 228 | "sorted(b, key=und_emoji_coll.getSortKey)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "The `da` locale has tailored collation rules to order text in the sequence required for Danish:" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 22, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "[',', '☹️', '✈️️', '😀', 'a', 'y', 'ü', 'Z', '글']" 247 | ] 248 | }, 249 | "execution_count": 22, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "# locale for da\n", 256 | "da_coll = Collator.createInstance(Locale.forLanguageTag(\"da\"));\n", 257 | "sorted(b, key=da_coll.getSortKey)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "Adding emoji collation support overrides the Danish language tailorings. Look at the order of __ü__ in the list for the `da` and `da-u-co-emoji` locales." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 20, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/plain": [ 275 | "[',', '😀', '☹️', '✈️️', 'a', 'ü', 'y', 'Z', '글']" 276 | ] 277 | }, 278 | "execution_count": 20, 279 | "metadata": {}, 280 | "output_type": "execute_result" 281 | } 282 | ], 283 | "source": [ 284 | "# locale for da-u-co-emoji\n", 285 | "da_emoji_coll = Collator.createInstance(Locale.forLanguageTag(\"da-u-co-emoji\"));\n", 286 | "sorted(b, key=da_emoji_coll.getSortKey)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "To overcome this, it is possible to combine the collation rules for the `da` and `da_and_emoji_rules`. We can do this by:\n", 294 | "\n", 295 | "1. Initiating collator instances for each locale, and retrieve the rules\n", 296 | "2. Concatenate the rule sets\n", 297 | "3. Initiate a collator instance using `RuleBasedCollator`\n", 298 | "\n", 299 | "This will order emoji according to the emoji collation rules and order Latin script text according to Danish collation rules." 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 19, 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "text/plain": [ 310 | "[',', '😀', '☹️', '✈️️', 'a', 'y', 'ü', 'Z', '글']" 311 | ] 312 | }, 313 | "execution_count": 19, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "# Combinded rules\n", 320 | "from icu import RuleBasedCollator\n", 321 | "#da_and_emoji_rules = Collator.createInstance(Locale.forLanguageTag('da')).getRules() + Collator.createInstance(Locale.forLanguageTag('und-u-co-emoji')).getRules()\n", 322 | "da_rules = Collator.createInstance(Locale.forLanguageTag('da')).getRules()\n", 323 | "emoji_rules = Collator.createInstance(Locale.forLanguageTag('und-u-co-emoji')).getRules()\n", 324 | "da_and_emoji_rules = da_rules + emoji_rules\n", 325 | "combined_coll = RuleBasedCollator(da_and_emoji_rules)\n", 326 | "sorted(b, key=combined_coll.getSortKey)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "The same approach is needed for other languages that are not supported by the CLDR root collation algorithm and require tailored rules." 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "## Resources\n", 341 | "\n", 342 | "* [Emoji ordering chart](https://www.unicode.org/emoji/charts/emoji-ordering.html)\n", 343 | "* [CLDR Root collation rules](https://github.com/unicode-org/cldr/blob/353527cdabf1e8870d261beb3c908de6deb1915b/common/collation/root.xml#L951)" 344 | ] 345 | } 346 | ], 347 | "metadata": { 348 | "interpreter": { 349 | "hash": "bb12d0de9674b66c629d2bafada2ec4f6e6dba6d129e54dea4badc21502d54d3" 350 | }, 351 | "kernelspec": { 352 | "display_name": "Python 3.8.1 64-bit ('el': venv)", 353 | "language": "python", 354 | "name": "python3" 355 | }, 356 | "language_info": { 357 | "codemirror_mode": { 358 | "name": "ipython", 359 | "version": 3 360 | }, 361 | "file_extension": ".py", 362 | "mimetype": "text/x-python", 363 | "name": "python", 364 | "nbconvert_exporter": "python", 365 | "pygments_lexer": "ipython3", 366 | "version": "3.8.1" 367 | }, 368 | "orig_nbformat": 4 369 | }, 370 | "nbformat": 4, 371 | "nbformat_minor": 2 372 | } 373 | -------------------------------------------------------------------------------- /notebooks/ethiopic_numbers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Working with Ethiopic numbers\n", 8 | "\n", 9 | "CLDR sets the default number system for languages written in the Ethiopic script to the Arabic (Latin) Number System. The Ethiopic number system is marked as an alternative (traditional) numbering system, and is not used by default.\n", 10 | "\n", 11 | "CLDR defines decimal and algorithmic [number systems](https://github.com/unicode-org/cldr/blob/main/common/supplemental/numberingSystems.xml). The Ethiopic number system is an algorithmic alphabetic numeral system.\n", 12 | "\n", 13 | "For a description of the number system refer to [Ethiopic number system](http://www.geez.org/Numerals/) for more details. A list of [sample numbers](http://www.geez.org/Numerals/NumberSamples.html) is available.\n", 14 | "\n", 15 | "ICU provides a number of classes used for [formatting numbers](https://unicode-org.github.io/icu/userguide/format_parse/numbers/), but the class needed to format Ethiopic numbers is the [RuleBasedNumberFormat](https://unicode-org.github.io/icu/userguide/format_parse/numbers/rbnf.html) class.\n", 16 | "\n", 17 | "Refer to the ICU4C API [RuleBasedNumberFormat class reference](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedNumberFormat.html). The RBNF rule set for [Ethiopic](https://github.com/unicode-org/cldr/blob/6c8ad511801043124d6ce25e0388412fe9b7b2f4/common/rbnf/root.xml#L246) is defined in the CLDR root locale.\n", 18 | "\n", 19 | "The most common use for the `RuleBasedNumberFormat` class is to format numbers as ordinals or as words in the target locale. It is also the nechanism for formating and parsing algorithmic number systems.\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Spelling out numbers in Amharic\n", 27 | "\n", 28 | "1. Create a locale instance\n", 29 | "2. create a number formatter instance using `RuleBasedNumberFormat` class\n", 30 | "3. Format the number\n", 31 | "\n", 32 | "We start by importing the necessary classes from PyICU:" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 15, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from icu import Locale, Formattable, RuleBasedNumberFormat, URBNFRuleSetTag" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "- [Locale](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1Locale.html) – methods for initiating and working with ICU's locale objects.\n", 49 | "- [Formattable](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1Formattable.html) – a wrapper that converts between numeric types, strings and date objects. It's primary use is in formatting.\n", 50 | "- [RulebasedNumberFormat](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedNumberFormat.html) – formats numbers according to a set of rules. The rules maybe inbuilt set of rules, or custom rules.\n", 51 | "- [URBNFRuleSetTag](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/namespaceicu.html#a55dbbbdd4946251c23988013e06e695e) – tags for predefined rule sets to use with `RulebasedNumberFormat`." 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "First, create a Locale instance, and a formatter instance. There are a number of methods for building a Locale instance. To keep things simple, we'll just pass a locale identifier directly to the class." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 16, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "loc = Locale('am_ET')\n", 68 | "formatter = RuleBasedNumberFormat(URBNFRuleSetTag.SPELLOUT, loc)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "We can control what rule sets are used. The following rule sets are available:" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 17, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "%spellout-numbering-year\n", 88 | "%spellout-numbering\n", 89 | "%spellout-cardinal\n", 90 | "%spellout-ordinal\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "for n in range(formatter.getNumberOfRuleSetNames()):\n", 96 | " print(formatter.getRuleSetName(n))" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 26, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "%spellout-numbering\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "print(formatter.getDefaultRuleSetName())" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "The `%spellout-numbering` is the default for Amharic, but `%spellout-numbering-year`, `%spellout-cardinal`, and `%spellout-ordinal` are alternative rule sets available. Use the [setDefaultRuleSet](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedNumberFormat.html#aa0fbc19602d99cfcb550e2c11cb9ca91) method, if required." 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "`RuleBasedNumberFormat` can be used in a number of ways, refer to the API documentation. IN this particular case we want to create a formatter that uses the Amharic spellout rule set. We passed the relevant rule set identifer and the required locale to create a formatter instance.\n", 128 | "\n", 129 | "The same Python code can be used for any locale that have spellout [rule sets](https://github.com/unicode-org/icu/tree/main/icu4c/source/data/rbnf).\n", 130 | "\n", 131 | "To convert the number to its word representation, use the `format` method." 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 18, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "አስር ሁለት ሺ ሦስት መቶ አራት አስር አምስት\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "number = 12345\n", 149 | "r = formatter.format(number)\n", 150 | "print(r)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "You can use the `parse` method to convert the word representation back into a formated number:" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 19, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "data": { 167 | "text/plain": [ 168 | "" 169 | ] 170 | }, 171 | "execution_count": 19, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "rreverse = formatter.parse(r)\n", 178 | "rreverse" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "This returns a Formattable object, which you can either render as a formated string, or convert to an interger or float, as required." 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 20, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "name": "stdout", 195 | "output_type": "stream", 196 | "text": [ 197 | "12,345\n" 198 | ] 199 | } 200 | ], 201 | "source": [ 202 | "rreverse_string = str(rreverse)\n", 203 | "print(rreverse_string)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "Available methods are:\n", 211 | "\n", 212 | "- getDouble – returns a floating point number\n", 213 | "- getInt64 – returns an integer\n" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 21, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "12345\n", 226 | "12345.0\n" 227 | ] 228 | } 229 | ], 230 | "source": [ 231 | "ireverse = rreverse.getInt64()\n", 232 | "print(ireverse)\n", 233 | "\n", 234 | "dreverse = rreverse.getDouble()\n", 235 | "print(dreverse)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "## Working with the Ethiopic numeral system\n", 243 | "\n", 244 | "Creating a formatter for Ethiopic numbers is a two step process, we need to create a formatter passing a rule set identifier for number systems and a locale, then we need to set the actual rule set needed. Locales may support multiple rule sets. \n", 245 | "\n", 246 | "1. Create a locale instance\n", 247 | "2. Create a formatter instance\n", 248 | "3. Set the rule set required\n", 249 | "\n", 250 | "We'll reuse the existing Locale instance." 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 22, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "%armenian-lower\n", 263 | "%armenian-upper\n", 264 | "%cyrillic-lower\n", 265 | "%ethiopic\n", 266 | "%georgian\n", 267 | "%greek-lower\n", 268 | "%greek-upper\n", 269 | "%hebrew\n", 270 | "%hebrew-item\n", 271 | "%roman-lower\n", 272 | "%roman-upper\n", 273 | "%tamil\n", 274 | "%zz-default\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "eformatter = RuleBasedNumberFormat(URBNFRuleSetTag.NUMBERING_SYSTEM, loc)\n", 280 | "\n", 281 | "for n in range(eformatter.getNumberOfRuleSetNames()):\n", 282 | " print(eformatter.getRuleSetName(n))\n" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "The public name of the rule set we need is `%ethiopic`, so we set this as our default rule set:" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 23, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "eformatter.setDefaultRuleSet('%ethiopic')" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "Then format the number as above:" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 24, 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "፳፫፻፵፩\n" 318 | ] 319 | } 320 | ], 321 | "source": [ 322 | "number = 2341\n", 323 | "r = eformatter.format(number)\n", 324 | "print(r)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "And likewise, we can parse the ethiopic digits back to the Arabic (Latin) number system:" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 25, 337 | "metadata": {}, 338 | "outputs": [ 339 | { 340 | "name": "stdout", 341 | "output_type": "stream", 342 | "text": [ 343 | "2,341\n", 344 | "2341\n", 345 | "2341.0\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "rreverse = eformatter.parse(r)\n", 351 | "print(str(rreverse))\n", 352 | "print(rreverse.getInt64())\n", 353 | "print(rreverse.getDouble())" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "## Further information\n", 361 | "\n", 362 | "- Unicode Locale Data Markup Language (LDML) [Part 3: Numbers](https://www.unicode.org/reports/tr35/tr35-numbers.html#unicode-locale-data-markup-language-ldmlpart-3-numbers)\n", 363 | " - [Number Systems](http://www.unicode.org/reports/tr35/tr35-numbers.html#Numbering_Systems)\n", 364 | " - [Rule-Based Number Formatting](https://www.unicode.org/reports/tr35/tr35-numbers.html#Rule-Based_Number_Formatting)" 365 | ] 366 | } 367 | ], 368 | "metadata": { 369 | "kernelspec": { 370 | "display_name": "athinkra", 371 | "language": "python", 372 | "name": "python3" 373 | }, 374 | "language_info": { 375 | "codemirror_mode": { 376 | "name": "ipython", 377 | "version": 3 378 | }, 379 | "file_extension": ".py", 380 | "mimetype": "text/x-python", 381 | "name": "python", 382 | "nbconvert_exporter": "python", 383 | "pygments_lexer": "ipython3", 384 | "version": "3.11.0" 385 | }, 386 | "orig_nbformat": 4 387 | }, 388 | "nbformat": 4, 389 | "nbformat_minor": 2 390 | } 391 | -------------------------------------------------------------------------------- /notebooks/images/sorani_plotly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/images/sorani_plotly.png -------------------------------------------------------------------------------- /notebooks/images/sorani_plotly2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/images/sorani_plotly2.png -------------------------------------------------------------------------------- /notebooks/images/sorani_plotly_inline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/images/sorani_plotly_inline.png -------------------------------------------------------------------------------- /notebooks/img/1440px-Lake_Dukan_12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/1440px-Lake_Dukan_12.jpg -------------------------------------------------------------------------------- /notebooks/img/ckb_IQ_collation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/ckb_IQ_collation.png -------------------------------------------------------------------------------- /notebooks/img/khamti.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/khamti.jpg -------------------------------------------------------------------------------- /notebooks/img/linux1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/linux1.png -------------------------------------------------------------------------------- /notebooks/img/macos1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/macos1.png -------------------------------------------------------------------------------- /notebooks/img/mplcairo_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/mplcairo_output.png -------------------------------------------------------------------------------- /notebooks/img/sibe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/sibe.jpg -------------------------------------------------------------------------------- /notebooks/img/std_matplotlib_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/std_matplotlib_output.png -------------------------------------------------------------------------------- /notebooks/img/tai_aiton.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/tai_aiton.jpg -------------------------------------------------------------------------------- /notebooks/img/tai_aiton_text_to_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/tai_aiton_text_to_image.png -------------------------------------------------------------------------------- /notebooks/img/yolngu.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/notebooks/img/yolngu.jpg -------------------------------------------------------------------------------- /notebooks/pandas_plot_mplcairo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Sorani Kurdish data using Pandas plot\n", 8 | "\n", 9 | "Enabling `mplcairo`, with `raqm`, as the backend for `matplotlib` will allow us to reuse the [Kurdish matplotlib example](https://github.com/enabling-languages/python-i18n/blob/main/notebooks/matplotlib_mplcairo.ipynb) with Pandas `plot`.\n", 10 | "\n", 11 | "__Please note:__ This notebook will run on MacOS, but tends to be buggy on other platforms. The _mplcairo_ package does not currently support Jupyter. It is better to use _mplcairo_ in a script, rather than a notebook. See [pandas_plot_kurdish.py](../py/pandas_plot_kurdish.py)." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Setup" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 3, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import pandas as pd\n", 28 | "import locale, platform\n", 29 | "import mplcairo\n", 30 | "import matplotlib as mpl\n", 31 | "if platform.system() == \"Darwin\":\n", 32 | " mpl.use(\"module://mplcairo.macosx\")\n", 33 | "else:\n", 34 | " mpl.use(\"module://mplcairo.qt\")\n", 35 | "import matplotlib.pyplot as plt\n", 36 | "import matplotlib.ticker as ticker\n", 37 | "import unicodedata as ud, regex as re" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Helper functions" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 4, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "def convert_digits(s, sep = (\",\", \".\")):\n", 54 | " nd = re.compile(r'^-?\\p{Nd}[,.\\u066B\\u066C\\u0020\\u2009\\u202F\\p{Nd}]*$')\n", 55 | " tsep, dsep = sep\n", 56 | " if nd.match(s):\n", 57 | " s = s.replace(tsep, \"\")\n", 58 | " s = ''.join([str(ud.decimal(c, c)) for c in s])\n", 59 | " if dsep in s:\n", 60 | " return float(s.replace(dsep, \".\")) if dsep != \".\" else float(s)\n", 61 | " return int(s)\n", 62 | " return s\n", 63 | "\n", 64 | "seps = (\"\\u066C\", \"\\u066B\")\n", 65 | "digitsconv = lambda x: convert_digits(x.replace(\"-\", \"٠\"), sep = seps)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "## Process data and plot data" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/html": [ 83 | "
\n", 84 | "\n", 97 | "\n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | "
---جیھانیتورکیائێرانعێراقسووریا
0کرمانجی14419000791900044300031850001661000
1ئەوانەی بە تورکی دەدوێن57320005732000000
2باشوور33810000338100000
3سۆرانی157600005020005670000
4زازایی - دەملی11250001125000000
5زازایی - ئەلڤێکا184000179000000
6ڕەوەند900003800020000330000
7ھەورامی54000026000280000
8شکاکی49000230002600000
9کۆی گشتی2671200015016000439800039160001661000
\n", 202 | "
" 203 | ], 204 | "text/plain": [ 205 | " --- جیھانی تورکیا ئێران عێراق سووریا\n", 206 | "0 کرمانجی 14419000 7919000 443000 3185000 1661000\n", 207 | "1 ئەوانەی بە تورکی دەدوێن 5732000 5732000 0 0 0\n", 208 | "2 باشوور 3381000 0 3381000 0 0\n", 209 | "3 سۆرانی 1576000 0 502000 567000 0\n", 210 | "4 زازایی - دەملی 1125000 1125000 0 0 0\n", 211 | "5 زازایی - ئەلڤێکا 184000 179000 0 0 0\n", 212 | "6 ڕەوەند 90000 38000 20000 33000 0\n", 213 | "7 ھەورامی 54000 0 26000 28000 0\n", 214 | "8 شکاکی 49000 23000 26000 0 0\n", 215 | "9 کۆی گشتی 26712000 15016000 4398000 3916000 1661000" 216 | ] 217 | }, 218 | "execution_count": 5, 219 | "metadata": {}, 220 | "output_type": "execute_result" 221 | } 222 | ], 223 | "source": [ 224 | "import pandas as pd\n", 225 | "conv = {\n", 226 | " 'سووریا': digitsconv,\n", 227 | " 'عێراق': digitsconv,\n", 228 | " 'ئێران': digitsconv,\n", 229 | " 'تورکیا': digitsconv,\n", 230 | " 'جیھانی': digitsconv\n", 231 | "}\n", 232 | "df = pd.read_table(\"../data/demographics.tsv\", converters=conv)\n", 233 | "df" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 6, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "تورکیا 30032000\n", 246 | "ئێران 8796000\n", 247 | "عێراق 7729000\n", 248 | "سووریا 3322000\n", 249 | "dtype: int64\n" 250 | ] 251 | } 252 | ], 253 | "source": [ 254 | "col_list=[\"تورکیا\" ,\"ئێران\" ,\"عێراق\" ,\"سووریا\"]\n", 255 | "\n", 256 | "total_df = df[col_list].sum(axis=0)\n", 257 | "print(total_df)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "Using indicies and values of the `total_df` series:" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 20, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "def convert_to_arab_ns(n, p=None, decimal=2, sep_in=[\"\", \".\"], sep_out=[\"\\u066C\", \"\\u066B\"], scale=None):\n", 274 | " locale.setlocale(locale.LC_ALL, \"en_US.UTF-8\")\n", 275 | " decimal_places = decimal\n", 276 | " if sep_in == [\"\", \".\"]:\n", 277 | " n = n * scale if scale else n\n", 278 | " format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d'\n", 279 | " n = locale.format_string(format_string, n, grouping=True, monetary=True)\n", 280 | " n = n.replace(\",\", \"ṯ\").replace(\".\", \"ḏ\")\n", 281 | " #n = str(n)\n", 282 | " if sep_in[0] in [\" \", \",\", \"٬\", \"\\u2009\"]:\n", 283 | " n = n.replace(r'[\\u0020,٬\\u2009]', \"ṯ\")\n", 284 | " elif sep_in[0] == \".\":\n", 285 | " n = n.replace(\".\", \"ṯ\")\n", 286 | " if sep_in[1] in [\",\", \".\", \"٫\"]:\n", 287 | " n = n.replace(r'[,.٫]', \"ḏ\")\n", 288 | " sep = sep_out\n", 289 | " t = n.maketrans(\"0123456789\", \"٠١٢٣٤٥٦٧٨٩\")\n", 290 | " locale.setlocale(locale.LC_ALL, \"\")\n", 291 | " return n.translate(t).replace(\"ṯ\", sep[0] ).replace(\"ḏ\", sep[1])" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 23, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "\n", 301 | "# ax = total_df.plot(kind=\"bar\", title='ڕێژەی دانیشتووانی کورد', xlabel=\"ناوچە\", ylabel=\"ڕێژەی دانیشتووان\" ,rot=0)\n", 302 | "# DEFAULT_NUMERAL_SYSYEM = \"arab\"\n", 303 | "# ns_formatter = ticker.FuncFormatter(lambda x, p: convert_to_arab_ns(x, p, scale=0.000001))\n", 304 | "# ax.get_yaxis().set_major_formatter(ns_formatter)\n", 305 | "# plt.show()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "# fig = px.bar(x=total_df.index, y=total_df.values)\n", 315 | "fig = total_df.plot(kind=\"bar\", title='ڕێژەی دانیشتووانی کورد', xlabel=\"ناوچە\", ylabel=\"ڕێژەی دانیشتووان\" ,rot=0)\n", 316 | "\n", 317 | "fig.update_layout(\n", 318 | " title={\n", 319 | " 'text': 'ڕێژەی دانیشتووانی کورد',\n", 320 | " 'y':0.95,\n", 321 | " 'x':0.5,\n", 322 | " 'xanchor': 'center',\n", 323 | " 'yanchor': 'top'},\n", 324 | " xaxis_title=\"ناوچە\",\n", 325 | " yaxis_title=\"ڕێژەی دانیشتووان\",\n", 326 | " font=dict(\n", 327 | " family=\"Vazirmatn\",\n", 328 | " size=14,\n", 329 | " color=\"Grey\"\n", 330 | " )\n", 331 | ")\n", 332 | "\n", 333 | "fig.show()" 334 | ] 335 | } 336 | ], 337 | "metadata": { 338 | "interpreter": { 339 | "hash": "05c935ee2b4ff45f26d355be2499c84aedc5a4939bfa2f7a9b7f00dda4a86ade" 340 | }, 341 | "kernelspec": { 342 | "display_name": "Python 3.10.1 ('el-test')", 343 | "language": "python", 344 | "name": "python3" 345 | }, 346 | "language_info": { 347 | "codemirror_mode": { 348 | "name": "ipython", 349 | "version": 3 350 | }, 351 | "file_extension": ".py", 352 | "mimetype": "text/x-python", 353 | "name": "python", 354 | "nbconvert_exporter": "python", 355 | "pygments_lexer": "ipython3", 356 | "version": "3.10.1" 357 | }, 358 | "orig_nbformat": 4 359 | }, 360 | "nbformat": 4, 361 | "nbformat_minor": 2 362 | } 363 | -------------------------------------------------------------------------------- /notebooks/persian_df.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Pandas internationalisation: Persian (فارسی) data example\n", 8 | "\n", 9 | "An example of reading in Persian data in Pandas.\n", 10 | "\n", 11 | "The file `fa_stats.tsv` is a tab delimited file in Persian. Column 1 contains a four digit year based on the Islamic calendar. Columns 2 and 3 contain integers using Eastern Arabic-Indic digits, using the Arabic thousands seperator.\n", 12 | "\n", 13 | "A set of conversion functions are used with `pd.read_table()` to convert the data to a format that cen be used in Pandas.\n", 14 | "\n", 15 | "Column 1 is converted to the Gregorian Calendar, using a combination of the `convert_digits()` function and PyICU's `icu.Calendar` and `icu.GregorianCalendar` modules. After the dataframe is available, we use `pandas.Series.dt.year` to convert the datetime objects in the column to Four digit year display.\n", 16 | "\n", 17 | "The `convert_digits()` function is used to convert the Eastern Arabic-Indic digits in columns 2 and 3 to Arabic digits that can be manipulated by Pandas." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import unicodedataplus as ud, regex as re, pandas as pd\n", 27 | "from icu import Locale, Calendar, GregorianCalendar\n", 28 | "\n", 29 | "def convert_digits(s, sep = (\",\", \".\")):\n", 30 | " nd = re.compile(r'^-?\\p{Nd}[,.\\u066B\\u066C\\u0020\\u2009\\u202F\\p{Nd}]*$')\n", 31 | " tsep, dsep = sep\n", 32 | " if nd.match(s):\n", 33 | " s = s.replace(tsep, \"\")\n", 34 | " s = ''.join([str(ud.decimal(c, c)) for c in s])\n", 35 | " if dsep in s:\n", 36 | " return float(s.replace(dsep, \".\")) if dsep != \".\" else float(s)\n", 37 | " return int(s)\n", 38 | " return s\n", 39 | "\n", 40 | "loc = \"fa_IR\"\n", 41 | "in_c = Calendar.createInstance(Locale(loc + \"@calendar=persian\"))\n", 42 | "out_c = GregorianCalendar(Locale(loc + \"@calendar=gregorian\"))\n", 43 | "\n", 44 | "def convert_islamic_year(y, in_c, out_c):\n", 45 | " y = convert_digits(y.strip())\n", 46 | " in_c.set(Calendar.YEAR, y)\n", 47 | " out_c.setTime(in_c.getTime())\n", 48 | " return out_c.get(Calendar.YEAR)\n", 49 | "\n", 50 | "seps = (\"\\u066C\", \"\\u066B\")\n", 51 | "digitf = lambda x: convert_digits(x.strip(), sep = seps)\n", 52 | "datef = lambda x: convert_islamic_year(x, in_c=in_c, out_c=out_c)\n" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/html": [ 63 | "
\n", 64 | "\n", 77 | "\n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | "
سالولادتوفات
01959864846176288
11960876206171040
21961902260159371
\n", 107 | "
" 108 | ], 109 | "text/plain": [ 110 | " سال ولادت وفات\n", 111 | "0 1959 864846 176288\n", 112 | "1 1960 876206 171040\n", 113 | "2 1961 902260 159371" 114 | ] 115 | }, 116 | "execution_count": 2, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "conv = {\"سال\": datef ,\"ولادت\": digitf, \"وفات\": digitf}\n", 123 | "df = pd.read_table(\"../data/csv/fa_stats.tsv\", converters=conv, parse_dates=['سال'])\n", 124 | "df[\"سال\"] = df[\"سال\"].dt.year\n", 125 | "df.head(3)" 126 | ] 127 | } 128 | ], 129 | "metadata": { 130 | "interpreter": { 131 | "hash": "bb12d0de9674b66c629d2bafada2ec4f6e6dba6d129e54dea4badc21502d54d3" 132 | }, 133 | "kernelspec": { 134 | "display_name": "Python 3", 135 | "language": "python", 136 | "name": "python3" 137 | }, 138 | "language_info": { 139 | "codemirror_mode": { 140 | "name": "ipython", 141 | "version": 3 142 | }, 143 | "file_extension": ".py", 144 | "mimetype": "text/x-python", 145 | "name": "python", 146 | "nbconvert_exporter": "python", 147 | "pygments_lexer": "ipython3", 148 | "version": "3.8.1" 149 | } 150 | }, 151 | "nbformat": 4, 152 | "nbformat_minor": 2 153 | } 154 | -------------------------------------------------------------------------------- /notebooks/strings_casing_matching.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python string operations: casing and matching" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "|Operation |Python |Pandas |PyICU |\n", 15 | "|----------- |-------- |------- |------ |\n", 16 | "|Lowercasing |[str.lower()](https://docs.python.org/3/library/stdtypes.html#str.lower) |[pandas.Series.str.lower()](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.lower.html?highlight=lower#pandas-series-str-lower) |icu.UnicodeString.toLower() |\n", 17 | "|Uppercasing |[str.upper()](https://docs.python.org/3/library/stdtypes.html#str.upper) |[pandas.Series.str.upper()](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.upper.html#pandas-series-str-upper) |icu.UnicodeString.toUpper() |\n", 18 | "|Titlecasing |[str.title()](https://docs.python.org/3/library/stdtypes.html#str.title) |[pandas.Series.str.title](pandas.Series.str.title) |icu.UnicodeString.toTitle() |\n", 19 | "|Casefolding |[str.casefold()](https://docs.python.org/3/library/stdtypes.html#str.casefold) |[pandas.Series.str.casefold()]() |icu.UnicodeString.CaseFold() |\n", 20 | "\n", 21 | "The operations [str.capitalize()](https://docs.python.org/3/library/stdtypes.html#str.capitalize)/[pandas.Series.str.capitalize()](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.capitalize.html#pandas-series-str-capitalize) and [str.swapcase()](https://docs.python.org/3/library/stdtypes.html#str.swapcase)/[pandas.Series.str.swapcase()](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.swapcase.html#pandas-series-str-swapcase), although string operations, aren't necessarily casing operations.\n", 22 | "\n", 23 | "N.B. we will not explore the differences between an [object and `StringDtype`](https://pandas.pydata.org/docs/user_guide/text.html#behavior-differences) in Pandas." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 4, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from el_internationalisation import cp, cpnames, udata" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Python casing operations\n", 40 | "\n", 41 | "Unicode contains a set of special casing mappings. These are divided intto unconditional and conditional mappings. All casing operations should support unconditional special mappings by default.\n", 42 | "\n", 43 | "Python's casing operations are language insensitive, that is langauge is not taken into account when casing operations occur. The current locale has no impact on casing operations, therefore language sensitive mappings are unsupported.\n", 44 | "\n", 45 | "Unconditional mappings:\n", 46 | "\n", 47 | " * Eszett (ß) casing \n", 48 | " * Preserving canonical equivalence of I WITH DOT ABOVE (İ)\n", 49 | " * Ligatures (Latin and Armenian script)\n", 50 | " * When a lowercase charcater has no corresponding uppercase precomposed character\n", 51 | " * Greek letters with letters with hupogegramménē (ὑπογεγραμμένη) or prosgráphō (προσγράφω) have special uppercase equivalents.\n", 52 | " * Some Greek letters with letters with hupogegramménē (ὑπογεγραμμένη) have no titlecase\n", 53 | "\n", 54 | "Conditional mappings:\n", 55 | " 1. Language-Insensitive Mappings\n", 56 | " * Final form of Greek sigma\n", 57 | " 2. Language-Sensitive Mappings\n", 58 | " * Lithuanian retains the dot in a lowercase i/j when followed by accents\n", 59 | " * For Turkish and Azeri, I and i-dotless; I-dot and i are case pairs\n", 60 | "\n", 61 | "See [Special Casings](https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt), which forms part of the Unicode Character database (UCD)." 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### Unconditional mappings\n", 69 | "\n", 70 | "Python lowercasing and uppercasing support the unconditional mappings of Unicode's special mappings.\n", 71 | "\n", 72 | "|Character |Lowercase |Titlecase |Uppercase |Notes |\n", 73 | "|---------- |---------- |---------- |---------- |------ |\n", 74 | "\n", 75 | "#### Latin script" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 20, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "ß (00DF) ⇒ SS (0053 0053)\n", 88 | "Titlecase: should not appear word initial.\n", 89 | "i̇ (0069 0307) ⇐ İ (0130)\n", 90 | "Titlecase: İ (0049 0307)\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "# ß\n", 96 | "ESZETT = \"ß\"\n", 97 | "print(f'{ESZETT} ({cp(ESZETT)}) ⇒ {ESZETT.upper()} ({cp(ESZETT.upper())})')\n", 98 | "print(\"Titlecase: should not appear word initial.\")\n", 99 | "\n", 100 | "# I WITH DOT ABOVE\n", 101 | "IDOT = \"\\u0130\"\n", 102 | "print(f'{IDOT.lower()} ({cp(IDOT.lower())}) ⇐ {IDOT} ({cp(IDOT)})')\n", 103 | "print(f'Titlecase: {\"i̇\".title()} ({cp(\"i̇\".title())})')" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "Note that Python titlecasing does not resolve back to the precomosed U+0130, but this is part of a wider issue with Python titlecasing, unlike uppercasing and lowercasing, titlecasing does not adhere to the Unicode specification\n", 111 | "\n", 112 | "If we take the name of the Turkish city İstanbul:" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 26, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "İstanbul: 0130 0073 0074 0061 006E 0062 0075 006C\n", 125 | "i̇stanbul: 0069 0307 0073 0074 0061 006E 0062 0075 006C\n", 126 | "Titlecase: İStanbul (0049 0307 0053 0074 0061 006E 0062 0075 006C)\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "print(f'İstanbul: {cp(\"İstanbul\")}')\n", 132 | "istanbul = \"İstanbul\".lower()\n", 133 | "print(f'{istanbul}: {cp(istanbul)}')\n", 134 | "istanbul_title = istanbul.title()\n", 135 | "print(f'Titlecase: {istanbul_title} ({cp(istanbul_title)})')" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "The first three characters in the titlecased string are U+0049 U+0307 U+0053. Python titlecases the first alphabetic character after a non-alphabetic character. Combining diacritics are not considered alphabetic characaters:" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 27, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "data": { 152 | "text/plain": [ 153 | "False" 154 | ] 155 | }, 156 | "execution_count": 27, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "istanbul.isalpha()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "So __i__ is uppercased to __I__, U+0307 is treated as a non-alphabetic character and the titlecasing operation titlecases the __s__, giving us İStanbul as the titlecased version of the string.\n", 170 | "\n", 171 | "It is important to note that the Unicode definition also excludes marks, like combining diacrtics, but Unicode titlecasing does not apply an alphabetic mask to titlecasing." 172 | ] 173 | } 174 | ], 175 | "metadata": { 176 | "kernelspec": { 177 | "display_name": "Python 3.8.1 ('el')", 178 | "language": "python", 179 | "name": "python3" 180 | }, 181 | "language_info": { 182 | "codemirror_mode": { 183 | "name": "ipython", 184 | "version": 3 185 | }, 186 | "file_extension": ".py", 187 | "mimetype": "text/x-python", 188 | "name": "python", 189 | "nbconvert_exporter": "python", 190 | "pygments_lexer": "ipython3", 191 | "version": "3.8.1" 192 | }, 193 | "orig_nbformat": 4, 194 | "vscode": { 195 | "interpreter": { 196 | "hash": "bb12d0de9674b66c629d2bafada2ec4f6e6dba6d129e54dea4badc21502d54d3" 197 | } 198 | } 199 | }, 200 | "nbformat": 4, 201 | "nbformat_minor": 2 202 | } 203 | -------------------------------------------------------------------------------- /py/am_ET_numbers_icu.py: -------------------------------------------------------------------------------- 1 | from icu import Locale, LocalizedNumberFormatter, Formattable, RuleBasedNumberFormat, URBNFRuleSetTag 2 | # lang = "hi-IN-u-nu-deva" 3 | # lang = "en-IN" 4 | lang = input("Enter language tag: ") 5 | LOC = Locale.createCanonical(lang) 6 | 7 | number = 123452.54 8 | formatter = LocalizedNumberFormatter(LOC) 9 | r = formatter.formatDouble(number) 10 | print(r) 11 | # १,२३,४५२.५४ 12 | 13 | rb_formatter = RuleBasedNumberFormat(URBNFRuleSetTag.SPELLOUT, LOC) 14 | r2 = rb_formatter.format(number) 15 | print(r2) 16 | # एक लाख तेईस हज़ार चार सौ बावन दशमलव पाँच चार 17 | 18 | r3 = rb_formatter.parse(r2) 19 | print(Formattable.getDouble(r3)) 20 | # 123452.54 21 | 22 | -------------------------------------------------------------------------------- /py/am_ET_numbers_icu_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/am_ET_numbers_icu_1.png -------------------------------------------------------------------------------- /py/am_ET_numbers_icu_1.py: -------------------------------------------------------------------------------- 1 | from icu import Locale, LocalizedNumberFormatter, Formattable, RuleBasedNumberFormat, URBNFRuleSetTag 2 | lang = "am-ET-u-nu-ethi" 3 | LOC = Locale.createCanonical(lang) 4 | number = 123452 5 | formatter = RuleBasedNumberFormat(URBNFRuleSetTag.SPELLOUT, LOC) 6 | 7 | # 8 | # Spellout (in Amharic) 9 | # 10 | r = formatter.format(number) 11 | print(r) 12 | # መቶ ሁለት አስር ሦስት ሺ አራት መቶ አምስት አስር ሁለት 13 | 14 | # 15 | # Convert back 16 | # 17 | n = formatter.parse(r) 18 | print(n) 19 | # 123,452 20 | print(Formattable.getInt64(n)) 21 | # 123452 22 | 23 | 24 | -------------------------------------------------------------------------------- /py/am_ET_numbers_icu_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/am_ET_numbers_icu_2.png -------------------------------------------------------------------------------- /py/am_ET_numbers_icu_2.py: -------------------------------------------------------------------------------- 1 | from icu import Locale, RuleBasedNumberFormat, URBNFRuleSetTag 2 | lang = "am-ET-u-nu-ethi" 3 | LOC = Locale.createCanonical(lang) 4 | number = 123452 5 | formatter = RuleBasedNumberFormat(URBNFRuleSetTag.NUMBERING_SYSTEM, LOC) 6 | formatter.setDefaultRuleSet('%ethiopic') 7 | r = formatter.format(number) 8 | print(r) 9 | # ፲፪፼፴፬፻፶፪ 10 | 11 | # http://www.geez.org/Numerals/NumberSamples.html 12 | 13 | 14 | def toEthiopicNS(n): 15 | formatter = RuleBasedNumberFormat(URBNFRuleSetTag.NUMBERING_SYSTEM, Locale("am_ET")) 16 | formatter.setDefaultRuleSet('%ethiopic') 17 | return formatter.format(n) 18 | 19 | import pytest 20 | arabic_numbers = [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 100010000, 100100000, 100200000, 100110000, 1, 11, 111, 1111, 11111, 111111, 1111111, 11111111, 111111111, 1111111111, 11111111111, 111111111111, 1111111111111, 1, 12, 123, 1234, 12345, 7654321, 17654321, 51615131, 15161513, 10101011, 101, 1001, 1010, 1011, 1100, 1101, 1111, 10001, 10010, 10100, 10101, 10110, 10111, 100001, 100010, 100011, 100100, 101010, 1000001, 1000101, 1000100, 1010000, 1010001, 1100001, 1010101, 101010101, 100010000, 100010100, 101010100, 3, 30, 33, 303, 3003, 3030, 3033, 3300, 3303, 3333, 30003, 30303, 300003, 303030, 3000003, 3000303, 3030003, 3300003, 3030303, 303030303, 333333333] 21 | ethiopic_numbers = ["a፩", "፲", "፻", "፲፻", "፼", "፲፼", "፻፼", "፲፻፼", "፼፼", "፲፼፼", "፻፼፼", "፲፻፼፼", "፼፼፼", "፼፩፼", "፼፲፼", "፼፳፼", "፼፲፩፼", "፩", "፲፩", "፻፲፩", "፲፩፻፲፩", "፼፲፩፻፲፩", "፲፩፼፲፩፻፲፩", "፻፲፩፼፲፩፻፲፩", "፲፩፻፲፩፼፲፩፻፲፩", "፼፲፩፻፲፩፼፲፩፻፲፩", "፲፩፼፲፩፻፲፩፼፲፩፻፲፩", "፻፲፩፼፲፩፻፲፩፼፲፩፻፲፩", "፲፩፻፲፩፼፲፩፻፲፩፼፲፩፻፲፩", "፼፲፩፻፲፩፼፲፩፻፲፩፼፲፩፻፲፩", "፩", "፲፪", "፻፳፫", "፲፪፻፴፬", "፼፳፫፻፵፭", "፯፻፷፭፼፵፫፻፳፩", "፲፯፻፷፭፼፵፫፻፳፩", "፶፩፻፷፩፼፶፩፻፴፩", "፲፭፻፲፮፼፲፭፻፲፫", "፲፻፲፼፲፻፲፩", "፻፩", "፲፻፩", "፲፻፲", "፲፻፲፩", "፲፩፻", "፲፩፻፩", "፲፩፻፲፩", "፼፩", "፼፲", "፼፻", "፼፻፩", "፼፻፲", "፼፻፲፩", "፲፼፩", "፲፼፲", "፲፼፲፩", "፲፼፻", "፲፼፲፻፲", "፻፼፩", "፻፼፻፩", "፻፼፻", "፻፩፼", "፻፩፼፩", "፻፲፼፩", "፻፩፼፻፩", "፼፻፩፼፻፩", "፼፩፼", "፼፩፼፻", "፼፻፩፼፻", "፫", "፴", "፴፫", "፫፻፫", "፴፻፫", "፴፻፴", "፴፻፴፫", "፴፫፻", "፴፫፻፫", "፴፫፻፴፫", "፫፼፫", "፫፼፫፻፫", "፴፼፫", "፴፼፴፻፴", "፫፻፼፫", "፫፻፼፫፻፫", "፫፻፫፼፫", "፫፻፴፼፫", "፫፻፫፼፫፻፫", "፫፼፫፻፫፼፫፻፫", "፫፼፴፫፻፴፫፼፴፫፻፴፫"] 22 | converted = list(map(toEthiopicNS, arabic_numbers)) 23 | converted == ethiopic_numbers 24 | # True 25 | 26 | def test_ethiopic_ns(l, r): 27 | converted = list(map(toEthiopicNS, l)) 28 | assert converted == r 29 | 30 | test_ethiopic_ns(arabic_numbers, ethiopic_numbers) 31 | 32 | # * [icu::RuleBasedNumberFormat Class Reference](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1RuleBasedNumberFormat.html) 33 | # * [URBNFRuleSetTag](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/namespaceicu.html#a55dbbbdd4946251c23988013e06e695e) 34 | 35 | 36 | for n in range(formatter.getNumberOfRuleSetNames()): 37 | print(formatter.getRuleSetName(n)) 38 | # %armenian-lower 39 | # %armenian-upper 40 | # %cyrillic-lower 41 | # %ethiopic 42 | # %georgian 43 | # %greek-lower 44 | # %greek-upper 45 | # %hebrew 46 | # # %hebrew-item 47 | # %roman-lower 48 | # %roman-upper 49 | # %tamil 50 | # %zz-default 51 | -------------------------------------------------------------------------------- /py/arabic_reshaper_example.py: -------------------------------------------------------------------------------- 1 | import arabic_reshaper 2 | from bidi.algorithm import get_display 3 | 4 | from el_internationalisation import cp, clean_presentation_forms 5 | 6 | def rtl_hack(text: str, arabic: bool = True) -> str: 7 | """Visually reorders Arabic or Hebrew script Unicode text 8 | 9 | Visually reorders Arabic or Hebrew script Unicode text. For Arabic script text, 10 | individual Unicode characters are substituting each character for its equivalent 11 | presentation form. The modules are used to overcome lack of bidirectional algorithm 12 | and complex font rendering in some modules and terminals. 13 | 14 | It is better to solutions that utilise proper bidirectional algorithm and font 15 | rendering implementations. For matplotlib use the mplcairo backend instead. For 16 | annotating images use Pillow. Both make use of libraqm. 17 | 18 | arabic_reshaper module converts Arabic characters to Arabic Presentation Forms: 19 | pip install arabic-reshaper 20 | 21 | bidi.algorithm module converts a logically ordered string to visually ordered 22 | equivalent. 23 | pip install python-bidi 24 | 25 | Args: 26 | text (str): _description_ 27 | 28 | Returns: 29 | str: _description_ 30 | """ 31 | return get_display(arabic_reshaper.reshape(text)) if arabic == True else get_display(text) 32 | 33 | text = 'اللغة العربية رائعة' 34 | text_h = rtl_hack(text) 35 | print(text) 36 | print(cp(text)) 37 | print(text_h) 38 | print(cp(text_h)) 39 | 40 | 41 | 42 | 43 | 44 | s1 = "لا" 45 | s1_h = rtl_hack(s1) 46 | s2 = "لأ" 47 | s2_h = rtl_hack(s2) 48 | 49 | print("\n") 50 | print(s1) 51 | print(cp(s1)) 52 | print(s1_h) 53 | print(cp(s1_h)) 54 | 55 | print("\n") 56 | print(s2) 57 | print(cp(s2)) 58 | print(s2_h) 59 | print(cp(s2_h)) 60 | 61 | 62 | s3 = "עברית חדשה" 63 | s3_h = rtl_hack(s3, arabic=False) 64 | print("\n") 65 | print(s3) 66 | print(cp(s3)) 67 | print(s3_h) 68 | print(cp(s3_h)) 69 | # print(s3_h == s3[::-1]) 70 | 71 | 72 | # Note s3[::-1] is used for reversing strings, 73 | # but for languages that use combining marks, 74 | # it is better to reverse grapheme clusters: 75 | # 76 | # from grapheme import graphemes 77 | # print(s3_h == "".join(list(graphemes(s3))[::-1])) 78 | 79 | from grapheme import graphemes 80 | def reverse_string(text: str, use_graphemes: bool = False) -> str: 81 | return "".join(list(graphemes(text))[::-1]) if use_graphemes else text[::-1] 82 | 83 | import regex as re 84 | def reverse_string_regex(text: str, use_graphemes: bool = False) -> str: 85 | return "".join(re.findall(r'\X', text)[::-1]) if use_graphemes else text[::-1] 86 | 87 | print("---") 88 | # print(s3_h == "".join(list(graphemes(s3))[::-1])) 89 | # print("\n") 90 | print(text_h == text[::-1]) 91 | print(clean_presentation_forms(text_h) == text[::-1]) 92 | # print(clean_presentation_forms(text_h) == "".join(list(graphemes(text))[::-1])) 93 | 94 | 95 | print(clean_presentation_forms(text_h) == reverse_string(text)) 96 | print(clean_presentation_forms(text_h) == reverse_string(text, use_graphemes=True)) 97 | 98 | print(clean_presentation_forms(text_h) == reverse_string_regex(text)) 99 | print(clean_presentation_forms(text_h) == reverse_string_regex(text, use_graphemes=True)) -------------------------------------------------------------------------------- /py/hi_IN_numbers_icu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/hi_IN_numbers_icu.png -------------------------------------------------------------------------------- /py/hi_IN_numbers_icu.py: -------------------------------------------------------------------------------- 1 | from icu import Locale, LocalizedNumberFormatter, Formattable, RuleBasedNumberFormat, URBNFRuleSetTag 2 | # lang = "hi-IN-u-nu-deva" 3 | # lang = "en-IN" 4 | lang = input("Enter language tag: ") 5 | LOC = Locale.createCanonical(lang) 6 | 7 | number = 123452.54 8 | formatter = LocalizedNumberFormatter(LOC) 9 | r = formatter.formatDouble(number) 10 | print(r) 11 | # १,२३,४५२.५४ 12 | 13 | rb_formatter = RuleBasedNumberFormat(URBNFRuleSetTag.SPELLOUT, LOC) 14 | r2 = rb_formatter.format(number) 15 | print(r2) 16 | # एक लाख तेईस हज़ार चार सौ बावन दशमलव पाँच चार 17 | 18 | r3 = rb_formatter.parse(r2) 19 | print(Formattable.getDouble(r3)) 20 | # 123452.54 21 | 22 | -------------------------------------------------------------------------------- /py/matplotlib_kurdish.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/matplotlib_kurdish.png -------------------------------------------------------------------------------- /py/matplotlib_kurdish.py: -------------------------------------------------------------------------------- 1 | # 2 | # matplotlib_kurdish.py 3 | # 4 | # This script will read in and process a Sorani Kurdish TSV file. 5 | # 6 | # mplcairo supports a number of backends available. 7 | # 8 | # If you wish to save plot as an image, rather than display plot 9 | # use module://mplcairo.base 10 | # 11 | # Depending on your OS and system configuration a number of 12 | # backends that render to widgets are available: 13 | # * module://mplcairo.gtk (used below for non-macOS installs) 14 | # * module://mplcairo.gtk_native 15 | # * module://mplcairo.qt 16 | # * module://mplcairo.tk 17 | # * module://mplcairo.wx 18 | # * module://mplcairo.macosx (used below for macOS) 19 | 20 | import pandas as pd 21 | import locale, platform 22 | import gi 23 | import mplcairo 24 | import matplotlib as mpl 25 | if platform.system() == "Darwin": 26 | mpl.use("module://mplcairo.macosx") 27 | else: 28 | gi.require_version("Gtk", "3.0") 29 | mpl.use("module://mplcairo.gtk") 30 | # mpl.use("module://mplcairo.qt") 31 | import matplotlib.pyplot as plt 32 | import matplotlib.ticker as ticker 33 | import seaborn as sns 34 | import unicodedata as ud, regex as re 35 | 36 | # Convert non-Western Arabic digits to Western Arabic digits 37 | def convert_digits(s, sep = (",", ".")): 38 | nd = re.compile(r'^-?\p{Nd}[,.\u066B\u066C\u0020\u2009\u202F\p{Nd}]*$') 39 | tsep, dsep = sep 40 | if nd.match(s): 41 | s = s.replace(tsep, "") 42 | s = ''.join([str(ud.decimal(c, c)) for c in s]) 43 | if dsep in s: 44 | return float(s.replace(dsep, ".")) if dsep != "." else float(s) 45 | return int(s) 46 | return s 47 | 48 | # Specify grouping and decimal seperators using in data 49 | seps = ("\u066C", "\u066B") 50 | # convert entries to hyphen to Eastern Arabic zero, and pass to convert_digits() 51 | digitsconv = lambda x: convert_digits(x.replace("-", "٠"), sep = seps) 52 | 53 | # Covert Western Arabic digits to Eastern Arabic digits for tick labels 54 | def convert_to_sorani_ns(n, p=None, scale=None): 55 | locale.setlocale(locale.LC_ALL, "en_US.UTF-8") 56 | decimal_places = 2 57 | n = n * scale if scale else n 58 | format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d' 59 | n = locale.format_string(format_string, n, grouping=True, monetary=True) 60 | n = n.replace(",", "ṯ").replace(".", "ḏ") 61 | sep = ["\u066C", "\u066B"] 62 | t = n.maketrans("0123456789", "٠١٢٣٤٥٦٧٨٩") 63 | locale.setlocale(locale.LC_ALL, "") 64 | return n.translate(t).replace("ṯ", sep[0] ).replace("ḏ", sep[1]) 65 | 66 | # import data 67 | import pandas as pd 68 | conv = { 69 | 'سووریا': digitsconv, 70 | 'عێراق': digitsconv, 71 | 'ئێران': digitsconv, 72 | 'تورکیا': digitsconv, 73 | 'جیھانی': digitsconv 74 | } 75 | df = pd.read_table("../data/demographics.tsv", converters=conv) 76 | print(df) 77 | 78 | # get sum of each column 79 | col_list=["تورکیا" ,"ئێران" ,"عێراق" ,"سووریا"] 80 | total_df = df[col_list].sum(axis=0) 81 | print(total_df) 82 | 83 | 84 | 85 | 86 | fig, axes = plt.subplots(1,2) 87 | plt.rcParams.update({'font.family':'Vazirmatn'}) 88 | 89 | # axes[0] - subplot with default (LTR) layout 90 | axes[0].bar(total_df.index, total_df.values, color='royalblue', alpha=0.7) 91 | axes[0].grid(color='#95a5a6', linestyle='--', linewidth=2, axis='y', alpha=0.7) 92 | axes[0].set_xlabel("ناوچە", size=12) 93 | axes[0].set_ylabel("ڕێژەی دانیشتووان (بە ملیۆن)", size=12) 94 | axes[0].set_title('ڕێژەی دانیشتووانی کورد', size=15) 95 | 96 | ns_formatter = ticker.FuncFormatter(lambda x, p: convert_to_sorani_ns(x, p, scale=0.000001)) 97 | axes[0].get_yaxis().set_major_formatter(ns_formatter) 98 | 99 | # axes[1] - subplot with RTL layout 100 | axes[1].bar(total_df.index, total_df.values, color='royalblue', alpha=0.7) 101 | axes[1].grid(color='#95a5a6', linestyle='--', linewidth=2, axis='y', alpha=0.7) 102 | 103 | # move y axis and associated label to right of plot 104 | axes[1].yaxis.tick_right() 105 | axes[1].yaxis.set_label_position("right") 106 | # invert x-axis 107 | #plt.gca().invert_xaxis() 108 | axes[1].invert_xaxis() 109 | axes[1].set_xlabel("ناوچە", size=12) 110 | axes[1].set_ylabel("ڕێژەی دانیشتووان (بە ملیۆن)", size=12, labelpad=10) 111 | axes[1].set_title('ڕێژەی دانیشتووانی کورد', size=15) 112 | axes[1].get_yaxis().set_major_formatter(ns_formatter) 113 | 114 | # block=True required for running script in CLI when outputting canvas to widget. 115 | plt.tight_layout() 116 | plt.show(block=True) -------------------------------------------------------------------------------- /py/pandas_plot_kurdish.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/pandas_plot_kurdish.png -------------------------------------------------------------------------------- /py/pandas_plot_kurdish.py: -------------------------------------------------------------------------------- 1 | # 2 | # matplotlib_kurdish.py 3 | # 4 | # This script will read in and process a Sorani Kurdish TSV file. 5 | # 6 | # mplcairo supports a number of backends available. 7 | # 8 | # If you wish to save plot as an image, rather than display plot 9 | # use module://mplcairo.base 10 | # 11 | # Depending on your OS and system configuration a number of 12 | # backends that render to widgets are available: 13 | # * module://mplcairo.gtk (used below for non-macOS installs) 14 | # * module://mplcairo.gtk_native 15 | # * module://mplcairo.qt 16 | # * module://mplcairo.tk 17 | # * module://mplcairo.wx 18 | # * module://mplcairo.macosx (used below for macOS) 19 | 20 | import pandas as pd 21 | import locale, platform 22 | import gi 23 | import mplcairo 24 | import matplotlib as mpl 25 | if platform.system() == "Darwin": 26 | mpl.use("module://mplcairo.macosx") 27 | else: 28 | gi.require_version("Gtk", "3.0") 29 | mpl.use("module://mplcairo.gtk") 30 | # mpl.use("module://mplcairo.qt") 31 | import matplotlib.pyplot as plt 32 | import matplotlib.ticker as ticker 33 | import seaborn as sns 34 | import unicodedata as ud, regex as re 35 | 36 | # Convert non-Western Arabic digits to Western Arabic digits 37 | def convert_digits(s, sep = (",", ".")): 38 | nd = re.compile(r'^-?\p{Nd}[,.\u066B\u066C\u0020\u2009\u202F\p{Nd}]*$') 39 | tsep, dsep = sep 40 | if nd.match(s): 41 | s = s.replace(tsep, "") 42 | s = ''.join([str(ud.decimal(c, c)) for c in s]) 43 | if dsep in s: 44 | return float(s.replace(dsep, ".")) if dsep != "." else float(s) 45 | return int(s) 46 | return s 47 | 48 | # Specify grouping and decimal seperators using in data 49 | seps = ("\u066C", "\u066B") 50 | # convert entries to hyphen to Eastern Arabic zero, and pass to convert_digits() 51 | digitsconv = lambda x: convert_digits(x.replace("-", "٠"), sep = seps) 52 | 53 | # Covert Western Arabic digits to Eastern Arabic digits for tick labels 54 | def convert_to_sorani_ns(n, p=None, scale=None): 55 | locale.setlocale(locale.LC_ALL, "en_US.UTF-8") 56 | decimal_places = 2 57 | n = n * scale if scale else n 58 | format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d' 59 | n = locale.format_string(format_string, n, grouping=True, monetary=True) 60 | n = n.replace(",", "ṯ").replace(".", "ḏ") 61 | sep = ["\u066C", "\u066B"] 62 | t = n.maketrans("0123456789", "٠١٢٣٤٥٦٧٨٩") 63 | locale.setlocale(locale.LC_ALL, "") 64 | return n.translate(t).replace("ṯ", sep[0] ).replace("ḏ", sep[1]) 65 | 66 | # import data 67 | import pandas as pd 68 | conv = { 69 | 'سووریا': digitsconv, 70 | 'عێراق': digitsconv, 71 | 'ئێران': digitsconv, 72 | 'تورکیا': digitsconv, 73 | 'جیھانی': digitsconv 74 | } 75 | df = pd.read_table("../data/demographics.tsv", converters=conv) 76 | print(df) 77 | 78 | # get sum of each column 79 | col_list=["تورکیا" ,"ئێران" ,"عێراق" ,"سووریا"] 80 | total_df = df[col_list].sum(axis=0) 81 | print(total_df) 82 | 83 | plt.figure() 84 | plt.rcParams.update({'font.family':'Vazirmatn'}) 85 | ns_formatter = ticker.FuncFormatter(lambda x, p: convert_to_sorani_ns(x, p, scale=0.000001)) 86 | 87 | plt.subplot(1, 2, 1) 88 | ax1 = total_df.plot(kind="bar", title='ڕێژەی دانیشتووانی کورد', xlabel="ناوچە", ylabel="ڕێژەی دانیشتووان (بە ملیۆن)", rot=0) 89 | ax1.get_yaxis().set_major_formatter(ns_formatter) 90 | 91 | plt.subplot(1, 2, 2) 92 | ax2 = total_df.plot(kind="bar", title='ڕێژەی دانیشتووانی کورد', xlabel="ناوچە", ylabel="ڕێژەی دانیشتووان (بە ملیۆن)", rot=0) 93 | ax2.get_yaxis().set_major_formatter(ns_formatter) 94 | # move y axis and associated label to right of plot 95 | ax2.yaxis.tick_right() 96 | ax2.yaxis.set_label_position("right") 97 | # invert x-axis 98 | #plt.gca().invert_xaxis() 99 | ax2.invert_xaxis() 100 | 101 | plt.tight_layout() 102 | plt.show(block=True) -------------------------------------------------------------------------------- /py/pyuca_test.py: -------------------------------------------------------------------------------- 1 | import pyuca 2 | test_list = ["₨", "Z", "ز", "z", "ر", "٨", "R", "﷼"] 3 | ducet_rules = "../rules/collation/allkeys_DUCET.txt" 4 | cldr_rules = "../rules/collation/allkeys_CLDR.txt" 5 | ducet_collator = pyuca.Collator(ducet_rules) 6 | cldr_collator = pyuca.Collator(cldr_rules) 7 | 8 | sorted_default = sorted(test_list) 9 | print(sorted_default) 10 | sorted_ducet = sorted(test_list, key=ducet_collator.sort_key) 11 | print(sorted_ducet) 12 | sorted_cldr = sorted(test_list, key=cldr_collator.sort_key) 13 | print(sorted_cldr) 14 | 15 | from icu import Locale, Collator 16 | loc = Locale.getRoot() 17 | collator = Collator.createInstance(loc) 18 | sorted_icu_root = sorted(test_list, key=collator.getSortKey) 19 | print(sorted_icu_root) 20 | 21 | print(sorted_icu_root == sorted_cldr) -------------------------------------------------------------------------------- /py/seaborn_kurdish.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/seaborn_kurdish.png -------------------------------------------------------------------------------- /py/seaborn_kurdish.py: -------------------------------------------------------------------------------- 1 | # 2 | # seaborn_kurdish.py 3 | # 4 | # This script will read in and process a Sorani Kurdish TSV file. 5 | # Two plots will be generated (a LTR layout and a RTL layout). 6 | # 7 | # mplcairo supports a number of backends available. 8 | # 9 | # If you wish to save plot as an image, rather than display plot 10 | # use module://mplcairo.base 11 | # 12 | # Depending on your OS and system configuration a number of 13 | # backends that render to widgets are available: 14 | # * module://mplcairo.gtk (used below for non-macOS installs) 15 | # * module://mplcairo.gtk_native 16 | # * module://mplcairo.qt 17 | # * module://mplcairo.tk 18 | # * module://mplcairo.wx 19 | # * module://mplcairo.macosx (used below for macOS) 20 | 21 | import pandas as pd 22 | import locale, platform 23 | import gi 24 | import mplcairo 25 | import matplotlib as mpl 26 | if platform.system() == "Darwin": 27 | mpl.use("module://mplcairo.macosx") 28 | else: 29 | gi.require_version("Gtk", "3.0") 30 | mpl.use("module://mplcairo.gtk") 31 | # mpl.use("module://mplcairo.qt") 32 | import matplotlib.pyplot as plt 33 | import matplotlib.ticker as ticker 34 | import seaborn as sns 35 | import unicodedata as ud, regex as re 36 | 37 | # Convert non-Western Arabic digits to Western Arabic digits 38 | def convert_digits(s, sep = (",", ".")): 39 | nd = re.compile(r'^-?\p{Nd}[,.\u066B\u066C\u0020\u2009\u202F\p{Nd}]*$') 40 | tsep, dsep = sep 41 | if nd.match(s): 42 | s = s.replace(tsep, "") 43 | s = ''.join([str(ud.decimal(c, c)) for c in s]) 44 | if dsep in s: 45 | return float(s.replace(dsep, ".")) if dsep != "." else float(s) 46 | return int(s) 47 | return s 48 | 49 | # Specify grouping and decimal seperators using in data 50 | seps = ("\u066C", "\u066B") 51 | # convert entries to hyphen to Eastern Arabic zero, and pass to convert_digits() 52 | digitsconv = lambda x: convert_digits(x.replace("-", "٠"), sep = seps) 53 | 54 | # Covert Western Arabic digits to Eastern Arabic digits for tick labels 55 | def convert_to_sorani_ns(n, p=None, scale=None): 56 | locale.setlocale(locale.LC_ALL, "en_US.UTF-8") 57 | decimal_places = 2 58 | n = n * scale if scale else n 59 | format_string = '%0.' + str(decimal_places) + 'f' if type(n) == float else '%d' 60 | n = locale.format_string(format_string, n, grouping=True, monetary=True) 61 | n = n.replace(",", "ṯ").replace(".", "ḏ") 62 | sep = ["\u066C", "\u066B"] 63 | t = n.maketrans("0123456789", "٠١٢٣٤٥٦٧٨٩") 64 | locale.setlocale(locale.LC_ALL, "") 65 | return n.translate(t).replace("ṯ", sep[0] ).replace("ḏ", sep[1]) 66 | 67 | # import data 68 | import pandas as pd 69 | conv = { 70 | 'سووریا': digitsconv, 71 | 'عێراق': digitsconv, 72 | 'ئێران': digitsconv, 73 | 'تورکیا': digitsconv, 74 | 'جیھانی': digitsconv 75 | } 76 | df = pd.read_table("../data/demographics.tsv", converters=conv) 77 | print(df) 78 | 79 | # get sum of each column 80 | col_list=["تورکیا" ,"ئێران" ,"عێراق" ,"سووریا"] 81 | total_df = df[col_list].sum(axis=0) 82 | print(total_df) 83 | 84 | # Plot data. First subplot (axes[0]) is default layout, second subplot (axes[1]) is an RTL layout 85 | sns.set_style('darkgrid') 86 | sns.set_context({"font.family": "Vazirmatn"}) 87 | fig, axes = plt.subplots(1,2) 88 | sns.barplot(x=total_df.index, y=total_df.values, ax=axes[0]) 89 | sns.barplot(x=total_df.index, y=total_df.values, ax=axes[1]) 90 | 91 | # set common labels for X and Y axes. 92 | plt.setp(axes, xlabel="ناوچە") 93 | plt.setp(axes, ylabel="ڕێژەی دانیشتووان (بە ملیۆن)") 94 | # Set single title for all subplots 95 | fig.suptitle('ڕێژەی دانیشتووانی کورد') 96 | 97 | # Define and apply conversion to tick labels for both axes 98 | ns_formatter = ticker.FuncFormatter(lambda x, p: convert_to_sorani_ns(x, p, scale=0.000001)) 99 | axes[0].get_yaxis().set_major_formatter(ns_formatter) 100 | axes[1].get_yaxis().set_major_formatter(ns_formatter) 101 | 102 | # move y axis and associated label to right of axes[1] 103 | axes[1].yaxis.tick_right() 104 | axes[1].yaxis.set_label_position("right") 105 | # invert x-axis for axes[1] 106 | #plt.gca().invert_xaxis() 107 | axes[1].invert_xaxis() 108 | 109 | # block=True required for running script in CLI when outputting canvas to widget. 110 | plt.show(block=True) -------------------------------------------------------------------------------- /py/wordcloud_kurdish.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enabling-languages/python-i18n/5580da3caf9f1cf60293bfe7f7726c8538e40c45/py/wordcloud_kurdish.png -------------------------------------------------------------------------------- /py/wordcloud_kurdish.py: -------------------------------------------------------------------------------- 1 | import gi, platform, os 2 | import mplcairo 3 | import matplotlib as mpl 4 | if platform.system() == "Darwin": 5 | mpl.use("module://mplcairo.macosx") 6 | else: 7 | gi.require_version("Gtk", "3.0") 8 | mpl.use("module://mplcairo.gtk") 9 | # mpl.use("module://mplcairo.qt") 10 | import matplotlib.pyplot as plt 11 | from wordcloud import WordCloud 12 | 13 | # Stopword list from klpt (Kurdish Language Processing Toolkit) 14 | # Available stopword lists: Sorani (Arabic) and Kurmanji (Latin) 15 | def get_kurdish_stopwords(dialect, script): 16 | from urllib.request import urlopen 17 | import json 18 | url = "https://raw.githubusercontent.com/sinaahmadi/klpt/master/klpt/data/stopwords.json" 19 | response = urlopen(url) 20 | data_json = json.loads(response.read()) 21 | return set(data_json[dialect][script]) 22 | 23 | ckb_stopwords = get_kurdish_stopwords("Sorani", "Arabic") 24 | text = """ 25 | زمانی کوردی 26 | لە ئینسایکڵۆپیدیای ئازادی ویکیپیدیاوە 27 | ئەم وتارە سەبارەت بە زمانی کوردی نووسراوە. بۆ شاعیرە کوردەکە، بڕوانە کوردی (شاعیر). بۆ وتارە ھاوشێوەکان، بڕوانە کوردی (ڕوونکردنەوە). 28 | زمانی کوردی (بە کرمانجی، بە سۆرانی: زمانی کوردی، بە کەڵهوڕی: زوان کوردی، بە لەکی: زوۆن کوردی، بە زازاکی، بە ھەورامی: زوانو کوردی) زمانێکە کە خەڵکی کورد قسەی پێدەکەن. لە ڕووی بنەماڵەوە بەشێکە لە زمانە ھیندوئەورووپایییەکان. ئەم زمانە لە زمانی کەڤناری مادی کەوتووەتەوە. زمانی کوردی لە نێوان زمانە ئێرانییەکاندا لە بواری پرژماربوونی ئاخێوەران سێیەمین زمانە و دەکەوێتە دوای زمانەکانی فارسی و پەشتۆ. 29 | شێوەزارەکانی کوردی 30 | وتاری سەرەکی: شێوەزارەکانی زمانی کوردی 31 | زمانی کوردی چەند شێوەزارێکی سەرەکی ھەیە کە جیاوازیی زۆریان ھەیە و زمانناسەکان لە سەر چۆنیەتی جیاکردنەوەی ئەم شێوەزارانە یەکدەنگ نین و زۆرێک لە زمانناسەکان باوەڕییان بە ماڵباتی زمانگەلی کوردی ھەیە. یانی کورمانجیی باکووری و گۆرانی، بە پێی یاسا و ڕێسای زمانناسی و زمانەوانییەوە، دو زمانی سەربەخۆی کوردینە، نەک دو شێوەزار. بەڵام زۆربەی ئەو کەسانەی زمانی(زمانەکانی) کوردییان دابەش کردووە، بەم چوار دەستەیە بووە 32 | کوردیی باکووری 33 | کوردیی ناوەندی 34 | کوردیی باشووری 35 | گۆرانی-زازایی 36 | ھەندێک لە زمانناسان، لوڕیش وەک شێوەزارێکی زمانی کوردی پۆلبەند دەکەن. ئەگەر چی لوڕی ژمارەیەکی زۆری وشەی کوردی تێدایە، بەڵام ھێشتاش لێکۆلینەوەیەکی ئەوتۆ لە سەر لوڕی لە بەر دەستدا نییە. 37 | ئەلفوبێی کوردی 38 | وتار سەرەکییەکان: ئەلفوبێکانی کوردی و ئەلفوبێی عەرەبیی زمانی کوردی 39 | بەھۆی ئەوەی کە کوردەکان لە ژێر دەسەڵاتی عوسمانی و ئێران بوون و ئەلفوبێی فەرمیی ئەو دوو وڵاتە ئەلفوبێی عەرەبی بوو، کوردەکانیش تا پێش سییەکان تەنیا ئەلفوبێی عەرەبییان بۆ نووسینی کوردی بەکار دەھێنا. لە تورکیا، لە دوای بە فەرمیکردنی ئەلفوبێی لاتینی بۆ زمانی تورکی، جەلادەت عەلی بەدرخان لە ساڵی ١٩٣٢ ئەلفوبێیەکی لاتینیی بۆ زمانی کوردی داھێنا کە ئێستا بە ناوی "ئەلفوبێی ھاوار" یان "بەدرخان" دەناسرێت. 40 | """ 41 | 42 | font_file = os.path.expanduser("~/.local/share/fonts/fontamin/TrueType/Estedad/Estedad_Regular.ttf") 43 | 44 | word_cloud = WordCloud(font_path=font_file, collocations = False, background_color = 'white', stopwords=ckb_stopwords).generate(text) 45 | plt.imshow(word_cloud, interpolation='bilinear') 46 | plt.axis("off") 47 | plt.show(block=True) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gcld3==3.0.13 2 | grapheme==0.6.0 3 | LaoNLP==0.2.dev5 4 | nltk>=3.6.4 5 | pandas==1.1.3 6 | PyICU==2.7.4 7 | pyidaungsu==0.0.9 8 | pythainlp==2.3.1 9 | python-myanmar==1.10.0 10 | pyuca==1.2 11 | regex==2020.4.4 12 | tangled-up-in-unicode==0.0.6 13 | unicodedata2==13.0.0.post2 14 | unicodedataplus==13.0.0.post2 15 | -------------------------------------------------------------------------------- /rules/collation/README.md: -------------------------------------------------------------------------------- 1 | # Collation rules 2 | 3 | Unicode 15.0.0 \ 4 | CLDR v41 5 | 6 | __Collation data:__ 7 | 8 | * [allkeys_CLDR.txt](https://github.com/unicode-org/cldr/blob/main/common/uca/allkeys_CLDR.txt) 9 | * [allkeys_DUCET.txt](https://www.unicode.org/Public/UCA/latest/allkeys.txt) 10 | * [CLDR collation rules per locale](https://github.com/unicode-org/cldr/tree/release-42-beta2/common/collation) 11 | 12 | 13 | __Other links:__ 14 | 15 | * [CLDR versions](https://cldr.unicode.org/index/downloads) 16 | * [CLDR (GitHub)](https://github.com/unicode-org/cldr) 17 | * [CLDR development version](https://cldr.unicode.org/index/downloads/dev) 18 | * [UCA data - latest](https://www.unicode.org/Public/UCA/latest/) 19 | * [UCD data and charts - latest](https://www.unicode.org/Public/UCD/latest/) 20 | * [Collator demo](https://icu4c-demos.unicode.org/icu-bin/collation.html) -------------------------------------------------------------------------------- /rules/collation/cldr/ckb.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 18 | 19 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /rules/collation/cldr/ckb_IQ.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /rules/collation/cldr/ckb_IR.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /rules/collation/collation_rules.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | # 3 | # Collation _rules 4 | # 5 | ############################### 6 | 7 | # Akan (ak, fat, twi, wss) 8 | ak_rules = fat_rules = twi_rules = wss_rules = ( 9 | "&E<ɛ<<<Ɛ" 10 | "&O<ɔ<<<Ɔ" 11 | ) 12 | 13 | # Dinka (din, dip, diw, dib, dks, dik) 14 | din_rules = dip_rules = diw_rules = dib_rules = dks_rules = dik_rules = ( 15 | "[normalization on]" 16 | "&A< from "" % Double ARABIC LETTER WAW 60 | 61 | reorder-after % ARABIC LETTER REH 62 | % ARABIC LETTER REH WITH SMALL V BELOW 63 | 64 | reorder-after % ARABIC LETTER WAW 65 | % Double ARABIC LETTER WAW 66 | 67 | reorder-end 68 | 69 | END LC_COLLATE 70 | 71 | LC_MONETARY 72 | copy "ckb_IQ" 73 | END LC_MONETARY 74 | 75 | LC_NUMERIC 76 | copy "ckb_IQ" 77 | END LC_NUMERIC 78 | 79 | LC_TIME 80 | copy "ckb_IQ" 81 | END LC_TIME 82 | 83 | LC_MESSAGES 84 | copy "ckb_IQ" 85 | END LC_MESSAGES 86 | 87 | LC_PAPER 88 | copy "ckb_IQ" 89 | END LC_PAPER 90 | 91 | LC_NAME 92 | copy "ckb_IQ" 93 | END LC_NAME 94 | 95 | LC_ADDRESS 96 | copy "ckb_IQ" 97 | END LC_ADDRESS 98 | 99 | LC_TELEPHONE 100 | copy "ckb_IQ" 101 | END LC_TELEPHONE 102 | 103 | LC_MEASUREMENT 104 | copy "ckb_IQ" 105 | END LC_MEASUREMENT -------------------------------------------------------------------------------- /rules/collation/glibc/en_SS: -------------------------------------------------------------------------------- 1 | comment_char % 2 | escape_char / 3 | 4 | % This file is part of the GNU C Library and contains locale data. 5 | % The Free Software Foundation does not claim any copyright interest 6 | % in the locale data contained in this file. The foregoing does not 7 | % affect the license of the GNU C Library as a whole. It does not 8 | % exempt you from the conditions of the license if your use would 9 | % otherwise be governed by that license. 10 | 11 | LC_IDENTIFICATION 12 | title "English locale for South Sudan" 13 | source "CLDR" 14 | address "" 15 | contact "Andjc" 16 | email "" 17 | tel "" 18 | fax "" 19 | language "English" 20 | territory "South Sudan" 21 | revision "1.0" 22 | date "2022-10-13" 23 | 24 | category "i18n:2012";LC_IDENTIFICATION 25 | category "i18n:2012";LC_CTYPE 26 | category "i18n:2012";LC_COLLATE 27 | category "i18n:2012";LC_TIME 28 | category "i18n:2012";LC_NUMERIC 29 | category "i18n:2012";LC_MONETARY 30 | category "i18n:2012";LC_MESSAGES 31 | category "i18n:2012";LC_PAPER 32 | category "i18n:2012";LC_NAME 33 | category "i18n:2012";LC_ADDRESS 34 | category "i18n:2012";LC_TELEPHONE 35 | category "i18n:2012";LC_MEASUREMENT 36 | END LC_IDENTIFICATION 37 | 38 | LC_CTYPE 39 | copy "i18n" 40 | 41 | translit_start 42 | include "translit_combining";"" 43 | translit_end 44 | END LC_CTYPE 45 | 46 | LC_COLLATE 47 | % Copy the template from ISO/IEC 14651 48 | copy "iso14651_t1" 49 | END LC_COLLATE 50 | 51 | LC_MONETARY 52 | int_curr_symbol "SSP " 53 | currency_symbol "" 54 | mon_decimal_point "." 55 | mon_thousands_sep "," 56 | mon_grouping 3;3 57 | positive_sign "" 58 | negative_sign "-" 59 | int_frac_digits 2 60 | frac_digits 2 61 | p_cs_precedes 1 62 | p_sep_by_space 0 63 | n_cs_precedes 1 64 | n_sep_by_space 0 65 | p_sign_posn 1 66 | n_sign_posn 1 67 | END LC_MONETARY 68 | 69 | LC_NUMERIC 70 | decimal_point "." 71 | thousands_sep "," 72 | grouping 3;3 73 | END LC_NUMERIC 74 | 75 | LC_TIME 76 | abday "Sun";"Mon";"Tue";"Wed";"Thu";"Fri";"Sat" 77 | day "Sunday";/ 78 | "Monday";/ 79 | "Tuesday";/ 80 | "Wednesday";/ 81 | "Thursday";/ 82 | "Friday";/ 83 | "Saturday" 84 | abmon "Jan";"Feb";/ 85 | "Mar";"Apr";/ 86 | "May";"Jun";/ 87 | "Jul";"Aug";/ 88 | "Sep";"Oct";/ 89 | "Nov";"Dec" 90 | mon "January";/ 91 | "February";/ 92 | "March";/ 93 | "April";/ 94 | "May";/ 95 | "June";/ 96 | "July";/ 97 | "August";/ 98 | "September";/ 99 | "October";/ 100 | "November";/ 101 | "December" 102 | d_t_fmt "%a %d %b %Y %T %Z" 103 | d_fmt "%d//%m//%y" 104 | t_fmt "%T" 105 | am_pm "am";"pm" 106 | t_fmt_ampm "%l:%M:%S %P %Z" 107 | date_fmt "%a %e %b %H:%M:%S %Z %Y" 108 | week 7;19971130;4 109 | first_weekday 2 110 | END LC_TIME 111 | 112 | LC_MESSAGES 113 | copy "en_US" 114 | END LC_MESSAGES 115 | 116 | LC_PAPER 117 | copy "i18n" 118 | END LC_PAPER 119 | 120 | LC_TELEPHONE 121 | tel_int_fmt "+%c %a %l" 122 | tel_dom_fmt "%A %l" 123 | int_select "00" % https://en.wikipedia.org/wiki/International_call_prefix, https://en.wikipedia.org/wiki/List_of_international_call_prefixes 124 | int_prefix "211" % https://en.wikipedia.org/wiki/List_of_country_calling_codes 125 | END LC_TELEPHONE 126 | 127 | LC_MEASUREMENT 128 | copy "i18n" 129 | END LC_MEASUREMENT 130 | 131 | LC_NAME 132 | copy "en_US" 133 | END LC_NAME 134 | 135 | LC_ADDRESS 136 | postal_fmt "%f%N%a%N%d%N%b%N%s %h %e %r%N%z %T%N%c%N" 137 | country_name "South Sudan" 138 | country_ab2 "SS" % https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes 139 | country_ab3 "SSD" 140 | country_num 728 141 | country_car "" 142 | lang_name "English" 143 | lang_ab "en" 144 | lang_term "eng" 145 | lang_lib "eng" 146 | END LC_ADDRESS 147 | 148 | % https://man7.org/linux/man-pages/man5/locale.5.html 149 | % https://metacpan.org/dist/DateTime-Locale/view/lib/DateTime/Locale/en_SS.pod 150 | % https://www.localeplanet.com/icu/en-SS/index.html 151 | % https://www.iana.org/time-zones - https://www.timeanddate.com/worldclock/south-sudan/juba - Central African Time (CAT) - Africa/Juba - UTC+2 -------------------------------------------------------------------------------- /rules/collation/icu/ckb.txt: -------------------------------------------------------------------------------- 1 | // © 2016 and later: Unicode, Inc. and others. 2 | // License & terms of use: http://www.unicode.org/copyright.html 3 | // Generated using tools/cldr/cldr-to-icu/build-icu-data.xml 4 | ckb{ 5 | collations{ 6 | standard{ 7 | Sequence{ 8 | "[normalization on]" 9 | "[reorder Arab]" 10 | "&ر < ڕ" 11 | "&و < وو" 12 | } 13 | Version{"42"} 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /rules/collation/icu/ckb_IQ.txt: -------------------------------------------------------------------------------- 1 | // © 2016 and later: Unicode, Inc. and others. 2 | // License & terms of use: http://www.unicode.org/copyright.html 3 | // Generated using tools/cldr/cldr-to-icu/build-icu-data.xml 4 | ckb_IQ{ 5 | collations{ 6 | standard{ 7 | Sequence{"[import ckb]"} 8 | Version{"42"} 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /rules/collation/sorani_alphabet.tsv: -------------------------------------------------------------------------------- 1 | Order Character Codepoint 2 | 1 ئ U+0626 3 | 2 ا U+0627 4 | 3 ب U+0628 5 | 4 پ U+067E 6 | 5 ت U+062A 7 | 6 ج U+062C 8 | 7 چ U+0686 9 | 8 ح U+062D 10 | 9 خ U+062E 11 | 10 د U+062F 12 | 11 ر U+0631 13 | 12 ڕ U+0695 14 | 13 ز U+0632 15 | 14 ژ U+0698 16 | 15 س U+0633 17 | 16 ش U+0634 18 | 17 ع U+0639 19 | 18 غ U+063A 20 | 19 ف U+0641 21 | 20 ڤ U+06A4 22 | 21 ق U+0642 23 | 22 ک U+06A9 24 | 23 گ U+06AF 25 | 24 ل U+0644 26 | 25 ڵ U+06B5 27 | 26 م U+0645 28 | 27 ن U+0646 29 | 28 ه U+0647 30 | 29 ە U+06D5 31 | 30 و U+0648 32 | 32 وو U+0648 U+0648 33 | 31 ۆ U+06C6 34 | 33 ی U+06CC 35 | 34 ێ U+06CE -------------------------------------------------------------------------------- /rules/collation/temp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import regex as re\n", 10 | "text = \"ရန်ကုန်ကွန်ပျူတာတက္ကသိုလ်\"" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 7, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "Number of graphemes: 14\n", 23 | "Graphemes: ['ရ', 'န်', 'ကု', 'န်', 'ကွ', 'န်', 'ပျူ', 'တ', 'ာ', 'တ', 'က္', 'က', 'သို', 'လ်']\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "# split string into extended grapheme clusters\n", 29 | "graphemes = re.findall(r'\\X', text)\n", 30 | "print(f'Number of graphemes: {len(graphemes)}\\nGraphemes: {graphemes}')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 8, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "Number of syllables: 7\n", 43 | "Syllables: ['န်', 'ုန်', 'ွန်', 'ျူ', 'ာ', 'က္က', 'ိုလ်']\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "# syllable segmentation with regex\n", 49 | "pattern = r'(?:(?= 60: 183 | formatter = LocalizedNumberFormatter(loc) 184 | r = formatter.formatDouble(digit) if isinstance(digit, float) else formatter.formatInt(digit) 185 | else: 186 | formatter = NumberFormat.createInstance(loc) 187 | r = formatter.format(digit) 188 | return r 189 | 190 | -------------------------------------------------------------------------------- /snippets/data_cleaning.py: -------------------------------------------------------------------------------- 1 | #################### 2 | # 3 | # Data cleaning 4 | # © Enabling Languages 2022 5 | # Released under the MIT License. 6 | # 7 | #################### 8 | 9 | # import unicodedata as ud 10 | import unicodedataplus as ud 11 | import regex as re 12 | from icu import UnicodeString, Locale, Normalizer2, UNormalizationMode2 13 | 14 | 15 | # 16 | # Unicode normalisation 17 | # Simple wrappers for Unicode normalisation 18 | # 19 | 20 | def NFD(s, engine="ud"): 21 | if engine == "icu": 22 | normalizer = Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE) 23 | return normalizer.normalize(s) 24 | return ud.normalize('NFD', s) 25 | 26 | def NFKD(s, engine="ud"): 27 | if engine == "icu": 28 | normalizer = Normalizer2.getInstance(None, "nfkc", UNormalizationMode2.DECOMPOSE) 29 | return normalizer.normalize(s) 30 | return ud.normalize('NFKD', s) 31 | 32 | def NFC(s, engine="ud"): 33 | if engine == "icu": 34 | normalizer = Normalizer2.getInstance(None, "nfc", UNormalizationMode2.COMPOSE) 35 | return normalizer.normalize(s) 36 | return ud.normalize('NFC', s) 37 | 38 | def NFKC(s, engine="ud"): 39 | if engine == "icu": 40 | normalizer = Normalizer2.getInstance(None, "nfkc", UNormalizationMode2.COMPOSE) 41 | return normalizer.normalize(s) 42 | return ud.normalize('NFKC', s) 43 | 44 | # 45 | # Clean presentation forms 46 | # 47 | # For Latin and Armenian scripts, use either folding=True or folding=False (default), 48 | # while for Arabic and Hebrew scripts, use folding=False. 49 | # 50 | 51 | def has_presentation_forms(text): 52 | pattern = r'([\p{InAlphabetic_Presentation_Forms}\p{InArabic_Presentation_Forms-A}\p{InArabic_Presentation_Forms-B}]+)' 53 | return bool(re.findall(pattern, text)) 54 | 55 | def clean_presentation_forms(text, folding=False): 56 | def clean_pf(match, folding): 57 | return match.group(1).casefold() if folding else ud.normalize("NFKC", match.group(1)) 58 | pattern = r'([\p{InAlphabetic_Presentation_Forms}\p{InArabic_Presentation_Forms-A}\p{InArabic_Presentation_Forms-B}]+)' 59 | return re.sub(pattern, lambda match, folding=folding: clean_pf(match, folding), text) 60 | 61 | # PyICU Helper functions for casing and casefolding. 62 | # s is a string, l is an ICU locale object (defaulting to CLDR Root Locale) 63 | def toLower(s, l=Locale.getRoot()): 64 | return str(UnicodeString(s).toLower(l)) 65 | def toUpper(s, l=Locale.getRoot()): 66 | return str(UnicodeString(s).toUpper(l)) 67 | def toTitle(s, l=Locale.getRoot()): 68 | return str(UnicodeString(s).toTitle(l)) 69 | def toSentence(s, l=Locale.getRoot()): 70 | return(str(UnicodeString(s[0]).toUpper(l)) + str(UnicodeString(s[1:]).toLower(l))) 71 | def foldCase(s): 72 | return str(UnicodeString(s).foldCase()) 73 | 74 | # 75 | # Turkish casing implemented without module dependencies. 76 | # PyICU provides a more comprehensive solution for Turkish. 77 | # 78 | 79 | # To lowercase 80 | def kucukharfyap(s): 81 | return ud.normalize("NFC", s).replace('İ', 'i').replace('I', 'ı').lower() 82 | 83 | # To uppercase 84 | def buyukharfyap(s): 85 | return ud.normalize("NFC", s).replace('ı', 'I').replace('i', 'İ').upper() 86 | -------------------------------------------------------------------------------- /snippets/matching.py: -------------------------------------------------------------------------------- 1 | #################### 2 | # 3 | # Unicode matching 4 | # 5 | # © Enabling Languages 2022 6 | # Released under the MIT License. 7 | # 8 | #################### 9 | 10 | import unicodedataplus as ud 11 | import regex as re 12 | 13 | def caseless_match(x, y): 14 | return x.casefold() == y.casefold() 15 | 16 | def canonical_caseless_match(x, y): 17 | return ud.normalize("NFD", ud.normalize("NFD", x).casefold()) == ud.normalize("NFD", ud.normalize("NFD", y).casefold()) 18 | 19 | def compatibility_caseless_match(x, y): 20 | return ud.normalize("NFKD", ud.normalize("NFKD", ud.normalize("NFD", x).casefold()).casefold()) == ud.normalize("NFKD", ud.normalize("NFKD", ud.normalize("NFD", y).casefold()).casefold()) 21 | 22 | def NFKC_Casefold(s): 23 | pattern = re.compile(r"\p{Default_Ignorable_Code_Point=Yes}") 24 | s = re.sub(pattern, '', s) 25 | return ud.normalize("NFC", ud.normalize('NFKC', s).casefold()) 26 | 27 | def identifier_caseless_match(x, y): 28 | return NFKC_Casefold(ud.normalize("NFD", x)) == NFKC_Casefold(ud.normalize("NFD", y)) 29 | -------------------------------------------------------------------------------- /snippets/regex_segmentation.py: -------------------------------------------------------------------------------- 1 | import regex 2 | from el_internationalisation import cp 3 | 4 | 5 | def regex_segmentation(text: str, pattern: str, sep: str = "\u200B", mode: list = ["list"]) -> list | str | None: 6 | """Tokenise string using regex, returning results as a list or string. 7 | 8 | Args: 9 | text (str): text to be segmented 10 | pattern (str): regex pattern for segmentation 11 | sep (str, optional): seperator to use if string is returned or results are displayed to STDOUT. Defaults to "\u200B" (ZWSP - Zero Width Space). 12 | display (bool, optional): Indicates whether results should displayed on STDOUT (True) or returned (False). Defaults to False. 13 | mode (str, optional): Indicates if results should be returned as a list or string, or displayed to STDOUT. Defaults to "list". Use "string" to return results as a string. Use "display" to output to STDOUT 14 | 15 | Returns: 16 | list | str | None: Results returned as list or string (see mode argument) or as None (if display) 17 | """ 18 | result: str = regex.sub(pattern, r"\u200B\1", text) 19 | if result[0] == "\u200B": 20 | result = result[1:] 21 | result_list: list = result.split("\u200B") 22 | result_string: str = sep.join(result_list) 23 | if "display" in mode: 24 | print( 25 | f"Number of tokens: {str(len(result_list))} \nSegmentation boundaries: {result_string}") 26 | if "codepoints" in mode: 27 | for item in result_list: 28 | print(cp(item)) 29 | if ("string" not in mode) and ("list" not in mode): 30 | print("Nothing to return") 31 | return None 32 | return result_string if "string" in mode else result_list 33 | 34 | ##################### 35 | # 36 | # Examples 37 | # 38 | ##################### 39 | 40 | 41 | s = 'ရန်ကုန်ကွန်ပျူတာတက္ကသိုလ်' 42 | pattern = r'(?:(?= 0xa0 and bytenums[1] <= 0xbf and \ 65 | bytenums[2] >= 0x80 and bytenums[2] <= 0xbf and \ 66 | bytenums[3] == SURROGATE_IDENTICATOR_INT and \ 67 | bytenums[4] >= 0xb0 and bytenums[4] <= 0xbf and \ 68 | bytenums[5] >= 0x80 and bytenums[5] <= 0xbf: 69 | 70 | codepoint = ( 71 | ((bytenums[1] & 0x0f) << 16) + 72 | ((bytenums[2] & 0x3f) << 10) + 73 | ((bytenums[4] & 0x0f) << 6) + 74 | (bytenums[5] & 0x3f) + 75 | 0x10000 76 | ) 77 | return chr(codepoint), 6 78 | 79 | # No CESU-8 surrogate but probably a 3 byte UTF-8 sequence 80 | return codecs.utf_8_decode(input[:3], errors, final) 81 | 82 | cesu8_surrogate_start = input.find(SURROGATE_IDENTICATOR_BYTE) 83 | if cesu8_surrogate_start > 0: 84 | # Decode everything until start of cesu8 surrogate pair 85 | return codecs.utf_8_decode(input[:cesu8_surrogate_start], errors, final) 86 | 87 | # No sign of CESU-8 encoding 88 | return codecs.utf_8_decode(input, errors, final) 89 | 90 | class IncrementalEncoder(codecs.BufferedIncrementalEncoder): 91 | 92 | def _buffer_encode(self, input, errors, final=False): 93 | encoded_segments = [] 94 | position = 0 95 | input_length = len(input) 96 | 97 | while position + 1 <= input_length: 98 | encoded, consumed = self._buffer_encode_step( 99 | input[position], errors, final 100 | ) 101 | 102 | if consumed == 0: 103 | break 104 | 105 | encoded_segments.append(encoded) 106 | position += consumed 107 | 108 | if final and position != len(input): 109 | raise Exception("Final encoder doesn't encode all characters") 110 | 111 | return b''.join(encoded_segments), position 112 | 113 | def _buffer_encode_step(self, char, errors, final): 114 | codepoint = ord(char) 115 | if codepoint <= 65535: 116 | return codecs.utf_8_encode(char, errors) 117 | else: 118 | seq = bytearray(6) 119 | seq[0] = 0xED 120 | seq[1] = 0xA0 | (((codepoint & 0x1F0000) >> 16) - 1) 121 | seq[2] = 0x80 | (codepoint & 0xFC00) >> 10 122 | seq[3] = 0xED 123 | seq[4] = 0xB0 | ((codepoint >> 6) & 0x3F) 124 | seq[5] = 0x80 | (codepoint & 0x3F) 125 | return bytes(seq), 1 126 | 127 | def encode(input, errors='strict'): 128 | return IncrementalEncoder(errors).encode(input, final=True), len(input) 129 | 130 | def decode(input, errors='strict'): 131 | return IncrementalDecoder(errors).decode(input, final=True), len(input) 132 | 133 | class StreamWriter(codecs.StreamWriter): 134 | encode = encode 135 | 136 | class StreamReader(codecs.StreamReader): 137 | decode = decode 138 | 139 | CESU8_CODEC_INFO = codecs.CodecInfo( 140 | name="cesu-8", 141 | encode=encode, 142 | decode=decode, 143 | incrementalencoder=IncrementalEncoder, 144 | incrementaldecoder=IncrementalDecoder, 145 | streamreader=StreamReader, 146 | streamwriter=StreamWriter, 147 | ) 148 | 149 | def search_function(encoding): 150 | if encoding == 'cesu-8': 151 | return CESU8_CODEC_INFO 152 | else: 153 | return None 154 | 155 | codecs.register(search_function) 156 | --------------------------------------------------------------------------------