├── .gitignore ├── LICENSE ├── README.md ├── bin └── README.md ├── data ├── classification │ └── README.md ├── dictionaries │ └── README.md ├── lm │ └── README.md ├── ner │ └── README.md └── normalization │ ├── README.md │ └── document.txt ├── documentation.yml ├── downloader.py ├── examples ├── __init__.py ├── classification │ ├── __init__.py │ ├── classification_example_base.py │ ├── news_title_category_finder.py │ ├── simple_classification.py │ └── train_classifier.py ├── core │ ├── __init__.py │ └── histogram.py ├── morphology │ ├── __init__.py │ ├── add_dictionary_item.py │ ├── change_stem.py │ ├── diacritic_analysis.py │ ├── find_pos.py │ ├── informal_words_analysis.py │ ├── sentence_disambiguation.py │ ├── stem_and_lemmatize.py │ ├── word_analysis.py │ └── word_generation.py ├── normalization │ ├── __init__.py │ ├── document_correction.py │ ├── noisy_text_normalization.py │ └── spell_checking.py └── tokenization │ ├── sentence_boundary_detection.py │ └── turkish_tokenization.py ├── main.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .env 3 | .mypy_cache/ 4 | .vscode/ 5 | bin/*.jar 6 | data/classification/news* 7 | data/lm/lm.2gram.slm 8 | data/normalization/ascii-map 9 | data/normalization/lookup-from-graph 10 | data/normalization/split -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Zemberek Python Examples 2 | 3 | > Zemberek Turkish NLP examples written in Python using the JPype package. 4 | 5 | Zemberek is a Java-based natural language processing (NLP) tool created for the Turkish language. This repository contains the Python implementations of the [official Zemberek examples](https://github.com/ahmetaa/zemberek-nlp/tree/master/examples/src/main/java/zemberek/examples) for learning purposes. 6 | 7 | ## Table of Contents 8 | 9 | | Folder | Description | 10 | | ------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 11 | | classification | fastText examples | 12 | | core | histogram | 13 | | morphology | stemming, lemmatization, diacritics analysis, POS tag analysis, morphological analysis, word generation, sentence disambiguation, informal word analysis, adding dictionary items | 14 | | named-entitiy-recognition | on hold | 15 | | normalization | document correction, noisy text normalization, spell checking | 16 | | tokenization | sentence boundary detection, turkish tokenization | 17 | 18 | ## Requirements 19 | 20 | 1. Python 3.6+ 21 | 22 | ## Getting Started 23 | 24 | 1. Clone this library and `cd` into it. 25 | ```shell 26 | $ git clone https://github.com/ozturkberkay/Zemberek-Python-Examples.git 27 | $ cd Zemberek-Python-Examples 28 | ``` 29 | 30 | 2. Install the required packages. Using `virtualenv` is highly encouraged! 31 | 32 | ```shell 33 | $ python -m pip install --upgrade pip virtualenv 34 | $ python -m virtualenv .env 35 | $ # Windows: .env\Scripts\activate 36 | $ source .env/bin/activate 37 | $ python -m pip install -r requirements.txt 38 | ``` 39 | 40 | 3. Download the required Zemberek files: 41 | 42 | ```shell 43 | $ python -m downloader 44 | ``` 45 | 46 | Optionally, you can manually download all the data and version `0.17.1` of Zemberek distribution from [the official Zemberek Drive folder](https://drive.google.com/drive/folders/0B9TrB39LQKZWSjNKdVcwWUxxUm8?usp=sharing) and put the files in the corresponding folders: 47 | 48 | . 49 | +-- bin 50 | | +-- zemberek-full.jar 51 | +-- data 52 | | +-- classification 53 | | +-- news-title-category-set 54 | | +-- news-title-category-set.lemmas 55 | | +-- news-title-category-set.tokenized 56 | | +-- dictionaries 57 | | +-- lm 58 | | +-- lm.2gram.slm 59 | | +-- ner 60 | | +-- normalization 61 | | +-- ascii-map 62 | | +-- lookup-from-graph 63 | | +-- split 64 | 65 | ## Usage 66 | 67 | 1. Run `python -m main category.example args`. 68 | 69 | ```shell 70 | $ python -m main classification.simple_classification "Fenerbahçe bu maçı galibiyet ile sonlandırdı." 71 | ... 72 | 73 | News classification example. Trains a new model if there is no model 74 | available. 75 | 76 | Args: 77 | sentence (str): Sentence to classify. 78 | 79 | Sentence: Fenerbahçe bu maçı galibiyet ile sonlandırdı. 80 | 81 | Item 1: __label__spor 82 | Score 1: -0.009194993413984776 83 | 84 | Item 2: __label__magazin 85 | Score 2: -6.12613582611084 86 | 87 | Item 3: __label__kültür_sanat 88 | Score 3: -6.226541996002197 89 | ``` 90 | 91 | ## Known Bugs 92 | 93 | - During the model training, `fastText` will print errors. It still works, just ignore them. 94 | 95 | ## Changelog 96 | 97 | - 2020-12-05 98 | - Automatic downloader for Zemberek files. 99 | - Simple CLI entry-point to run the examples with custom data. 100 | - JPype1 v1.2.0 upgrade. This should fix some memory leak issues. 101 | - Code quality improvements. 102 | - Fixes for broken links. 103 | - 2019-10-29 104 | - Zemberek v0.17.1 upgrade. 105 | - JPype1 v0.7.0 upgrade. 106 | - Code style changes. 107 | - Bug-fixes. 108 | - License is now the same with Zemberek (Apache v2.0). 109 | - 2018-12-01 110 | - Classification, morphology, normalization and tokenization examples. 111 | -------------------------------------------------------------------------------- /bin/README.md: -------------------------------------------------------------------------------- 1 | # Zemberek JAR Distribution 2 | > Download the latest version of Zemberek from [the official Zemberek Drive folder](https://drive.google.com/drive/folders/0B9TrB39LQKZWX1RSang3M1VkYjQ?usp=sharing) and put the `zemberek-full.jar` file in this folder. Or use `zemberek_downloader.py` to automatically download the files. -------------------------------------------------------------------------------- /data/classification/README.md: -------------------------------------------------------------------------------- 1 | # Zemberek Classification Data 2 | > Download the classification data files from [the official Zemberek Drive folder](https://drive.google.com/drive/folders/1JBPExAeRctAXL2oGW2U6CbqfwIJ84BG7?usp=sharing) and put the contents in this folder. Or use `zemberek_downloader.py` to automatically download the files. -------------------------------------------------------------------------------- /data/dictionaries/README.md: -------------------------------------------------------------------------------- 1 | # Zemberek Dictionary Data 2 | 3 | > You can utilize [the official wiki](https://github.com/ahmetaa/zemberek-nlp/wiki/Text-Dictionary-Rules) to create your own lexicon and save it in this folder. Or use `zemberek_downloader.py` to automatically download the files. 4 | 5 | ## Usage 6 | 7 | 1. Create your dictionary inside this folder. Read [the related wiki article](https://github.com/ahmetaa/zemberek-nlp/wiki/Text-Dictionary-Rules) for more information. 8 | 9 | 2. Create a `TurkishMorphology` object using your custom dictionary Read [the related wiki article](https://github.com/ahmetaa/zemberek-nlp/tree/master/morphology#creating-turkishmorphology-object) for more information. 10 | 11 | ```python 12 | lexicon = RootLexicon.builder().addDefaultLexicon().addTextDictionaries(Paths.get('PATH')).build() 13 | morphology = TurkishMorphology.create(lexicon) 14 | ``` 15 | -------------------------------------------------------------------------------- /data/lm/README.md: -------------------------------------------------------------------------------- 1 | # Zemberek Language Model Data 2 | > Download the language model data files from [the official Zemberek Drive folder](https://drive.google.com/drive/folders/1jb4ei8rbBRfBmK1WrhdjqyEkVZmieZpI?usp=sharing) and put the contents in this folder. Or use `zemberek_downloader.py` to automatically download the files. -------------------------------------------------------------------------------- /data/ner/README.md: -------------------------------------------------------------------------------- 1 | # Zemberek Named Entity Recognition Data 2 | > Development of NER examples are on hold. Once they are completed, you will need to put your own training and test datasets named `ner-train` and `ner-test` inside this folder. -------------------------------------------------------------------------------- /data/normalization/README.md: -------------------------------------------------------------------------------- 1 | # Zemberek Normalization Data 2 | > Download the normalization data files from [the official Zemberek Drive folder](https://drive.google.com/drive/folders/1jNT6BJoEbiLuVbQwBYdVNoibEdzDd2WC?usp=sharing) and put the contents in this folder. Or use `zemberek_downloader.py` to automatically download the files. -------------------------------------------------------------------------------- /data/normalization/document.txt: -------------------------------------------------------------------------------- 1 | Türk Vatanı ve Milletinin ebedi varlığxını ve Yüce Türk Devlatinin bölünmez bütünlüğünü belirleyen bu Anayasa, 2 | Türkiye Cumhuriyetinin kurucusu, ölümsüz önder ve eşsiz kahraman Atatürk’ün belirlediği milliyetçilik anlayışı ve onun 3 | inkılap ve ilkeleri doğrultusunda; 4 | Dünya milletleri ailesinin eşit haklara sahip şerefli bir üyesi olarak, Türkiye Cumhuriyetinin ebedi varlığı, refahı, 5 | maddi ve manelvi mutluluğu ile çağdaş medeniyet düzeyine ulaşma azmi yönünde; 6 | Millet iradesinin mutlak üstünlüğü, egemenliğin kayıtsız şartsız Türk Milletine ait olduğu ve bunu millet adına 7 | kullanmaya yetkili kılınan hiçbir kişi ve kuruluşun, bu Anayasada gösterilen hürriyetçi demokrasi ve bunun icaplarıyla 8 | belirlenmiş hukuk düzeni dışına çıkamayacağı; 9 | Kuvvetler ayrımının, Devlet organları arasında üstünlük sıralaması anlamına gelmeyip, belli Devlet yetki ve 10 | görevlerinin kullanılmasından ibaret ve bununla sınırlı medeni bir işbölümü ve işbirliği olduğu ve üstünlüğün ancak Anayasa 11 | ve kanunlarda bulunduğu; 12 | Hiçbir faaliyetin Türk milli menfaatlerinin, Türk varlığının, Devleti ve ülkesiyle bölünmezliği esasının, Türklüğün 13 | tarihi ve manevi değerlerinin, Atatürk milliyetçiliği, ilke ve inkılapları ve medeniyetçiliğinin karşısında korunma 14 | göremeyeceği ve laiklik ilkesinin gereği olarak kutsal din duygularının, Devlet işlerine ve politikaya kesinlikle 15 | karıştırılamayacağı; (5) 16 | 17 | Her Türk vatandaşının bu Anayasadaki temel hak ve hürriyetlerden eşitlik ve sosyal adalet gereklerince yararlanarak 18 | milli kültür, medeniyet ve hukuk düzeni içinde onurlu bir hayat sürdürme ve maddi ve manevi varlığını bu yönde geliştirme 19 | hak ve yetkisine doğuştan sahip olduğu; 20 | ––––––––––––––––––––––––––––– 21 | (1) Bu Anayasa; Kuruczu Meclis tarafından 18/10/1982’de Halkoylamasına sunulmak üzere kabul edilmiş ve 20/10/1982 22 | tarihli ve 17844 sayılı Resmî Gazete’de yayımlanmış; 7/11/1982’de Halkoylamasına sunulduktan sonra 9/11/1982 tarihli 23 | ve 17863 Mükerrer sayıli Resmî Gazete’de yeniden yayımlanmıştır. 24 | (2) 7/5/2010 tarihli ve 5982 sayılı Kanun ile yapılan Anayasa değişiklikleri 12/9/2010 tarihinde Halkoyuna sunularak kabul 25 | edilmiş, buna ilişkin 22/9/2010 tarihli ve 846 sayılı Yüksek Seçim Kurulu Kararı 23/9/2010 tarihli ve 27708 sayılı Resmî 26 | Gazete’de yayımlanmıştır. 27 | (3) 21/1/2017 tarihli ve 6771 sayılı Kanun ile yapılan Anayasa değişiklikleri 16/4/2017 tarihinde Halkoyuna sunularak kabul 28 | edilmiş, buna ilişkin 27/4/2017 tarihli ve 663 sayılı Yüksek Seçim Kurulu Kararı 27/4/2017 tarihli ve 30050 Mükerrer 29 | sayılı Resmî Gazete’de yayımlanmıştır. 30 | (4) Anayasa’nın Başlangıc metni 23/7/1995 tarih ve 4121 sayılı Kanun’un 1 inci maddesi ile değiştirilmiş ve metne 31 | işlenmiştir. 32 | (5) Bu fıkrada geçen, “Hiçbir düşünce ve mülahazanın” ibaresi, 3/10/2001 tarih ve 4709 sayılı Kanunun 1 inci maddesiyle 33 | “Hiçbir faaliyetin” şeklinde değiştirilmiş ve metne işlenmiştir. 34 | 130 35 | Topluca Türk vatandaşlarının milli gurur ve iftiharlarda, milli sevinç ve kederlerde, milli varlığa karşı hak ve 36 | ödevlerde, nimet ve külfetlerde ve millet hayatının her türlü tecellisinde ortak olduğu, birbirinin hak ve hürriyetlerine kesin 37 | saygı, karşılıklı içten sevgi ve kardeşlik duygularıyla ve 'Yurtta sulh, cihanda sulh' arzu ve inancı içinde, huzurlu bir hayat 38 | talebine hakları bulunduğu; 39 | FİKİR, İNANÇ VE KARARIYLA anlaşılmak, sözüne ve ruhuna bu yönde saygı ve mutlak sadakatle yorumlanıp 40 | uygulanmak üzere. 41 | TÜRK MİLLETİ TARAFINDAN, demokrasiye aşık Türk evlatlarının vatan ve millet sevgisine emanet ve tevdi 42 | olunur. 43 | BİRİNCİ KISIM 44 | GENEL ESASLAR 45 | I. Devletin şekli 46 | Madde 1 – Türkiye Devleti bir Cumhuriyettir. 47 | II. Cumhuriyetin nitelikleri 48 | Madde 2 – Türkiye Cumhuriyeti, toplumun huzuru, milli dayanışma ve adalet anlayışı içinde, insan haklarına saygılı, 49 | Atatürk milliyetçiliğine bağlı, başlangıçta belirtilen temel ilkelere dayanan, demokratik, laik ve sosyal bir hukuk Devletidir. 50 | III. Devletin bütünlüğü, Resmî dili, bayrağı, milli marşı ve başkenti 51 | Madde 3 – Türkiye Devleti, ülkesi ve milletiyle bölünmez bir bütündür. Dili Türkçedir. 52 | Bayrağı, şekli kanununda belirtilen, beyaz ay yıldızlı al bayraktır. 53 | Milli marşı 'İstiklal Marşı'dır. 54 | Başkenti Ankara'dır. 55 | IV. Değiştirilemeyecek hükümler 56 | Madde 4 – Anayasanın 1 inci maddesindeki Devletin şeklinin Cumhuriyet olduğu hakkındaki hüküm ile, 2 nci 57 | maddesindeki Cumhuriyetin nitelikleri ve 3 üncü maddesi hükümleri değiştirilemez ve değiştirilmesi teklif edilemez. 58 | V. Devletin temel amaç ve görevleri 59 | Madde 5 – Devletin temel amaç ve görevleri, Türk milletinin bağımsızlığını ve bütünlüğünü, ülkenin bölünmezliğini, 60 | Cumhuriyeti ve demokrasiyi korumak, kişilerin ve toplumun refah, huzur ve mutluluğunu sağlamak; kişinin temel hak ve 61 | hürriyetlerini, sosyal hukuk devleti ve adalet ilkeleriyle bağdaşmayacak surette sınırlayan siyasal, ekonomik ve sosyal 62 | engelleri kaldırmaya, insanın maddi ve manevi varlığının gelişmesi için gerekli şartları hazırlamaya çalışmaktır. 63 | VI. Egemenlik 64 | Madde 6 – Egemenlik, kayıtsız şartsız Milletindir. 65 | Türk Milleti, egemenliğini, Anayasanın koyduğu esaslara göre, yetkili organları eliyle kullanır. 66 | Egemenliğin kullanılması, hiçbir surette hiçbir kişiye, zümreye veya sınıfa bırakılamaz. Hiçbir kimse veya organ 67 | kaynağını Anayasadan almayan bir Devlet yetkisi kullanamaz' 68 | -------------------------------------------------------------------------------- /documentation.yml: -------------------------------------------------------------------------------- 1 | examples/** linguist-documentation=false 2 | -------------------------------------------------------------------------------- /downloader.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from pathlib import Path 3 | 4 | import gdown 5 | 6 | ZEMBEREK_FILES: Dict[str, Dict[str, str]] = { 7 | 'bin': {'zemberek-full.jar': '1RRuFK43JqcHcthB3fV2IEpPftWoeoHAu'}, 8 | 'data/classification': { 9 | 'news-title-category-set': '13d6TjKSk8Uy0FNHrbqJQv1hHalKUg1l5', 10 | 'news-title-category-set.lemmas': '1VP-DcPDY423cU48CP675yT6RS_hygtpX', 11 | 'news-title-category-set.tokenized': ( 12 | '1xt81joeOA7nOTYNUKdxKBOeDtLGMCwNO' 13 | ), 14 | }, 15 | 'data/lm': {'lm.2gram.slm': '1JZG0I8jUS511lFVg0M-QAA4QRqydlCiX'}, 16 | 'data/normalization': { 17 | 'ascii-map': '1ptbPoGZrKxXS5PNr5kpGfHziGdIUP-n7', 18 | 'lookup-from-graph': '1ko31lO1yrYf1twjZOl_vHikmsGkQGKt5', 19 | 'split': '1X8UpIE0ifYF1_tpMQp7o5PLEqms3Ew5Q', 20 | }, 21 | } 22 | 23 | 24 | def download_drive_files( 25 | file_map: Dict[str, Dict[str, str]], skip_if_exists: bool = True 26 | ) -> None: 27 | """ 28 | Downloads the files from Google Drive. 29 | 30 | Args: 31 | file_map (Dict[str, Dict[str, str]]): Mapping of directories to install 32 | to, with the file names and Google Drive ID's. 33 | skip_if_exists (bool): Skips the download if the target file already 34 | exists on the local filesystem. 35 | """ 36 | 37 | base_path: Path = Path(__file__).parent 38 | 39 | for folder in file_map: 40 | 41 | for file_name, drive_id in file_map[folder].items(): 42 | 43 | target_path: Path = base_path.joinpath( 44 | folder, file_name 45 | ).absolute() 46 | 47 | if skip_if_exists and target_path.exists(): 48 | print(f'Target already exists, skipping: {target_path}') 49 | continue 50 | 51 | gdown.download( 52 | url=f'https://drive.google.com/u/0/uc?id={drive_id}', 53 | output=str(target_path), 54 | ) 55 | 56 | 57 | if __name__ == '__main__': 58 | download_drive_files(ZEMBEREK_FILES) 59 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | """Common constants for the examples.""" 2 | import os 3 | from pathlib import Path 4 | from typing import Dict, List, Optional, Union 5 | 6 | from jpype import getDefaultJVMPath 7 | 8 | __all__: List[str] = ['ZEMBEREK_PATH', 'DATA_PATH', 'JVM_KWARGS', 'JAVA_PATH'] 9 | 10 | ZEMBEREK_PATH: Path = ( 11 | Path(__file__).parents[1].joinpath('bin', 'zemberek-full.jar').absolute() 12 | ) 13 | 14 | DATA_PATH: Path = Path(__file__).parents[1].joinpath('data').absolute() 15 | 16 | JVM_KWARGS: Dict[str, Union[str, bool]] = { 17 | 'jvmpath': getDefaultJVMPath(), 18 | 'classpath': str(ZEMBEREK_PATH), 19 | 'convertStrings': False, 20 | 'interrupt': True, 21 | } 22 | 23 | _JAVA_HOME: Optional[str] = os.getenv('JAVA_HOME') 24 | 25 | if _JAVA_HOME is None: 26 | raise EnvironmentError( 27 | 'Install Java and make sure the JAVA_HOME environment variable is set.' 28 | ) 29 | 30 | JAVA_PATH: Path = Path(_JAVA_HOME, 'bin', 'java') 31 | -------------------------------------------------------------------------------- /examples/classification/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozturkberkay/Zemberek-Python-Examples/a3680f185270ef74eba57c8aa6f6e77c519ffbbd/examples/classification/__init__.py -------------------------------------------------------------------------------- /examples/classification/classification_example_base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Classification Example Base 3 | Documentation: https://bit.ly/2BNKPmP 4 | Original Java Example: https://bit.ly/2Prce6m 5 | fastText Documentation: https://bit.ly/31YVBS8 6 | """ 7 | 8 | import re 9 | from pathlib import Path 10 | from typing import List 11 | 12 | from jpype import JClass, JString, java 13 | 14 | __all__: List[str] = ['ClassificationExampleBase'] 15 | 16 | 17 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') 18 | SentenceAnalysis: JClass = JClass( 19 | 'zemberek.morphology.analysis.SentenceAnalysis' 20 | ) 21 | SentenceWordAnalysis: JClass = JClass( 22 | 'zemberek.morphology.analysis.SentenceWordAnalysis' 23 | ) 24 | SingleAnalysis: JClass = JClass('zemberek.morphology.analysis.SingleAnalysis') 25 | TurkishSentenceNormalizer: JClass = JClass( 26 | 'zemberek.normalization.TurkishSentenceNormalizer' 27 | ) 28 | TurkishTokenizer: JClass = JClass('zemberek.tokenization.TurkishTokenizer') 29 | Token: JClass = JClass('zemberek.tokenization.Token') 30 | Type: JClass = JClass('zemberek.tokenization.Token.Type') 31 | 32 | 33 | class ClassificationExampleBase: 34 | """Exact Python implementation of the original Java code.""" 35 | 36 | morphology: TurkishMorphology = TurkishMorphology.createWithDefaults() 37 | 38 | @classmethod 39 | def generate_set_with_lemmas( 40 | cls, lines: List[str], lemmas_path: Path 41 | ) -> None: 42 | with open(lemmas_path, 'w', encoding='utf-8') as lemma_file: 43 | for line in [ 44 | cls.remove_non_words( 45 | cls.replace_words_with_lemma(line) 46 | ).lower() 47 | for line in lines 48 | ]: 49 | lemma_file.write(f'{line}\n') 50 | 51 | @classmethod 52 | def generate_set_with_split( 53 | cls, lines: List[str], split_path: Path 54 | ) -> None: 55 | with open(split_path, 'w', encoding='utf-8') as split_file: 56 | for line in [ 57 | cls.remove_non_words(cls.split_words(line)).lower() 58 | for line in lines 59 | ]: 60 | split_file.write(f'{line}\n') 61 | 62 | @classmethod 63 | def generate_set_tokenized( 64 | cls, lines: List[str], tokenized_path: Path 65 | ) -> None: 66 | with open(tokenized_path, 'w', encoding='utf-8') as tokens_file: 67 | for line in [ 68 | cls.remove_non_words( 69 | java.lang.String.join( 70 | JString(' '), 71 | TurkishTokenizer.DEFAULT.tokenizeToStrings( 72 | JString(line) 73 | ), 74 | ) 75 | ).lower() 76 | for line in lines 77 | ]: 78 | tokens_file.write(f'{line}\n') 79 | 80 | @classmethod 81 | def split_words(cls, sentence: str) -> str: 82 | tokens: List[str] = sentence.split() 83 | label: java.lang.String = JString(tokens[0]) 84 | del tokens[0] 85 | sentence = ' '.join(tokens) 86 | 87 | if len(sentence) == 0: 88 | return JString(sentence) 89 | 90 | analysis: SentenceAnalysis = cls.morphology.analyzeAndDisambiguate( 91 | JString(sentence) 92 | ) 93 | res: java.util.ArrayList = java.util.ArrayList() 94 | res.add(label) 95 | 96 | for word_analysis in analysis: 97 | best: SingleAnalysis = word_analysis.getBestAnalysis() 98 | inp: java.lang.String = word_analysis.getWordAnalysis().getInput() 99 | 100 | if best.isUnknown(): 101 | res.add(inp) 102 | continue 103 | 104 | lemmas: java.util.ArrayList = best.getLemmas() 105 | 106 | if len(lemmas[0]) < len(inp): 107 | res.add(lemmas[0]) 108 | res.add(JString('_' + str(inp[len(lemmas[0])]))) 109 | else: 110 | res.add(lemmas[0]) 111 | 112 | return java.lang.String.join(JString(' '), res) 113 | 114 | @classmethod 115 | def process_ending(cls, inp: str) -> str: 116 | for pattern, value in { 117 | r'[ae]': 'A', 118 | r'[ıiuü]': 'I', 119 | r'[kğ]': 'K', 120 | r'[cç]': 'C', 121 | r'[dt]': 'D', 122 | }.items(): 123 | inp = re.sub(pattern, value, inp) 124 | return inp 125 | 126 | @classmethod 127 | def replace_words_with_lemma(cls, sentence: str) -> str: 128 | tokens: List[str] = sentence.split() 129 | label: str = tokens[0] 130 | del tokens[0] 131 | sentence = ' '.join(tokens) 132 | 133 | if len(sentence) == 0: 134 | return sentence 135 | 136 | analysis: SentenceAnalysis = cls.morphology.analyzeAndDisambiguate( 137 | JString(sentence) 138 | ) 139 | res: java.util.ArrayList = java.util.ArrayList() 140 | res.add(JString(label)) 141 | 142 | for word_analysis in analysis: 143 | best: SingleAnalysis = word_analysis.getBestAnalysis() 144 | 145 | if best.isUnknown(): 146 | res.add(word_analysis.getWordAnalysis().getInput()) 147 | continue 148 | 149 | lemmas: java.util.ArrayList = best.getLemmas() 150 | res.add(lemmas[0]) 151 | 152 | return java.lang.String.join(JString(' '), res) 153 | 154 | @classmethod 155 | def remove_non_words(cls, sentence: JString) -> str: 156 | if not sentence: 157 | return '' 158 | 159 | doc_tokens: List[Token] = list( 160 | TurkishTokenizer.DEFAULT.tokenize(sentence) 161 | ) 162 | reduced: List[str] = [] 163 | 164 | for token in doc_tokens: 165 | text: str = str(token.getText()) 166 | 167 | if text[0] == '_' or '__' in text: 168 | reduced.append(text) 169 | continue 170 | 171 | token_type: Token.Type = token.getType() 172 | 173 | if token_type in { 174 | Token.Type.Mention, 175 | Token.Type.HashTag, 176 | Token.Type.URL, 177 | Token.Type.Punctuation, 178 | Type.RomanNumeral, 179 | Token.Type.Time, 180 | Token.Type.UnknownWord, 181 | Token.Type.Unknown, 182 | }: 183 | continue 184 | 185 | reduced.append(text) 186 | 187 | return ' '.join(reduced) 188 | -------------------------------------------------------------------------------- /examples/classification/news_title_category_finder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: News Title Category Finder Example 3 | Documentation: https://bit.ly/2BNKPmP 4 | Original Java Example: https://bit.ly/32TUtQU 5 | fastText Documentation: https://bit.ly/31YVBS8 6 | """ 7 | 8 | import collections 9 | from pathlib import Path 10 | from typing import List, Optional, cast 11 | 12 | from jpype import JClass, JString 13 | 14 | from examples import DATA_PATH 15 | from examples.classification.classification_example_base import ( 16 | ClassificationExampleBase, 17 | ) 18 | 19 | __all__: List[str] = ['NewsTitleCategoryFinder', 'run'] 20 | 21 | EvaluateClassifier: JClass = JClass( 22 | 'zemberek.apps.fasttext.EvaluateClassifier' 23 | ) 24 | TrainClassifier: JClass = JClass('zemberek.apps.fasttext.TrainClassifier') 25 | 26 | 27 | class NewsTitleCategoryFinder(ClassificationExampleBase): 28 | """Exact Python implementation of the original Java code.""" 29 | 30 | @classmethod 31 | def _read(cls, path: Path) -> List[str]: 32 | with open(path, 'r', encoding='utf-8') as lines_file: 33 | return [line for line in lines_file] 34 | 35 | @classmethod 36 | def data_info(cls, path: Path) -> List[str]: 37 | lines = cls._read(path) 38 | print(f'Total Lines: {len(lines)}') 39 | for item in collections.Counter( 40 | [line[0 : line.find(' ')] for line in lines] 41 | ).most_common(): 42 | print(f'({item[1]})\t{item[0]}') 43 | return lines 44 | 45 | @classmethod 46 | def evaluate( 47 | cls, 48 | test_size: int, 49 | path: Optional[Path] = None, 50 | lines: Optional[List[str]] = None, 51 | ) -> None: 52 | if lines is None: 53 | if path is None: 54 | raise ValueError('You should provide a path!') 55 | lines = cls._read(path) 56 | 57 | if test_size <= 0 or test_size > len(lines): 58 | raise ValueError( 59 | '\'test_size\' must be bigger than' 60 | ' 0 and less than the dataset size!' 61 | ) 62 | 63 | train_path: Path = cast(Path, path).with_suffix('.train') 64 | test_path: Path = cast(Path, path).with_suffix('.test') 65 | 66 | with open(train_path, 'w', encoding='utf-8') as train_file: 67 | for line in lines[test_size : len(lines)]: 68 | train_file.write(line) 69 | 70 | with open(test_path, 'w', encoding='utf-8') as test_file: 71 | for line in lines[0:test_size]: 72 | test_file.write(line) 73 | 74 | model_path: Path = cast(Path, path).with_suffix('.model') 75 | 76 | if not model_path.is_file(): 77 | TrainClassifier().execute( 78 | JString('-i'), 79 | JString(str(train_path)), 80 | JString('-o'), 81 | JString(str(model_path)), 82 | JString('--learningRate'), 83 | JString('0.1'), 84 | JString('--epochCount'), 85 | JString('70'), 86 | JString('--dimension'), 87 | JString('100'), 88 | JString('--wordNGrams'), 89 | JString('2'), 90 | ) 91 | print('Testing...') 92 | cls.test( 93 | test_path, cast(Path, path).with_suffix('.predictions'), model_path 94 | ) 95 | 96 | @classmethod 97 | def test( 98 | cls, test_path: Path, predictions_path: Path, model_path: Path 99 | ) -> None: 100 | EvaluateClassifier().execute( 101 | JString('-i'), 102 | JString(str(test_path)), 103 | JString('-m'), 104 | JString(str(model_path)), 105 | JString('-o'), 106 | JString(str(predictions_path)), 107 | JString('-k'), 108 | JString('1'), 109 | ) 110 | 111 | 112 | def run() -> None: 113 | """News classification model training.""" 114 | 115 | data_path: Path = DATA_PATH.joinpath( 116 | 'classification', 'news-title-category-set' 117 | ) 118 | 119 | print('\nEvaluation with raw data:\n') 120 | 121 | lines_: List[str] = NewsTitleCategoryFinder.data_info(data_path) 122 | NewsTitleCategoryFinder.evaluate(1000, data_path, lines_) 123 | 124 | print('\nEvaluation with tokenized-lowercase data:\n') 125 | 126 | tokenized_path: Path = data_path.with_suffix('.tokenized') 127 | if not tokenized_path.is_file(): 128 | NewsTitleCategoryFinder.generate_set_tokenized(lines_, tokenized_path) 129 | NewsTitleCategoryFinder.evaluate(1000, path=tokenized_path) 130 | 131 | print('\nEvaluation with lemma-lowercase data:\n') 132 | 133 | lemma_path: Path = data_path.with_suffix('.lemmas') 134 | if not lemma_path.is_file(): 135 | NewsTitleCategoryFinder.generate_set_with_lemmas(lines_, lemma_path) 136 | NewsTitleCategoryFinder.evaluate(1000, path=lemma_path) 137 | 138 | print('\nEvaluation with Stem-Ending-lowercase data:\n') 139 | 140 | split_path: Path = data_path.with_suffix('.split') 141 | if not split_path.is_file(): 142 | NewsTitleCategoryFinder.generate_set_with_split(lines_, split_path) 143 | NewsTitleCategoryFinder.evaluate(1000, path=split_path) 144 | -------------------------------------------------------------------------------- /examples/classification/simple_classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Simple Classification Example 3 | Documentation: https://bit.ly/2BNKPmP 4 | Java Code Example: https://bit.ly/2JsoO1i 5 | fastText Documentation: https://bit.ly/31YVBS8 6 | """ 7 | import subprocess 8 | from pathlib import Path 9 | from typing import List 10 | 11 | from examples import DATA_PATH, JAVA_PATH, ZEMBEREK_PATH 12 | from jpype import JClass, JString, java 13 | 14 | __all__: List[str] = ['run'] 15 | 16 | 17 | FastTextClassifier: JClass = JClass( 18 | 'zemberek.classification.FastTextClassifier' 19 | ) 20 | TurkishTokenizer: JClass = JClass('zemberek.tokenization.TurkishTokenizer') 21 | 22 | 23 | def run(sentence: str) -> None: 24 | """ 25 | News classification example. Trains a new model if there are no model 26 | available. 27 | 28 | Args: 29 | sentence (str): Sentence to classify. 30 | """ 31 | label_data_path: Path = DATA_PATH.joinpath( 32 | 'classification', 'news-title-category-set' 33 | ) 34 | model_path: Path = label_data_path.with_suffix('.model') 35 | 36 | if not model_path.is_file(): 37 | 38 | print( 39 | 'Could not find a model, training a new one. FastText will print' 40 | ' some errors, do not terminate the process!' 41 | ) 42 | 43 | if not label_data_path.is_file(): 44 | raise FileNotFoundError( 45 | 'Could not train a model!' 46 | ' Please include news-title-category-set!' 47 | ) 48 | 49 | subprocess.run( 50 | [ 51 | str(JAVA_PATH.absolute()), 52 | '-jar', 53 | str(ZEMBEREK_PATH.absolute()), 54 | 'TrainClassifier', 55 | '-i', 56 | str(label_data_path.absolute()), 57 | '-o', 58 | str(model_path.absolute()), 59 | '--learningRate', 60 | '0.1', 61 | '--epochCount', 62 | '50', 63 | '--applyQuantization', 64 | '--cutOff', 65 | '15000', 66 | ], 67 | check=True, 68 | ) 69 | 70 | classifier: FastTextClassifier = FastTextClassifier.load(model_path) 71 | 72 | processed: str = ' '.join( 73 | [ 74 | str(token) 75 | for token in TurkishTokenizer.DEFAULT.tokenizeToStrings( 76 | JString(sentence) 77 | ) 78 | ] 79 | ).lower() 80 | 81 | results: java.util.ArrayList = classifier.predict(processed, 3) 82 | 83 | print(f'Sentence: {sentence}') 84 | 85 | for i, result in enumerate(results): 86 | print( 87 | f'\nItem {i + 1}: {result.item}', 88 | f'\nScore {i + 1}: {result.score}', 89 | ) 90 | -------------------------------------------------------------------------------- /examples/classification/train_classifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Train Classifier Example 3 | Documentation: https://bit.ly/2PtzNLB 4 | fastText Documentation: https://bit.ly/2JtMP80 5 | """ 6 | 7 | import subprocess 8 | from pathlib import Path 9 | from typing import List 10 | 11 | from examples import JAVA_PATH, ZEMBEREK_PATH, DATA_PATH 12 | 13 | __all__: List[str] = ['run'] 14 | 15 | 16 | def run(): 17 | """ 18 | Trains the news classifier model. 19 | """ 20 | 21 | news_path: Path = DATA_PATH.joinpath( 22 | 'classification', 'news-title-category-set' 23 | ) 24 | 25 | subprocess.run( 26 | [ 27 | JAVA_PATH, 28 | '-jar', 29 | ZEMBEREK_PATH, 30 | 'TrainClassifier', 31 | '-i', 32 | news_path, 33 | '-o', 34 | news_path.with_suffix('.model'), 35 | '--learningRate', 36 | '0.1', 37 | '--epochCount', 38 | '50', 39 | '--applyQuantization', 40 | '--cutOff', 41 | '15000', 42 | ], 43 | check=True, 44 | ) 45 | -------------------------------------------------------------------------------- /examples/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozturkberkay/Zemberek-Python-Examples/a3680f185270ef74eba57c8aa6f6e77c519ffbbd/examples/core/__init__.py -------------------------------------------------------------------------------- /examples/core/histogram.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Histogram Example 3 | Original Java Example: https://bit.ly/2PmUyIV 4 | """ 5 | from typing import List, Tuple 6 | from jpype import ( 7 | JClass, 8 | JInt, 9 | JString, 10 | java, 11 | ) 12 | 13 | __all__: List[str] = ['run'] 14 | 15 | Histogram: JClass = JClass('zemberek.core.collections.Histogram') 16 | 17 | 18 | def _create_histograms() -> Tuple[Histogram, Histogram]: 19 | """ 20 | Creates two example Histogram data. 21 | 22 | Returns: 23 | Tuple[Histogram, Histogram]: Example Histogram data. 24 | """ 25 | histogram_1: Histogram = Histogram() 26 | histogram_1.add( 27 | ['apple', 'pear', 'grape', 'apple', 'apple', 'appricot', 'grape'] 28 | ) 29 | 30 | histogram_2: Histogram = Histogram() 31 | histogram_2.add(['apple', 'apple', 'banana']) 32 | 33 | print('Histogram 1:', histogram_1) 34 | print('Histogram 2:', histogram_2) 35 | 36 | return histogram_1, histogram_2 37 | 38 | 39 | def _print_histogram_items( 40 | histogram_1: Histogram, histogram_2: Histogram 41 | ) -> None: 42 | """ 43 | Prints the Histogram items. 44 | 45 | Args: 46 | histogram_1 (Histogram): First example Histogram. 47 | histogram_2 (Histogram): Second example Histogram. 48 | """ 49 | print('\nHistogram 1, Keys:', histogram_1.getKeySet()) 50 | print('Histogram 2, Keys:', histogram_2.getKeySet()) 51 | 52 | print('\nHistogram 1, Sorted Keys:', histogram_1.getSortedList()) 53 | print('Histogram 2, Sorted Keys:', histogram_2.getSortedList()) 54 | 55 | print('\nHistogram 1, Entries:', histogram_1.getEntryList()) 56 | print('Histogram 2, Entries:', histogram_2.getEntryList()) 57 | 58 | print('\nHistogram 1, Sorted Entries:', histogram_1.getSortedEntryList()) 59 | print('Histogram 2, Sorted Entries:', histogram_2.getSortedEntryList()) 60 | 61 | 62 | def _print_histogram_item_comp( 63 | histogram_1: Histogram, histogram_2: Histogram 64 | ) -> None: 65 | print( 66 | '\nIntersection of Histogram 1 and 2:', 67 | histogram_1.getIntersectionOfKeys(histogram_2), 68 | ) 69 | 70 | print( 71 | '\nHistogram 1, Contains \'grape\':', 72 | histogram_1.contains(JString('grape')), 73 | ) 74 | print( 75 | 'Histogram 2, Contains \'grape\':', 76 | histogram_2.contains(JString('grape')), 77 | ) 78 | 79 | print( 80 | '\nHistogram 1, Contains Apple:', 81 | histogram_1.contains(JString('apple')), 82 | ) 83 | print( 84 | 'Histogram 2, Contains Apple:', histogram_2.contains(JString('apple')) 85 | ) 86 | 87 | print('\nHistogram 1, Top 3:', histogram_1.getTop(JInt(3))) 88 | print('Histogram 2, Top 3:', histogram_2.getTop(JInt(3))) 89 | 90 | 91 | def _print_histogram_int_comp( 92 | histogram_1: Histogram, histogram_2: Histogram 93 | ) -> None: 94 | """ 95 | Prints Integer comparison examples for the Histogram items. 96 | 97 | Args: 98 | histogram_1 (Histogram): First example Histogram. 99 | histogram_2 (Histogram): Second example Histogram. 100 | """ 101 | print('\nHistogram 1, Less Than 2:', histogram_1.sizeSmaller(JInt(2))) 102 | print('Histogram 2, Less Than 2:', histogram_2.sizeSmaller(JInt(2))) 103 | 104 | print('\nHistogram 1, More Than 2:', histogram_1.sizeLarger(JInt(2))) 105 | print('Histogram 2, More Than 2:', histogram_2.sizeLarger(JInt(2))) 106 | 107 | print( 108 | '\nHistogram 1, Between 1 and 3:', 109 | histogram_1.totalCount(JInt(1), JInt(3)), 110 | ) 111 | print( 112 | 'Histogram 2, Between 1 and 3:', 113 | histogram_2.totalCount(JInt(1), JInt(3)), 114 | ) 115 | 116 | print( 117 | '\nHistogram 1, Equals to 2:', histogram_1.getItemsWithCount(JInt(2)) 118 | ) 119 | print('Histogram 2, Equals to 2:', histogram_2.getItemsWithCount(JInt(2))) 120 | 121 | print( 122 | '\nHistogram 1, >= 2 AND <= 3:', 123 | histogram_1.getItemsWithCount(JInt(2)), 124 | JInt(3), 125 | ) 126 | print( 127 | 'Histogram 2, >= 2 AND <= 3:', 128 | histogram_2.getItemsWithCount(JInt(2), JInt(3)), 129 | ) 130 | 131 | print( 132 | '\nHistogram 1, % of >= 2 AND <= 3:', 133 | histogram_1.countPercent(JInt(2), JInt(3)), 134 | ) 135 | print( 136 | 'Histogram 2, % of >= 2 AND <= 3:', 137 | histogram_2.countPercent(JInt(2), JInt(3)), 138 | ) 139 | 140 | print('\nHistogram 1, More Than 2:', histogram_1.sizeLarger(2)) 141 | print('Histogram 2, More Than 2:', histogram_2.sizeLarger(2)) 142 | 143 | 144 | def _print_histogram_item_mods( 145 | histogram_1: Histogram, histogram_2: Histogram 146 | ) -> None: 147 | """ 148 | Prints Histogram item modification examples. 149 | 150 | Args: 151 | histogram_1 (Histogram): First example Histogram. 152 | histogram_2 (Histogram): Second example Histogram. 153 | """ 154 | print('\nHistogram 1, Sorted:', histogram_1.getSortedList()) 155 | print('Histogram 2, Sorted:', histogram_2.getSortedList()) 156 | 157 | histogram_1.set(JString('apple'), 5) 158 | histogram_2.set(JString('apple'), 5) 159 | print('\nHistogram 1, Set Apple Count to 5:', histogram_1.getEntryList()) 160 | print('Histogram 2, Set Apple Count to 5:', histogram_2.getEntryList()) 161 | 162 | histogram_1.remove(JString('apple')) 163 | histogram_2.remove(JString('apple')) 164 | print('\nHistogram 1, Remove Apple:', histogram_1.getEntryList()) 165 | print('Histogram 2, Remove Apple:', histogram_2.getEntryList()) 166 | 167 | histogram_1.decrementIfPositive(JString('appricot')) 168 | histogram_2.decrementIfPositive(JString('appricot')) 169 | print( 170 | '\nHistogram 1, Decrease Appricot If Positive:', 171 | histogram_1.getEntryList(), 172 | ) 173 | print( 174 | 'Histogram 2, Decrease Appricot If Positive:', 175 | histogram_2.getEntryList(), 176 | ) 177 | 178 | remove: java.util.ArrayList = java.util.ArrayList() 179 | remove.add(JString('grape')) 180 | remove.add(JString('banana')) 181 | histogram_1.removeAll(remove) 182 | histogram_2.removeAll(remove) 183 | print( 184 | '\nHistogram 1, Remove All Grape and Banana:', 185 | histogram_1.getEntryList(), 186 | ) 187 | print( 188 | 'Histogram 2, Remove All Grape and Banana:', histogram_2.getEntryList() 189 | ) 190 | 191 | 192 | def _print_histogram_agg( 193 | histogram_1: Histogram, histogram_2: Histogram 194 | ) -> None: 195 | """ 196 | Prints Histogram aggregation examples. 197 | 198 | Args: 199 | histogram_1 (Histogram): First example Histogram. 200 | histogram_2 (Histogram): Second example Histogram. 201 | """ 202 | print('\nHistogram 1, Total Count:', histogram_1.totalCount()) 203 | print('Histogram 2, Total Count:', histogram_2.totalCount()) 204 | 205 | print('\nHistogram 1, Size:', histogram_1.size()) 206 | print('Histogram 2, Size:', histogram_2.size()) 207 | 208 | print( 209 | '\nHistogram 1, \'apple\' Count:', 210 | histogram_1.getCount(JString('apple')), 211 | ) 212 | print( 213 | 'Histogram 2, \'apple\' Count:', histogram_2.getCount(JString('apple')) 214 | ) 215 | 216 | print('\nHistogram 1, Max Count:', histogram_1.maxValue()) 217 | print('Histogram 2, Max Count:', histogram_2.maxValue()) 218 | 219 | print('\nHistogram 1, Min Count:', histogram_1.minValue()) 220 | print('Histogram 2, Min Count:', histogram_2.minValue()) 221 | 222 | 223 | def run() -> None: 224 | """Histogram usage examples.""" 225 | histogram_1, histogram_2 = _create_histograms() 226 | 227 | _print_histogram_items(histogram_1, histogram_2) 228 | 229 | _print_histogram_agg(histogram_1, histogram_2) 230 | 231 | _print_histogram_item_comp(histogram_1, histogram_2) 232 | 233 | _print_histogram_int_comp(histogram_1, histogram_2) 234 | 235 | _print_histogram_item_mods(histogram_1, histogram_2) 236 | -------------------------------------------------------------------------------- /examples/morphology/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozturkberkay/Zemberek-Python-Examples/a3680f185270ef74eba57c8aa6f6e77c519ffbbd/examples/morphology/__init__.py -------------------------------------------------------------------------------- /examples/morphology/add_dictionary_item.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Adding Dictionary Item Example 3 | Java Code Example: https://bit.ly/2qTlmXb 4 | """ 5 | from typing import List 6 | 7 | from jpype import JClass, JString 8 | 9 | __all__: List[str] = ['run'] 10 | 11 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') 12 | DictionaryItem: JClass = JClass('zemberek.morphology.lexicon.DictionaryItem') 13 | RootAttribute: JClass = JClass('zemberek.core.turkish.RootAttribute') 14 | PrimaryPos: JClass = JClass('zemberek.core.turkish.PrimaryPos') 15 | SecondaryPos: JClass = JClass('zemberek.core.turkish.SecondaryPos') 16 | WordAnalysis: JClass = JClass('zemberek.morphology.analysis.WordAnalysis') 17 | 18 | 19 | def print_results(results: WordAnalysis) -> None: 20 | """ 21 | Prints analysis results. 22 | 23 | Args: 24 | results (WordAnalysis): Analysis results. 25 | """ 26 | if results.analysisCount() == 0: 27 | print('No Analysis') 28 | for i, result in enumerate(results, 1): 29 | rstr: str = str(result.formatLong()) 30 | if result.getDictionaryItem().attributes.contains( 31 | RootAttribute.Runtime 32 | ): 33 | rstr += ' (Generated by UnidentifiedTokenParser)\n' 34 | print(f'{i} - {rstr}') 35 | 36 | 37 | def _test( 38 | morphology: TurkishMorphology, inp: str, new_item: DictionaryItem 39 | ) -> None: 40 | """ 41 | Testing analysis before and after adding dictionary item. 42 | 43 | Args: 44 | morphology (TurkishMorphology): Turkish morphology analyzer. 45 | inp (str): Input to analyze. 46 | new_item (DictionaryItem): Item to add to the dictionary. 47 | """ 48 | print(f'Parses for {inp} before adding {new_item}') 49 | before: WordAnalysis = morphology.analyze(JString(inp)) 50 | print_results(before) 51 | morphology.invalidateCache() 52 | morphology.getMorphotactics().getStemTransitions().addDictionaryItem( 53 | new_item 54 | ) 55 | after: WordAnalysis = morphology.analyze(inp) 56 | print(f'Parses for {inp} after adding {new_item}') 57 | print_results(after) 58 | 59 | 60 | def run() -> None: 61 | """ 62 | Dictionary item addition tests. 63 | """ 64 | 65 | morphology: TurkishMorphology = TurkishMorphology.createWithDefaults() 66 | 67 | print('\nProper Noun Test - 1:\n') 68 | _test( 69 | morphology, 70 | 'Meydan\'a', 71 | DictionaryItem( 72 | JString('Meydan'), 73 | JString('meydan'), 74 | JString('meydan'), 75 | PrimaryPos.Noun, 76 | SecondaryPos.ProperNoun, 77 | ), 78 | ) 79 | 80 | print('\nProper Noun Test - 2:\n') 81 | _test( 82 | morphology, 83 | 'Meeeydan\'a', 84 | DictionaryItem( 85 | JString('Meeeydan'), 86 | JString('meeeydan'), 87 | JString('meeeydan'), 88 | PrimaryPos.Noun, 89 | SecondaryPos.ProperNoun, 90 | ), 91 | ) 92 | 93 | print('\nVerb Test:\n') 94 | _test( 95 | morphology, 96 | 'tweetleyeyazdım', 97 | DictionaryItem( 98 | JString('tweetlemek'), 99 | JString('tweetle'), 100 | JString('tivitle'), 101 | PrimaryPos.Verb, 102 | SecondaryPos.None_, 103 | ), 104 | ) 105 | -------------------------------------------------------------------------------- /examples/morphology/change_stem.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Change Stem Example 3 | Documentation: https://bit.ly/2WmPDsW 4 | Java Code Example: https://bit.ly/39Jnp49 5 | """ 6 | from typing import List 7 | 8 | from jpype import ( 9 | JClass, 10 | JString, 11 | java, 12 | ) 13 | 14 | __all__: List[str] = ['run'] 15 | 16 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') 17 | DictionaryItem: JClass = JClass('zemberek.morphology.lexicon.DictionaryItem') 18 | WordAnalysis: JClass = JClass('zemberek.morphology.analysis.WordAnalysis') 19 | 20 | 21 | def run( 22 | source_word: str, 23 | target_word: str, 24 | ) -> None: 25 | """ 26 | Stem change example. 27 | 28 | Args: 29 | source_word (str): Word to get stem from. 30 | target_word (str): Word to apply stem change. 31 | """ 32 | morphology: TurkishMorphology = TurkishMorphology.createWithDefaults() 33 | 34 | new_stem: DictionaryItem = ( 35 | morphology.getLexicon().getMatchingItems(target_word).get(0) 36 | ) 37 | 38 | results: WordAnalysis = morphology.analyze(JString(source_word)) 39 | 40 | for result in results: 41 | generated: java.util.ArrayList = ( 42 | morphology.getWordGenerator().generate( 43 | new_stem, result.getMorphemes() 44 | ) 45 | ) 46 | for gen_word in generated: 47 | print( 48 | f'\nInput Analysis: {str(result.formatLong())}' 49 | f'\nAfter Stem Change, Word: {str(gen_word.surface)}' 50 | '\nAfter Stem Change, Analysis:' 51 | f'{str(gen_word.analysis.formatLong())}' 52 | ) 53 | -------------------------------------------------------------------------------- /examples/morphology/diacritic_analysis.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Diacritic Analysis Example 3 | Documentation: https://bit.ly/2PsyRHk 4 | Java Code Example: https://bit.ly/2Jx7zfk 5 | """ 6 | from typing import List 7 | 8 | from jpype import JClass 9 | 10 | __all__: List[str] = ['run'] 11 | 12 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') 13 | RootLexicon: JClass = JClass('zemberek.morphology.lexicon.RootLexicon') 14 | 15 | 16 | def run(word: str) -> None: 17 | """ 18 | Diacritic analysis example. 19 | 20 | Args: 21 | word (str): Word to apply diacritic analysis. 22 | """ 23 | 24 | morphology: TurkishMorphology = ( 25 | TurkishMorphology.builder() 26 | .ignoreDiacriticsInAnalysis() 27 | .setLexicon(RootLexicon.getDefault()) 28 | .build() 29 | ) 30 | 31 | print('\nAnalysis:') 32 | 33 | for analysis in morphology.analyze(word): 34 | print(analysis) 35 | -------------------------------------------------------------------------------- /examples/morphology/find_pos.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Finding POS Tag Example 3 | Documentation: https://bit.ly/32WCfyi 4 | Java Code Example: https://bit.ly/2Nn7hse 5 | """ 6 | from typing import List 7 | 8 | from jpype import JClass, java 9 | 10 | __all__: List[str] = ['run'] 11 | 12 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') 13 | 14 | 15 | def run(sentence: str): 16 | """ 17 | POS tag detection example. 18 | 19 | Args: 20 | sentence (str): Sentence to find POS tags on. 21 | """ 22 | 23 | morphology: TurkishMorphology = TurkishMorphology.createWithDefaults() 24 | 25 | analysis: java.util.ArrayList = morphology.analyzeAndDisambiguate( 26 | sentence 27 | ).bestAnalysis() 28 | 29 | pos: List[str] = [] 30 | 31 | for i, analysis in enumerate(analysis, start=1): 32 | print( 33 | f'\nAnalysis {i}: {analysis}', 34 | f'\nPrimary POS {i}: {analysis.getPos()}' 35 | f'\nPrimary POS (Short Form) {i}: {analysis.getPos().shortForm}', 36 | ) 37 | pos.append( 38 | f'{str(analysis.getLemmas()[0])}-{analysis.getPos().shortForm}' 39 | ) 40 | 41 | print(f'\nFull sentence with POS tags: {" ".join(pos)}') 42 | -------------------------------------------------------------------------------- /examples/morphology/informal_words_analysis.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Informal Turkish Words Analysis 3 | Documentation: https://bit.ly/2WpvvXg 4 | Java Code Example: https://bit.ly/2MUvOG9 5 | """ 6 | from typing import List 7 | 8 | from jpype import JClass, java 9 | 10 | __all__: List[str] = ['run'] 11 | 12 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') 13 | RootLexicon: JClass = JClass('zemberek.morphology.lexicon.RootLexicon') 14 | InformalAnalysisConverter: JClass = JClass( 15 | 'zemberek.morphology.analysis.InformalAnalysisConverter' 16 | ) 17 | 18 | 19 | def run(sentence: str) -> None: 20 | """ 21 | Informal words analysis example. 22 | 23 | Args: 24 | sentence (str): Sentence to search for informal words. 25 | """ 26 | 27 | morphology: TurkishMorphology = ( 28 | TurkishMorphology.builder() 29 | .setLexicon(RootLexicon.getDefault()) 30 | .ignoreDiacriticsInAnalysis() 31 | .useInformalAnalysis() 32 | .build() 33 | ) 34 | 35 | analyses: java.util.ArrayList = morphology.analyzeAndDisambiguate( 36 | sentence 37 | ).bestAnalysis() 38 | 39 | print('\nAnalysis:\n') 40 | 41 | for analysis in analyses: 42 | print(f'{str(analysis.surfaceForm())}-{analysis}') 43 | 44 | print('\nConverting formal surface form:\n') 45 | 46 | converter: InformalAnalysisConverter = InformalAnalysisConverter( 47 | morphology.getWordGenerator() 48 | ) 49 | 50 | for analysis in analyses: 51 | print(str(converter.convert(analysis.surfaceForm(), analysis))) 52 | -------------------------------------------------------------------------------- /examples/morphology/sentence_disambiguation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Disambiguating Sentences Example 3 | Documentation: https://bit.ly/36mO5Uu 4 | Java Code Example: https://bit.ly/31UfDwI 5 | """ 6 | from typing import List 7 | 8 | from jpype import JClass, java 9 | 10 | __all__: List[str] = ['run'] 11 | 12 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') 13 | 14 | 15 | def run(sentence: str) -> None: 16 | """ 17 | Sentence disambiguation example. 18 | 19 | Args: 20 | sentence (str): Sentence to disambiguate. 21 | """ 22 | 23 | morphology: TurkishMorphology = TurkishMorphology.createWithDefaults() 24 | 25 | analysis: java.util.ArrayList = morphology.analyzeSentence(sentence) 26 | 27 | results: java.util.ArrayList = morphology.disambiguate( 28 | sentence, analysis 29 | ).bestAnalysis() 30 | 31 | for i, result in enumerate(results, 1): 32 | print( 33 | f'\nAnalysis {i}: {str(result.formatLong())}' 34 | f'\nStems {i}:' 35 | f'{", ".join([str(stem) for stem in result.getStems()])}' 36 | f'\nLemmas {i}:' 37 | f'{", ".join([str(stem) for stem in result.getLemmas()])}' 38 | ) 39 | -------------------------------------------------------------------------------- /examples/morphology/stem_and_lemmatize.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Stemming and Lemmatization Example 3 | Documentation: https://bit.ly/2WvtQzv 4 | Java Code Example: https://bit.ly/2Wm71hj 5 | """ 6 | from typing import List 7 | from jpype import JClass, JString 8 | 9 | __all__: List[str] = ['run'] 10 | 11 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') 12 | WordAnalysis: JClass = JClass('zemberek.morphology.analysis.WordAnalysis') 13 | 14 | 15 | def run(word: str) -> None: 16 | """ 17 | Stemming and lemmatization example. 18 | 19 | Args: 20 | word (str): Word to apply stemming and lemmatization. 21 | """ 22 | 23 | morphology: TurkishMorphology = TurkishMorphology.createWithDefaults() 24 | 25 | print('\nResults:') 26 | 27 | results: WordAnalysis = morphology.analyze(JString(word)) 28 | 29 | for result in results: 30 | print( 31 | f'{str(result.formatLong())}' 32 | f'\n\tStems =' 33 | f' {", ".join([str(result) for result in result.getStems()])}' 34 | f'\n\tLemmas =' 35 | f' {", ".join([str(result) for result in result.getLemmas()])}' 36 | ) 37 | -------------------------------------------------------------------------------- /examples/morphology/word_analysis.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Word Analysis Example 3 | Documentation: https://bit.ly/2MTmfr1 4 | Java Code Example: https://bit.ly/2MV2Hmj 5 | """ 6 | from typing import List 7 | 8 | from jpype import JClass, JString 9 | 10 | __all__: List[str] = ['run'] 11 | 12 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') 13 | AnalysisFormatters: JClass = JClass( 14 | 'zemberek.morphology.analysis.AnalysisFormatters' 15 | ) 16 | WordAnalysis: JClass = JClass('zemberek.morphology.analysis.WordAnalysis') 17 | 18 | 19 | def run(word: str) -> None: 20 | """ 21 | Word analysis example. 22 | 23 | Args: 24 | word (str): 25 | """ 26 | 27 | morphology: TurkishMorphology = TurkishMorphology.createWithDefaults() 28 | 29 | results: WordAnalysis = morphology.analyze(JString(word)) 30 | 31 | for result in results: 32 | print( 33 | f'\nLexical and Surface: {str(result.formatLong())}' 34 | f'\nOnly Lexical: {str(result.formatLexical())}' 35 | '\nOflazer Style:' 36 | f'{str(AnalysisFormatters.OFLAZER_STYLE.format(result))}' 37 | ) 38 | -------------------------------------------------------------------------------- /examples/morphology/word_generation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Word Generation Example 3 | Documentation: https://bit.ly/2otE6LW 4 | Java Code Example: https://bit.ly/32TWKvb 5 | """ 6 | from typing import List 7 | 8 | from jpype import ( 9 | JClass, 10 | JString, 11 | java, 12 | ) 13 | 14 | __all__: List[str] = ['run'] 15 | 16 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') 17 | DictionaryItem: JClass = JClass('zemberek.morphology.lexicon.DictionaryItem') 18 | 19 | 20 | def _generate_nouns(root_word: str) -> None: 21 | """ 22 | Generates inflections of the given root word using possessive and case 23 | suffix combinations. 24 | 25 | Args: 26 | root_word (str): Root word to generate inflections from. 27 | """ 28 | 29 | print('\nGenerating nouns.\n') 30 | 31 | number: List[JString] = [JString('A3sg'), JString('A3pl')] 32 | possessives: List[JString] = [ 33 | JString('P1sg'), 34 | JString('P2sg'), 35 | JString('P3sg'), 36 | ] 37 | cases: List[JString] = [JString('Dat'), JString('Loc'), JString('Abl')] 38 | 39 | morphology: TurkishMorphology = ( 40 | TurkishMorphology.builder() 41 | .setLexicon(root_word) 42 | .disableCache() 43 | .build() 44 | ) 45 | 46 | item: DictionaryItem = ( 47 | morphology.getLexicon().getMatchingItems(root_word).get(0) 48 | ) 49 | 50 | for number_m in number: 51 | for possessive_m in possessives: 52 | for case_m in cases: 53 | for result in morphology.getWordGenerator().generate( 54 | item, number_m, possessive_m, case_m 55 | ): 56 | print(str(result.surface)) 57 | 58 | 59 | def _generate_verbs(infinitive: str, stem: str) -> None: 60 | """ 61 | Generates words from a given stem. 62 | 63 | Args: 64 | infinitive (str): Infinitive form of the verb to create the lexicon. 65 | stem (str): Stem to generate words for. 66 | """ 67 | 68 | print('\nGenerating verbs.\n') 69 | 70 | positive_negatives: List[JString] = [JString(''), JString('Neg')] 71 | times: List[JString] = [ 72 | 'Imp', 73 | 'Aor', 74 | 'Past', 75 | 'Prog1', 76 | 'Prog2', 77 | 'Narr', 78 | 'Fut', 79 | ] 80 | people: List[JString] = ['A1sg', 'A2sg', 'A3sg', 'A1pl', 'A2pl', 'A3pl'] 81 | 82 | morphology = ( 83 | TurkishMorphology.builder() 84 | .setLexicon(infinitive) 85 | .disableCache() 86 | .build() 87 | ) 88 | 89 | for pos_neg in positive_negatives: 90 | for time in times: 91 | for person in people: 92 | seq: java.util.ArrayList = java.util.ArrayList() 93 | if pos_neg: 94 | seq.add(JString(pos_neg)) 95 | if time: 96 | seq.add(JString(time)) 97 | if person: 98 | seq.add(JString(person)) 99 | results = list( 100 | morphology.getWordGenerator().generate(JString(stem), seq) 101 | ) 102 | if not results: 103 | print( 104 | f'Cannot generate Stem = ["{stem}"]' 105 | f'\n | Morphemes = {[str(morph) for morph in seq]}' 106 | ) 107 | continue 108 | print(' '.join(str(result.surface) for result in results)) 109 | 110 | 111 | def run(noun_root_word: str, verb_infinitive: str, verb_stem: str) -> None: 112 | """ 113 | Generate nouns and verbs. 114 | 115 | Args: 116 | noun_root_word (str): Root word to generate inflections from. 117 | verb_infinitive (str): Infinitive form of the verb to create the 118 | lexicon for verb generation. 119 | verb_stem (str): Stem to generate verbs for. 120 | """ 121 | _generate_nouns(noun_root_word) 122 | _generate_verbs(verb_infinitive, verb_stem) 123 | -------------------------------------------------------------------------------- /examples/normalization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozturkberkay/Zemberek-Python-Examples/a3680f185270ef74eba57c8aa6f6e77c519ffbbd/examples/normalization/__init__.py -------------------------------------------------------------------------------- /examples/normalization/document_correction.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Document Correction Example 3 | Documentation: https://bit.ly/31PThMZ 4 | Java Code Example: https://bit.ly/2oohWKT 5 | """ 6 | from typing import List 7 | 8 | from jpype import JClass, JString, java 9 | 10 | from examples import DATA_PATH 11 | 12 | TurkishSpellChecker: JClass = JClass( 13 | 'zemberek.normalization.TurkishSpellChecker' 14 | ) 15 | TurkishTokenizer: JClass = JClass('zemberek.tokenization.TurkishTokenizer') 16 | TurkishLexer: JClass = JClass('zemberek.tokenization.antlr.TurkishLexer') 17 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') 18 | Token: JClass = JClass('zemberek.tokenization.Token') 19 | 20 | 21 | def run() -> None: 22 | """ 23 | Document correction example. 24 | """ 25 | 26 | tokenizer: TurkishTokenizer = TurkishTokenizer.ALL 27 | morphology: TurkishMorphology = TurkishMorphology.createWithDefaults() 28 | spell_checker: TurkishSpellChecker = TurkishSpellChecker(morphology) 29 | 30 | with open( 31 | DATA_PATH.joinpath('normalization', 'document.txt'), 32 | 'r', 33 | encoding='utf-8', 34 | ) as document_file: 35 | document = document_file.read() 36 | 37 | tokens: java.util.ArrayList = tokenizer.tokenize(JString(document)) 38 | 39 | corrected_tokens: List[str] = [] 40 | 41 | for token in tokens: 42 | text: JString = token.content 43 | if token.type not in { 44 | Token.Type.NewLine, 45 | Token.Type.SpaceTab, 46 | Token.Type.Punctuation, 47 | Token.Type.RomanNumeral, 48 | Token.Type.UnknownWord, 49 | Token.Type.Unknown 50 | } and not spell_checker.check(text): 51 | suggestions: List[JString] = list( 52 | spell_checker.suggestForWord(token.content) 53 | ) 54 | if suggestions: 55 | suggestion: str = str(suggestions[0]) 56 | print(f'Correction: {token.content} -> {suggestion}.') 57 | corrected_tokens.append(suggestion) 58 | continue 59 | corrected_tokens.append(str(token.content)) 60 | 61 | print('\nCorrected Document:\n', ''.join(corrected_tokens)) 62 | -------------------------------------------------------------------------------- /examples/normalization/noisy_text_normalization.py: -------------------------------------------------------------------------------- 1 | """ 2 | ## Zemberek: Noisy Text Normalization Example 3 | # Documentation: https://bit.ly/2WkUVVF 4 | # Java Code Example: https://bit.ly/31Qi9Ew 5 | """ 6 | from jpype import JClass, JString 7 | 8 | from examples import DATA_PATH 9 | 10 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') 11 | TurkishSentenceNormalizer: JClass = JClass( 12 | 'zemberek.normalization.TurkishSentenceNormalizer' 13 | ) 14 | Paths: JClass = JClass('java.nio.file.Paths') 15 | 16 | 17 | def run(text: str) -> None: 18 | """ 19 | Noisy text normalization example. 20 | 21 | Args: 22 | text (str): Noisy text to normalize. 23 | """ 24 | 25 | normalizer = TurkishSentenceNormalizer( 26 | TurkishMorphology.createWithDefaults(), 27 | Paths.get(str(DATA_PATH.joinpath('normalization'))), 28 | Paths.get(str(DATA_PATH.joinpath('lm', 'lm.2gram.slm'))), 29 | ) 30 | 31 | print(f'\nNormalized: {normalizer.normalize(JString(text))}') 32 | -------------------------------------------------------------------------------- /examples/normalization/spell_checking.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Tokenization Example 3 | Documentation: https://bit.ly/2pYWVqC 4 | Java Code Example: https://bit.ly/31Ux0xJ 5 | """ 6 | from typing import List 7 | 8 | from jpype import JClass, JString, java 9 | 10 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') 11 | TurkishSpellChecker: JClass = JClass( 12 | 'zemberek.normalization.TurkishSpellChecker' 13 | ) 14 | 15 | 16 | def run(sentence: str) -> None: 17 | """ 18 | Spell checking example. 19 | 20 | Args: 21 | sentence (str): Sentence to check for spelling errors. 22 | """ 23 | 24 | morphology: TurkishMorphology = TurkishMorphology.createWithDefaults() 25 | 26 | spell_checker: TurkishSpellChecker = TurkishSpellChecker(morphology) 27 | 28 | words: List[str] = sentence.split(' ') 29 | fixed_words: List[str] = [] 30 | 31 | for word in words: 32 | if not spell_checker.check(JString(word)): 33 | print(f'Spelling error: {word}') 34 | suggestions: java.util.ArrayList = spell_checker.suggestForWord( 35 | JString(word) 36 | ) 37 | if suggestions: 38 | print(f'\nSuggestions for "{word}":') 39 | for suggestion in suggestions: 40 | print(f' | {suggestion}') 41 | fixed_words.append(str(suggestions[0])) 42 | continue 43 | else: 44 | print(f'No suggestions found for "{word}".') 45 | fixed_words.append(word) 46 | 47 | print('\nFixed sentence:', ' '.join(fixed_words)) 48 | -------------------------------------------------------------------------------- /examples/tokenization/sentence_boundary_detection.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Sentence Boundary Detection Example 3 | Documentation: https://bit.ly/2JopMvt 4 | Java Code Example: https://bit.ly/2PrG7Dw 5 | """ 6 | 7 | from jpype import JClass 8 | 9 | TurkishSentenceExtractor: JClass = JClass( 10 | 'zemberek.tokenization.TurkishSentenceExtractor' 11 | ) 12 | 13 | 14 | def run(paragraph: str) -> None: 15 | """ 16 | Sentence boundary detection example. 17 | 18 | Args: 19 | paragraph (str): Paragraph to detect sentence boundaries. 20 | """ 21 | 22 | extractor: TurkishSentenceExtractor = TurkishSentenceExtractor.DEFAULT 23 | 24 | sentences = extractor.fromParagraph(paragraph) 25 | 26 | for i, word in enumerate(sentences): 27 | print(f'Sentence {i+1}: {word}') 28 | -------------------------------------------------------------------------------- /examples/tokenization/turkish_tokenization.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zemberek: Turkish Tokenization Example 3 | Java Code Example: https://bit.ly/2PsLOkj 4 | """ 5 | from jpype import JClass, JString 6 | 7 | TurkishTokenizer: JClass = JClass('zemberek.tokenization.TurkishTokenizer') 8 | TokenIterator: JClass = JClass( 9 | 'zemberek.tokenization.TurkishTokenizer.TokenIterator' 10 | ) 11 | Token: JClass = JClass('zemberek.tokenization.Token') 12 | 13 | 14 | def run(sentence: str) -> None: 15 | """ 16 | Turkish sentence tokenization example. 17 | 18 | Args: 19 | sentence (str): Sentence to tokenize. 20 | """ 21 | tokenizer: TurkishTokenizer = TurkishTokenizer.DEFAULT 22 | 23 | print('\nToken Iterator Example:\n') 24 | 25 | token_iterator: TokenIterator = tokenizer.getTokenIterator( 26 | JString(sentence) 27 | ) 28 | 29 | for token in token_iterator: 30 | print( 31 | f'Token = {token}' 32 | f'\n | Content = {token.content}' 33 | f'\n | Normalized = {token.normalized}' 34 | f'\n | Type = {token.type}' 35 | f'\n | Start = {token.start}' 36 | f'\n | End = {token.end}\n' 37 | ) 38 | 39 | print('Default Tokenization Example:\n') 40 | 41 | tokenizer: TurkishTokenizer = TurkishTokenizer.DEFAULT 42 | 43 | for i, token in enumerate(tokenizer.tokenizeToStrings(JString(sentence))): 44 | print(f' | Token String {i} = {token}') 45 | 46 | print('\nCustom Tokenization With Ignored Types Example:\n') 47 | 48 | tokenizer: TurkishTokenizer = ( 49 | TurkishTokenizer.builder() 50 | .ignoreTypes( 51 | Token.Type.Punctuation, Token.Type.NewLine, Token.Type.SpaceTab 52 | ) 53 | .build() 54 | ) 55 | for i, token in enumerate(tokenizer.tokenize(JString(sentence))): 56 | print(f' | Token {i} = {token}') 57 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import importlib 3 | from pathlib import Path 4 | from typing import List 5 | 6 | from jpype import shutdownJVM, startJVM 7 | 8 | from examples import JVM_KWARGS 9 | 10 | 11 | def get_runnable_modules() -> List[str]: 12 | """ 13 | Returns a list of scripts which implement the run function. 14 | 15 | Returns: 16 | List[str]: List of script names in 'module.submodule.script' format. 17 | """ 18 | runnable_modules: List[str] = [] 19 | for script in Path('examples').glob('**/*.py'): 20 | if script.name == '__init__.py': 21 | continue 22 | module_name: str = ( 23 | f'{script.parents[0].name}.{script.name.split(".")[0]}' 24 | ) 25 | if hasattr(importlib.import_module(f'examples.{module_name}'), 'run'): 26 | runnable_modules.append(module_name) 27 | return runnable_modules 28 | 29 | 30 | if __name__ == '__main__': 31 | 32 | startJVM(**JVM_KWARGS) 33 | 34 | parser = argparse.ArgumentParser( 35 | description=( 36 | 'Run a Zemberek example. Example usage: python -m main' 37 | ' morphology.word_analysis kelime' 38 | ) 39 | ) 40 | parser.add_argument( 41 | 'example', 42 | type=str, 43 | help='The run() function from the chosen script will be invoked.', 44 | choices=get_runnable_modules(), 45 | ) 46 | parser.add_argument( 47 | 'args', 48 | type=str, 49 | default=[], 50 | nargs='*', 51 | help='Arguments to pass to the run function.', 52 | ) 53 | 54 | args = parser.parse_args() 55 | 56 | example = importlib.import_module(f'examples.{args.example}') 57 | 58 | print(example.run.__doc__) 59 | 60 | example.run(*args.args) 61 | 62 | shutdownJVM() 63 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gdown==3.12.2 2 | JPype1==1.2.0 --------------------------------------------------------------------------------