├── .gitignore
├── LICENSE
├── README.md
├── bin
    └── README.md
├── data
    ├── classification
    │   └── README.md
    ├── dictionaries
    │   └── README.md
    ├── lm
    │   └── README.md
    ├── ner
    │   └── README.md
    └── normalization
    │   ├── README.md
    │   └── document.txt
├── documentation.yml
├── downloader.py
├── examples
    ├── __init__.py
    ├── classification
    │   ├── __init__.py
    │   ├── classification_example_base.py
    │   ├── news_title_category_finder.py
    │   ├── simple_classification.py
    │   └── train_classifier.py
    ├── core
    │   ├── __init__.py
    │   └── histogram.py
    ├── morphology
    │   ├── __init__.py
    │   ├── add_dictionary_item.py
    │   ├── change_stem.py
    │   ├── diacritic_analysis.py
    │   ├── find_pos.py
    │   ├── informal_words_analysis.py
    │   ├── sentence_disambiguation.py
    │   ├── stem_and_lemmatize.py
    │   ├── word_analysis.py
    │   └── word_generation.py
    ├── normalization
    │   ├── __init__.py
    │   ├── document_correction.py
    │   ├── noisy_text_normalization.py
    │   └── spell_checking.py
    └── tokenization
    │   ├── sentence_boundary_detection.py
    │   └── turkish_tokenization.py
├── main.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .env
 3 | .mypy_cache/
 4 | .vscode/
 5 | bin/*.jar
 6 | data/classification/news*
 7 | data/lm/lm.2gram.slm
 8 | data/normalization/ascii-map
 9 | data/normalization/lookup-from-graph
10 | data/normalization/split


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Zemberek Python Examples
  2 | 
  3 | > Zemberek Turkish NLP examples written in Python using the JPype package.
  4 | 
  5 | Zemberek is a Java-based natural language processing (NLP) tool created for the Turkish language. This repository contains the Python implementations of the [official Zemberek examples](https://github.com/ahmetaa/zemberek-nlp/tree/master/examples/src/main/java/zemberek/examples) for learning purposes.
  6 | 
  7 | ## Table of Contents
  8 | 
  9 | | Folder                    | Description                                                                                                                                                                       |
 10 | | ------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 11 | | classification            | fastText examples                                                                                                                                                                 |
 12 | | core                      | histogram                                                                                                                                                                         |
 13 | | morphology                | stemming, lemmatization, diacritics analysis, POS tag analysis, morphological analysis, word generation, sentence disambiguation, informal word analysis, adding dictionary items |
 14 | | named-entitiy-recognition | on hold                                                                                                                                                                           |
 15 | | normalization             | document correction, noisy text normalization, spell checking                                                                                                                     |
 16 | | tokenization              | sentence boundary detection, turkish tokenization                                                                                                                                 |
 17 |   
 18 | ## Requirements
 19 | 
 20 | 1.  Python 3.6+
 21 | 
 22 | ## Getting Started
 23 | 
 24 | 1. Clone this library and `cd` into it.
 25 |     ```shell
 26 |     $ git clone https://github.com/ozturkberkay/Zemberek-Python-Examples.git
 27 |     $ cd Zemberek-Python-Examples
 28 |     ```
 29 | 
 30 | 2.  Install the required packages. Using `virtualenv` is highly encouraged!
 31 | 
 32 |     ```shell
 33 |     $ python -m pip install --upgrade pip virtualenv
 34 |     $ python -m virtualenv .env
 35 |     $ # Windows: .env\Scripts\activate
 36 |     $ source .env/bin/activate
 37 |     $ python -m pip install -r requirements.txt
 38 |     ```
 39 | 
 40 | 3. Download the required Zemberek files:
 41 | 
 42 |     ```shell
 43 |     $ python -m downloader 
 44 |     ```
 45 | 
 46 |     Optionally, you can manually download all the data and version `0.17.1` of Zemberek distribution from [the official Zemberek Drive folder](https://drive.google.com/drive/folders/0B9TrB39LQKZWSjNKdVcwWUxxUm8?usp=sharing) and put the files in the corresponding folders:
 47 | 
 48 |         .
 49 |         +-- bin
 50 |         |   +-- zemberek-full.jar
 51 |         +-- data
 52 |         |   +-- classification
 53 |         |       +-- news-title-category-set
 54 |         |       +-- news-title-category-set.lemmas
 55 |         |       +-- news-title-category-set.tokenized
 56 |         |   +-- dictionaries
 57 |         |   +-- lm
 58 |         |       +-- lm.2gram.slm
 59 |         |   +-- ner
 60 |         |   +-- normalization
 61 |         |       +-- ascii-map
 62 |         |       +-- lookup-from-graph
 63 |         |       +-- split
 64 | 
 65 | ## Usage
 66 | 
 67 | 1. Run `python -m main category.example args`.
 68 | 
 69 |     ```shell
 70 |     $ python -m main classification.simple_classification "Fenerbahçe bu maçı galibiyet ile sonlandırdı."
 71 |     ...
 72 | 
 73 |         News classification example. Trains a new model if there is no model
 74 |         available.
 75 | 
 76 |         Args:
 77 |             sentence (str): Sentence to classify.
 78 |         
 79 |     Sentence: Fenerbahçe bu maçı galibiyet ile sonlandırdı.
 80 | 
 81 |     Item 1: __label__spor 
 82 |     Score 1: -0.009194993413984776
 83 | 
 84 |     Item 2: __label__magazin 
 85 |     Score 2: -6.12613582611084
 86 | 
 87 |     Item 3: __label__kültür_sanat 
 88 |     Score 3: -6.226541996002197
 89 |     ```
 90 | 
 91 | ## Known Bugs
 92 | 
 93 | -   During the model training, `fastText` will print errors. It still works, just ignore them.
 94 | 
 95 | ## Changelog
 96 | 
 97 | -   2020-12-05
 98 |     -   Automatic downloader for Zemberek files.
 99 |     -   Simple CLI entry-point to run the examples with custom data. 
100 |     -   JPype1 v1.2.0 upgrade. This should fix some memory leak issues.
101 |     -   Code quality improvements.
102 |     -   Fixes for broken links.
103 | -   2019-10-29
104 |     -   Zemberek v0.17.1 upgrade.
105 |     -   JPype1 v0.7.0 upgrade.
106 |     -   Code style changes.
107 |     -   Bug-fixes.
108 |     -   License is now the same with Zemberek (Apache v2.0).
109 | -   2018-12-01
110 |     -   Classification, morphology, normalization and tokenization examples.
111 | 


--------------------------------------------------------------------------------
/bin/README.md:
--------------------------------------------------------------------------------
1 | # Zemberek JAR Distribution
2 | > Download the latest version of Zemberek from [the official Zemberek Drive folder](https://drive.google.com/drive/folders/0B9TrB39LQKZWX1RSang3M1VkYjQ?usp=sharing) and put the `zemberek-full.jar` file in this folder. Or use `zemberek_downloader.py` to automatically download the files.


--------------------------------------------------------------------------------
/data/classification/README.md:
--------------------------------------------------------------------------------
1 | # Zemberek Classification Data
2 | > Download the classification data files from [the official Zemberek Drive folder](https://drive.google.com/drive/folders/1JBPExAeRctAXL2oGW2U6CbqfwIJ84BG7?usp=sharing) and put the contents in this folder. Or use `zemberek_downloader.py` to automatically download the files.


--------------------------------------------------------------------------------
/data/dictionaries/README.md:
--------------------------------------------------------------------------------
 1 | # Zemberek Dictionary Data
 2 | 
 3 | > You can utilize [the official wiki](https://github.com/ahmetaa/zemberek-nlp/wiki/Text-Dictionary-Rules) to create your own lexicon and save it in this folder. Or use `zemberek_downloader.py` to automatically download the files.
 4 | 
 5 | ## Usage
 6 | 
 7 | 1. Create your dictionary inside this folder. Read [the related wiki article](https://github.com/ahmetaa/zemberek-nlp/wiki/Text-Dictionary-Rules) for more information.
 8 | 
 9 | 2. Create a `TurkishMorphology` object using your custom dictionary Read [the related wiki article](https://github.com/ahmetaa/zemberek-nlp/tree/master/morphology#creating-turkishmorphology-object) for more information.
10 | 
11 |     ```python
12 |     lexicon = RootLexicon.builder().addDefaultLexicon().addTextDictionaries(Paths.get('PATH')).build()
13 |     morphology = TurkishMorphology.create(lexicon)
14 |     ```
15 | 


--------------------------------------------------------------------------------
/data/lm/README.md:
--------------------------------------------------------------------------------
1 | # Zemberek Language Model Data
2 | > Download the language model data files from [the official Zemberek Drive folder](https://drive.google.com/drive/folders/1jb4ei8rbBRfBmK1WrhdjqyEkVZmieZpI?usp=sharing) and put the contents in this folder. Or use `zemberek_downloader.py` to automatically download the files.


--------------------------------------------------------------------------------
/data/ner/README.md:
--------------------------------------------------------------------------------
1 | # Zemberek Named Entity Recognition Data
2 | > Development of NER examples are on hold. Once they are completed, you will need to put your own training and test datasets named `ner-train` and `ner-test` inside this folder.


--------------------------------------------------------------------------------
/data/normalization/README.md:
--------------------------------------------------------------------------------
1 | # Zemberek Normalization Data
2 | > Download the normalization data files from [the official Zemberek Drive folder](https://drive.google.com/drive/folders/1jNT6BJoEbiLuVbQwBYdVNoibEdzDd2WC?usp=sharing) and put the contents in this folder. Or use `zemberek_downloader.py` to automatically download the files.


--------------------------------------------------------------------------------
/data/normalization/document.txt:
--------------------------------------------------------------------------------
 1 | Türk Vatanı ve Milletinin ebedi varlığxını ve Yüce Türk Devlatinin bölünmez bütünlüğünü belirleyen bu Anayasa,
 2 | Türkiye Cumhuriyetinin kurucusu, ölümsüz önder ve eşsiz kahraman Atatürk’ün belirlediği milliyetçilik anlayışı ve onun
 3 | inkılap ve ilkeleri doğrultusunda;
 4 | Dünya milletleri ailesinin eşit haklara sahip şerefli bir üyesi olarak, Türkiye Cumhuriyetinin ebedi varlığı, refahı,
 5 | maddi ve manelvi mutluluğu ile çağdaş medeniyet düzeyine ulaşma azmi yönünde;
 6 | Millet iradesinin mutlak üstünlüğü, egemenliğin kayıtsız şartsız Türk Milletine ait olduğu ve bunu millet adına
 7 | kullanmaya yetkili kılınan hiçbir kişi ve kuruluşun, bu Anayasada gösterilen hürriyetçi demokrasi ve bunun icaplarıyla
 8 | belirlenmiş hukuk düzeni dışına çıkamayacağı;
 9 | Kuvvetler ayrımının, Devlet organları arasında üstünlük sıralaması anlamına gelmeyip, belli Devlet yetki ve
10 | görevlerinin kullanılmasından ibaret ve bununla sınırlı medeni bir işbölümü ve işbirliği olduğu ve üstünlüğün ancak Anayasa
11 | ve kanunlarda bulunduğu;
12 | Hiçbir faaliyetin Türk milli menfaatlerinin, Türk varlığının, Devleti ve ülkesiyle bölünmezliği esasının, Türklüğün
13 | tarihi ve manevi değerlerinin, Atatürk milliyetçiliği, ilke ve inkılapları ve medeniyetçiliğinin karşısında korunma
14 | göremeyeceği ve laiklik ilkesinin gereği olarak kutsal din duygularının, Devlet işlerine ve politikaya kesinlikle
15 | karıştırılamayacağı; (5)
16 | 
17 | Her Türk vatandaşının bu Anayasadaki temel hak ve hürriyetlerden eşitlik ve sosyal adalet gereklerince yararlanarak
18 | milli kültür, medeniyet ve hukuk düzeni içinde onurlu bir hayat sürdürme ve maddi ve manevi varlığını bu yönde geliştirme
19 | hak ve yetkisine doğuştan sahip olduğu;
20 | –––––––––––––––––––––––––––––
21 | (1) Bu Anayasa; Kuruczu Meclis tarafından 18/10/1982’de Halkoylamasına sunulmak üzere kabul edilmiş ve 20/10/1982
22 | tarihli ve 17844 sayılı Resmî Gazete’de yayımlanmış; 7/11/1982’de Halkoylamasına sunulduktan sonra 9/11/1982 tarihli
23 | ve 17863 Mükerrer sayıli Resmî Gazete’de yeniden yayımlanmıştır.
24 | (2) 7/5/2010 tarihli ve 5982 sayılı Kanun ile yapılan Anayasa değişiklikleri 12/9/2010 tarihinde Halkoyuna sunularak kabul
25 | edilmiş, buna ilişkin 22/9/2010 tarihli ve 846 sayılı Yüksek Seçim Kurulu Kararı 23/9/2010 tarihli ve 27708 sayılı Resmî
26 | Gazete’de yayımlanmıştır.
27 | (3) 21/1/2017 tarihli ve 6771 sayılı Kanun ile yapılan Anayasa değişiklikleri 16/4/2017 tarihinde Halkoyuna sunularak kabul
28 | edilmiş, buna ilişkin 27/4/2017 tarihli ve 663 sayılı Yüksek Seçim Kurulu Kararı 27/4/2017 tarihli ve 30050 Mükerrer
29 | sayılı Resmî Gazete’de yayımlanmıştır.
30 | (4) Anayasa’nın Başlangıc metni 23/7/1995 tarih ve 4121 sayılı Kanun’un 1 inci maddesi ile değiştirilmiş ve metne
31 | işlenmiştir.
32 | (5) Bu fıkrada geçen, “Hiçbir düşünce ve mülahazanın” ibaresi, 3/10/2001 tarih ve 4709 sayılı Kanunun 1 inci maddesiyle
33 | “Hiçbir faaliyetin” şeklinde değiştirilmiş ve metne işlenmiştir.
34 | 130
35 | Topluca Türk vatandaşlarının milli gurur ve iftiharlarda, milli sevinç ve kederlerde, milli varlığa karşı hak ve
36 | ödevlerde, nimet ve külfetlerde ve millet hayatının her türlü tecellisinde ortak olduğu, birbirinin hak ve hürriyetlerine kesin
37 | saygı, karşılıklı içten sevgi ve kardeşlik duygularıyla ve 'Yurtta sulh, cihanda sulh' arzu ve inancı içinde, huzurlu bir hayat
38 | talebine hakları bulunduğu;
39 | FİKİR, İNANÇ VE KARARIYLA anlaşılmak, sözüne ve ruhuna bu yönde saygı ve mutlak sadakatle yorumlanıp
40 | uygulanmak üzere.
41 | TÜRK MİLLETİ TARAFINDAN, demokrasiye aşık Türk evlatlarının vatan ve millet sevgisine emanet ve tevdi
42 | olunur.
43 | BİRİNCİ KISIM
44 | GENEL ESASLAR
45 | I. Devletin şekli
46 | Madde 1 – Türkiye Devleti bir Cumhuriyettir.
47 | II. Cumhuriyetin nitelikleri
48 | Madde 2 – Türkiye Cumhuriyeti, toplumun huzuru, milli dayanışma ve adalet anlayışı içinde, insan haklarına saygılı,
49 | Atatürk milliyetçiliğine bağlı, başlangıçta belirtilen temel ilkelere dayanan, demokratik, laik ve sosyal bir hukuk Devletidir.
50 | III. Devletin bütünlüğü, Resmî dili, bayrağı, milli marşı ve başkenti
51 | Madde 3 – Türkiye Devleti, ülkesi ve milletiyle bölünmez bir bütündür. Dili Türkçedir.
52 | Bayrağı, şekli kanununda belirtilen, beyaz ay yıldızlı al bayraktır.
53 | Milli marşı 'İstiklal Marşı'dır.
54 | Başkenti Ankara'dır.
55 | IV. Değiştirilemeyecek hükümler
56 | Madde 4 – Anayasanın 1 inci maddesindeki Devletin şeklinin Cumhuriyet olduğu hakkındaki hüküm ile, 2 nci
57 | maddesindeki Cumhuriyetin nitelikleri ve 3 üncü maddesi hükümleri değiştirilemez ve değiştirilmesi teklif edilemez.
58 | V. Devletin temel amaç ve görevleri
59 | Madde 5 – Devletin temel amaç ve görevleri, Türk milletinin bağımsızlığını ve bütünlüğünü, ülkenin bölünmezliğini,
60 | Cumhuriyeti ve demokrasiyi korumak, kişilerin ve toplumun refah, huzur ve mutluluğunu sağlamak; kişinin temel hak ve
61 | hürriyetlerini, sosyal hukuk devleti ve adalet ilkeleriyle bağdaşmayacak surette sınırlayan siyasal, ekonomik ve sosyal
62 | engelleri kaldırmaya, insanın maddi ve manevi varlığının gelişmesi için gerekli şartları hazırlamaya çalışmaktır.
63 | VI. Egemenlik
64 | Madde 6 – Egemenlik, kayıtsız şartsız Milletindir.
65 | Türk Milleti, egemenliğini, Anayasanın koyduğu esaslara göre, yetkili organları eliyle kullanır.
66 | Egemenliğin kullanılması, hiçbir surette hiçbir kişiye, zümreye veya sınıfa bırakılamaz. Hiçbir kimse veya organ
67 | kaynağını Anayasadan almayan bir Devlet yetkisi kullanamaz'
68 | 


--------------------------------------------------------------------------------
/documentation.yml:
--------------------------------------------------------------------------------
1 | examples/** linguist-documentation=false
2 | 


--------------------------------------------------------------------------------
/downloader.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | from pathlib import Path
 3 | 
 4 | import gdown
 5 | 
 6 | ZEMBEREK_FILES: Dict[str, Dict[str, str]] = {
 7 |     'bin': {'zemberek-full.jar': '1RRuFK43JqcHcthB3fV2IEpPftWoeoHAu'},
 8 |     'data/classification': {
 9 |         'news-title-category-set': '13d6TjKSk8Uy0FNHrbqJQv1hHalKUg1l5',
10 |         'news-title-category-set.lemmas': '1VP-DcPDY423cU48CP675yT6RS_hygtpX',
11 |         'news-title-category-set.tokenized': (
12 |             '1xt81joeOA7nOTYNUKdxKBOeDtLGMCwNO'
13 |         ),
14 |     },
15 |     'data/lm': {'lm.2gram.slm': '1JZG0I8jUS511lFVg0M-QAA4QRqydlCiX'},
16 |     'data/normalization': {
17 |         'ascii-map': '1ptbPoGZrKxXS5PNr5kpGfHziGdIUP-n7',
18 |         'lookup-from-graph': '1ko31lO1yrYf1twjZOl_vHikmsGkQGKt5',
19 |         'split': '1X8UpIE0ifYF1_tpMQp7o5PLEqms3Ew5Q',
20 |     },
21 | }
22 | 
23 | 
24 | def download_drive_files(
25 |     file_map: Dict[str, Dict[str, str]], skip_if_exists: bool = True
26 | ) -> None:
27 |     """
28 |     Downloads the files from Google Drive.
29 | 
30 |     Args:
31 |         file_map (Dict[str, Dict[str, str]]): Mapping of directories to install
32 |             to, with the file names and Google Drive ID's.
33 |         skip_if_exists (bool): Skips the download if the target file already
34 |             exists on the local filesystem.
35 |     """
36 | 
37 |     base_path: Path = Path(__file__).parent
38 | 
39 |     for folder in file_map:
40 | 
41 |         for file_name, drive_id in file_map[folder].items():
42 | 
43 |             target_path: Path = base_path.joinpath(
44 |                 folder, file_name
45 |             ).absolute()
46 | 
47 |             if skip_if_exists and target_path.exists():
48 |                 print(f'Target already exists, skipping: {target_path}')
49 |                 continue
50 | 
51 |             gdown.download(
52 |                 url=f'https://drive.google.com/u/0/uc?id={drive_id}',
53 |                 output=str(target_path),
54 |             )
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     download_drive_files(ZEMBEREK_FILES)
59 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
 1 | """Common constants for the examples."""
 2 | import os
 3 | from pathlib import Path
 4 | from typing import Dict, List, Optional, Union
 5 | 
 6 | from jpype import getDefaultJVMPath
 7 | 
 8 | __all__: List[str] = ['ZEMBEREK_PATH', 'DATA_PATH', 'JVM_KWARGS', 'JAVA_PATH']
 9 | 
10 | ZEMBEREK_PATH: Path = (
11 |     Path(__file__).parents[1].joinpath('bin', 'zemberek-full.jar').absolute()
12 | )
13 | 
14 | DATA_PATH: Path = Path(__file__).parents[1].joinpath('data').absolute()
15 | 
16 | JVM_KWARGS: Dict[str, Union[str, bool]] = {
17 |     'jvmpath': getDefaultJVMPath(),
18 |     'classpath': str(ZEMBEREK_PATH),
19 |     'convertStrings': False,
20 |     'interrupt': True,
21 | }
22 | 
23 | _JAVA_HOME: Optional[str] = os.getenv('JAVA_HOME')
24 | 
25 | if _JAVA_HOME is None:
26 |     raise EnvironmentError(
27 |         'Install Java and make sure the JAVA_HOME environment variable is set.'
28 |     )
29 | 
30 | JAVA_PATH: Path = Path(_JAVA_HOME, 'bin', 'java')
31 | 


--------------------------------------------------------------------------------
/examples/classification/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozturkberkay/Zemberek-Python-Examples/a3680f185270ef74eba57c8aa6f6e77c519ffbbd/examples/classification/__init__.py


--------------------------------------------------------------------------------
/examples/classification/classification_example_base.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Zemberek: Classification Example Base
  3 | Documentation: https://bit.ly/2BNKPmP
  4 | Original Java Example: https://bit.ly/2Prce6m
  5 | fastText Documentation: https://bit.ly/31YVBS8
  6 | """
  7 | 
  8 | import re
  9 | from pathlib import Path
 10 | from typing import List
 11 | 
 12 | from jpype import JClass, JString, java
 13 | 
 14 | __all__: List[str] = ['ClassificationExampleBase']
 15 | 
 16 | 
 17 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
 18 | SentenceAnalysis: JClass = JClass(
 19 |     'zemberek.morphology.analysis.SentenceAnalysis'
 20 | )
 21 | SentenceWordAnalysis: JClass = JClass(
 22 |     'zemberek.morphology.analysis.SentenceWordAnalysis'
 23 | )
 24 | SingleAnalysis: JClass = JClass('zemberek.morphology.analysis.SingleAnalysis')
 25 | TurkishSentenceNormalizer: JClass = JClass(
 26 |     'zemberek.normalization.TurkishSentenceNormalizer'
 27 | )
 28 | TurkishTokenizer: JClass = JClass('zemberek.tokenization.TurkishTokenizer')
 29 | Token: JClass = JClass('zemberek.tokenization.Token')
 30 | Type: JClass = JClass('zemberek.tokenization.Token.Type')
 31 | 
 32 | 
 33 | class ClassificationExampleBase:
 34 |     """Exact Python implementation of the original Java code."""
 35 | 
 36 |     morphology: TurkishMorphology = TurkishMorphology.createWithDefaults()
 37 | 
 38 |     @classmethod
 39 |     def generate_set_with_lemmas(
 40 |         cls, lines: List[str], lemmas_path: Path
 41 |     ) -> None:
 42 |         with open(lemmas_path, 'w', encoding='utf-8') as lemma_file:
 43 |             for line in [
 44 |                 cls.remove_non_words(
 45 |                     cls.replace_words_with_lemma(line)
 46 |                 ).lower()
 47 |                 for line in lines
 48 |             ]:
 49 |                 lemma_file.write(f'{line}\n')
 50 | 
 51 |     @classmethod
 52 |     def generate_set_with_split(
 53 |         cls, lines: List[str], split_path: Path
 54 |     ) -> None:
 55 |         with open(split_path, 'w', encoding='utf-8') as split_file:
 56 |             for line in [
 57 |                 cls.remove_non_words(cls.split_words(line)).lower()
 58 |                 for line in lines
 59 |             ]:
 60 |                 split_file.write(f'{line}\n')
 61 | 
 62 |     @classmethod
 63 |     def generate_set_tokenized(
 64 |         cls, lines: List[str], tokenized_path: Path
 65 |     ) -> None:
 66 |         with open(tokenized_path, 'w', encoding='utf-8') as tokens_file:
 67 |             for line in [
 68 |                 cls.remove_non_words(
 69 |                     java.lang.String.join(
 70 |                         JString(' '),
 71 |                         TurkishTokenizer.DEFAULT.tokenizeToStrings(
 72 |                             JString(line)
 73 |                         ),
 74 |                     )
 75 |                 ).lower()
 76 |                 for line in lines
 77 |             ]:
 78 |                 tokens_file.write(f'{line}\n')
 79 | 
 80 |     @classmethod
 81 |     def split_words(cls, sentence: str) -> str:
 82 |         tokens: List[str] = sentence.split()
 83 |         label: java.lang.String = JString(tokens[0])
 84 |         del tokens[0]
 85 |         sentence = ' '.join(tokens)
 86 | 
 87 |         if len(sentence) == 0:
 88 |             return JString(sentence)
 89 | 
 90 |         analysis: SentenceAnalysis = cls.morphology.analyzeAndDisambiguate(
 91 |             JString(sentence)
 92 |         )
 93 |         res: java.util.ArrayList = java.util.ArrayList()
 94 |         res.add(label)
 95 | 
 96 |         for word_analysis in analysis:
 97 |             best: SingleAnalysis = word_analysis.getBestAnalysis()
 98 |             inp: java.lang.String = word_analysis.getWordAnalysis().getInput()
 99 | 
100 |             if best.isUnknown():
101 |                 res.add(inp)
102 |                 continue
103 | 
104 |             lemmas: java.util.ArrayList = best.getLemmas()
105 | 
106 |             if len(lemmas[0]) < len(inp):
107 |                 res.add(lemmas[0])
108 |                 res.add(JString('_' + str(inp[len(lemmas[0])])))
109 |             else:
110 |                 res.add(lemmas[0])
111 | 
112 |         return java.lang.String.join(JString(' '), res)
113 | 
114 |     @classmethod
115 |     def process_ending(cls, inp: str) -> str:
116 |         for pattern, value in {
117 |             r'[ae]': 'A',
118 |             r'[ıiuü]': 'I',
119 |             r'[kğ]': 'K',
120 |             r'[cç]': 'C',
121 |             r'[dt]': 'D',
122 |         }.items():
123 |             inp = re.sub(pattern, value, inp)
124 |         return inp
125 | 
126 |     @classmethod
127 |     def replace_words_with_lemma(cls, sentence: str) -> str:
128 |         tokens: List[str] = sentence.split()
129 |         label: str = tokens[0]
130 |         del tokens[0]
131 |         sentence = ' '.join(tokens)
132 | 
133 |         if len(sentence) == 0:
134 |             return sentence
135 | 
136 |         analysis: SentenceAnalysis = cls.morphology.analyzeAndDisambiguate(
137 |             JString(sentence)
138 |         )
139 |         res: java.util.ArrayList = java.util.ArrayList()
140 |         res.add(JString(label))
141 | 
142 |         for word_analysis in analysis:
143 |             best: SingleAnalysis = word_analysis.getBestAnalysis()
144 | 
145 |             if best.isUnknown():
146 |                 res.add(word_analysis.getWordAnalysis().getInput())
147 |                 continue
148 | 
149 |             lemmas: java.util.ArrayList = best.getLemmas()
150 |             res.add(lemmas[0])
151 | 
152 |         return java.lang.String.join(JString(' '), res)
153 | 
154 |     @classmethod
155 |     def remove_non_words(cls, sentence: JString) -> str:
156 |         if not sentence:
157 |             return ''
158 | 
159 |         doc_tokens: List[Token] = list(
160 |             TurkishTokenizer.DEFAULT.tokenize(sentence)
161 |         )
162 |         reduced: List[str] = []
163 | 
164 |         for token in doc_tokens:
165 |             text: str = str(token.getText())
166 | 
167 |             if text[0] == '_' or '__' in text:
168 |                 reduced.append(text)
169 |                 continue
170 | 
171 |             token_type: Token.Type = token.getType()
172 | 
173 |             if token_type in {
174 |                 Token.Type.Mention,
175 |                 Token.Type.HashTag,
176 |                 Token.Type.URL,
177 |                 Token.Type.Punctuation,
178 |                 Type.RomanNumeral,
179 |                 Token.Type.Time,
180 |                 Token.Type.UnknownWord,
181 |                 Token.Type.Unknown,
182 |             }:
183 |                 continue
184 | 
185 |             reduced.append(text)
186 | 
187 |         return ' '.join(reduced)
188 | 


--------------------------------------------------------------------------------
/examples/classification/news_title_category_finder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Zemberek: News Title Category Finder Example
  3 | Documentation: https://bit.ly/2BNKPmP
  4 | Original Java Example: https://bit.ly/32TUtQU
  5 | fastText Documentation: https://bit.ly/31YVBS8
  6 | """
  7 | 
  8 | import collections
  9 | from pathlib import Path
 10 | from typing import List, Optional, cast
 11 | 
 12 | from jpype import JClass, JString
 13 | 
 14 | from examples import DATA_PATH
 15 | from examples.classification.classification_example_base import (
 16 |     ClassificationExampleBase,
 17 | )
 18 | 
 19 | __all__: List[str] = ['NewsTitleCategoryFinder', 'run']
 20 | 
 21 | EvaluateClassifier: JClass = JClass(
 22 |     'zemberek.apps.fasttext.EvaluateClassifier'
 23 | )
 24 | TrainClassifier: JClass = JClass('zemberek.apps.fasttext.TrainClassifier')
 25 | 
 26 | 
 27 | class NewsTitleCategoryFinder(ClassificationExampleBase):
 28 |     """Exact Python implementation of the original Java code."""
 29 | 
 30 |     @classmethod
 31 |     def _read(cls, path: Path) -> List[str]:
 32 |         with open(path, 'r', encoding='utf-8') as lines_file:
 33 |             return [line for line in lines_file]
 34 | 
 35 |     @classmethod
 36 |     def data_info(cls, path: Path) -> List[str]:
 37 |         lines = cls._read(path)
 38 |         print(f'Total Lines: {len(lines)}')
 39 |         for item in collections.Counter(
 40 |             [line[0 : line.find(' ')] for line in lines]
 41 |         ).most_common():
 42 |             print(f'({item[1]})\t{item[0]}')
 43 |         return lines
 44 | 
 45 |     @classmethod
 46 |     def evaluate(
 47 |         cls,
 48 |         test_size: int,
 49 |         path: Optional[Path] = None,
 50 |         lines: Optional[List[str]] = None,
 51 |     ) -> None:
 52 |         if lines is None:
 53 |             if path is None:
 54 |                 raise ValueError('You should provide a path!')
 55 |             lines = cls._read(path)
 56 | 
 57 |         if test_size <= 0 or test_size > len(lines):
 58 |             raise ValueError(
 59 |                 '\'test_size\' must be bigger than'
 60 |                 ' 0 and less than the dataset size!'
 61 |             )
 62 | 
 63 |         train_path: Path = cast(Path, path).with_suffix('.train')
 64 |         test_path: Path = cast(Path, path).with_suffix('.test')
 65 | 
 66 |         with open(train_path, 'w', encoding='utf-8') as train_file:
 67 |             for line in lines[test_size : len(lines)]:
 68 |                 train_file.write(line)
 69 | 
 70 |         with open(test_path, 'w', encoding='utf-8') as test_file:
 71 |             for line in lines[0:test_size]:
 72 |                 test_file.write(line)
 73 | 
 74 |         model_path: Path = cast(Path, path).with_suffix('.model')
 75 | 
 76 |         if not model_path.is_file():
 77 |             TrainClassifier().execute(
 78 |                 JString('-i'),
 79 |                 JString(str(train_path)),
 80 |                 JString('-o'),
 81 |                 JString(str(model_path)),
 82 |                 JString('--learningRate'),
 83 |                 JString('0.1'),
 84 |                 JString('--epochCount'),
 85 |                 JString('70'),
 86 |                 JString('--dimension'),
 87 |                 JString('100'),
 88 |                 JString('--wordNGrams'),
 89 |                 JString('2'),
 90 |             )
 91 |         print('Testing...')
 92 |         cls.test(
 93 |             test_path, cast(Path, path).with_suffix('.predictions'), model_path
 94 |         )
 95 | 
 96 |     @classmethod
 97 |     def test(
 98 |         cls, test_path: Path, predictions_path: Path, model_path: Path
 99 |     ) -> None:
100 |         EvaluateClassifier().execute(
101 |             JString('-i'),
102 |             JString(str(test_path)),
103 |             JString('-m'),
104 |             JString(str(model_path)),
105 |             JString('-o'),
106 |             JString(str(predictions_path)),
107 |             JString('-k'),
108 |             JString('1'),
109 |         )
110 | 
111 | 
112 | def run() -> None:
113 |     """News classification model training."""
114 | 
115 |     data_path: Path = DATA_PATH.joinpath(
116 |         'classification', 'news-title-category-set'
117 |     )
118 | 
119 |     print('\nEvaluation with raw data:\n')
120 | 
121 |     lines_: List[str] = NewsTitleCategoryFinder.data_info(data_path)
122 |     NewsTitleCategoryFinder.evaluate(1000, data_path, lines_)
123 | 
124 |     print('\nEvaluation with tokenized-lowercase data:\n')
125 | 
126 |     tokenized_path: Path = data_path.with_suffix('.tokenized')
127 |     if not tokenized_path.is_file():
128 |         NewsTitleCategoryFinder.generate_set_tokenized(lines_, tokenized_path)
129 |     NewsTitleCategoryFinder.evaluate(1000, path=tokenized_path)
130 | 
131 |     print('\nEvaluation with lemma-lowercase data:\n')
132 | 
133 |     lemma_path: Path = data_path.with_suffix('.lemmas')
134 |     if not lemma_path.is_file():
135 |         NewsTitleCategoryFinder.generate_set_with_lemmas(lines_, lemma_path)
136 |     NewsTitleCategoryFinder.evaluate(1000, path=lemma_path)
137 | 
138 |     print('\nEvaluation with Stem-Ending-lowercase data:\n')
139 | 
140 |     split_path: Path = data_path.with_suffix('.split')
141 |     if not split_path.is_file():
142 |         NewsTitleCategoryFinder.generate_set_with_split(lines_, split_path)
143 |     NewsTitleCategoryFinder.evaluate(1000, path=split_path)
144 | 


--------------------------------------------------------------------------------
/examples/classification/simple_classification.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Zemberek: Simple Classification Example
 3 | Documentation: https://bit.ly/2BNKPmP
 4 | Java Code Example: https://bit.ly/2JsoO1i
 5 | fastText Documentation: https://bit.ly/31YVBS8
 6 | """
 7 | import subprocess
 8 | from pathlib import Path
 9 | from typing import List
10 | 
11 | from examples import DATA_PATH, JAVA_PATH, ZEMBEREK_PATH
12 | from jpype import JClass, JString, java
13 | 
14 | __all__: List[str] = ['run']
15 | 
16 | 
17 | FastTextClassifier: JClass = JClass(
18 |     'zemberek.classification.FastTextClassifier'
19 | )
20 | TurkishTokenizer: JClass = JClass('zemberek.tokenization.TurkishTokenizer')
21 | 
22 | 
23 | def run(sentence: str) -> None:
24 |     """
25 |     News classification example. Trains a new model if there are no model
26 |     available.
27 | 
28 |     Args:
29 |         sentence (str): Sentence to classify.
30 |     """
31 |     label_data_path: Path = DATA_PATH.joinpath(
32 |         'classification', 'news-title-category-set'
33 |     )
34 |     model_path: Path = label_data_path.with_suffix('.model')
35 | 
36 |     if not model_path.is_file():
37 | 
38 |         print(
39 |             'Could not find a model, training a new one. FastText will print'
40 |             ' some errors, do not terminate the process!'
41 |         )
42 | 
43 |         if not label_data_path.is_file():
44 |             raise FileNotFoundError(
45 |                 'Could not train a model!'
46 |                 ' Please include news-title-category-set!'
47 |             )
48 | 
49 |         subprocess.run(
50 |             [
51 |                 str(JAVA_PATH.absolute()),
52 |                 '-jar',
53 |                 str(ZEMBEREK_PATH.absolute()),
54 |                 'TrainClassifier',
55 |                 '-i',
56 |                 str(label_data_path.absolute()),
57 |                 '-o',
58 |                 str(model_path.absolute()),
59 |                 '--learningRate',
60 |                 '0.1',
61 |                 '--epochCount',
62 |                 '50',
63 |                 '--applyQuantization',
64 |                 '--cutOff',
65 |                 '15000',
66 |             ],
67 |             check=True,
68 |         )
69 | 
70 |     classifier: FastTextClassifier = FastTextClassifier.load(model_path)
71 | 
72 |     processed: str = ' '.join(
73 |         [
74 |             str(token)
75 |             for token in TurkishTokenizer.DEFAULT.tokenizeToStrings(
76 |                 JString(sentence)
77 |             )
78 |         ]
79 |     ).lower()
80 | 
81 |     results: java.util.ArrayList = classifier.predict(processed, 3)
82 | 
83 |     print(f'Sentence: {sentence}')
84 | 
85 |     for i, result in enumerate(results):
86 |         print(
87 |             f'\nItem {i + 1}: {result.item}',
88 |             f'\nScore {i + 1}: {result.score}',
89 |         )
90 | 


--------------------------------------------------------------------------------
/examples/classification/train_classifier.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Zemberek: Train Classifier Example
 3 | Documentation: https://bit.ly/2PtzNLB
 4 | fastText Documentation: https://bit.ly/2JtMP80
 5 | """
 6 | 
 7 | import subprocess
 8 | from pathlib import Path
 9 | from typing import List
10 | 
11 | from examples import JAVA_PATH, ZEMBEREK_PATH, DATA_PATH
12 | 
13 | __all__: List[str] = ['run']
14 | 
15 | 
16 | def run():
17 |     """
18 |     Trains the news classifier model.
19 |     """
20 | 
21 |     news_path: Path = DATA_PATH.joinpath(
22 |         'classification', 'news-title-category-set'
23 |     )
24 | 
25 |     subprocess.run(
26 |         [
27 |             JAVA_PATH,
28 |             '-jar',
29 |             ZEMBEREK_PATH,
30 |             'TrainClassifier',
31 |             '-i',
32 |             news_path,
33 |             '-o',
34 |             news_path.with_suffix('.model'),
35 |             '--learningRate',
36 |             '0.1',
37 |             '--epochCount',
38 |             '50',
39 |             '--applyQuantization',
40 |             '--cutOff',
41 |             '15000',
42 |         ],
43 |         check=True,
44 |     )
45 | 


--------------------------------------------------------------------------------
/examples/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozturkberkay/Zemberek-Python-Examples/a3680f185270ef74eba57c8aa6f6e77c519ffbbd/examples/core/__init__.py


--------------------------------------------------------------------------------
/examples/core/histogram.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Zemberek: Histogram Example
  3 | Original Java Example: https://bit.ly/2PmUyIV
  4 | """
  5 | from typing import List, Tuple
  6 | from jpype import (
  7 |     JClass,
  8 |     JInt,
  9 |     JString,
 10 |     java,
 11 | )
 12 | 
 13 | __all__: List[str] = ['run']
 14 | 
 15 | Histogram: JClass = JClass('zemberek.core.collections.Histogram')
 16 | 
 17 | 
 18 | def _create_histograms() -> Tuple[Histogram, Histogram]:
 19 |     """
 20 |     Creates two example Histogram data.
 21 | 
 22 |     Returns:
 23 |         Tuple[Histogram, Histogram]: Example Histogram data.
 24 |     """
 25 |     histogram_1: Histogram = Histogram()
 26 |     histogram_1.add(
 27 |         ['apple', 'pear', 'grape', 'apple', 'apple', 'appricot', 'grape']
 28 |     )
 29 | 
 30 |     histogram_2: Histogram = Histogram()
 31 |     histogram_2.add(['apple', 'apple', 'banana'])
 32 | 
 33 |     print('Histogram 1:', histogram_1)
 34 |     print('Histogram 2:', histogram_2)
 35 | 
 36 |     return histogram_1, histogram_2
 37 | 
 38 | 
 39 | def _print_histogram_items(
 40 |     histogram_1: Histogram, histogram_2: Histogram
 41 | ) -> None:
 42 |     """
 43 |     Prints the Histogram items.
 44 | 
 45 |     Args:
 46 |         histogram_1 (Histogram): First example Histogram.
 47 |         histogram_2 (Histogram): Second example Histogram.
 48 |     """
 49 |     print('\nHistogram 1, Keys:', histogram_1.getKeySet())
 50 |     print('Histogram 2, Keys:', histogram_2.getKeySet())
 51 | 
 52 |     print('\nHistogram 1, Sorted Keys:', histogram_1.getSortedList())
 53 |     print('Histogram 2, Sorted Keys:', histogram_2.getSortedList())
 54 | 
 55 |     print('\nHistogram 1, Entries:', histogram_1.getEntryList())
 56 |     print('Histogram 2, Entries:', histogram_2.getEntryList())
 57 | 
 58 |     print('\nHistogram 1, Sorted Entries:', histogram_1.getSortedEntryList())
 59 |     print('Histogram 2, Sorted Entries:', histogram_2.getSortedEntryList())
 60 | 
 61 | 
 62 | def _print_histogram_item_comp(
 63 |     histogram_1: Histogram, histogram_2: Histogram
 64 | ) -> None:
 65 |     print(
 66 |         '\nIntersection of Histogram 1 and 2:',
 67 |         histogram_1.getIntersectionOfKeys(histogram_2),
 68 |     )
 69 | 
 70 |     print(
 71 |         '\nHistogram 1, Contains \'grape\':',
 72 |         histogram_1.contains(JString('grape')),
 73 |     )
 74 |     print(
 75 |         'Histogram 2, Contains \'grape\':',
 76 |         histogram_2.contains(JString('grape')),
 77 |     )
 78 | 
 79 |     print(
 80 |         '\nHistogram 1, Contains Apple:',
 81 |         histogram_1.contains(JString('apple')),
 82 |     )
 83 |     print(
 84 |         'Histogram 2, Contains Apple:', histogram_2.contains(JString('apple'))
 85 |     )
 86 | 
 87 |     print('\nHistogram 1, Top 3:', histogram_1.getTop(JInt(3)))
 88 |     print('Histogram 2, Top 3:', histogram_2.getTop(JInt(3)))
 89 | 
 90 | 
 91 | def _print_histogram_int_comp(
 92 |     histogram_1: Histogram, histogram_2: Histogram
 93 | ) -> None:
 94 |     """
 95 |     Prints Integer comparison examples for the Histogram items.
 96 | 
 97 |     Args:
 98 |         histogram_1 (Histogram): First example Histogram.
 99 |         histogram_2 (Histogram): Second example Histogram.
100 |     """
101 |     print('\nHistogram 1, Less Than 2:', histogram_1.sizeSmaller(JInt(2)))
102 |     print('Histogram 2, Less Than 2:', histogram_2.sizeSmaller(JInt(2)))
103 | 
104 |     print('\nHistogram 1, More Than 2:', histogram_1.sizeLarger(JInt(2)))
105 |     print('Histogram 2, More Than 2:', histogram_2.sizeLarger(JInt(2)))
106 | 
107 |     print(
108 |         '\nHistogram 1, Between 1 and 3:',
109 |         histogram_1.totalCount(JInt(1), JInt(3)),
110 |     )
111 |     print(
112 |         'Histogram 2, Between 1 and 3:',
113 |         histogram_2.totalCount(JInt(1), JInt(3)),
114 |     )
115 | 
116 |     print(
117 |         '\nHistogram 1, Equals to 2:', histogram_1.getItemsWithCount(JInt(2))
118 |     )
119 |     print('Histogram 2, Equals to 2:', histogram_2.getItemsWithCount(JInt(2)))
120 | 
121 |     print(
122 |         '\nHistogram 1, >= 2 AND <= 3:',
123 |         histogram_1.getItemsWithCount(JInt(2)),
124 |         JInt(3),
125 |     )
126 |     print(
127 |         'Histogram 2, >= 2 AND <= 3:',
128 |         histogram_2.getItemsWithCount(JInt(2), JInt(3)),
129 |     )
130 | 
131 |     print(
132 |         '\nHistogram 1, % of >= 2 AND <= 3:',
133 |         histogram_1.countPercent(JInt(2), JInt(3)),
134 |     )
135 |     print(
136 |         'Histogram 2, % of >= 2 AND <= 3:',
137 |         histogram_2.countPercent(JInt(2), JInt(3)),
138 |     )
139 | 
140 |     print('\nHistogram 1, More Than 2:', histogram_1.sizeLarger(2))
141 |     print('Histogram 2, More Than 2:', histogram_2.sizeLarger(2))
142 | 
143 | 
144 | def _print_histogram_item_mods(
145 |     histogram_1: Histogram, histogram_2: Histogram
146 | ) -> None:
147 |     """
148 |     Prints Histogram item modification examples.
149 | 
150 |     Args:
151 |         histogram_1 (Histogram): First example Histogram.
152 |         histogram_2 (Histogram): Second example Histogram.
153 |     """
154 |     print('\nHistogram 1, Sorted:', histogram_1.getSortedList())
155 |     print('Histogram 2, Sorted:', histogram_2.getSortedList())
156 | 
157 |     histogram_1.set(JString('apple'), 5)
158 |     histogram_2.set(JString('apple'), 5)
159 |     print('\nHistogram 1, Set Apple Count to 5:', histogram_1.getEntryList())
160 |     print('Histogram 2, Set Apple Count to 5:', histogram_2.getEntryList())
161 | 
162 |     histogram_1.remove(JString('apple'))
163 |     histogram_2.remove(JString('apple'))
164 |     print('\nHistogram 1, Remove Apple:', histogram_1.getEntryList())
165 |     print('Histogram 2, Remove Apple:', histogram_2.getEntryList())
166 | 
167 |     histogram_1.decrementIfPositive(JString('appricot'))
168 |     histogram_2.decrementIfPositive(JString('appricot'))
169 |     print(
170 |         '\nHistogram 1, Decrease Appricot If Positive:',
171 |         histogram_1.getEntryList(),
172 |     )
173 |     print(
174 |         'Histogram 2, Decrease Appricot If Positive:',
175 |         histogram_2.getEntryList(),
176 |     )
177 | 
178 |     remove: java.util.ArrayList = java.util.ArrayList()
179 |     remove.add(JString('grape'))
180 |     remove.add(JString('banana'))
181 |     histogram_1.removeAll(remove)
182 |     histogram_2.removeAll(remove)
183 |     print(
184 |         '\nHistogram 1, Remove All Grape and Banana:',
185 |         histogram_1.getEntryList(),
186 |     )
187 |     print(
188 |         'Histogram 2, Remove All Grape and Banana:', histogram_2.getEntryList()
189 |     )
190 | 
191 | 
192 | def _print_histogram_agg(
193 |     histogram_1: Histogram, histogram_2: Histogram
194 | ) -> None:
195 |     """
196 |     Prints Histogram aggregation examples.
197 | 
198 |     Args:
199 |         histogram_1 (Histogram): First example Histogram.
200 |         histogram_2 (Histogram): Second example Histogram.
201 |     """
202 |     print('\nHistogram 1, Total Count:', histogram_1.totalCount())
203 |     print('Histogram 2, Total Count:', histogram_2.totalCount())
204 | 
205 |     print('\nHistogram 1, Size:', histogram_1.size())
206 |     print('Histogram 2, Size:', histogram_2.size())
207 | 
208 |     print(
209 |         '\nHistogram 1, \'apple\' Count:',
210 |         histogram_1.getCount(JString('apple')),
211 |     )
212 |     print(
213 |         'Histogram 2, \'apple\' Count:', histogram_2.getCount(JString('apple'))
214 |     )
215 | 
216 |     print('\nHistogram 1, Max Count:', histogram_1.maxValue())
217 |     print('Histogram 2, Max Count:', histogram_2.maxValue())
218 | 
219 |     print('\nHistogram 1, Min Count:', histogram_1.minValue())
220 |     print('Histogram 2, Min Count:', histogram_2.minValue())
221 | 
222 | 
223 | def run() -> None:
224 |     """Histogram usage examples."""
225 |     histogram_1, histogram_2 = _create_histograms()
226 | 
227 |     _print_histogram_items(histogram_1, histogram_2)
228 | 
229 |     _print_histogram_agg(histogram_1, histogram_2)
230 | 
231 |     _print_histogram_item_comp(histogram_1, histogram_2)
232 | 
233 |     _print_histogram_int_comp(histogram_1, histogram_2)
234 | 
235 |     _print_histogram_item_mods(histogram_1, histogram_2)
236 | 


--------------------------------------------------------------------------------
/examples/morphology/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozturkberkay/Zemberek-Python-Examples/a3680f185270ef74eba57c8aa6f6e77c519ffbbd/examples/morphology/__init__.py


--------------------------------------------------------------------------------
/examples/morphology/add_dictionary_item.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Zemberek: Adding Dictionary Item Example
  3 | Java Code Example: https://bit.ly/2qTlmXb
  4 | """
  5 | from typing import List
  6 | 
  7 | from jpype import JClass, JString
  8 | 
  9 | __all__: List[str] = ['run']
 10 | 
 11 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
 12 | DictionaryItem: JClass = JClass('zemberek.morphology.lexicon.DictionaryItem')
 13 | RootAttribute: JClass = JClass('zemberek.core.turkish.RootAttribute')
 14 | PrimaryPos: JClass = JClass('zemberek.core.turkish.PrimaryPos')
 15 | SecondaryPos: JClass = JClass('zemberek.core.turkish.SecondaryPos')
 16 | WordAnalysis: JClass = JClass('zemberek.morphology.analysis.WordAnalysis')
 17 | 
 18 | 
 19 | def print_results(results: WordAnalysis) -> None:
 20 |     """
 21 |     Prints analysis results.
 22 | 
 23 |     Args:
 24 |         results (WordAnalysis): Analysis results.
 25 |     """
 26 |     if results.analysisCount() == 0:
 27 |         print('No Analysis')
 28 |     for i, result in enumerate(results, 1):
 29 |         rstr: str = str(result.formatLong())
 30 |         if result.getDictionaryItem().attributes.contains(
 31 |             RootAttribute.Runtime
 32 |         ):
 33 |             rstr += ' (Generated by UnidentifiedTokenParser)\n'
 34 |         print(f'{i} - {rstr}')
 35 | 
 36 | 
 37 | def _test(
 38 |     morphology: TurkishMorphology, inp: str, new_item: DictionaryItem
 39 | ) -> None:
 40 |     """
 41 |     Testing analysis before and after adding dictionary item.
 42 | 
 43 |     Args:
 44 |         morphology (TurkishMorphology): Turkish morphology analyzer.
 45 |         inp (str): Input to analyze.
 46 |         new_item (DictionaryItem): Item to add to the dictionary.
 47 |     """
 48 |     print(f'Parses for {inp} before adding {new_item}')
 49 |     before: WordAnalysis = morphology.analyze(JString(inp))
 50 |     print_results(before)
 51 |     morphology.invalidateCache()
 52 |     morphology.getMorphotactics().getStemTransitions().addDictionaryItem(
 53 |         new_item
 54 |     )
 55 |     after: WordAnalysis = morphology.analyze(inp)
 56 |     print(f'Parses for {inp} after adding {new_item}')
 57 |     print_results(after)
 58 | 
 59 | 
 60 | def run() -> None:
 61 |     """
 62 |     Dictionary item addition tests.
 63 |     """
 64 | 
 65 |     morphology: TurkishMorphology = TurkishMorphology.createWithDefaults()
 66 | 
 67 |     print('\nProper Noun Test - 1:\n')
 68 |     _test(
 69 |         morphology,
 70 |         'Meydan\'a',
 71 |         DictionaryItem(
 72 |             JString('Meydan'),
 73 |             JString('meydan'),
 74 |             JString('meydan'),
 75 |             PrimaryPos.Noun,
 76 |             SecondaryPos.ProperNoun,
 77 |         ),
 78 |     )
 79 | 
 80 |     print('\nProper Noun Test - 2:\n')
 81 |     _test(
 82 |         morphology,
 83 |         'Meeeydan\'a',
 84 |         DictionaryItem(
 85 |             JString('Meeeydan'),
 86 |             JString('meeeydan'),
 87 |             JString('meeeydan'),
 88 |             PrimaryPos.Noun,
 89 |             SecondaryPos.ProperNoun,
 90 |         ),
 91 |     )
 92 | 
 93 |     print('\nVerb Test:\n')
 94 |     _test(
 95 |         morphology,
 96 |         'tweetleyeyazdım',
 97 |         DictionaryItem(
 98 |             JString('tweetlemek'),
 99 |             JString('tweetle'),
100 |             JString('tivitle'),
101 |             PrimaryPos.Verb,
102 |             SecondaryPos.None_,
103 |         ),
104 |     )
105 | 


--------------------------------------------------------------------------------
/examples/morphology/change_stem.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Zemberek: Change Stem Example
 3 | Documentation: https://bit.ly/2WmPDsW
 4 | Java Code Example: https://bit.ly/39Jnp49
 5 | """
 6 | from typing import List
 7 | 
 8 | from jpype import (
 9 |     JClass,
10 |     JString,
11 |     java,
12 | )
13 | 
14 | __all__: List[str] = ['run']
15 | 
16 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
17 | DictionaryItem: JClass = JClass('zemberek.morphology.lexicon.DictionaryItem')
18 | WordAnalysis: JClass = JClass('zemberek.morphology.analysis.WordAnalysis')
19 | 
20 | 
21 | def run(
22 |     source_word: str,
23 |     target_word: str,
24 | ) -> None:
25 |     """
26 |     Stem change example.
27 | 
28 |     Args:
29 |         source_word (str): Word to get stem from.
30 |         target_word (str): Word to apply stem change.
31 |     """
32 |     morphology: TurkishMorphology = TurkishMorphology.createWithDefaults()
33 | 
34 |     new_stem: DictionaryItem = (
35 |         morphology.getLexicon().getMatchingItems(target_word).get(0)
36 |     )
37 | 
38 |     results: WordAnalysis = morphology.analyze(JString(source_word))
39 | 
40 |     for result in results:
41 |         generated: java.util.ArrayList = (
42 |             morphology.getWordGenerator().generate(
43 |                 new_stem, result.getMorphemes()
44 |             )
45 |         )
46 |         for gen_word in generated:
47 |             print(
48 |                 f'\nInput Analysis: {str(result.formatLong())}'
49 |                 f'\nAfter Stem Change, Word: {str(gen_word.surface)}'
50 |                 '\nAfter Stem Change, Analysis:'
51 |                 f'{str(gen_word.analysis.formatLong())}'
52 |             )
53 | 


--------------------------------------------------------------------------------
/examples/morphology/diacritic_analysis.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Zemberek: Diacritic Analysis Example
 3 | Documentation: https://bit.ly/2PsyRHk
 4 | Java Code Example: https://bit.ly/2Jx7zfk
 5 | """
 6 | from typing import List
 7 | 
 8 | from jpype import JClass
 9 | 
10 | __all__: List[str] = ['run']
11 | 
12 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
13 | RootLexicon: JClass = JClass('zemberek.morphology.lexicon.RootLexicon')
14 | 
15 | 
16 | def run(word: str) -> None:
17 |     """
18 |     Diacritic analysis example.
19 | 
20 |     Args:
21 |         word (str): Word to apply diacritic analysis.
22 |     """
23 | 
24 |     morphology: TurkishMorphology = (
25 |         TurkishMorphology.builder()
26 |         .ignoreDiacriticsInAnalysis()
27 |         .setLexicon(RootLexicon.getDefault())
28 |         .build()
29 |     )
30 | 
31 |     print('\nAnalysis:')
32 | 
33 |     for analysis in morphology.analyze(word):
34 |         print(analysis)
35 | 


--------------------------------------------------------------------------------
/examples/morphology/find_pos.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Zemberek: Finding POS Tag Example
 3 | Documentation: https://bit.ly/32WCfyi
 4 | Java Code Example: https://bit.ly/2Nn7hse
 5 | """
 6 | from typing import List
 7 | 
 8 | from jpype import JClass, java
 9 | 
10 | __all__: List[str] = ['run']
11 | 
12 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
13 | 
14 | 
15 | def run(sentence: str):
16 |     """
17 |     POS tag detection example.
18 | 
19 |     Args:
20 |         sentence (str): Sentence to find POS tags on.
21 |     """
22 | 
23 |     morphology: TurkishMorphology = TurkishMorphology.createWithDefaults()
24 | 
25 |     analysis: java.util.ArrayList = morphology.analyzeAndDisambiguate(
26 |         sentence
27 |     ).bestAnalysis()
28 | 
29 |     pos: List[str] = []
30 | 
31 |     for i, analysis in enumerate(analysis, start=1):
32 |         print(
33 |             f'\nAnalysis {i}: {analysis}',
34 |             f'\nPrimary POS {i}: {analysis.getPos()}'
35 |             f'\nPrimary POS (Short Form) {i}: {analysis.getPos().shortForm}',
36 |         )
37 |         pos.append(
38 |             f'{str(analysis.getLemmas()[0])}-{analysis.getPos().shortForm}'
39 |         )
40 | 
41 |     print(f'\nFull sentence with POS tags: {" ".join(pos)}')
42 | 


--------------------------------------------------------------------------------
/examples/morphology/informal_words_analysis.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Zemberek: Informal Turkish Words Analysis
 3 | Documentation: https://bit.ly/2WpvvXg
 4 | Java Code Example: https://bit.ly/2MUvOG9
 5 | """
 6 | from typing import List
 7 | 
 8 | from jpype import JClass, java
 9 | 
10 | __all__: List[str] = ['run']
11 | 
12 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
13 | RootLexicon: JClass = JClass('zemberek.morphology.lexicon.RootLexicon')
14 | InformalAnalysisConverter: JClass = JClass(
15 |     'zemberek.morphology.analysis.InformalAnalysisConverter'
16 | )
17 | 
18 | 
19 | def run(sentence: str) -> None:
20 |     """
21 |     Informal words analysis example.
22 | 
23 |     Args:
24 |         sentence (str): Sentence to search for informal words.
25 |     """
26 | 
27 |     morphology: TurkishMorphology = (
28 |         TurkishMorphology.builder()
29 |         .setLexicon(RootLexicon.getDefault())
30 |         .ignoreDiacriticsInAnalysis()
31 |         .useInformalAnalysis()
32 |         .build()
33 |     )
34 | 
35 |     analyses: java.util.ArrayList = morphology.analyzeAndDisambiguate(
36 |         sentence
37 |     ).bestAnalysis()
38 | 
39 |     print('\nAnalysis:\n')
40 | 
41 |     for analysis in analyses:
42 |         print(f'{str(analysis.surfaceForm())}-{analysis}')
43 | 
44 |     print('\nConverting formal surface form:\n')
45 | 
46 |     converter: InformalAnalysisConverter = InformalAnalysisConverter(
47 |         morphology.getWordGenerator()
48 |     )
49 | 
50 |     for analysis in analyses:
51 |         print(str(converter.convert(analysis.surfaceForm(), analysis)))
52 | 


--------------------------------------------------------------------------------
/examples/morphology/sentence_disambiguation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Zemberek: Disambiguating Sentences Example
 3 | Documentation: https://bit.ly/36mO5Uu
 4 | Java Code Example: https://bit.ly/31UfDwI
 5 | """
 6 | from typing import List
 7 | 
 8 | from jpype import JClass, java
 9 | 
10 | __all__: List[str] = ['run']
11 | 
12 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
13 | 
14 | 
15 | def run(sentence: str) -> None:
16 |     """
17 |     Sentence disambiguation example.
18 | 
19 |     Args:
20 |         sentence (str): Sentence to disambiguate.
21 |     """
22 | 
23 |     morphology: TurkishMorphology = TurkishMorphology.createWithDefaults()
24 | 
25 |     analysis: java.util.ArrayList = morphology.analyzeSentence(sentence)
26 | 
27 |     results: java.util.ArrayList = morphology.disambiguate(
28 |         sentence, analysis
29 |     ).bestAnalysis()
30 | 
31 |     for i, result in enumerate(results, 1):
32 |         print(
33 |             f'\nAnalysis {i}: {str(result.formatLong())}'
34 |             f'\nStems {i}:'
35 |             f'{", ".join([str(stem) for stem in result.getStems()])}'
36 |             f'\nLemmas {i}:'
37 |             f'{", ".join([str(stem) for stem in result.getLemmas()])}'
38 |         )
39 | 


--------------------------------------------------------------------------------
/examples/morphology/stem_and_lemmatize.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Zemberek: Stemming and Lemmatization Example
 3 | Documentation: https://bit.ly/2WvtQzv
 4 | Java Code Example: https://bit.ly/2Wm71hj
 5 | """
 6 | from typing import List
 7 | from jpype import JClass, JString
 8 | 
 9 | __all__: List[str] = ['run']
10 | 
11 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
12 | WordAnalysis: JClass = JClass('zemberek.morphology.analysis.WordAnalysis')
13 | 
14 | 
15 | def run(word: str) -> None:
16 |     """
17 |     Stemming and lemmatization example.
18 | 
19 |     Args:
20 |         word (str): Word to apply stemming and lemmatization.
21 |     """
22 | 
23 |     morphology: TurkishMorphology = TurkishMorphology.createWithDefaults()
24 | 
25 |     print('\nResults:')
26 | 
27 |     results: WordAnalysis = morphology.analyze(JString(word))
28 | 
29 |     for result in results:
30 |         print(
31 |             f'{str(result.formatLong())}'
32 |             f'\n\tStems ='
33 |             f' {", ".join([str(result) for result in result.getStems()])}'
34 |             f'\n\tLemmas ='
35 |             f' {", ".join([str(result) for result in result.getLemmas()])}'
36 |         )
37 | 


--------------------------------------------------------------------------------
/examples/morphology/word_analysis.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Zemberek: Word Analysis Example
 3 | Documentation: https://bit.ly/2MTmfr1
 4 | Java Code Example: https://bit.ly/2MV2Hmj
 5 | """
 6 | from typing import List
 7 | 
 8 | from jpype import JClass, JString
 9 | 
10 | __all__: List[str] = ['run']
11 | 
12 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
13 | AnalysisFormatters: JClass = JClass(
14 |     'zemberek.morphology.analysis.AnalysisFormatters'
15 | )
16 | WordAnalysis: JClass = JClass('zemberek.morphology.analysis.WordAnalysis')
17 | 
18 | 
19 | def run(word: str) -> None:
20 |     """
21 |     Word analysis example.
22 | 
23 |     Args:
24 |         word (str):
25 |     """
26 | 
27 |     morphology: TurkishMorphology = TurkishMorphology.createWithDefaults()
28 | 
29 |     results: WordAnalysis = morphology.analyze(JString(word))
30 | 
31 |     for result in results:
32 |         print(
33 |             f'\nLexical and Surface: {str(result.formatLong())}'
34 |             f'\nOnly Lexical: {str(result.formatLexical())}'
35 |             '\nOflazer Style:'
36 |             f'{str(AnalysisFormatters.OFLAZER_STYLE.format(result))}'
37 |         )
38 | 


--------------------------------------------------------------------------------
/examples/morphology/word_generation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Zemberek: Word Generation Example
  3 | Documentation: https://bit.ly/2otE6LW
  4 | Java Code Example: https://bit.ly/32TWKvb
  5 | """
  6 | from typing import List
  7 | 
  8 | from jpype import (
  9 |     JClass,
 10 |     JString,
 11 |     java,
 12 | )
 13 | 
 14 | __all__: List[str] = ['run']
 15 | 
 16 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
 17 | DictionaryItem: JClass = JClass('zemberek.morphology.lexicon.DictionaryItem')
 18 | 
 19 | 
 20 | def _generate_nouns(root_word: str) -> None:
 21 |     """
 22 |     Generates inflections of the given root word using possessive and case
 23 |     suffix combinations.
 24 | 
 25 |     Args:
 26 |         root_word (str): Root word to generate inflections from.
 27 |     """
 28 | 
 29 |     print('\nGenerating nouns.\n')
 30 | 
 31 |     number: List[JString] = [JString('A3sg'), JString('A3pl')]
 32 |     possessives: List[JString] = [
 33 |         JString('P1sg'),
 34 |         JString('P2sg'),
 35 |         JString('P3sg'),
 36 |     ]
 37 |     cases: List[JString] = [JString('Dat'), JString('Loc'), JString('Abl')]
 38 | 
 39 |     morphology: TurkishMorphology = (
 40 |         TurkishMorphology.builder()
 41 |         .setLexicon(root_word)
 42 |         .disableCache()
 43 |         .build()
 44 |     )
 45 | 
 46 |     item: DictionaryItem = (
 47 |         morphology.getLexicon().getMatchingItems(root_word).get(0)
 48 |     )
 49 | 
 50 |     for number_m in number:
 51 |         for possessive_m in possessives:
 52 |             for case_m in cases:
 53 |                 for result in morphology.getWordGenerator().generate(
 54 |                     item, number_m, possessive_m, case_m
 55 |                 ):
 56 |                     print(str(result.surface))
 57 | 
 58 | 
 59 | def _generate_verbs(infinitive: str, stem: str) -> None:
 60 |     """
 61 |     Generates words from a given stem.
 62 | 
 63 |     Args:
 64 |         infinitive (str): Infinitive form of the verb to create the lexicon.
 65 |         stem (str): Stem to generate words for.
 66 |     """
 67 | 
 68 |     print('\nGenerating verbs.\n')
 69 | 
 70 |     positive_negatives: List[JString] = [JString(''), JString('Neg')]
 71 |     times: List[JString] = [
 72 |         'Imp',
 73 |         'Aor',
 74 |         'Past',
 75 |         'Prog1',
 76 |         'Prog2',
 77 |         'Narr',
 78 |         'Fut',
 79 |     ]
 80 |     people: List[JString] = ['A1sg', 'A2sg', 'A3sg', 'A1pl', 'A2pl', 'A3pl']
 81 | 
 82 |     morphology = (
 83 |         TurkishMorphology.builder()
 84 |         .setLexicon(infinitive)
 85 |         .disableCache()
 86 |         .build()
 87 |     )
 88 | 
 89 |     for pos_neg in positive_negatives:
 90 |         for time in times:
 91 |             for person in people:
 92 |                 seq: java.util.ArrayList = java.util.ArrayList()
 93 |                 if pos_neg:
 94 |                     seq.add(JString(pos_neg))
 95 |                 if time:
 96 |                     seq.add(JString(time))
 97 |                 if person:
 98 |                     seq.add(JString(person))
 99 |                 results = list(
100 |                     morphology.getWordGenerator().generate(JString(stem), seq)
101 |                 )
102 |                 if not results:
103 |                     print(
104 |                         f'Cannot generate Stem = ["{stem}"]'
105 |                         f'\n | Morphemes = {[str(morph) for morph in seq]}'
106 |                     )
107 |                     continue
108 |                 print(' '.join(str(result.surface) for result in results))
109 | 
110 | 
111 | def run(noun_root_word: str, verb_infinitive: str, verb_stem: str) -> None:
112 |     """
113 |     Generate nouns and verbs.
114 | 
115 |     Args:
116 |         noun_root_word (str): Root word to generate inflections from.
117 |         verb_infinitive (str): Infinitive form of the verb to create the
118 |             lexicon for verb generation.
119 |         verb_stem (str): Stem to generate verbs for.
120 |     """
121 |     _generate_nouns(noun_root_word)
122 |     _generate_verbs(verb_infinitive, verb_stem)
123 | 


--------------------------------------------------------------------------------
/examples/normalization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozturkberkay/Zemberek-Python-Examples/a3680f185270ef74eba57c8aa6f6e77c519ffbbd/examples/normalization/__init__.py


--------------------------------------------------------------------------------
/examples/normalization/document_correction.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Zemberek: Document Correction Example
 3 | Documentation: https://bit.ly/31PThMZ
 4 | Java Code Example: https://bit.ly/2oohWKT
 5 | """
 6 | from typing import List
 7 | 
 8 | from jpype import JClass, JString, java
 9 | 
10 | from examples import DATA_PATH
11 | 
12 | TurkishSpellChecker: JClass = JClass(
13 |     'zemberek.normalization.TurkishSpellChecker'
14 | )
15 | TurkishTokenizer: JClass = JClass('zemberek.tokenization.TurkishTokenizer')
16 | TurkishLexer: JClass = JClass('zemberek.tokenization.antlr.TurkishLexer')
17 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
18 | Token: JClass = JClass('zemberek.tokenization.Token')
19 | 
20 | 
21 | def run() -> None:
22 |     """
23 |     Document correction example.
24 |     """
25 | 
26 |     tokenizer: TurkishTokenizer = TurkishTokenizer.ALL
27 |     morphology: TurkishMorphology = TurkishMorphology.createWithDefaults()
28 |     spell_checker: TurkishSpellChecker = TurkishSpellChecker(morphology)
29 | 
30 |     with open(
31 |         DATA_PATH.joinpath('normalization', 'document.txt'),
32 |         'r',
33 |         encoding='utf-8',
34 |     ) as document_file:
35 |         document = document_file.read()
36 | 
37 |     tokens: java.util.ArrayList = tokenizer.tokenize(JString(document))
38 | 
39 |     corrected_tokens: List[str] = []
40 | 
41 |     for token in tokens:
42 |         text: JString = token.content
43 |         if token.type not in {
44 |             Token.Type.NewLine,
45 |             Token.Type.SpaceTab,
46 |             Token.Type.Punctuation,
47 |             Token.Type.RomanNumeral,
48 |             Token.Type.UnknownWord,
49 |             Token.Type.Unknown
50 |         } and not spell_checker.check(text):
51 |             suggestions: List[JString] = list(
52 |                 spell_checker.suggestForWord(token.content)
53 |             )
54 |             if suggestions:
55 |                 suggestion: str = str(suggestions[0])
56 |                 print(f'Correction: {token.content} -> {suggestion}.')
57 |                 corrected_tokens.append(suggestion)
58 |                 continue
59 |         corrected_tokens.append(str(token.content))
60 | 
61 |     print('\nCorrected Document:\n', ''.join(corrected_tokens))
62 | 


--------------------------------------------------------------------------------
/examples/normalization/noisy_text_normalization.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ## Zemberek: Noisy Text Normalization Example
 3 | # Documentation: https://bit.ly/2WkUVVF
 4 | # Java Code Example: https://bit.ly/31Qi9Ew
 5 | """
 6 | from jpype import JClass, JString
 7 | 
 8 | from examples import DATA_PATH
 9 | 
10 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
11 | TurkishSentenceNormalizer: JClass = JClass(
12 |     'zemberek.normalization.TurkishSentenceNormalizer'
13 | )
14 | Paths: JClass = JClass('java.nio.file.Paths')
15 | 
16 | 
17 | def run(text: str) -> None:
18 |     """
19 |     Noisy text normalization example.
20 | 
21 |     Args:
22 |         text (str): Noisy text to normalize.
23 |     """
24 | 
25 |     normalizer = TurkishSentenceNormalizer(
26 |         TurkishMorphology.createWithDefaults(),
27 |         Paths.get(str(DATA_PATH.joinpath('normalization'))),
28 |         Paths.get(str(DATA_PATH.joinpath('lm', 'lm.2gram.slm'))),
29 |     )
30 | 
31 |     print(f'\nNormalized: {normalizer.normalize(JString(text))}')
32 | 


--------------------------------------------------------------------------------
/examples/normalization/spell_checking.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Zemberek: Tokenization Example
 3 | Documentation: https://bit.ly/2pYWVqC
 4 | Java Code Example: https://bit.ly/31Ux0xJ
 5 | """
 6 | from typing import List
 7 | 
 8 | from jpype import JClass, JString, java
 9 | 
10 | TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
11 | TurkishSpellChecker: JClass = JClass(
12 |     'zemberek.normalization.TurkishSpellChecker'
13 | )
14 | 
15 | 
16 | def run(sentence: str) -> None:
17 |     """
18 |     Spell checking example.
19 | 
20 |     Args:
21 |         sentence (str): Sentence to check for spelling errors.
22 |     """
23 | 
24 |     morphology: TurkishMorphology = TurkishMorphology.createWithDefaults()
25 | 
26 |     spell_checker: TurkishSpellChecker = TurkishSpellChecker(morphology)
27 | 
28 |     words: List[str] = sentence.split(' ')
29 |     fixed_words: List[str] = []
30 | 
31 |     for word in words:
32 |         if not spell_checker.check(JString(word)):
33 |             print(f'Spelling error: {word}')
34 |             suggestions: java.util.ArrayList = spell_checker.suggestForWord(
35 |                 JString(word)
36 |             )
37 |             if suggestions:
38 |                 print(f'\nSuggestions for "{word}":')
39 |                 for suggestion in suggestions:
40 |                     print(f' | {suggestion}')
41 |                 fixed_words.append(str(suggestions[0]))
42 |                 continue
43 |             else:
44 |                 print(f'No suggestions found for "{word}".')
45 |         fixed_words.append(word)
46 | 
47 |     print('\nFixed sentence:', ' '.join(fixed_words))
48 | 


--------------------------------------------------------------------------------
/examples/tokenization/sentence_boundary_detection.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Zemberek: Sentence Boundary Detection Example
 3 | Documentation: https://bit.ly/2JopMvt
 4 | Java Code Example: https://bit.ly/2PrG7Dw
 5 | """
 6 | 
 7 | from jpype import JClass
 8 | 
 9 | TurkishSentenceExtractor: JClass = JClass(
10 |     'zemberek.tokenization.TurkishSentenceExtractor'
11 | )
12 | 
13 | 
14 | def run(paragraph: str) -> None:
15 |     """
16 |     Sentence boundary detection example.
17 | 
18 |     Args:
19 |         paragraph (str): Paragraph to detect sentence boundaries.
20 |     """
21 | 
22 |     extractor: TurkishSentenceExtractor = TurkishSentenceExtractor.DEFAULT
23 | 
24 |     sentences = extractor.fromParagraph(paragraph)
25 | 
26 |     for i, word in enumerate(sentences):
27 |         print(f'Sentence {i+1}: {word}')
28 | 


--------------------------------------------------------------------------------
/examples/tokenization/turkish_tokenization.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Zemberek: Turkish Tokenization Example
 3 | Java Code Example: https://bit.ly/2PsLOkj
 4 | """
 5 | from jpype import JClass, JString
 6 | 
 7 | TurkishTokenizer: JClass = JClass('zemberek.tokenization.TurkishTokenizer')
 8 | TokenIterator: JClass = JClass(
 9 |     'zemberek.tokenization.TurkishTokenizer.TokenIterator'
10 | )
11 | Token: JClass = JClass('zemberek.tokenization.Token')
12 | 
13 | 
14 | def run(sentence: str) -> None:
15 |     """
16 |     Turkish sentence tokenization example.
17 | 
18 |     Args:
19 |         sentence (str): Sentence to tokenize.
20 |     """
21 |     tokenizer: TurkishTokenizer = TurkishTokenizer.DEFAULT
22 | 
23 |     print('\nToken Iterator Example:\n')
24 | 
25 |     token_iterator: TokenIterator = tokenizer.getTokenIterator(
26 |         JString(sentence)
27 |     )
28 | 
29 |     for token in token_iterator:
30 |         print(
31 |             f'Token = {token}'
32 |             f'\n | Content = {token.content}'
33 |             f'\n | Normalized = {token.normalized}'
34 |             f'\n | Type = {token.type}'
35 |             f'\n | Start = {token.start}'
36 |             f'\n | End = {token.end}\n'
37 |         )
38 | 
39 |     print('Default Tokenization Example:\n')
40 | 
41 |     tokenizer: TurkishTokenizer = TurkishTokenizer.DEFAULT
42 | 
43 |     for i, token in enumerate(tokenizer.tokenizeToStrings(JString(sentence))):
44 |         print(f' | Token String {i} = {token}')
45 | 
46 |     print('\nCustom Tokenization With Ignored Types Example:\n')
47 | 
48 |     tokenizer: TurkishTokenizer = (
49 |         TurkishTokenizer.builder()
50 |         .ignoreTypes(
51 |             Token.Type.Punctuation, Token.Type.NewLine, Token.Type.SpaceTab
52 |         )
53 |         .build()
54 |     )
55 |     for i, token in enumerate(tokenizer.tokenize(JString(sentence))):
56 |         print(f' | Token {i} = {token}')
57 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import importlib
 3 | from pathlib import Path
 4 | from typing import List
 5 | 
 6 | from jpype import shutdownJVM, startJVM
 7 | 
 8 | from examples import JVM_KWARGS
 9 | 
10 | 
11 | def get_runnable_modules() -> List[str]:
12 |     """
13 |     Returns a list of scripts which implement the run function.
14 | 
15 |     Returns:
16 |         List[str]: List of script names in 'module.submodule.script' format.
17 |     """
18 |     runnable_modules: List[str] = []
19 |     for script in Path('examples').glob('**/*.py'):
20 |         if script.name == '__init__.py':
21 |             continue
22 |         module_name: str = (
23 |             f'{script.parents[0].name}.{script.name.split(".")[0]}'
24 |         )
25 |         if hasattr(importlib.import_module(f'examples.{module_name}'), 'run'):
26 |             runnable_modules.append(module_name)
27 |     return runnable_modules
28 | 
29 | 
30 | if __name__ == '__main__':
31 | 
32 |     startJVM(**JVM_KWARGS)
33 | 
34 |     parser = argparse.ArgumentParser(
35 |         description=(
36 |             'Run a Zemberek example. Example usage: python -m main'
37 |             ' morphology.word_analysis kelime'
38 |         )
39 |     )
40 |     parser.add_argument(
41 |         'example',
42 |         type=str,
43 |         help='The run() function from the chosen script will be invoked.',
44 |         choices=get_runnable_modules(),
45 |     )
46 |     parser.add_argument(
47 |         'args',
48 |         type=str,
49 |         default=[],
50 |         nargs='*',
51 |         help='Arguments to pass to the run function.',
52 |     )
53 | 
54 |     args = parser.parse_args()
55 | 
56 |     example = importlib.import_module(f'examples.{args.example}')
57 | 
58 |     print(example.run.__doc__)
59 | 
60 |     example.run(*args.args)
61 | 
62 |     shutdownJVM()
63 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gdown==3.12.2
2 | JPype1==1.2.0


--------------------------------------------------------------------------------