├── .gitignore ├── LICENSE ├── README.md ├── Tokenizer_sample.ipynb ├── data ├── phrases-example.csv └── phrases.csv ├── main.py ├── persian_tokenizer ├── requirements.txt └── tokenizer ├── __init__.py ├── _tokenizer.py └── lookup_dic.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | ._* 3 | .apdisk 4 | .AppleDB 5 | .AppleDesktop 6 | .AppleDouble 7 | .apt_generated 8 | .apt_generated/ 9 | atlassian-ide-plugin.xml 10 | .autotools 11 | *.bak 12 | bin/ 13 | *.box 14 | build/ 15 | .buildpath 16 | .cache 17 | .cache-main 18 | celerybeat-schedule 19 | *.class 20 | .classpath 21 | cmake-build-*/ 22 | .com.apple.timemachine.donotpresent 23 | com_crashlytics_export_strings.xml 24 | *.cover 25 | .coverage 26 | .coverage.* 27 | coverage.xml 28 | .cproject 29 | crashlytics-build.properties 30 | crashlytics.properties 31 | db.sqlite3 32 | develop-eggs/ 33 | .directory 34 | dist/ 35 | .dmypy.json 36 | dmypy.json 37 | docs/_build/ 38 | .DocumentRevisions-V100 39 | downloads/ 40 | .DS_Store 41 | *.egg 42 | *.egg-info/ 43 | .eggs/ 44 | eggs/ 45 | .env 46 | env/ 47 | ENV/ 48 | env.bak/ 49 | .externalToolBuilders/ 50 | fabric.properties 51 | .factorypath 52 | .fseventsd 53 | .fuse_hidden* 54 | htmlcov/ 55 | .hypothesis/ 56 | Icon 57 | .idea/ 58 | .idea/caches/build_file_checksums.ser 59 | .idea/**/contentModel.xml 60 | .idea/**/dataSources/ 61 | .idea/**/dataSources.ids 62 | .idea/**/dataSources.local.xml 63 | .idea/**/dbnavigator.xml 64 | .idea/**/dictionaries 65 | .idea/**/dynamic.xml 66 | .idea/**/gradle.xml 67 | .idea/httpRequests 68 | .idea/**/libraries 69 | .idea/misc.xml 70 | .idea_modules/ 71 | .idea/**/mongoSettings.xml 72 | .idea/replstate.xml 73 | .idea/**/shelf 74 | .idea/sonarlint 75 | .idea/**/sqlDataSources.xml 76 | .idea/**/tasks.xml 77 | .idea/**/uiDesigner.xml 78 | .idea/**/usage.statistics.xml 79 | .idea/**/workspace.xml 80 | *.iml 81 | .installed.cfg 82 | instance/ 83 | *.ipr 84 | */.ipynb_checkpoints/* 85 | .ipynb_checkpoints 86 | ipython_config.py 87 | *.iws 88 | *.launch 89 | lib/ 90 | lib64/ 91 | .loadpath 92 | local.properties 93 | local_settings.py 94 | *.log 95 | .LSOverride 96 | *.manifest 97 | MANIFEST 98 | .metadata 99 | *.mo 100 | modules.xml 101 | .mr.developer.cfg 102 | .mypy_cache/ 103 | .netrwhist 104 | Network Trash Folder 105 | .nfs* 106 | *~.nib 107 | nosetests.xml 108 | .nox/ 109 | out/ 110 | parts/ 111 | pip-delete-this-directory.txt 112 | pip-log.txt 113 | pip-wheel-metadata/ 114 | *.pot 115 | profile_default/ 116 | .project 117 | __pycache__/ 118 | *.py[cod] 119 | *.pydevproject 120 | .pydevproject 121 | .pyre/ 122 | .pytest_cache/ 123 | .Python 124 | .python-version 125 | .recommenders 126 | .recommenders/ 127 | .ropeproject 128 | *.sage.py 129 | [._]s[a-rt-v][a-z] 130 | [._]*.s[a-v][a-z] 131 | .scala_dependencies 132 | .scrapy 133 | sdist/ 134 | Session.vim 135 | Sessionx.vim 136 | .settings/ 137 | share/python-wheels/ 138 | /site 139 | *.so 140 | *.spec 141 | .Spotlight-V100 142 | .springBeans 143 | .spyderproject 144 | .spyproject 145 | [._]ss[a-gi-z] 146 | .sts4-cache/ 147 | [._]*.sw[a-p] 148 | [._]sw[a-p] 149 | *.swp 150 | tags 151 | .target 152 | target/ 153 | .TemporaryItems 154 | Temporary Items 155 | .tern-project 156 | .texlipse 157 | *.tmp 158 | tmp/ 159 | .tox/ 160 | .Trash-* 161 | .Trashes 162 | [._]*.un~ 163 | .vagrant/* 164 | var/ 165 | .venv 166 | venv/ 167 | venv.bak/ 168 | .vim 169 | .VolumeIcon.icns 170 | .webassets-cache 171 | wheels/ 172 | .worksheet 173 | *.BAK 174 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 skorani 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PizzaTokenizer 2 | 3 | The Semantic Tokenizer "Pizza Tokenizer" by S. Korani. 4 | 5 | ## Usage: 6 | 7 | **Note 1**: We are constantly developing new features, and it may cause incompatibility with older versions. 8 | **Note 2**: This tokenizer needs normalized data. For testing this module, you need to remove punctuations and extra whitespaces from your text. 9 | 10 | To use the tokenizer copy `tokenizer` directory to your project path or add 11 | its path to python's module search path. Then 12 | 13 | import tokenizer 14 | 15 | Initialize tokenizer by instantiating `Tokenizer` class: 16 | 17 | tokenizer_object = tokenizer.Tokenizer() 18 | 19 | Now, you could tokenize ***normalized*** text with the method below: 20 | 21 | tokenizer_object("your normalized string") 22 | 23 | This function returns a list of Semantic Tokens. 24 | -------------------------------------------------------------------------------- /Tokenizer_sample.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tokenizer (v0.0.1)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Just some examples and how-to-use this tokenizer.\n", 15 | "\n", 16 | "**Please note** that this tokenizer is in its early stages and may introduce changes to API which\n", 17 | "may be ***not*** *backward-compatible*!" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Usage" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "### Example strings:" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 1, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "first_example = \"سلام بر شما\"\n", 41 | "second_example = \"اینجا هگمتانه نام گرفت\"\n", 42 | "third_example = \"که با این درد اگر در بند در مانند، درمانند.\"" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Instructions:" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "First import the `tokenizer` module." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 2, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "import tokenizer" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "`tokenizer` has a class named `Tokenizer` which do tokenization." 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "When we instantiate `Tokenizer` it will initialize itself." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "tokenizer_object = tokenizer.Tokenizer()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "Now just use `tokenizer_object` functor on texts.\n", 96 | "The `Tokenizer` class implemented `__call__` magic hence it could be called directly. \n", 97 | "Here, we use examples from above code:" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "['سلام', 'بر', 'شما']" 109 | ] 110 | }, 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "tokenizer_object(first_example)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "['اینجا', 'هگمتانه', 'نام گرفت']" 129 | ] 130 | }, 131 | "execution_count": 5, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "tokenizer_object(second_example)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 6, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "['که', 'با', 'این', 'درد', 'اگر', 'در', 'بند', 'در', 'مانند،', 'درمانند.']" 149 | ] 150 | }, 151 | "execution_count": 6, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "tokenizer_object(third_example)" 158 | ] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "display_name": "Python 3", 164 | "language": "python", 165 | "name": "python3" 166 | }, 167 | "language_info": { 168 | "codemirror_mode": { 169 | "name": "ipython", 170 | "version": 3 171 | }, 172 | "file_extension": ".py", 173 | "mimetype": "text/x-python", 174 | "name": "python", 175 | "nbconvert_exporter": "python", 176 | "pygments_lexer": "ipython3", 177 | "version": "3.7.5" 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 2 182 | } 183 | -------------------------------------------------------------------------------- /data/phrases-example.csv: -------------------------------------------------------------------------------- 1 | "آدم تو آفتابه پپسی بخوره، خیط نشه","_tkph_00000d55__" 2 | "آشغال کله","_tkph_00000d56__" 3 | "آمپر چسبوندن","_tkph_00000d59__" 4 | "آنتی حال زدن","_tkph_00000d5a__" 5 | "آرنولد فشرده","_tkph_00000d5b__" 6 | "آینه ی بغل اتوبوس","_tkph_00000d5c__" 7 | "ابوالحسن نجفی","_tkph_00000d5d__" 8 | "ارایه دادن","_tkph_00000d5e__" 9 | "اسگل کردن","_tkph_00000d5f__" 10 | "اِندِ مرام بازی","_tkph_00000d5g__" 11 | "اوپدیس کردن","_tkph_00000d5h__" 12 | "اوپدیس بازی کردن","_tkph_00000d5i__" 13 | "با دنده سنگین رفتن","_tkph_00000d5j__" 14 | "برو بکس یا فقط بکس","_tkph_00000d5k__" 15 | "پارس خود رو","_tkph_00000d5l__" 16 | "پایه بودن","_tkph_00000d5m__" 17 | "پسته خانم","_tkph_00000d5n__" 18 | "تو سایت کسی رفتن","_tkph_00000d5p__" 19 | "تو کار کسی بودن","_tkph_00000d5q__" 20 | "تو کف چیزی بودن","_tkph_00000d5r__" 21 | "تو کف کسی بودن","_tkph_00000d5s__" 22 | "تیکه انداختن","_tkph_00000d5t__" 23 | "جنده ی دولتی","_tkph_00000d5u__" 24 | "جوات مخفی","_tkph_00000d5v__" 25 | "جیب ملا","_tkph_00000d5w__" 26 | "چپو کردن","_tkph_00000d5x__" 27 | "حسین صاف کار","_tkph_00000d5z__" 28 | "خار داشتن","_tkph_00000d5A__" 29 | "خر به خراسان بردن","_tkph_00000d5C__" 30 | "خز و پیل","_tkph_00000d5E__" 31 | "خط خطی بودن","_tkph_00000d5F__" 32 | "خفن بازار","_tkph_00000d5G__" 33 | "خلافی داشتن","_tkph_00000d5H__" 34 | "داف بازی","_tkph_00000d5I__" 35 | "درایوری رانندگی کردن","_tkph_00000d5J__" 36 | "درد کشیده طبیبه","_tkph_00000d5K__" 37 | "دو دره","_tkph_00000d5L__" 38 | "دور سه فرمان","_tkph_00000d5M__" 39 | "دهن کسی کف کردن","_tkph_00000d5O__" 40 | "راه دادن","_tkph_00000d5P__" 41 | "رَ دَ دَ","_tkph_00000d5Q__" 42 | "رفیق دُنگ","_tkph_00000d5R__" 43 | "روی آنتن رفتن","_tkph_00000d5S__" 44 | "ریلیف کردن","_tkph_00000d5T__" 45 | "زاب چک","_tkph_00000d5U__" 46 | "زیب چک","_tkph_00000d5V__" 47 | "زارت زرت","_tkph_00000d5Y__" 48 | "ز ذ","_tkph_00000d60__" 49 | "ساختن خود","_tkph_00000d63__" 50 | "سازمان سنجش","_tkph_00000d64__" 51 | "سازمان گوشت","_tkph_00000d65__" 52 | "سفره الفقرا","_tkph_00000d66__" 53 | "سکه رایج بلاد اسلامی","_tkph_00000d67__" 54 | "سوار درخت انگور","_tkph_00000d68__" 55 | "سوپر قلعه","_tkph_00000d69__" 56 | "سوسک کردن کسی","_tkph_00000d6a__" 57 | "سوپر کالی‌ فرا جلیس تیک ِ اِکس پیا ل ِ دوشِز","_tkph_00000d6b__" 58 | "سوتی دادن","_tkph_00000d6c__" 59 | "سوراخ جورابتیم","_tkph_00000d6d__" 60 | "سه دروغ بزرگ","_tkph_00000d6e__" 61 | "سیامک سنجرانی","_tkph_00000d6f__" 62 | "سیم های کسی قاطی کردن","_tkph_00000d6h__" 63 | "شوخی افغانی","_tkph_00000d6l__" 64 | "صاف شدن","_tkph_00000d6n__" 65 | "ضد حال","_tkph_00000d6o__" 66 | "ضد حال زدن","_tkph_00000d6p__" 67 | "عمرنات پتاسیم","_tkph_00000d6r__" 68 | "فر دادن","_tkph_00000d6u__" 69 | "فر خوردن","_tkph_00000d6v__" 70 | "فک کسی به زمین خوردن","_tkph_00000d6w__" 71 | "فلفل سبز","_tkph_00000d6x__" 72 | "فنچ فنچول","_tkph_00000d6y__" 73 | "قات زدن","_tkph_00000d6z__" 74 | "قُزل قورت","_tkph_00000d6A__" 75 | "قه ثانیه","_tkph_00000d6C__" 76 | "کره کردن","_tkph_00000d6G__" 77 | "کف کسی بریدن","_tkph_00000d6H__" 78 | "کف دستش مثل کون بچه صافه","_tkph_00000d6I__" 79 | "کف و خون بالا آوردن","_tkph_00000d6J__" 80 | "کلان از کلانتر","_tkph_00000d6K__" 81 | "گوجه زدن","_tkph_00000d6N__" 82 | "گوشت کوب","_tkph_00000d6O__" 83 | "لاو انداختن","_tkph_00000d6Q__" 84 | "لایی کشیدن","_tkph_00000d6R__" 85 | "مال دوره ی گروهبان یکی هیتلر","_tkph_00000d6T__" 86 | "ماهی شو برو","_tkph_00000d6V__" 87 | "مخ زدن","_tkph_00000d6W__" 88 | "مخ گایی","_tkph_00000d6X__" 89 | "مگسی شدن","_tkph_00000d6Y__" 90 | "ملی شدن","_tkph_00000d6Z__" 91 | "مماس بودن","_tkph_00000d70__" 92 | "مُهرمون هم خرابه","_tkph_00000d71__" 93 | "میخ شدن","_tkph_00000d72__" 94 | "میرزا مقوا","_tkph_00000d73__" 95 | "نا فرم","_tkph_00000d74__" 96 | "نبشی دادن","_tkph_00000d75__" 97 | "نک و نال","_tkph_00000d76__" 98 | "نمره ی شهرستان","_tkph_00000d77__" 99 | "نمور نموره","_tkph_00000d78__" 100 | "هندونه گذاشتن","_tkph_00000d7a__" 101 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import logging 5 | 6 | from collections import Counter 7 | 8 | from tokenizer import Tokenizer 9 | 10 | logging.basicConfig() 11 | 12 | 13 | def main(): 14 | clean_text = [input()] 15 | tok = Tokenizer() 16 | for text in clean_text: 17 | tokens = tok(text) 18 | print(tokens) 19 | counts = Counter(tokens) 20 | for token, count in counts.items(): 21 | print(f"{token}: {count}") 22 | 23 | 24 | if __name__ == "__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /persian_tokenizer: -------------------------------------------------------------------------------- 1 | improve dataset 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flashtext 2 | -------------------------------------------------------------------------------- /tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | import logging as __logging 2 | 3 | from ._tokenizer import Tokenizer 4 | 5 | __logging.getLogger(f"pizza_nlp.{__name__}").addHandler(__logging.NullHandler()) 6 | -------------------------------------------------------------------------------- /tokenizer/_tokenizer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from flashtext import KeywordProcessor 4 | 5 | 6 | log = logging.getLogger(f"pizza_nlp.{__name__}") 7 | 8 | 9 | class Tokenizer(object): 10 | def __init__(self): 11 | log.info("Tokenizer initialization") 12 | 13 | from .lookup_dic import _PhraseDictionary as __PD 14 | log.debug("Tokenizer: calls lookup_dic.read_phrases") 15 | self.lookup_dic = __PD() 16 | 17 | log.debug("Instanciate flashtext.KeyworkProcessor") 18 | self.__keyword_processor = KeywordProcessor() 19 | log.debug("Insert data into flashtext.KeyworkProcessor instance.") 20 | self.__keyword_processor.add_keywords_from_dict( 21 | self.lookup_dic.lookup_dic_CODE) 22 | log.info("Tokenizer initialization successful") 23 | 24 | 25 | def tokenize(self, text): 26 | log.debug(f"Tokenizer called on {text}") 27 | 28 | log.debug("Phase I: Replacing phrases.") 29 | text = self.__keyword_processor.replace_keywords(text) 30 | 31 | log.debug("Phase II: Split by space.") 32 | tokens_list = text.split() 33 | 34 | log.debug("Phase III: Replace back token id to its original form.") 35 | tokens_list = [ 36 | self.lookup_dic.reverse_replace(token) 37 | if token in self.lookup_dic.lookup_dic_CODE else token 38 | for token in tokens_list 39 | ] 40 | 41 | return tokens_list 42 | 43 | def __call__(self, text): 44 | return self.tokenize(text) 45 | -------------------------------------------------------------------------------- /tokenizer/lookup_dic.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import logging 3 | 4 | 5 | log = logging.getLogger(f"pizza_nlp.{__name__}") 6 | 7 | 8 | class _PhraseDictionary(object): 9 | def __init__(self, phrase_file="data/phrases.csv"): 10 | self._phrase_file = phrase_file 11 | log.debug("Populating phrase dictionary: started") 12 | self.lookup_reverse_dic_CODE = dict() 13 | self.lookup_dic_CODE = dict() 14 | log.debug("Reading phrase datafile: started") 15 | log.info("Reading phrase datafile: {}".format(phrase_file)) 16 | with open(phrase_file) as f: 17 | reader = csv.reader(f) 18 | for row in reader: 19 | try: 20 | phrase, replace_with = row 21 | log.debug(f" phrase: {phrase}, rw: {replace_with}") 22 | except ValueError: 23 | log.error( 24 | "Bad input: {} - " 25 | "csv parser could not unpack properly.".format( 26 | repr(row))) 27 | self.lookup_dic_CODE.update({replace_with: [phrase]}) 28 | self.lookup_reverse_dic_CODE.update({phrase: [replace_with]}) 29 | log.debug( 30 | f"Phrase: {phrase} with id {replace_with}" 31 | "added to phrase dictionary.") 32 | log.debug("Populating phrase dictionary: finished") 33 | 34 | def reverse_replace(self, _token_): 35 | log.debug(f'Reverse lookup call for "{_token_}".') 36 | for item in self.lookup_dic_CODE.items(): 37 | _token_ = str.replace(_token_, item[0], item[1][0]) 38 | log.debug(f'Found "{_token_}".') 39 | 40 | return _token_ 41 | --------------------------------------------------------------------------------