├── .gitattributes ├── .github └── workflows │ ├── linter.yaml │ ├── python-publish.yaml │ └── test.yaml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── chikkarpy ├── __init__.py ├── chikkar.py ├── command_line.py ├── config.py ├── dictionarylib │ ├── __init__.py │ ├── binarydictionary.py │ ├── dictionary.py │ ├── dictionarybuilder.py │ ├── dictionaryheader.py │ ├── dictionaryversion.py │ ├── doublearraytrie.py │ ├── flags.py │ ├── format │ │ ├── __init__.py │ │ └── format.py │ ├── idtable.py │ ├── jtypedbytebuffer.py │ └── synonym_group_list.py ├── synonym.py └── synonymgroup.py ├── requirements.txt ├── scripts ├── flake8.cfg ├── license-header.txt ├── lint.sh └── test.sh ├── setup.py └── tests ├── __init__.py ├── dictionarylib ├── __init__.py ├── test_dictionary.py ├── test_dictionaryheader.py ├── test_doublearraytrie.py └── test_flags.py ├── resources ├── system.csv ├── user.csv └── user2.csv ├── test_chikkar.py └── test_synonymgroup.py /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | 3 | *.in text 4 | *.md text 5 | *.py text 6 | *.txt text 7 | 8 | *.pyc binary 9 | *.pyd binary 10 | *.pyo binary 11 | *.pyw binary 12 | *.dic binary -------------------------------------------------------------------------------- /.github/workflows/linter.yaml: -------------------------------------------------------------------------------- 1 | name: linter 2 | 3 | on: 4 | push: 5 | branches: [develop] 6 | pull_request: 7 | types: [opened, synchronize, reopend] 8 | branches: [develop] 9 | 10 | jobs: 11 | linter: 12 | name: flake8 linter 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - name: Install Dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install flake8 flake8-builtins flake8-import-order 22 | 23 | - name: Code style check by flake8 24 | run: | 25 | cd scripts && ./lint.sh 26 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yaml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install cython>=0.28 25 | pip install -r requirements.txt 26 | pip install setuptools wheel twine 27 | - name: Build and publish 28 | env: 29 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 30 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 31 | run: | 32 | python setup.py sdist 33 | twine upload dist/* 34 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | push: 5 | branches: [develop] 6 | pull_request: 7 | types: [opened, synchronize, reopened] 8 | branches: [develop] 9 | 10 | jobs: 11 | test: 12 | name: Test package 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | os: [ubuntu-latest, macos-latest, windows-latest] 17 | python-version: ['3.5', '3.x'] 18 | 19 | steps: 20 | - uses: actions/checkout@v2 21 | 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | 27 | - name: Display Python version 28 | run: | 29 | python -c "import sys; print(sys.version)" 30 | 31 | - name: Install Dependencies 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install cython>=0.28 35 | pip install -r requirements.txt 36 | 37 | - name: Set up Test 38 | run: | 39 | TEST_RESOURCES_DIR="tests/resources/" 40 | for DIC_TYPE in {system,user,user2}; do 41 | IN="${TEST_RESOURCES_DIR}${DIC_TYPE}.csv" 42 | OUT="${TEST_RESOURCES_DIR}${DIC_TYPE}.dic" 43 | DES="the ${DIC_TYPE} dictionary for the unit tests" 44 | python -c "import sys; from chikkarpy.command_line import build_dictionary; build_dictionary(sys.argv[1], sys.argv[2], sys.argv[3]);" "${IN}" "${OUT}" "${DES}" 45 | done 46 | shell: bash 47 | 48 | - name: Run Test 49 | run: | 50 | python -m unittest discover tests 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | notebooks/ 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | 104 | # IDE, Editor 105 | .idea/ 106 | .vscode/ 107 | 108 | # dictionary 109 | *.dic 110 | 111 | _*/ 112 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE requirements.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # chikkarpy 2 | [![PyPi version](https://img.shields.io/pypi/v/chikkarpy.svg)](https://pypi.python.org/pypi/chikkarpy/) 3 | [![](https://img.shields.io/badge/python-3.5+-blue.svg)](https://www.python.org/downloads/release/python-350/) 4 | [![test](https://github.com/t-yamamura/chikkarpy/actions/workflows/test.yaml/badge.svg)](https://github.com/t-yamamura/chikkarpy/actions/workflows/test.yaml) 5 | [![](https://img.shields.io/github/license/t-yamamura/chikkarpy.svg)](https://github.com/t-yamamura/chikkarpy/blob/master/LICENSE) 6 | 7 | chikkarpyは[chikkar](https://github.com/WorksApplications/chikkar)のPython版です。 8 | chikkarpy は [Sudachi 同義語辞書](https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md)を利用し、[SudachiPy](https://github.com/WorksApplications/SudachiPy)の出力に同義語展開を追加するために開発されたライブラリです。 9 | 単体でも同義語辞書の検索ツールとして利用できます。 10 | 11 | chikkarpy is a Python version of [chikkar](https://github.com/WorksApplications/chikkar). 12 | chikkarpy is developed to utilize the [Sudachi synonym dictionary](https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md) and add synonym expansion to the output of [SudachiPy](https://github.com/WorksApplications/SudachiPy). 13 | This library alone can be used as a search tool for our synonym dictionaries. 14 | 15 | ## 利用方法 Usage 16 | ## TL;DR 17 | ```bash 18 | $ pip install chikkarpy 19 | 20 | $ echo "閉店" | chikkarpy 21 | 閉店 クローズ,close,店仕舞い 22 | ``` 23 | 24 | ## Step 1. chikkarpyのインストール Install chikkarpy 25 | ```bash 26 | $ pip install chikkarpy 27 | ``` 28 | 29 | ## Step 2. 使用方法 Usage 30 | ### コマンドライン Command Line 31 | ```bash 32 | $ echo "閉店" | chikkarpy 33 | 閉店 クローズ,close,店仕舞い 34 | ``` 35 | chikkarpyは入力された単語を見て一致する同義語のリストを返します。 36 | chikkarpy looks at a headword of synonym dictionary by the entered word and returns a list of matching synonyms. 37 | 38 | 同義語辞書内の曖昧性フラグが`1`の見出し語をトリガーにすることはできません。 39 | You cannot use a headword with an ambiguity flag of `1` in a synonym dictionary as a search trigger. 40 | 41 | 出力は`クエリ\t同義語リスト`の形式です。 42 | The output is in the form of a `query \t synonym list`. 43 | 44 | デフォルトの [Sudachi 同義語辞書](https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md) の見出し語は、 45 | SudachiPyの正規化形 (`normalized_form()`) で登録されています。 46 | 47 | The headwords in the Sudachi synonym dictionary are registered in SudachiPy's normalized form, `normalized_form()`. 48 | 49 | ```bash 50 | $ chikkarpy search -h 51 | usage: chikkarpy search [-h] [-d [file [file ...]]] [-ev] [-o file] [-v] 52 | [file [file ...]] 53 | 54 | Search synonyms 55 | 56 | positional arguments: 57 | file text written in utf-8 58 | 59 | optional arguments: 60 | -h, --help show this help message and exit 61 | -d [file [file ...]] synonym dictionary (default: system synonym 62 | dictionary) 63 | -ev Enable verb and adjective synonyms. 64 | -o file the output file 65 | -v, --version print chikkarpy version 66 | ``` 67 | 68 | 自分で用意したユーザー辞書を使いたい場合は`-d`で読み込むバイナリ辞書を指定できます。 69 | (バイナリ辞書のビルドは[辞書の作成](#辞書の作成-Build-a-dictionary)を参照してください。) 70 | When you use your user dictionary, you should specify the binary dictionary to read with `-d`. 71 | (For building a binary dictionary, see [Building a Dictionary](#辞書の作成-Build-a-dictionary).) 72 | 73 | 複数辞書を読み込む場合は順番に注意してください。 74 | When reading multiple dictionaries, pay attention to the order. 75 | 76 | 以下の場合,**user2 > user > system** の順で同義語を検索して見つかった時点で検索結果を返します。 77 | In the following cases, the synonyms are searched in the order of **user2 > user > system**, and the search results are returned which are first found. 78 | 79 | ```bash 80 | chikkarpy -d system.dic user.dic user2.dic 81 | ``` 82 | 83 | また、出力はデフォルトで**体言**のみです。 84 | Also, the output is **noun** only by default. 85 | 86 | **用言**も出力したい場合は`-ev`を有効にしてください。 87 | When you want to output **verb** as well, please enable `-ev`. 88 | 89 | ```bash 90 | $ echo "開放" | chikkarpy 91 | 開放 オープン,open 92 | $ echo "開放" | chikkarpy -ev 93 | 開放 開け放す,開く,オープン,open 94 | ``` 95 | 96 | ### Python ライブラリ / Python library 97 | 使用例 Example of use 98 | 99 | ```python 100 | from chikkarpy import Chikkar 101 | from chikkarpy.dictionarylib import Dictionary 102 | 103 | chikkar = Chikkar() 104 | 105 | # デフォルトのシステム同義語辞書を使う場合,Dictionaryの引数は省略可能 You may omit the ``Dictionary`` arguments if you want to use the system synonym dictionary 106 | system_dic = Dictionary() 107 | chikkar.add_dictionary(system_dic) 108 | 109 | print(chikkar.find("閉店")) 110 | # => ['クローズ', 'close', '店仕舞い'] 111 | 112 | print(chikkar.find("閉店", group_ids=[5])) # グループIDによる検索 Search by group ID 113 | # => ['クローズ', 'close', '店仕舞い'] 114 | 115 | print(chikkar.find("開放")) 116 | # => ['オープン', 'open'] 117 | 118 | chikkar.enable_verb() # 用言の出力制御(デフォルトは体言のみ出力) Output control of verbs (default is to output only nouns) 119 | print(chikkar.find("開放")) 120 | # => ['開け放す', '開く', 'オープン', 'open'] 121 | ``` 122 | 123 | `chikkar.add_dictionary()`で複数の辞書を読み込ませる場合は順番に注意してください。 124 | 最後に読み込んだ辞書を優先して検索します。 125 | また、`enable_trie`を`False`に設定した辞書では、同義語を検索するときに見出し語よりもグループIDを優先して検索します。 126 | 127 | When you read multiple dictionaries with `chikkar.add_dictionary()`, pay attention to the order. 128 | Priority is given to the last read dictionary. 129 | If ``enable_trie`` is ``False``, a search by synonym group IDs takes precedence over a search by the headword. 130 | 131 | ```python 132 | chikkar = Chikkar() 133 | 134 | system_dic = Dictionary(enable_trie=False) 135 | user_dic = Dictionary(user_dict_path, enable_trie=True) 136 | user2_dic = Dictionary(user_dict_path, enable_trie=True) 137 | 138 | chikkar.add_dictionary(system_dic) 139 | chikkar.add_dictionary(user_dic) 140 | chikkar.add_dictionary(user2_dic) 141 | ``` 142 | 143 | 144 | ## 辞書の作成 Build a dictionary 145 | 146 | 新しく辞書を追加する場合は、利用前にバイナリ形式辞書の作成が必要です。 147 | Before using new dictionary, you need to create a binary format dictionary. 148 | 149 | 同義語辞書のフォーマットは[Sudachi 同義語辞書](https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md)に従ってください。 150 | Follow the [Sudachi Synonym Dictionary](https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md) for the format of the synonym dictionary. 151 | 152 | ```bash 153 | $ chikkarpy build -i synonym_dict.csv -o system.dic 154 | ``` 155 | 156 | ```bash 157 | $ chikkarpy build -h 158 | usage: chikkarpy build [-h] -i file [-o file] [-d string] 159 | 160 | Build Synonym Dictionary 161 | 162 | optional arguments: 163 | -h, --help show this help message and exit 164 | -i file dictionary file (csv) 165 | -o file output file (default: synonym.dic) 166 | -d string description comment to be embedded on dictionary 167 | ``` 168 | 169 | ## 開発者向け 170 | 171 | ### Code Format 172 | 173 | `scripts/lint.sh` を実行して、コードが正しいフォーマットかを確認してください。 174 | Run `scripts/lint.sh` to check if your code is formatted correctly. 175 | 176 | `flake8` `flake8-import-order` `flake8-builtins` が必要です。 177 | You need packages `flake8` `flake8-import-order` `flake8-builtins`. 178 | 179 | ### Test 180 | 181 | `scripts/test.sh` を実行してテストしてください。 182 | Run `scripts/test.sh` to run the tests. 183 | 184 | ## Contact 185 | 186 | chikkarpyは[WAP Tokushima Laboratory of AI and NLP](http://nlp.worksap.co.jp/)によって開発されています。 187 | chikkarpy is developed by WAP Tokushima Laboratory of AI and NLP. 188 | 189 | 開発者やユーザーの方々が質問したり議論するためのSlackワークスペースを用意しています。 190 | Open an issue, or come to our Slack workspace for questions and discussion. 191 | - https://sudachi-dev.slack.com/ ([招待を受ける/Get invitation](https://join.slack.com/t/sudachi-dev/shared_invite/enQtMzg2NTI2NjYxNTUyLTMyYmNkZWQ0Y2E5NmQxMTI3ZGM3NDU0NzU4NGE1Y2UwYTVmNTViYjJmNDI0MWZiYTg4ODNmMzgxYTQ3ZmI2OWU)) 192 | -------------------------------------------------------------------------------- /chikkarpy/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .chikkar import Chikkar 16 | 17 | from pkg_resources import get_distribution, DistributionNotFound 18 | try: 19 | __version__ = get_distribution(__name__).version 20 | except DistributionNotFound: 21 | # package is not installed 22 | pass 23 | -------------------------------------------------------------------------------- /chikkarpy/chikkar.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import TYPE_CHECKING 16 | 17 | 18 | if TYPE_CHECKING: 19 | from .dictionarylib import Dictionary 20 | 21 | 22 | class Chikkar(object): 23 | """ 24 | A container of synonym dictionaries. 25 | """ 26 | def __init__(self): 27 | self._dictionaries = [] 28 | self._can_search_verb = False 29 | 30 | def enable_verb(self): 31 | """Enable verb and adjective synonyms. 32 | 33 | After this method is called, ``self.find()`` searches for synonyms for verbs and adjectives. 34 | """ 35 | self._can_search_verb = True 36 | 37 | def add_dictionary(self, dictionary): 38 | """Add a synonym dictionary. 39 | 40 | Adds a ``dictionary`` to be used for search. When searching, the dictionary added later takes precedence. 41 | 42 | Args: 43 | dictionary (Dictionary): a synonym dictionary 44 | """ 45 | self._dictionaries.insert(0, dictionary) 46 | 47 | def find(self, word, group_ids=None): 48 | """Returns synonyms for the specified word. 49 | 50 | If the tries in the dictionaries are enabled and ``group_ids`` is not ``None``, 51 | use the synonym group IDs as keys. Otherwise, use ``word`` as a key. 52 | If ``enable_verb`` is not called, only noun synonyms are returned. 53 | 54 | Args: 55 | word (str): keyword 56 | group_ids (list[int]): synonym group IDs 57 | 58 | Returns: 59 | list[str]: a list of synonym head words 60 | """ 61 | for dictionary in self._dictionaries: 62 | gids = dictionary.lookup(word, group_ids) 63 | if len(gids) == 0: 64 | continue 65 | 66 | synonyms = [] 67 | for gid in gids: 68 | ret = self.gather_head_word(word, gid, dictionary) 69 | if ret: 70 | synonyms += ret 71 | return synonyms 72 | 73 | return [] 74 | 75 | def gather_head_word(self, word, group_id, dictionary): 76 | """Searches synonyms by the ``group_id`` from the ``dictionary``. 77 | 78 | Args: 79 | word (str): keyword 80 | group_id (int): synonym group ID 81 | dictionary (Dictionary): a synonym dictionary 82 | 83 | Returns: 84 | list[str] | None: head words of synonyms. 85 | 86 | If synonyms with the specified group ID exist in a dictionary, head words of the synonyms are returned. 87 | 88 | Returns ``None`` in the following cases: 89 | 1. The synonym group with the ``group_id`` does not exist in the ``dictionary``. 90 | 2. The ``key`` is ambiguous, which is not a trigger of synonym expansion. 91 | 92 | Raises: 93 | ValueError: The ``group_id`` is defined in the dictionary, but the ``key`` does not exist in the group. 94 | """ 95 | head_words = [] 96 | 97 | synonym_group = dictionary.get_synonym_group(group_id) 98 | if synonym_group is None: 99 | return None 100 | 101 | looked_up = synonym_group.lookup(word) 102 | if looked_up is None: 103 | raise ValueError( 104 | "The dictionary (``{}``) has a group ID of {}, " 105 | "but the key (``{}``) dose not exist in the group.".format(dictionary.filename, group_id, word) 106 | ) 107 | if looked_up.has_ambiguity: 108 | return None 109 | 110 | for synonym in synonym_group.get_synonyms(): 111 | if synonym.head_word == word: 112 | continue 113 | if not self._can_search_verb and not synonym.is_noun: 114 | continue 115 | 116 | head_words.append(synonym.head_word) 117 | return head_words 118 | -------------------------------------------------------------------------------- /chikkarpy/command_line.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import fileinput 17 | import logging 18 | import os 19 | import sys 20 | import time 21 | 22 | from . import Chikkar 23 | from .dictionarylib import Dictionary 24 | from .dictionarylib.dictionarybuilder import DictionaryBuilder 25 | from .dictionarylib.dictionaryheader import DictionaryHeader 26 | from .dictionarylib.dictionaryversion import SYSTEM_DICT_VERSION_1 27 | 28 | 29 | def _set_default_subparser(self, name, args=None): 30 | """Set a default subparser 31 | 32 | copy and modify code from https://bitbucket.org/ruamel/std.argparse 33 | """ 34 | subparser_found = False 35 | for arg in sys.argv[1:]: 36 | if arg in ['-h', '--help']: # global help if no subparser 37 | break 38 | else: 39 | for x in self._subparsers._actions: 40 | if not isinstance(x, argparse._SubParsersAction): 41 | continue 42 | for sp_name in x._name_parser_map.keys(): 43 | if sp_name in sys.argv[1:]: 44 | subparser_found = True 45 | if not subparser_found: 46 | # insert default in first position, this implies no 47 | # global options without a sub_parsers specified 48 | if args is None: 49 | sys.argv.insert(1, name) 50 | else: 51 | args.insert(0, name) 52 | 53 | 54 | argparse.ArgumentParser.set_default_subparser = _set_default_subparser 55 | 56 | 57 | def print_version(): 58 | from . import __version__ 59 | print('chikkarpy {}'.format(__version__)) 60 | 61 | 62 | def search_synonyms(enable_verb, dictionaries, input_, stdout_logger): 63 | for word in input_: 64 | word = word.rstrip('\n') 65 | chikkar = Chikkar() 66 | if enable_verb: 67 | chikkar.enable_verb() 68 | for dictionary in dictionaries: 69 | dic = Dictionary(filename=dictionary) 70 | chikkar.add_dictionary(dic) 71 | stdout_logger.info("{}\t{}".format(word, ','.join(chikkar.find(word)))) 72 | 73 | 74 | def _command_search(args, print_usage): 75 | if args.version: 76 | print_version() 77 | return 78 | 79 | stdout_logger = logging.getLogger(__name__) 80 | 81 | output = open(args.fpath_out, "w", encoding="utf-8") if args.fpath_out else sys.stdout 82 | 83 | handler = logging.StreamHandler(output) 84 | handler.setLevel(logging.DEBUG) 85 | stdout_logger.addHandler(handler) 86 | stdout_logger.setLevel(logging.DEBUG) 87 | stdout_logger.propagate = False 88 | 89 | try: 90 | input_ = fileinput.input(args.in_files, openhook=fileinput.hook_encoded("utf-8")) 91 | search_synonyms(args.enable_verb, args.dictionaries, input_, stdout_logger) 92 | finally: 93 | if args.fpath_out: 94 | output.close() 95 | 96 | 97 | def _input_files_checker(args, print_usage): 98 | for file in args.in_files: 99 | if not os.path.exists(file): 100 | print_usage() 101 | print('{}: error: {} doesn\'t exist'.format(__name__, file), file=sys.stderr) 102 | exit(1) 103 | 104 | 105 | def build_dictionary(input_file, output_file, description): 106 | header = DictionaryHeader(SYSTEM_DICT_VERSION_1, int(time.time()), description) 107 | with open(output_file, 'wb') as wf: 108 | wf.write(header.to_byte()) 109 | 110 | builder = DictionaryBuilder() 111 | builder.build(input_file, wf) 112 | 113 | 114 | def _command_build(args, print_usage): 115 | build_dictionary(args.input_file, args.out_file, args.description) 116 | 117 | 118 | def main(): 119 | parser = argparse.ArgumentParser(description="Japanese Morphological Analyzer") 120 | 121 | subparsers = parser.add_subparsers(description='') 122 | 123 | # root, search synonyms 124 | parser_ss = subparsers.add_parser('search', help='(default) see `search -h`', description='Search synonyms') 125 | parser_ss.add_argument('-d', dest='dictionaries', metavar='file', nargs=argparse.ZERO_OR_MORE, default=[None], 126 | help='synonym dictionary (default: system synonym dictionary)') 127 | parser_ss.add_argument('-ev', dest='enable_verb', action='store_true', default=False, 128 | help='Enable verb and adjective synonyms.') 129 | parser_ss.add_argument('-o', dest='fpath_out', metavar='file', help='the output file') 130 | parser_ss.add_argument('in_files', metavar='file', nargs=argparse.ZERO_OR_MORE, help='text written in utf-8') 131 | parser_ss.add_argument('-v', '--version', action='store_true', dest='version', help='print chikkarpy version') 132 | parser_ss.set_defaults(handler=_command_search, print_usage=parser_ss.print_usage) 133 | 134 | # build dictionary parser 135 | parser_bd = subparsers.add_parser('build', help='see `build -h`', description='Build Synonym Dictionary') 136 | parser_bd.add_argument('-i', dest='input_file', metavar='file', required=True, 137 | help='dictionary file (csv)') 138 | parser_bd.add_argument('-o', dest='out_file', metavar='file', default='synonym.dic', required=False, 139 | help='output file (default: synonym.dic)') 140 | parser_bd.add_argument('-d', dest='description', metavar='string', default='', required=False, 141 | help='description comment to be embedded on dictionary') 142 | 143 | parser_bd.set_defaults(handler=_command_build, print_usage=parser_bd.print_usage) 144 | 145 | parser.set_default_subparser('search') 146 | 147 | args = parser.parse_args() 148 | 149 | if hasattr(args, 'handler'): 150 | args.handler(args, args.print_usage) 151 | else: 152 | parser.print_help() 153 | 154 | 155 | if __name__ == '__main__': 156 | main() 157 | -------------------------------------------------------------------------------- /chikkarpy/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from logging import getLogger 17 | from pathlib import Path 18 | from urllib.parse import urlparse 19 | from urllib.request import urlretrieve 20 | from zipfile import ZipFile 21 | 22 | 23 | DEFAULT_RESOURCEDIR = Path(__file__).absolute().parent / 'resources' 24 | DEFAULT_RESOURCEDIR = DEFAULT_RESOURCEDIR.as_posix() 25 | 26 | 27 | DICT_VERSION = "20200722" 28 | DICT_PREFIX = "sudachi-synonym" 29 | BINARY_NAME = "system_synonym.dic" 30 | 31 | ZIP_URL = ( 32 | "https://sudachi.s3-ap-northeast-1.amazonaws.com/sudachisynonym/" 33 | "{}-{}.zip".format(DICT_PREFIX, DICT_VERSION) 34 | ) 35 | ZIP_NAME = urlparse(ZIP_URL).path.split("/")[-1] 36 | UNZIP_NAME = "{}-{}".format(DICT_PREFIX, DICT_VERSION) 37 | 38 | logger = getLogger(__name__) 39 | 40 | 41 | def download_dictionary(): 42 | if not os.path.exists(DEFAULT_RESOURCEDIR): 43 | logger.warning("Downloading the Sudachi Synonym dictionary (It may take a while) ...") 44 | 45 | _, _msg = urlretrieve(ZIP_URL, ZIP_NAME) 46 | with ZipFile(ZIP_NAME) as z: 47 | z.extractall() 48 | 49 | os.rename(UNZIP_NAME, DEFAULT_RESOURCEDIR) 50 | os.remove(ZIP_NAME) 51 | 52 | logger.warning("... downloaded and placed the dictionary at `{}`.".format(DEFAULT_RESOURCEDIR)) 53 | else: 54 | logger.warning("Resource is already installed at `{}`.".format(DEFAULT_RESOURCEDIR)) 55 | 56 | 57 | def get_system_dictionary_path(): 58 | dictionary_path = os.path.join(DEFAULT_RESOURCEDIR, BINARY_NAME) 59 | if not os.path.exists(dictionary_path): 60 | raise FileNotFoundError("Synonym dictionary is not installed.") 61 | 62 | return dictionary_path 63 | -------------------------------------------------------------------------------- /chikkarpy/dictionarylib/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .dictionary import Dictionary 16 | -------------------------------------------------------------------------------- /chikkarpy/dictionarylib/binarydictionary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import mmap 16 | 17 | from .dictionaryheader import DictionaryHeader 18 | from .dictionaryversion import is_dictionary 19 | from .doublearraytrie import DoubleArrayTrie 20 | 21 | 22 | class BinaryDictionary(object): 23 | 24 | def __init__(self, bytes_, header, trie, offset): 25 | """Constructs a new dictionary. 26 | 27 | Args: 28 | bytes_ (mmap.mmap): a memory-mapped dictionary 29 | header (DictionaryHeader): a header of dictionary 30 | trie (DoubleArrayTrie): a double array trie 31 | offset (int): byte offset 32 | """ 33 | self._bytes = bytes_ 34 | self._header = header 35 | self._trie = trie 36 | self._offset = offset 37 | 38 | @staticmethod 39 | def _read_dictionary(filename, access=mmap.ACCESS_READ): 40 | """Reads the synonym dictionary from the specified file. 41 | 42 | Args: 43 | filename (str): the file path of a synonym dictionary 44 | access (int): file-open mode 45 | 46 | Returns: 47 | tuple[mmap.mmap, DictionaryHeader, DoubleArrayTrie, int]: byte data to be read 48 | """ 49 | with open(filename, 'rb') as system_dic: 50 | bytes_ = mmap.mmap(system_dic.fileno(), 0, access=access) 51 | offset = 0 52 | 53 | header = DictionaryHeader.from_bytes(bytes_, offset) 54 | offset += header.storage_size() 55 | 56 | if not is_dictionary(header.version): 57 | raise Exception('invalid dictionary version') 58 | 59 | trie = DoubleArrayTrie(bytes_, offset) 60 | offset += trie.get_storage_size() 61 | 62 | return bytes_, header, trie, offset 63 | 64 | @classmethod 65 | def from_system_dictionary(cls, filename): 66 | """Constructs a new dictionary and return a ``BinaryDictionary`` object. 67 | 68 | Args: 69 | filename (str): the file path of a synonym dictionary 70 | 71 | Returns: 72 | BinaryDictionary: a binary dictionary 73 | """ 74 | args = cls._read_dictionary(filename) 75 | return cls(*args) 76 | 77 | def close(self): 78 | del self._trie 79 | self._bytes.close() 80 | 81 | @property 82 | def bytes_(self): 83 | """mmap.mmap: a memory-mapped dictionary""" 84 | return self._bytes 85 | 86 | @property 87 | def header(self): 88 | """DictionaryHeader: a header of dictionary""" 89 | return self._header 90 | 91 | @property 92 | def trie(self): 93 | """DoubleArrayTrie: a double array trie""" 94 | return self._trie 95 | 96 | @property 97 | def offset(self): 98 | """int: byte offset""" 99 | return self._offset 100 | -------------------------------------------------------------------------------- /chikkarpy/dictionarylib/dictionary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .binarydictionary import BinaryDictionary 16 | from .synonym_group_list import SynonymGroupList 17 | from ..config import get_system_dictionary_path 18 | from ..synonymgroup import SynonymGroup 19 | 20 | 21 | class Dictionary(object): 22 | """ 23 | A container of synonyms 24 | """ 25 | def __init__(self, filename=None, enable_trie=False): 26 | """Reads the synonym dictionary from the specified file. 27 | 28 | If ``enable_trie`` is ``False``, a search by synonym group IDs takes precedence over a search by the headword. 29 | 30 | Args: 31 | filename (str | None): path of synonym dictionary file 32 | enable_trie (bool): ``True`` to enable trie, otherwise ``False`` 33 | """ 34 | self.filename = filename if filename is not None else get_system_dictionary_path() 35 | self.dict_ = BinaryDictionary.from_system_dictionary(self.filename) 36 | self.enable_trie = enable_trie 37 | self.group_list = SynonymGroupList(self.dict_.bytes_, self.dict_.offset) 38 | 39 | def lookup(self, word, group_ids): 40 | """Returns a synonym group ID that contains the specified headword or a specified synonym group ID. 41 | 42 | Args: 43 | word (str): a headword to search for 44 | group_ids (list[int] | None): an array of synonym group IDs to search for 45 | 46 | Returns: 47 | list[int]: an array of synonym group IDs found, or an empty array if not found 48 | """ 49 | if self.enable_trie or group_ids is None: 50 | return self.dict_.trie.lookup_by_exact_match(word.encode('utf-8')) 51 | else: 52 | return group_ids 53 | 54 | def get_synonym_group(self, group_id): 55 | """Returns a group of synonyms with the specified ID. 56 | 57 | Args: 58 | group_id (int): a synonym group ID 59 | 60 | Returns: 61 | SynonymGroup | None: the group of synonyms with the specified ID, or None if no ID matches 62 | """ 63 | return self.group_list.get_synonym_group(group_id) 64 | 65 | def close(self): 66 | self.dict_.close() 67 | -------------------------------------------------------------------------------- /chikkarpy/dictionarylib/dictionarybuilder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from io import BufferedWriter, TextIOWrapper 16 | from logging import DEBUG, StreamHandler, getLogger 17 | 18 | from dartsclone import DoubleArray 19 | 20 | from sortedcontainers import SortedDict 21 | 22 | from .flags import Flags 23 | from .format import Acronym, Ambiguity, Column, Form, IsNoun, Variant 24 | from .jtypedbytebuffer import JTypedByteBuffer 25 | from ..synonym import Synonym 26 | 27 | 28 | class SynonymWithGroupId: 29 | def __init__(self, group_id, synonym): 30 | """Constructs a synonym with its group ID 31 | 32 | Args: 33 | group_id (int): a group ID 34 | synonym (Synonym): a synonym object 35 | """ 36 | self._synonym = synonym 37 | self._group_id = group_id 38 | 39 | @property 40 | def group_id(self): 41 | return self._group_id 42 | 43 | @property 44 | def headword(self): 45 | return self._synonym.head_word 46 | 47 | @property 48 | def lexeme_ids(self): 49 | return self._synonym.lexeme_ids 50 | 51 | @property 52 | def flags(self): 53 | return self._synonym.flags 54 | 55 | @property 56 | def category(self): 57 | return self._synonym.category 58 | 59 | 60 | class DictionaryBuilder: 61 | __BYTE_MAX_VALUE = 127 62 | 63 | @staticmethod 64 | def __default_logger(): 65 | """Sets and returns a default logging. 66 | 67 | Returns: 68 | StreamHandler: a default logging 69 | """ 70 | handler = StreamHandler() 71 | handler.terminator = "" 72 | handler.setLevel(DEBUG) 73 | logger = getLogger(__name__) 74 | logger.setLevel(DEBUG) 75 | logger.addHandler(handler) 76 | logger.propagate = False 77 | 78 | return logger 79 | 80 | def __init__(self, *, logger=None): 81 | self.byte_buffer = JTypedByteBuffer() 82 | self.trie_keys = SortedDict() 83 | self.synonym_groups = [] 84 | self.is_dictionary = False 85 | self.logger = logger or self.__default_logger() 86 | 87 | def build(self, input_path, out_stream): 88 | """Builds the synonym dictionary from the specified input file and writes it to the specified output. 89 | 90 | Args: 91 | input_path (str): an input file path 92 | out_stream (BufferedWriter): 93 | """ 94 | self.logger.info('reading the source file...') 95 | with open(input_path, 'r', encoding='utf-8') as rf: 96 | self.build_synonym(rf) 97 | self.write_trie(out_stream) 98 | self.write_synonym_groups(out_stream) 99 | 100 | def build_synonym(self, synonym_input_stream): 101 | """Reads lines in the specified input file. 102 | 103 | Args: 104 | synonym_input_stream (TextIOWrapper): an input stream 105 | 106 | Raises: 107 | ValueError: Group ID is changed in a group. 108 | """ 109 | block = [] 110 | line_no = -1 111 | group_id = -1 112 | try: 113 | for i, row in enumerate(synonym_input_stream): 114 | line_no = i 115 | if not row or row.isspace(): 116 | if len(block) == 0: 117 | continue 118 | else: 119 | self.synonym_groups.append(block) 120 | block = [] 121 | group_id = -1 122 | else: 123 | entry = self.parse_line(row) 124 | if not entry: 125 | continue 126 | if group_id < 0: 127 | group_id = entry.group_id 128 | elif group_id != entry.group_id: 129 | raise ValueError("Group ID is changed in block.") 130 | self.add_to_trie(entry.headword, group_id) 131 | block.append(entry) 132 | if len(block) > 0: 133 | self.synonym_groups.append(block) 134 | except Exception as e: 135 | if line_no >= 0: 136 | self.logger.error( 137 | '{} at line {} in {}\n'.format(e.args[0], line_no, synonym_input_stream.name)) 138 | raise e 139 | 140 | def parse_line(self, line): 141 | """Parses a line in a dictionary file (csv). 142 | 143 | Args: 144 | line (str): each line in a csv file 145 | 146 | Returns: 147 | SynonymWithGroupId: encoded line 148 | 149 | Raises: 150 | ValueError: Too few columns in a specified line 151 | """ 152 | cols = line.split(",") 153 | if len(cols) <= max(map(int, Column)): 154 | raise ValueError('Too few columns. {} <= n are allowed.'.format(max(map(int, Column)))) 155 | if int(cols[Column.AMBIGUITY]) == Ambiguity.INVALID: 156 | return None 157 | 158 | group_id = int(cols[Column.GROUP_ID]) 159 | 160 | lexeme_ids = cols[Column.GROUP_ID] if cols[Column.LEXEME_IDS] == "" else list(map(int, cols[Column.LEXEME_IDS].split("/"))) 161 | headword = cols[Column.HEAD_WORD] 162 | _is_noun = self.parse_boolean(cols[Column.IS_NOUN], IsNoun.FALSE, IsNoun.TRUE) 163 | _has_ambiguity = self.parse_boolean(cols[Column.AMBIGUITY], Ambiguity.FALSE, Ambiguity.TRUE) 164 | _form_type = self.parse_int(cols[Column.FORM_TYPE], max(map(int, Form))) 165 | _acronym_type = self.parse_int(cols[Column.ACRONYM_TYPE], max(map(int, Acronym))) 166 | _variant_type = self.parse_int(cols[Column.VARIANT_TYPE], max(map(int, Variant))) 167 | flags = Flags(_has_ambiguity, _is_noun, _form_type, _acronym_type, _variant_type) 168 | category = cols[Column.CATEGORY] 169 | 170 | entry = SynonymWithGroupId(group_id, Synonym(headword, lexeme_ids, flags, category)) 171 | 172 | return entry 173 | 174 | @staticmethod 175 | def parse_boolean(s, false_value, true_value): 176 | """Parses and validates a str-type boolean value. 177 | 178 | Args: 179 | s (str): a str-type boolean value 180 | false_value (int): false value 181 | true_value (int): true value 182 | 183 | Returns: 184 | bool: validated and parsed value 185 | 186 | Raises: 187 | ValueError: ``v`` is an invalid value 188 | """ 189 | v = int(s) 190 | if v == false_value: 191 | return False 192 | elif v == true_value: 193 | return True 194 | else: 195 | raise ValueError("'{}' is an invalid value. '{}' or '{}' are allowed.".format(s, false_value, true_value)) 196 | 197 | @staticmethod 198 | def parse_int(s, limit): 199 | """Parses and validates a str-type numeric value. 200 | 201 | Args: 202 | s (str): a str-type numeric value 203 | limit (int): an allowed maximum value 204 | 205 | Returns: 206 | int: validated and parsed value 207 | """ 208 | v = int(s) 209 | if v < 0 or v > limit: 210 | raise ValueError("'{}' is an invalid value. 0 <= n <= '{}' are allowed.".format(s, limit)) 211 | return v 212 | 213 | def add_to_trie(self, headword, group_id): 214 | """Adds ``headword``-``group_id`` pairs to a trie. 215 | 216 | Args: 217 | headword (str): a headword 218 | group_id (int): a synonym group ID 219 | """ 220 | key = headword.encode('utf-8') 221 | if key not in self.trie_keys: 222 | self.trie_keys[key] = [] 223 | self.trie_keys[key].append(group_id) 224 | 225 | def write_trie(self, io_out): 226 | """Writes ``headword``-``group_id`` pairs to the specified output file. 227 | 228 | Args: 229 | io_out (BufferedWriter): an output stream 230 | """ 231 | trie = DoubleArray() 232 | keys = [] 233 | vals = [] 234 | id_table = JTypedByteBuffer() 235 | for key, ids in self.trie_keys.items(): 236 | keys.append(key) 237 | vals.append(id_table.tell()) 238 | id_table.write_int(len(ids), 'byte') 239 | for _id in ids: 240 | id_table.write_int(_id, 'int') 241 | 242 | self.logger.info('building the trie...') 243 | trie.build(keys, lengths=[len(k) for k in keys], values=vals) 244 | self.logger.info('done\n') 245 | self.logger.info('writing the trie...') 246 | self.byte_buffer.clear() 247 | self.byte_buffer.write_int(trie.size(), 'int') 248 | self.byte_buffer.seek(0) 249 | io_out.write(self.byte_buffer.read()) 250 | self.byte_buffer.clear() 251 | io_out.write(trie.array()) 252 | self.__logging_size(trie.size() * 4 + 4) 253 | trie.clear() 254 | del trie 255 | 256 | self.logger.info('writing the word-ID table...') 257 | self.byte_buffer.write_int(id_table.tell(), 'int') 258 | self.byte_buffer.seek(0) 259 | io_out.write(self.byte_buffer.read()) 260 | self.byte_buffer.clear() 261 | id_table.seek(0) 262 | io_out.write(id_table.read()) 263 | self.__logging_size(id_table.tell() + 4) 264 | del id_table 265 | 266 | def write_synonym_groups(self, io_out): 267 | """Writes synonym groups to the specified output file. 268 | 269 | Args: 270 | io_out (BufferedWriter): an output stream 271 | """ 272 | mark = io_out.tell() 273 | io_out.seek(mark + 4 * len(self.synonym_groups) * 2 + 4) 274 | offsets = JTypedByteBuffer() 275 | offsets.write_int(len(self.synonym_groups), 'int') 276 | self.logger.info('writing the word_infos...') 277 | base = io_out.tell() 278 | for entries in self.synonym_groups: 279 | if len(entries) == 0: 280 | continue 281 | offsets.write_int(entries[0].group_id, 'int') 282 | offsets.write_int(io_out.tell(), 'int') 283 | 284 | self.byte_buffer.write_int(len(entries), 'short') 285 | for entry in entries: 286 | self.write_string(entry.headword) 287 | self.write_short_array(entry.lexeme_ids) 288 | self.byte_buffer.write_int(entry.flags.encode(), 'short') 289 | self.write_string(entry.category) 290 | self.byte_buffer.seek(0) 291 | io_out.write(self.byte_buffer.read()) 292 | self.byte_buffer.clear() 293 | 294 | self.__logging_size(io_out.tell() - base) 295 | self.logger.info('writing synonym groups offsets...') 296 | io_out.seek(mark) 297 | offsets.seek(0) 298 | io_out.write(offsets.read()) 299 | self.__logging_size(offsets.tell()) 300 | 301 | def write_string(self, text): 302 | """Converts a string to bytes and writes it to a buffer. 303 | 304 | Args: 305 | text (str): a string 306 | """ 307 | len_ = 0 308 | for c in text: 309 | if 0x10000 <= ord(c) <= 0x10FFFF: 310 | len_ += 2 311 | else: 312 | len_ += 1 313 | self.write_string_length(len_) 314 | self.byte_buffer.write_str(text) 315 | 316 | def write_short_array(self, array): 317 | """Converts a list of short to bytes and writes it to a buffer. 318 | 319 | Args: 320 | array (list[int]): a list of short 321 | """ 322 | self.byte_buffer.write_int(len(array), 'byte') 323 | for item in array: 324 | self.byte_buffer.write_int(item, 'short') 325 | 326 | def write_string_length(self, len_): 327 | """Converts a length of a string and writes it to a buffer. 328 | 329 | Args: 330 | len_ (int): a length of a string 331 | """ 332 | if len_ <= self.__BYTE_MAX_VALUE: 333 | self.byte_buffer.write_int(len_, 'byte') 334 | else: 335 | self.byte_buffer.write_int((len_ >> 8) | 0x80, 'byte') 336 | self.byte_buffer.write_int((len_ & 0xFF), 'byte') 337 | 338 | def __logging_size(self, size): 339 | self.logger.info('{} bytes\n'.format(size)) 340 | -------------------------------------------------------------------------------- /chikkarpy/dictionarylib/dictionaryheader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import struct 16 | 17 | from . import dictionaryversion 18 | from .jtypedbytebuffer import JTypedByteBuffer 19 | 20 | 21 | class DictionaryHeader(object): 22 | """ 23 | A header of a dictionary file. 24 | """ 25 | __DESCRIPTION_SIZE = 256 26 | __STORAGE_SIZE = 8 + 8 + __DESCRIPTION_SIZE 27 | 28 | def __init__(self, version, create_time, description): 29 | """Constructs a dictionary header. 30 | 31 | Args: 32 | version (int): a dictionary version ID 33 | create_time (int): dictionary creation time (unix time) 34 | description (str): description of a dictionary 35 | """ 36 | self._version = version 37 | self._create_time = create_time 38 | self._description = description 39 | 40 | @classmethod 41 | def from_bytes(cls, bytes_, offset): 42 | """Reads the dictionary header from the specified byte object and returns a ``DictionaryHeader`` object. 43 | 44 | Args: 45 | bytes_ (mmap.mmap): a memory-mapped dictionary 46 | offset (int): byte offset 47 | 48 | Returns: 49 | DictionaryHeader: a dictionary header 50 | """ 51 | version, create_time = struct.unpack_from("<2Q", bytes_, offset) 52 | offset += 16 53 | 54 | len_ = 0 55 | while len_ < cls.__DESCRIPTION_SIZE: 56 | if bytes_[offset + len_] == 0: 57 | break 58 | len_ += 1 59 | description = bytes_[offset:offset + len_].decode("utf-8") 60 | return cls(version, create_time, description) 61 | 62 | def storage_size(self): 63 | """int: a storage size of the dictionary header""" 64 | return self.__STORAGE_SIZE 65 | 66 | def to_byte(self): 67 | """DictionaryHeader to binary converter. 68 | 69 | Returns: 70 | bytes: a binarized dictionary header 71 | """ 72 | buf = JTypedByteBuffer(b'\x00' * (16 + self.__DESCRIPTION_SIZE)) 73 | buf.seek(0) 74 | buf.write_int(self.version, 'long', signed=False) 75 | buf.write_int(self.create_time, 'long') 76 | dbesc = self.description.encode('utf-8') 77 | if len(dbesc) > self.__DESCRIPTION_SIZE: 78 | raise ValueError('description is too long') 79 | buf.write(dbesc) 80 | return buf.getvalue() 81 | 82 | @property 83 | def version(self): 84 | """int: a dictionary version ID""" 85 | return self._version 86 | 87 | @property 88 | def create_time(self): 89 | """int: dictionary creation time (unix time)""" 90 | return self._create_time 91 | 92 | @property 93 | def description(self): 94 | """str: description of a dictionary""" 95 | return self._description 96 | 97 | def is_dictionary(self): 98 | """Returns ``True`` if, and only if, the file is a system dictionary. 99 | 100 | Returns: 101 | bool: ``True`` if the file is a system dictionary, otherwise ``False`` 102 | """ 103 | return dictionaryversion.is_dictionary(self.version) 104 | -------------------------------------------------------------------------------- /chikkarpy/dictionarylib/dictionaryversion.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # the first version of system dictionaries 16 | SYSTEM_DICT_VERSION_1 = 0xeb5b87cc8b3f406c 17 | 18 | 19 | def is_dictionary(version): 20 | """Returns ``True`` if, and only if, the file is a system dictionary. 21 | 22 | Args: 23 | version (int): a dictionary version ID 24 | 25 | Returns: 26 | bool: ``True`` if the file is a system dictionary, otherwise ``False`` 27 | """ 28 | return version == SYSTEM_DICT_VERSION_1 29 | -------------------------------------------------------------------------------- /chikkarpy/dictionarylib/doublearraytrie.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import mmap 16 | 17 | from dartsclone import DoubleArray 18 | 19 | from . import idtable 20 | 21 | 22 | class DoubleArrayTrie(object): 23 | 24 | def __init__(self, bytes_, offset): 25 | """Constructs a new double-array trie 26 | 27 | Args: 28 | bytes_ (mmap.mmap): a memory-mapped dictionary 29 | offset (int): byte offset 30 | """ 31 | position = offset 32 | self.trie = DoubleArray() 33 | bytes_.seek(position) 34 | 35 | # trie size 36 | size = int.from_bytes(bytes_.read(4), 'little') 37 | position += 4 38 | 39 | # trie array 40 | array = memoryview(bytes_)[position:position + size * 4] 41 | self.trie.set_array(array, size) 42 | position += self.trie.total_size() 43 | 44 | self.group_id_table = idtable.IdTable(bytes_, position) 45 | position += self.group_id_table.storage_size() 46 | 47 | self.storage_size = position - offset 48 | 49 | def lookup_by_common_prefix(self, text, offset): 50 | """Searches group IDs with the `text` by common prefix. 51 | 52 | Args: 53 | text (bytes): a memory-mapped dictionary 54 | offset (int): byte offset 55 | 56 | Yields: 57 | tuple[int, int]: a group ID and 58 | """ 59 | key = text[offset:] 60 | result = self.trie.common_prefix_search(key, length=len(key)) 61 | for index, length in result: 62 | group_ids = self.group_id_table.get(index) 63 | length += offset 64 | for group_id in group_ids: 65 | yield group_id, length 66 | 67 | def lookup_by_exact_match(self, text): 68 | """Searches group IDs with the ``text`` by exact match. 69 | 70 | Args: 71 | text (bytes): a head word to search for 72 | 73 | Returns: 74 | list[int]: a list of synonym group IDs 75 | """ 76 | results = self.trie.exact_match_search(text) 77 | if results[0] < 0: 78 | return [] 79 | else: 80 | return list(self.group_id_table.get(results[0])) 81 | 82 | def get_storage_size(self): 83 | """int: a storage size of the double-array trie""" 84 | return self.storage_size 85 | -------------------------------------------------------------------------------- /chikkarpy/dictionarylib/flags.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | class Flags: 16 | def __init__(self, has_ambiguity, is_noun, form_type, acronym_type, variant_type): 17 | """Constructs flags of a synonym. 18 | 19 | Args: 20 | has_ambiguity (bool): ``True`` if a synonym is ambiguous, ``False`` otherwise 21 | is_noun (bool): ``True`` if a synonym is a noun, ``False`` otherwise 22 | form_type (int): a word form type of a synonym 23 | acronym_type (int): an acronym type of a synonym 24 | variant_type (int): a variant type of a synonym 25 | """ 26 | self._has_ambiguity = has_ambiguity 27 | self._is_noun = is_noun 28 | self._form_type = form_type 29 | self._acronym_type = acronym_type 30 | self._variant_type = variant_type 31 | 32 | @classmethod 33 | def from_int(cls, flags): 34 | """Reads and returns flags from the specified int value. 35 | 36 | Args: 37 | flags (int): int-type flag 38 | 39 | Returns: 40 | Flags: a flags of a synonym 41 | """ 42 | has_ambiguity = ((flags & 0x0001) == 1) 43 | is_noun = ((flags & 0x0002) == 2) 44 | form_type = (flags >> 2) & 0x0007 45 | acronym_type = (flags >> 5) & 0x0003 46 | variant_type = (flags >> 7) & 0x0003 47 | return cls(has_ambiguity, is_noun, form_type, acronym_type, variant_type) 48 | 49 | @property 50 | def has_ambiguity(self): 51 | """bool: ``True`` if a synonym is ambiguous, ``False`` otherwise""" 52 | return self._has_ambiguity 53 | 54 | @property 55 | def is_noun(self): 56 | """bool: ``True`` if a synonym is a noun, ``False`` otherwise""" 57 | return self._is_noun 58 | 59 | @property 60 | def form_type(self): 61 | """int: a word form type of a synonym""" 62 | return self._form_type 63 | 64 | @property 65 | def acronym_type(self): 66 | """int: an acronym type of a synonym""" 67 | return self._acronym_type 68 | 69 | @property 70 | def variant_type(self): 71 | """int: a variant type of a synonym""" 72 | return self._variant_type 73 | 74 | def encode(self): 75 | """Encodes this ``Flags`` object. 76 | 77 | Returns: 78 | int: encoded flags 79 | """ 80 | flags = 0 81 | flags |= 1 if self.has_ambiguity else 0 82 | flags |= (1 if self.is_noun else 0) << 1 83 | flags |= self.form_type << 2 84 | flags |= self.acronym_type << 5 85 | flags |= self.variant_type << 7 86 | return flags 87 | -------------------------------------------------------------------------------- /chikkarpy/dictionarylib/format/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .format import * 16 | -------------------------------------------------------------------------------- /chikkarpy/dictionarylib/format/format.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from enum import IntEnum 16 | 17 | 18 | class Column(IntEnum): 19 | """https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md""" 20 | GROUP_ID = 0 21 | IS_NOUN = 1 22 | AMBIGUITY = 2 23 | LEXEME_IDS = 3 24 | FORM_TYPE = 4 25 | ACRONYM_TYPE = 5 26 | VARIANT_TYPE = 6 27 | CATEGORY = 7 28 | HEAD_WORD = 8 29 | 30 | 31 | class IsNoun(IntEnum): 32 | TRUE = 1 33 | FALSE = 2 34 | 35 | 36 | class Ambiguity(IntEnum): 37 | FALSE = 0 38 | TRUE = 1 39 | INVALID = 2 40 | 41 | 42 | class Form(IntEnum): 43 | # Typical form 44 | NONE = 0 45 | # Translated from another language 46 | TRANSLATION = 1 47 | # Alias or common name 48 | ALIAS = 2 49 | # Old name 50 | OLD_NAME = 3 51 | # Misused words 52 | MISNOMER = 4 53 | 54 | 55 | class Acronym(IntEnum): 56 | # Typical Abbreviations 57 | NONE = 0 58 | # Abbreviations written in Latin letters 59 | ALPHABET = 1 60 | # Abbreviations written outside the Latin alphabet 61 | OTHERS = 2 62 | 63 | 64 | class Variant(IntEnum): 65 | # Typical form 66 | NONE = 0 67 | # Original spelling of foreign words or romanization of Japanese words 68 | ALPHABET = 1 69 | # Variant notation 70 | GENERAL = 2 71 | # Misspelled words 72 | MISSPELLED = 3 73 | -------------------------------------------------------------------------------- /chikkarpy/dictionarylib/idtable.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import struct 16 | 17 | 18 | class IdTable(object): 19 | def __init__(self, bytes_, offset): 20 | """Construct a ID table of synonyms. 21 | 22 | Args: 23 | bytes_ (mmap.mmap): a memory-mapped dictionary 24 | offset (int): byte offset 25 | """ 26 | bytes_.seek(offset) 27 | self.size = int.from_bytes(bytes_.read(4), 'little') 28 | 29 | self.offset = offset + 4 30 | self._bytes_view = memoryview(bytes_)[self.offset: self.offset + self.size] 31 | 32 | def __del__(self): 33 | self._bytes_view.release() 34 | 35 | def storage_size(self): 36 | """int: a storage size of the ID table""" 37 | return 4 + self.size 38 | 39 | def get(self, index): 40 | """Reads bytes with synonym group IDs from the specified index and returns the group IDs. 41 | 42 | Args: 43 | index (int): offset 44 | 45 | Returns: 46 | tuple[int]: a list of synonym group IDs 47 | """ 48 | length = self._bytes_view[index] 49 | result = struct.unpack_from("<{}I".format(length), self._bytes_view, index + 1) 50 | return result 51 | -------------------------------------------------------------------------------- /chikkarpy/dictionarylib/jtypedbytebuffer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from io import BytesIO 16 | 17 | 18 | class JTypedByteBuffer(BytesIO): 19 | """ 20 | An interface of BytesIO to write dictionary 21 | """ 22 | 23 | __ENDIAN = 'little' 24 | 25 | @classmethod 26 | def from_bytes(cls, bytes_io): 27 | return cls(bytes_io.getvalue()) 28 | 29 | def write_int(self, int_, type_, signed=True): 30 | if type_ == 'byte': 31 | len_ = 1 32 | signed = False 33 | elif type_ == 'int': 34 | len_ = 4 35 | elif type_ == 'char': 36 | len_ = 2 37 | signed = False 38 | elif type_ == 'short': 39 | len_ = 2 40 | elif type_ == 'long': 41 | len_ = 8 42 | else: 43 | raise ValueError('{} is invalid type'.format(type_)) 44 | self.write(int_.to_bytes(len_, byteorder=self.__ENDIAN, signed=signed)) 45 | 46 | def write_str(self, text): 47 | self.write(text.encode('utf-16-le')) 48 | 49 | def clear(self): 50 | self.seek(0) 51 | self.truncate(0) 52 | -------------------------------------------------------------------------------- /chikkarpy/dictionarylib/synonym_group_list.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import struct 16 | 17 | from ..dictionarylib.flags import Flags 18 | from ..synonym import Synonym 19 | from ..synonymgroup import SynonymGroup 20 | 21 | 22 | class SynonymGroupList(object): 23 | 24 | def __init__(self, bytes_, offset): 25 | """Constructs a new synonym group list. 26 | 27 | Args: 28 | bytes_ (mmap.mmap): a memory-mapped dictionary 29 | offset (int): byte offset 30 | """ 31 | self.bytes_ = bytes_ 32 | self.orig_pos = self.bytes_.tell() 33 | self.bytes_.seek(offset) 34 | self.size = int.from_bytes(self.bytes_.read(4), 'little', signed=True) 35 | 36 | self.group_id_to_offset = {} 37 | for i in range(self.size): 38 | group_id = int.from_bytes(self.bytes_.read(4), 'little', signed=True) 39 | offset = int.from_bytes(self.bytes_.read(4), 'little', signed=True) 40 | self.group_id_to_offset[group_id] = offset 41 | 42 | def get_synonym_group(self, group_id): 43 | """Search a synonym group with the ``group_id`` and return the ``SynonymGroup`` object. 44 | 45 | Args: 46 | group_id (int): a synonym group ID 47 | 48 | Returns: 49 | SynonymGroup | None: the ``SynonymGroup`` with the ``group_id``, or ``None`` if no group is found. 50 | """ 51 | if group_id not in self.group_id_to_offset: 52 | return None 53 | 54 | offset = self.group_id_to_offset[group_id] 55 | self.bytes_.seek(offset) # ? self.bytes_.seek(self.group_id_to_offset[group_id]) 56 | 57 | synonyms = [] 58 | n = int.from_bytes(self.bytes_.read(2), 'little') 59 | for i in range(n): 60 | head_word = self.buffer_to_string() 61 | lexeme_ids = self.buffer_to_short_array() 62 | flags = int.from_bytes(self.bytes_.read(2), 'little') 63 | category = self.buffer_to_string() 64 | synonyms.append(Synonym(head_word, lexeme_ids, Flags.from_int(flags), category)) 65 | 66 | return SynonymGroup(group_id, synonyms) 67 | 68 | def buffer_to_string_length(self): 69 | """Reads a byte with a length of a subsequent string and returns the string length. 70 | 71 | Returns: 72 | int: a string length 73 | """ 74 | length = self.bytes_.read_byte() 75 | if length < 128: 76 | return length 77 | else: 78 | low = self.bytes_.read_byte() 79 | return ((length & 0x7F) << 8) | low 80 | 81 | def buffer_to_string(self): 82 | """Reads bytes with a string of the appropriate length and returns the string. 83 | 84 | Returns: 85 | str: a string 86 | """ 87 | length = self.buffer_to_string_length() 88 | return self.bytes_.read(2 * length).decode('utf-16-le') 89 | 90 | def buffer_to_short_array(self): 91 | """Reads byte with a continuous value of short. 92 | 93 | Returns: 94 | list[int]: a list of short 95 | """ 96 | length = self.bytes_.read_byte() 97 | _bytes = self.bytes_.read(2 * length) 98 | return list(struct.unpack('{}h'.format(length), _bytes)) 99 | -------------------------------------------------------------------------------- /chikkarpy/synonym.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .dictionarylib.flags import Flags 16 | 17 | 18 | class Synonym(object): 19 | """ 20 | A synonym 21 | """ 22 | def __init__(self, head_word, lexeme_ids, flags, category): 23 | """Construct a new synonym with the specified parameter. 24 | 25 | Args: 26 | head_word (str): a notation string 27 | lexeme_ids (list[int]): IDs of lexeme in the synonym group 28 | flags (Flags): encoded flags 29 | category (str): category Information of the synonym 30 | """ 31 | self._head_word = head_word 32 | self._lexeme_ids = lexeme_ids 33 | self._flags = flags 34 | self._category = category 35 | 36 | @property 37 | def head_word(self): 38 | """str: the notation of this synonym""" 39 | return self._head_word 40 | 41 | @property 42 | def lexeme_ids(self): 43 | """list[int]: the IDs of the lexemes that corresponds to this synonym""" 44 | return self._lexeme_ids 45 | 46 | @property 47 | def category(self): 48 | """str: the category information of this synonym""" 49 | return self._category 50 | 51 | @property 52 | def flags(self): 53 | """Flags: encoded flags""" 54 | return self._flags 55 | 56 | @property 57 | def has_ambiguity(self): 58 | """bool: ``True`` if this synonym is ambiguous, ``False`` otherwise""" 59 | return self._flags.has_ambiguity 60 | 61 | @property 62 | def is_noun(self): 63 | """bool: ``True`` if this synonym is a noun, ``False`` otherwise""" 64 | return self._flags.is_noun 65 | 66 | @property 67 | def form_type(self): 68 | """int: the word form type of this synonym""" 69 | return self._flags.form_type 70 | 71 | @property 72 | def acronym_type(self): 73 | """int: the acronym type of this synonym""" 74 | return self._flags.acronym_type 75 | 76 | @property 77 | def variant_type(self): 78 | """int: the variant type of this synonym""" 79 | return self._flags.variant_type 80 | -------------------------------------------------------------------------------- /chikkarpy/synonymgroup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .synonym import Synonym 16 | 17 | 18 | class SynonymGroup(object): 19 | """ 20 | A container of synonyms 21 | """ 22 | def __init__(self, group_id, synonyms): 23 | """Constructs a new group with the specified synonym group ID and the list of synonyms. 24 | 25 | Args: 26 | group_id (int): a synonym group ID 27 | synonyms (list[Synonym]): a list of synonyms 28 | """ 29 | self._group_id = group_id 30 | self._synonyms = synonyms 31 | 32 | def get_id(self): 33 | """Returns the ID of this group. 34 | 35 | Returns: 36 | int: the ID of this group 37 | """ 38 | return self._group_id 39 | 40 | def get_synonyms(self): 41 | """Returns the list of synonyms in this group. 42 | 43 | Returns: 44 | list[Synonym]: the list of synonyms in this group 45 | """ 46 | return self._synonyms 47 | 48 | def lookup(self, word): 49 | """Returns a synonym from this group with the specified headword. 50 | 51 | Args: 52 | word (str): a headword 53 | 54 | Returns: 55 | Synonym | None: the synonym with the specified headword, or ``None`` if a synonym is not found 56 | """ 57 | for synonym in self._synonyms: 58 | if synonym.head_word == word: 59 | return synonym 60 | 61 | return None 62 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dartsclone~=0.9.0 2 | sortedcontainers>=2.1.0 3 | -------------------------------------------------------------------------------- /scripts/flake8.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = \ 3 | F401, \ # imported but unused 4 | C901, \ # too complex 5 | max-line-length = 140 6 | exclude = __init__.py 7 | max-complexity = 10 8 | -------------------------------------------------------------------------------- /scripts/license-header.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /scripts/lint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd $(dirname $0) && cd .. 4 | 5 | flake8 --show --config=scripts/flake8.cfg setup.py chikkarpy/ tests/ >> scripts/.log 2>&1 6 | 7 | array=() 8 | for FILE in $(find ./chikkarpy -type f -name "*.py"); do 9 | array+=( ${FILE} ) 10 | done 11 | for FILE in $(find ./tests -type f -name "*.py"); do 12 | array+=( ${FILE} ) 13 | done 14 | array+=( ./setup.py ) 15 | 16 | HEADER=$(cat scripts/license-header.txt) 17 | for FILE in ${array[@]}; do 18 | FILE_CONTENTS=$(cat "${FILE}") 19 | if [[ ${FILE_CONTENTS} != ${HEADER}* ]]; then 20 | echo "invalid license header on ${FILE}" >> scripts/.log 2>&1 21 | fi 22 | done 23 | 24 | cat scripts/.log 25 | ERROR_LINE_NUM=$(cat scripts/.log | wc -l) 26 | rm scripts/.log 27 | 28 | if [ "${ERROR_LINE_NUM}" -gt 0 ]; then 29 | exit 1 30 | fi 31 | -------------------------------------------------------------------------------- /scripts/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd $(dirname $0) && cd .. 4 | 5 | TEST_RESOURCES_DIR="tests/resources/" 6 | for DIC_TYPE in {system,user,user2}; do 7 | IN="${TEST_RESOURCES_DIR}${DIC_TYPE}.csv" 8 | OUT="${TEST_RESOURCES_DIR}${DIC_TYPE}.dic" 9 | DES="the ${DIC_TYPE} dictionary for the unit tests" 10 | python -c "import sys; from chikkarpy.command_line import build_dictionary; build_dictionary(sys.argv[1], sys.argv[2], sys.argv[3]);" "${IN}" "${OUT}" "${DES}" 11 | done 12 | 13 | python -m unittest discover tests -p '*test*.py' 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from chikkarpy.config import download_dictionary 16 | 17 | from setuptools import find_packages, setup 18 | 19 | setup( 20 | name="chikkarpy", 21 | use_scm_version=True, 22 | setup_requires=['setuptools_scm'], 23 | description="Python version of chikkar, a library for using the Sudachi synonym dictionary", 24 | long_description=open('README.md', encoding='utf-8').read(), 25 | long_description_content_type="text/markdown", 26 | url="https://github.com/WorksApplications/chikkarpy", 27 | license="Apache-2.0", 28 | author="Works Applications", 29 | author_email="sudachi@worksap.co.jp", 30 | packages=find_packages(include=["chikkarpy", "chikkarpy.*"]), 31 | package_data={"": ["resources/*"]}, 32 | entry_points={ 33 | "console_scripts": ["chikkarpy=chikkarpy.command_line:main"] 34 | }, 35 | install_requires=[ 36 | "dartsclone~=0.9.0", 37 | "sortedcontainers>=2.1.0" 38 | ] 39 | ) 40 | 41 | # Downloads the Sudachi Synonym dictionary 42 | download_dictionary() 43 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/dictionarylib/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/dictionarylib/test_dictionary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from unittest import TestCase 17 | 18 | from chikkarpy.dictionarylib import Dictionary 19 | 20 | 21 | class TestDictionary(TestCase): 22 | 23 | def setUp(self): 24 | dic_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources', 'system.dic') 25 | self.dict = Dictionary(dic_file, True) 26 | self.dict_group_id = Dictionary(dic_file, False) 27 | 28 | def tearDown(self): 29 | self.dict.dict_.close() 30 | self.dict_group_id.dict_.close() 31 | 32 | def test_lookup(self): 33 | self.assertCountEqual(self.dict.lookup("open", group_ids=None), [6, 100006]) 34 | self.assertCountEqual(self.dict.lookup("open", group_ids=[4]), [6, 100006]) 35 | 36 | self.assertCountEqual(self.dict_group_id.lookup("open", group_ids=None), [6, 100006]) 37 | self.assertCountEqual(self.dict_group_id.lookup("open", group_ids=[4]), [4]) 38 | 39 | def test_get_synonyms(self): 40 | synonym_group = self.dict.get_synonym_group(6) 41 | self.assertTrue(synonym_group) 42 | self.assertEqual(synonym_group.get_id(), 6) 43 | 44 | # non-existent group id in the dictionary 45 | synonym_group = self.dict.get_synonym_group(200) 46 | self.assertFalse(synonym_group) 47 | -------------------------------------------------------------------------------- /tests/dictionarylib/test_dictionaryheader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import mmap 16 | import os 17 | from unittest import TestCase 18 | 19 | from chikkarpy.dictionarylib.dictionaryheader import DictionaryHeader 20 | from chikkarpy.dictionarylib.dictionaryversion import SYSTEM_DICT_VERSION_1 21 | 22 | 23 | class TestDictionaryHeader(TestCase): 24 | 25 | def setUp(self): 26 | dic_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources', 'system.dic') 27 | with open(dic_file, 'rb') as f: 28 | bytes_ = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) 29 | self.header = DictionaryHeader.from_bytes(bytes_, 0) 30 | 31 | def test_version(self): 32 | self.assertTrue(self.header.version, SYSTEM_DICT_VERSION_1) 33 | 34 | def test_create_time(self): 35 | self.assertTrue(self.header.create_time > 0) 36 | 37 | def test_description(self): 38 | self.assertEqual(self.header.description, "the system dictionary for the unit tests") 39 | 40 | def test_is_dictionary(self): 41 | self.assertTrue(self.header.is_dictionary()) 42 | -------------------------------------------------------------------------------- /tests/dictionarylib/test_doublearraytrie.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import mmap 16 | import os 17 | from unittest import TestCase 18 | 19 | from chikkarpy.dictionarylib.dictionaryheader import DictionaryHeader 20 | from chikkarpy.dictionarylib.doublearraytrie import DoubleArrayTrie 21 | 22 | 23 | class TestDoubleArrayTrie(TestCase): 24 | 25 | ENCODING = "utf-8" 26 | 27 | def setUp(self): 28 | dic_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources', 'system.dic') 29 | with open(dic_file, 'rb') as f: 30 | bytes_ = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) 31 | header = DictionaryHeader.from_bytes(bytes_, 0) 32 | self.trie = DoubleArrayTrie(bytes_, header.storage_size()) 33 | 34 | def test_common_prefix_search(self): 35 | results = list(self.trie.lookup_by_common_prefix("open".encode(self.ENCODING), 0)) 36 | self.assertEqual(len(results), 2) 37 | r1, r2 = results 38 | self.assertEqual(r1[0], 6) 39 | self.assertEqual(r1[1], 4) 40 | self.assertEqual(r2[0], 100006) 41 | self.assertEqual(r2[1], 4) 42 | 43 | def test_exact_match(self): 44 | self.assertCountEqual(self.trie.lookup_by_exact_match("open".encode(self.ENCODING)), [6, 100006]) 45 | self.assertFalse(self.trie.lookup_by_exact_match("nothing".encode(self.ENCODING))) 46 | 47 | def test_storage_size(self): 48 | self.assertEqual(self.trie.get_storage_size(), 1095) 49 | -------------------------------------------------------------------------------- /tests/dictionarylib/test_flags.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from unittest import TestCase 16 | 17 | from chikkarpy.dictionarylib.flags import Flags 18 | from chikkarpy.dictionarylib.format import Acronym, Form, Variant 19 | 20 | 21 | class TestFlags(TestCase): 22 | 23 | def test_all_zero(self): 24 | flags = Flags(False, False, Form.NONE, Form.NONE, Form.NONE) 25 | code = flags.encode() 26 | new_flags = Flags.from_int(code) 27 | self.assertFalse(new_flags.has_ambiguity) 28 | self.assertFalse(new_flags.is_noun) 29 | self.assertEqual(new_flags.form_type, Form.NONE) 30 | self.assertEqual(new_flags.acronym_type, Form.NONE) 31 | self.assertEqual(new_flags.variant_type, Form.NONE) 32 | 33 | def test_max(self): 34 | flags = Flags(True, True, Form.MISNOMER, Acronym.OTHERS, Variant.MISSPELLED) 35 | code = flags.encode() 36 | new_flags = Flags.from_int(code) 37 | self.assertTrue(new_flags.has_ambiguity) 38 | self.assertTrue(new_flags.is_noun) 39 | self.assertEqual(new_flags.form_type, Form.MISNOMER) 40 | self.assertEqual(new_flags.acronym_type, Acronym.OTHERS) 41 | self.assertEqual(new_flags.variant_type, Variant.MISSPELLED) 42 | -------------------------------------------------------------------------------- /tests/resources/system.csv: -------------------------------------------------------------------------------- 1 | 000005,1,0,1,0,0,0,(),閉店,, 2 | 000005,1,1,2,0,0,0,(),クローズ,, 3 | 000005,1,1,2,0,0,1,(),close,, 4 | 000005,1,0,3,0,0,0,(),店仕舞い,, 5 | 6 | 000006,1,0,1,0,0,0,(),開店,, 7 | 000006,1,0,2,0,0,0,(),始業,, 8 | 000006,1,0,3,0,0,0,(),営業開始,, 9 | 000006,1,1,4,0,0,0,(),店開き,, 10 | 000006,1,1,5,0,0,0,(),オープン,, 11 | 000006,1,1,5,0,0,1,(),open,, 12 | 13 | 100006,1,0,1,0,0,0,(),公然,, 14 | 100006,1,1,2,0,0,0,(),オープン,, 15 | 100006,1,1,2,0,0,1,(),open,, 16 | -------------------------------------------------------------------------------- /tests/resources/user.csv: -------------------------------------------------------------------------------- 1 | 1000001,1,0,1,0,0,0,(),開放,, 2 | 1000001,2,0,2,0,0,0,(),開け放す,, 3 | 1000001,2,0,3,0,0,0,(),開く,, 4 | 1000001,1,0,4,0,0,0,(),オープン,, 5 | 1000001,1,0,4,0,0,1,(),open,, -------------------------------------------------------------------------------- /tests/resources/user2.csv: -------------------------------------------------------------------------------- 1 | 1000001,1,0,4,0,0,0,(),オープン,, 2 | 1000001,1,1,4,0,0,1,(),open,, -------------------------------------------------------------------------------- /tests/test_chikkar.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from unittest import TestCase 17 | 18 | from chikkarpy import Chikkar 19 | from chikkarpy.dictionarylib import Dictionary 20 | 21 | 22 | class TestChikkar(TestCase): 23 | 24 | def setUp(self): 25 | dict_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources') 26 | 27 | self.system_dict = Dictionary(os.path.join(dict_dir, 'system.dic'), False) 28 | self.user_dict = Dictionary(os.path.join(dict_dir, 'user.dic'), True) 29 | self.user2_dict = Dictionary(os.path.join(dict_dir, 'user2.dic'), True) 30 | 31 | self.chikkar = Chikkar() 32 | self.chikkar.add_dictionary(self.system_dict) 33 | 34 | def tearDown(self): 35 | self.system_dict.dict_.close() 36 | self.user_dict.dict_.close() 37 | self.user2_dict.dict_.close() 38 | 39 | def test_find(self): 40 | self.assertCountEqual(self.chikkar.find("開店"), ["始業", "営業開始", "店開き", "オープン", "open"]) 41 | self.assertFalse(self.chikkar.find("オープン")) 42 | self.assertFalse(self.chikkar.find("nothing")) 43 | 44 | def test_find_with_group_ids(self): 45 | group_ids = [6] 46 | self.assertCountEqual(self.chikkar.find("開店", group_ids=group_ids), ["始業", "営業開始", "店開き", "オープン", "open"]) 47 | self.assertFalse(self.chikkar.find("オープン", group_ids=group_ids)) 48 | self.assertFalse(self.chikkar.find("nothing", group_ids=[0])) 49 | 50 | def test_find_oov_with_group_ids(self): 51 | with self.assertRaises(ValueError): 52 | self.assertFalse(self.chikkar.find("nothing", group_ids=[6])) 53 | 54 | def test_find_with_user_dict(self): 55 | self.chikkar.add_dictionary(self.user_dict) 56 | self.assertCountEqual(self.chikkar.find("open"), ["開放", "オープン"]) 57 | self.chikkar.add_dictionary(self.user2_dict) 58 | self.assertFalse(self.chikkar.find("open")) 59 | self.assertCountEqual(self.chikkar.find("開店"), ["始業", "営業開始", "店開き", "オープン", "open"]) 60 | 61 | def test_enable_verb(self): 62 | self.chikkar.add_dictionary(self.user_dict) 63 | self.chikkar.enable_verb() 64 | self.assertCountEqual(self.chikkar.find("open"), ["開放", "開け放す", "開く", "オープン"]) 65 | -------------------------------------------------------------------------------- /tests/test_synonymgroup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from unittest import TestCase 16 | 17 | from chikkarpy.dictionarylib.flags import Flags 18 | from chikkarpy.synonym import Synonym 19 | from chikkarpy.synonymgroup import SynonymGroup 20 | 21 | 22 | class TestSynonymGroup(TestCase): 23 | 24 | def setUp(self): 25 | flags = Flags(False, True, 0, 0, 0) 26 | self.synonym_a = Synonym("aaa", [1], flags, "") 27 | self.synonym_b = Synonym("bbb", [2], flags, "") 28 | self.group = SynonymGroup(1, [self.synonym_a, self.synonym_b]) 29 | 30 | def test_get_id(self): 31 | self.assertEqual(self.group.get_id(), 1) 32 | 33 | def test_get_synonyms(self): 34 | synonyms = self.group.get_synonyms() 35 | self.assertEqual(len(synonyms), 2) 36 | 37 | s = synonyms[0] 38 | self.assertEqual(s.head_word, "aaa") 39 | self.assertFalse(s.has_ambiguity) 40 | self.assertTrue(s.is_noun) 41 | self.assertListEqual(s.lexeme_ids, [1]) 42 | self.assertEqual(s.form_type, 0) 43 | self.assertEqual(s.acronym_type, 0) 44 | self.assertEqual(s.variant_type, 0) 45 | s = synonyms[1] 46 | self.assertEqual(s.head_word, "bbb") 47 | 48 | def test_lookup(self): 49 | s = self.group.lookup("aaa") 50 | self.assertIsNotNone(s) 51 | self.assertEqual(s.head_word, "aaa") 52 | s = self.group.lookup("ccc") 53 | self.assertIsNone(s) 54 | --------------------------------------------------------------------------------