├── MANIFEST.in ├── tests ├── resources │ ├── rewrite_error_ignorelist.def │ ├── rewrite_error_dup.def │ ├── rewrite_error_replacelist.def │ ├── rewrite.def │ ├── dict │ │ ├── user2.csv │ │ ├── user.csv │ │ ├── matrix.def │ │ └── lex.csv │ ├── char.def │ ├── sudachi.json │ ├── numeric_sudachi.json │ ├── sudachi_large_user.json │ ├── unk.def │ └── joinnumeric │ │ └── char.def ├── __init__.py ├── plugin │ ├── __init__.py │ ├── test_join_katakana_oov_plugin.py │ ├── test_default_input_text_plugin.py │ ├── test_prolongedsoundmarkinput.py │ ├── test_numericparser.py │ └── test_join_numeric_plugin.py ├── dictionarylib │ ├── __init__.py │ ├── test_dictionaryheader.py │ ├── test_userdictionarybuilder.py │ └── test_doublearraylexicon.py ├── test_dictionary.py ├── test_large_userdict.py ├── mock_grammar.py ├── mock_inputtext.py ├── test_tokenizer.py ├── test_grammar.py └── test_switchdictionary.py ├── .github ├── FUNDING.yml └── workflows │ ├── python-publish.yml │ ├── python-publish-macos.yml │ ├── python-publish-windows.yml │ └── build.yml ├── requirements.txt ├── scripts ├── flake8.cfg ├── format.sh ├── checkheader.sh ├── license-header.txt └── test.sh ├── .gitattributes ├── sudachipy ├── latticenode.pxd ├── lattice.pxd ├── plugin │ ├── connect_cost │ │ ├── __init__.py │ │ ├── inhibitconnectioncost.py │ │ └── editconnectioncost.py │ ├── oov │ │ ├── __init__.py │ │ ├── utils.py │ │ ├── oov_provider_plugin.py │ │ ├── simple_oov_plugin.py │ │ └── mecab_oov_plugin.py │ ├── input_text │ │ ├── __init__.py │ │ ├── input_text.py │ │ ├── utils.py │ │ ├── prolongedsoundmark.py │ │ └── default_input_text_plugin.py │ ├── path_rewrite │ │ ├── __init__.py │ │ ├── utils.py │ │ ├── join_katakana_oov_plugin.py │ │ ├── path_rewrite_plugin.py │ │ └── join_numeric_plugin.py │ └── __init__.py ├── __init__.py ├── dictionarylib │ ├── __init__.py │ ├── wordidtable.py │ ├── categorytype.py │ ├── wordinfo.py │ ├── jtypedbytebuffer.py │ ├── lexicon.py │ ├── dictionaryversion.py │ ├── wordparameterlist.py │ ├── dictionaryheader.py │ ├── userdictionarybuilder.py │ ├── binarydictionary.py │ ├── wordinfolist.py │ ├── grammar.py │ ├── doublearraylexicon.py │ ├── lexiconset.py │ └── charactercategory.py ├── resources │ ├── sudachi.json │ ├── unk.def │ └── char.def ├── morpheme.py ├── morphemelist.py ├── utf8inputtext.py ├── latticenode.pyx ├── dictionary.py ├── config.py ├── utf8inputtextbuilder.py └── lattice.pyx ├── .gitignore └── setup.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE requirements.txt 2 | -------------------------------------------------------------------------------- /tests/resources/rewrite_error_ignorelist.def: -------------------------------------------------------------------------------- 1 | # there are two characters in ignore list 2 | 12 -------------------------------------------------------------------------------- /tests/resources/rewrite_error_dup.def: -------------------------------------------------------------------------------- 1 | # there are ad uplicated replacement. 2 | 12 21 3 | 12 31 4 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: WorksApplications 4 | -------------------------------------------------------------------------------- /tests/resources/rewrite_error_replacelist.def: -------------------------------------------------------------------------------- 1 | # there are three columns in replace list 2 | 12 21 31 3 | -------------------------------------------------------------------------------- /tests/resources/rewrite.def: -------------------------------------------------------------------------------- 1 | # ignore normalize list 2 | Ⅲ 3 | ⅲ 4 | ⼼ 5 | 6 | # replace char list 7 | ガ ガ 8 | ウ゛ ヴ 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sortedcontainers~=2.1.0 2 | dartsclone~=0.9.0 3 | # flake8 4 | # flake8-import-order 5 | # flake8-buitins 6 | -------------------------------------------------------------------------------- /tests/resources/dict/user2.csv: -------------------------------------------------------------------------------- 1 | ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,*,A,*,*,*,* 2 | かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,*,A,*,*,*,* 3 | -------------------------------------------------------------------------------- /scripts/flake8.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = \ 3 | E501, \ # line too long ( > _ characters) 4 | C901, \ # too complex 5 | max-line-length = 140 6 | exclude = __init__.py 7 | max-complexity = 10 8 | -------------------------------------------------------------------------------- /scripts/format.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd $(dirname $0) 4 | 5 | flake8 --show --config=flake8.cfg ../sudachipy 6 | flake8 --show --config=flake8.cfg ../tests 7 | 8 | cd .. 9 | scripts/checkheader.sh -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | 3 | *.def text 4 | *.in text 5 | *.json text 6 | *.md text 7 | *.py text 8 | *.txt text 9 | 10 | *.pyc binary 11 | *.pyd binary 12 | *.pyo binary 13 | *.pyw binary 14 | *.dic binary 15 | *.png binary 16 | *.jpg binary 17 | -------------------------------------------------------------------------------- /tests/resources/dict/user.csv: -------------------------------------------------------------------------------- 1 | ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,*,A,*,*,*,* 2 | 府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,*,A,*,*,*,* 3 | 東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,*,B,5/U1,*,5/U1,1/3 4 | すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,*,A,*,*,*,* 5 | -------------------------------------------------------------------------------- /scripts/checkheader.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | HEADER=scripts/license-header.txt 4 | SIZE=`wc -c < "$HEADER"` 5 | 6 | RES=`find setup.py sudachipy tests -type f -name '*.py' -exec cmp -n "$SIZE" "$HEADER" {} \;` 7 | if [ -n "$RES" ]; then 8 | echo "$RES" | awk '{print "invalid license header on " $2}' >&2 9 | exit 1 10 | fi -------------------------------------------------------------------------------- /sudachipy/latticenode.pxd: -------------------------------------------------------------------------------- 1 | cdef class LatticeNode: 2 | 3 | cdef int begin 4 | cdef int end 5 | cdef int total_cost 6 | cdef int word_id 7 | cdef bint _is_oov 8 | cdef LatticeNode best_previous_node 9 | cdef bint is_connected_to_bos 10 | cdef object extra_word_info 11 | cdef object undefined_word_info 12 | cdef bint _is_defined 13 | cdef object lexicon 14 | cdef int left_id 15 | cdef int right_id 16 | cdef int cost 17 | 18 | -------------------------------------------------------------------------------- /sudachipy/lattice.pxd: -------------------------------------------------------------------------------- 1 | from .latticenode cimport LatticeNode 2 | 3 | cdef extern from "limits.h": 4 | cdef int INT_MAX 5 | 6 | cdef class Lattice: 7 | 8 | cdef int size 9 | cdef int capacity 10 | cdef LatticeNode eos_node 11 | 12 | cdef list end_lists 13 | cdef object grammar 14 | cdef object eos_params 15 | cdef const short[:,:] connect_costs 16 | 17 | cpdef void resize(self, int size) 18 | cpdef void insert(self, int begin, int end, LatticeNode node) 19 | cdef void connect_node(self, LatticeNode r_node) 20 | cdef void connect_eos_node(self) 21 | -------------------------------------------------------------------------------- /tests/resources/char.def: -------------------------------------------------------------------------------- 1 | 0x0030..0x0039 NUMERIC #0-9 2 | 0x0041..0x005A ALPHA #A-Z 3 | 0x0061..0x007A ALPHA #a-z 4 | 0x00C0..0x00FF ALPHA # Latin 1 #À->ÿ 5 | 0x3041..0x309F HIRAGANA 6 | 0x30A1..0x30FF KATAKANA 7 | 0x30A1 NOOOVBOW 8 | 0xFF66..0xFF9D KATAKANA 9 | 0xFF9E..0xFF9F KATAKANA 10 | 0x2E80..0x2EF3 KANJI # CJK Raidcals Supplement 11 | 0x2F00..0x2FD5 KANJI 12 | 0x3005 KANJI 13 | 0x3007 KANJI 14 | 0x3400..0x4DB5 KANJI # CJK Unified Ideographs Extention 15 | 0x4E00..0x9FA5 KANJI 16 | 0xF900..0xFA2D KANJI 17 | 0xFA30..0xFA6A KANJI 18 | 0xFF10..0xFF19 NUMERIC 19 | 0xFF21..0xFF3A ALPHA 20 | 0xFF41..0xFF5A ALPHA 21 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /scripts/license-header.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/plugin/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | -------------------------------------------------------------------------------- /tests/dictionarylib/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | -------------------------------------------------------------------------------- /sudachipy/plugin/connect_cost/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .editconnectioncost import EditConnectionCostPlugin 16 | from .inhibitconnectioncost import InhibitConnectionPlugin 17 | -------------------------------------------------------------------------------- /sudachipy/plugin/oov/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .oov_provider_plugin import OovProviderPlugin 16 | from .mecab_oov_plugin import MeCabOovPlugin 17 | from .simple_oov_plugin import SimpleOovPlugin 18 | from .utils import get_oov_plugins 19 | -------------------------------------------------------------------------------- /tests/resources/sudachi.json: -------------------------------------------------------------------------------- 1 | { 2 | "systemDict" : "system.dic", 3 | "userDict" : [ "user.dic" ], 4 | "characterDefinitionFile" : "char.def", 5 | "inputTextPlugin" : [ 6 | { "class" : "sudachipy.plugin.input_text.DefaultInputTextPlugin" } 7 | ], 8 | "oovProviderPlugin" : [ 9 | { "class" : "sudachipy.plugin.oov.SimpleOovProviderPlugin", 10 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 11 | "leftId" : 8, 12 | "rightId" : 8, 13 | "cost" : 6000 } 14 | ], 15 | "pathRewritePlugin" : [ 16 | { "class" : "sudachipy.plugin.path_rewrite.JoinNumericPlugin", 17 | "enableNormalize" : true }, 18 | { "class" : "sudachipy.plugin.path_rewrite.JoinKatakanaOovPlugin", 19 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 20 | "minLength" : 3 21 | } 22 | ] 23 | } 24 | -------------------------------------------------------------------------------- /sudachipy/plugin/input_text/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .input_text import InputTextPlugin 16 | from .default_input_text_plugin import DefaultInputTextPlugin 17 | from .prolongedsoundmark import ProlongedSoundMarkInputTextPlugin 18 | from .utils import get_input_text_plugins 19 | -------------------------------------------------------------------------------- /sudachipy/plugin/path_rewrite/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .join_katakana_oov_plugin import JoinKatakanaOovPlugin 16 | from .join_numeric_plugin import JoinNumericPlugin 17 | from .path_rewrite_plugin import PathRewritePlugin 18 | from .utils import get_path_rewrite_plugins 19 | -------------------------------------------------------------------------------- /tests/resources/numeric_sudachi.json: -------------------------------------------------------------------------------- 1 | { 2 | "systemDict" : "system.dic", 3 | "userDict" : [ "user.dic" ], 4 | "characterDefinitionFile" : "joinnumeric/char.def", 5 | "inputTextPlugin" : [ 6 | { "class" : "sudachipy.plugin.input_text.DefaultInputTextPlugin" } 7 | ], 8 | "oovProviderPlugin" : [ 9 | { "class" : "sudachipy.plugin.oov.SimpleOovProviderPlugin", 10 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 11 | "leftId" : 8, 12 | "rightId" : 8, 13 | "cost" : 6000 } 14 | ], 15 | "pathRewritePlugin" : [ 16 | { "class" : "sudachipy.plugin.path_rewrite.JoinNumericPlugin", 17 | "enableNormalize" : true }, 18 | { "class" : "sudachipy.plugin.path_rewrite.JoinKatakanaOovPlugin", 19 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 20 | "minLength" : 3 21 | } 22 | ] 23 | } 24 | -------------------------------------------------------------------------------- /tests/resources/sudachi_large_user.json: -------------------------------------------------------------------------------- 1 | { 2 | "systemDict" : "system.dic", 3 | "userDict" : [ "user.dic", "large_user.dic" ], 4 | "characterDefinitionFile" : "char.def", 5 | "inputTextPlugin" : [ 6 | { "class" : "sudachipy.plugin.input_text.DefaultInputTextPlugin" } 7 | ], 8 | "oovProviderPlugin" : [ 9 | { "class" : "sudachipy.plugin.oov.SimpleOovProviderPlugin", 10 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 11 | "leftId" : 8, 12 | "rightId" : 8, 13 | "cost" : 6000 } 14 | ], 15 | "pathRewritePlugin" : [ 16 | { "class" : "sudachipy.plugin.path_rewrite.JoinNumericPlugin", 17 | "enableNormalize" : true }, 18 | { "class" : "sudachipy.plugin.path_rewrite.JoinKatakanaOovPlugin", 19 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 20 | "minLength" : 3 21 | } 22 | ] 23 | } 24 | -------------------------------------------------------------------------------- /sudachipy/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import utf8inputtextbuilder 16 | from . import tokenizer 17 | from . import config 18 | 19 | from pkg_resources import get_distribution, DistributionNotFound 20 | try: 21 | __version__ = get_distribution(__name__).version 22 | except DistributionNotFound: 23 | # package is not installed 24 | pass 25 | -------------------------------------------------------------------------------- /scripts/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # print error message only when it fails 4 | # python unittest print message in stderr even if it succeed 5 | # You need to prepare system.dic in resources and tests/resources 6 | # see README 7 | 8 | set -e 9 | 10 | # build dictionaries 11 | if !(type sudachipy > /dev/null 2>&1); then 12 | python setup.py develop 13 | fi 14 | sudachipy build -o tests/resources/system.dic -d "the system dictionary for the unit tests" -m tests/resources/dict/matrix.def tests/resources/dict/lex.csv 15 | sudachipy ubuild -o tests/resources/user.dic -s tests/resources/system.dic tests/resources/dict/user.csv 16 | sudachipy ubuild -o tests/resources/large_user.dic -s tests/resources/system.dic tests/resources/dict/large_user.csv 17 | 18 | set +e 19 | 20 | # unittest 21 | RES=`python -m unittest discover tests -p '*test*.py' 2>&1` 22 | STATUS=$? 23 | RES_TAIL=`echo "$RES" | tail -1` 24 | if [[ $RES_TAIL != "OK" ]]; then 25 | >&2 echo "$RES" 26 | fi 27 | 28 | exit $STATUS 29 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.github/workflows/python-publish-macos.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package for macOS 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | 10 | runs-on: ${{ matrix.os }} 11 | 12 | strategy: 13 | matrix: 14 | os: [macos-latest] 15 | architecture: [x64] 16 | python-version: [3.6, 3.7, 3.8] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | architecture: ${{ matrix.architecture }} 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install setuptools wheel twine 29 | - name: Build and publish 30 | env: 31 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 32 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 33 | run: | 34 | python setup.py bdist_wheel 35 | twine upload dist/* 36 | -------------------------------------------------------------------------------- /.github/workflows/python-publish-windows.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package for Windows 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | 10 | runs-on: ${{ matrix.os }} 11 | 12 | strategy: 13 | matrix: 14 | os: [windows-latest] 15 | architecture: [x64, x86] 16 | python-version: [3.6, 3.7, 3.8] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | architecture: ${{ matrix.architecture }} 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install setuptools wheel twine 29 | - name: Build and publish 30 | env: 31 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 32 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 33 | run: | 34 | python setup.py bdist_wheel 35 | twine upload dist/* 36 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import grammar 16 | from . import charactercategory 17 | from . import categorytype 18 | from . import lexiconset 19 | from . import doublearraylexicon 20 | from . import dictionaryheader 21 | from .dictionaryversion import ( 22 | SYSTEM_DICT_VERSION_1, SYSTEM_DICT_VERSION_2, USER_DICT_VERSION_1, USER_DICT_VERSION_2, USER_DICT_VERSION_3, 23 | ) 24 | from .binarydictionary import BinaryDictionary 25 | -------------------------------------------------------------------------------- /sudachipy/plugin/input_text/input_text.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | 17 | from sudachipy.utf8inputtextbuilder import UTF8InputTextBuilder 18 | 19 | 20 | class InputTextPlugin(ABC): 21 | 22 | Builder = UTF8InputTextBuilder 23 | 24 | @abstractmethod 25 | def set_up(self) -> None: 26 | raise NotImplementedError 27 | 28 | @abstractmethod 29 | def rewrite(self, builder: Builder) -> None: 30 | raise NotImplementedError 31 | -------------------------------------------------------------------------------- /sudachipy/plugin/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #from sudachipy.plugin.oov import simple_oov_plugin 16 | #from sudachipy.plugin.oov import mecab_oov_plugin 17 | #from sudachipy.plugin.path_rewrite import join_numeric_plugin 18 | #from sudachipy.plugin.path_rewrite import join_katakana_oov_plugin 19 | #from sudachipy.plugin.input_text import default_input_text_plugin 20 | #from .connect_cost.inhibitconnectioncost import InhibitConnectionPlugin 21 | #from . import oov 22 | #from . import input_text 23 | #from . import path_rewrite 24 | -------------------------------------------------------------------------------- /sudachipy/resources/sudachi.json: -------------------------------------------------------------------------------- 1 | { 2 | "systemDict" : "", 3 | "characterDefinitionFile" : "char.def", 4 | "inputTextPlugin" : [ 5 | { "class" : "sudachipy.plugin.input_text.DefaultInputTextPlugin" }, 6 | { "class" : "sudachipy.plugin.input_text.ProlongedSoundMarkInputTextPlugin", 7 | "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"], 8 | "replacementSymbol": "ー"} 9 | ], 10 | "oovProviderPlugin" : [ 11 | { "class" : "sudachipy.plugin.oov.MeCabOovProviderPlugin", 12 | "charDef" : "char.def", 13 | "unkDef" : "unk.def" }, 14 | { "class" : "sudachipy.plugin.oov.SimpleOovProviderPlugin", 15 | "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ], 16 | "leftId" : 5968, 17 | "rightId" : 5968, 18 | "cost" : 3857 } 19 | ], 20 | "pathRewritePlugin" : [ 21 | { "class" : "sudachipy.plugin.path_rewrite.JoinNumericPlugin", 22 | "enableNormalize" : true }, 23 | { "class" : "sudachipy.plugin.path_rewrite.JoinKatakanaOovPlugin", 24 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 25 | "minLength": 3 } 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/wordidtable.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import struct 16 | 17 | 18 | class WordIdTable(object): 19 | def __init__(self, bytes_, offset): 20 | bytes_.seek(offset) 21 | self.size = int.from_bytes(bytes_.read(4), 'little') 22 | self.offset = offset + 4 23 | self._bytes_view = memoryview(bytes_)[self.offset: self.offset + self.size] 24 | 25 | def __del__(self): 26 | self._bytes_view.release() 27 | 28 | def storage_size(self): 29 | return 4 + self.size 30 | 31 | def get(self, index): 32 | length = self._bytes_view[index] 33 | result = struct.unpack_from("<{}I".format(length), self._bytes_view, index + 1) 34 | return result 35 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: build 5 | 6 | on: 7 | push: 8 | branches: [ develop ] 9 | pull_request: 10 | branches: [ develop ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python 3.9 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: 3.9 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install flake8 27 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 28 | - name: Lint with flake8 29 | run: | 30 | # stop the build if there are Python syntax errors or undefined names 31 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 32 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 33 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 34 | - name: Check license header 35 | run: | 36 | scripts/checkheader.sh 37 | - name: Test with unittest 38 | run: | 39 | python setup.py build_ext --inplace 40 | scripts/test.sh -------------------------------------------------------------------------------- /tests/resources/dict/matrix.def: -------------------------------------------------------------------------------- 1 | 10 10 2 | 0 0 0 3 | 0 1 863 4 | 0 2 2124 5 | 0 3 1032 6 | 0 4 591 7 | 0 5 -162 8 | 0 6 -79 9 | 0 7 887 10 | 0 8 447 11 | 0 9 -535 12 | 1 0 -3689 13 | 1 1 -3361 14 | 1 2 -7643 15 | 1 3 -3267 16 | 1 4 809 17 | 1 5 -1098 18 | 1 6 4606 19 | 1 7 4269 20 | 1 8 4567 21 | 1 9 1635 22 | 2 0 -1959 23 | 2 1 2457 24 | 2 2 811 25 | 2 3 840 26 | 2 4 903 27 | 2 5 -958 28 | 2 6 517 29 | 2 7 2037 30 | 2 8 1392 31 | 2 9 -193 32 | 3 0 -2288 33 | 3 1 1741 34 | 3 2 487 35 | 3 3 792 36 | 3 4 -1474 37 | 3 5 -3429 38 | 3 6 126 39 | 3 7 437 40 | 3 8 605 41 | 3 9 -547 42 | 4 0 -2809 43 | 4 1 -3584 44 | 4 2 -6743 45 | 4 3 -2869 46 | 4 4 -2805 47 | 4 5 -407 48 | 4 6 3422 49 | 4 7 5642 50 | 4 8 6382 51 | 4 9 2165 52 | 5 0 -509 53 | 5 1 -3665 54 | 5 2 -3882 55 | 5 3 -572 56 | 5 4 -1036 57 | 5 5 -54 58 | 5 6 2570 59 | 5 7 3319 60 | 5 8 4059 61 | 5 9 882 62 | 6 0 101 63 | 6 1 2933 64 | 6 2 2198 65 | 6 3 -2004 66 | 6 4 4392 67 | 6 5 4017 68 | 6 6 569 69 | 6 7 475 70 | 6 8 -390 71 | 6 9 852 72 | 7 0 -852 73 | 7 1 2079 74 | 7 2 1180 75 | 7 3 -3084 76 | 7 4 2010 77 | 7 5 1570 78 | 7 6 746 79 | 7 7 2341 80 | 7 8 2051 81 | 7 9 1393 82 | 8 0 -522 83 | 8 1 3354 84 | 8 2 2037 85 | 8 3 -2542 86 | 8 4 3071 87 | 8 5 2631 88 | 8 6 -352 89 | 8 7 2847 90 | 8 8 1134 91 | 8 9 1256 92 | 9 0 -975 93 | 9 1 2498 94 | 9 2 1690 95 | 9 3 -1523 96 | 9 4 3023 97 | 9 5 3139 98 | 9 6 2562 99 | 9 7 3962 100 | 9 8 418 101 | 9 9 -2490 102 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/categorytype.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from enum import Enum 16 | 17 | 18 | class CategoryType(Enum): 19 | DEFAULT = 1 20 | SPACE = 1 << 1 21 | KANJI = 1 << 2 22 | SYMBOL = 1 << 3 23 | NUMERIC = 1 << 4 24 | ALPHA = 1 << 5 25 | HIRAGANA = 1 << 6 26 | KATAKANA = 1 << 7 27 | KANJINUMERIC = 1 << 8 28 | GREEK = 1 << 9 29 | CYRILLIC = 1 << 10 30 | USER1 = 1 << 11 31 | USER2 = 1 << 12 32 | USER3 = 1 << 13 33 | USER4 = 1 << 14 34 | NOOOVBOW = 1 << 15 35 | 36 | def get_id(self): 37 | return self.id 38 | 39 | def get_type(self, id_): 40 | for type_ in CategoryType.values(): 41 | if type_.get_id() is id_: 42 | return type_ 43 | return None 44 | 45 | @staticmethod 46 | def get(str_): 47 | try: 48 | return CategoryType[str_] 49 | except KeyError: 50 | return None 51 | -------------------------------------------------------------------------------- /tests/resources/unk.def: -------------------------------------------------------------------------------- 1 | DEFAULT,5968,5968,3857,補助記号,一般,*,*,*,* 2 | SPACE,5966,5966,6056,空白,*,*,*,*,* 3 | KANJI,5139,5139,14657,名詞,普通名詞,一般,*,*,* 4 | KANJI,5129,5129,17308,名詞,普通名詞,サ変可能,*,*,* 5 | KANJI,4785,4785,18181,名詞,固有名詞,一般,*,*,* 6 | KANJI,4787,4787,18086,名詞,固有名詞,人名,一般,*,* 7 | KANJI,4791,4791,19198,名詞,固有名詞,地名,一般,*,* 8 | SYMBOL,5129,5129,17094,名詞,普通名詞,サ変可能,*,*,* 9 | NUMERIC,4794,4794,12450,名詞,数詞,*,*,*,* 10 | ALPHA,5139,5139,11633,名詞,普通名詞,一般,*,*,* 11 | ALPHA,4785,4785,13620,名詞,固有名詞,一般,*,*,* 12 | ALPHA,4787,4787,14228,名詞,固有名詞,人名,一般,*,* 13 | ALPHA,4791,4791,15793,名詞,固有名詞,地名,一般,*,* 14 | ALPHA,5687,5687,15246,感動詞,一般,*,*,*,* 15 | HIRAGANA,5139,5139,16012,名詞,普通名詞,一般,*,*,* 16 | HIRAGANA,5129,5129,20012,名詞,普通名詞,サ変可能,*,*,* 17 | HIRAGANA,4785,4785,18282,名詞,固有名詞,一般,*,*,* 18 | HIRAGANA,4787,4787,18269,名詞,固有名詞,人名,一般,*,* 19 | HIRAGANA,4791,4791,20474,名詞,固有名詞,地名,一般,*,* 20 | HIRAGANA,5687,5687,17786,感動詞,一般,*,*,*,* 21 | KATAKANA,5139,5139,10980,名詞,普通名詞,一般,*,*,* 22 | KATAKANA,5129,5129,14802,名詞,普通名詞,サ変可能,*,*,* 23 | KATAKANA,4785,4785,13451,名詞,固有名詞,一般,*,*,* 24 | KATAKANA,4787,4787,13759,名詞,固有名詞,人名,一般,*,* 25 | KATAKANA,4791,4791,14554,名詞,固有名詞,地名,一般,*,* 26 | KATAKANA,5687,5687,15272,感動詞,一般,*,*,*,* 27 | KANJINUMERIC,4794,4794,14170,名詞,数詞,*,*,*,* 28 | GREEK,5139,5139,11051,名詞,普通名詞,一般,*,*,* 29 | GREEK,4785,4785,13353,名詞,固有名詞,一般,*,*,* 30 | GREEK,4787,4787,13671,名詞,固有名詞,人名,一般,*,* 31 | GREEK,4791,4791,14862,名詞,固有名詞,地名,一般,*,* 32 | CYRILLIC,5139,5139,11140,名詞,普通名詞,一般,*,*,* 33 | CYRILLIC,4785,4785,13174,名詞,固有名詞,一般,*,*,* 34 | CYRILLIC,4787,4787,13495,名詞,固有名詞,人名,一般,*,* 35 | CYRILLIC,4791,4791,14700,名詞,固有名詞,地名,一般,*,* 36 | -------------------------------------------------------------------------------- /sudachipy/resources/unk.def: -------------------------------------------------------------------------------- 1 | DEFAULT,5968,5968,3857,補助記号,一般,*,*,*,* 2 | SPACE,5966,5966,6056,空白,*,*,*,*,* 3 | KANJI,5139,5139,14657,名詞,普通名詞,一般,*,*,* 4 | KANJI,5129,5129,17308,名詞,普通名詞,サ変可能,*,*,* 5 | KANJI,4785,4785,18181,名詞,固有名詞,一般,*,*,* 6 | KANJI,4787,4787,18086,名詞,固有名詞,人名,一般,*,* 7 | KANJI,4791,4791,19198,名詞,固有名詞,地名,一般,*,* 8 | SYMBOL,5129,5129,17094,名詞,普通名詞,サ変可能,*,*,* 9 | NUMERIC,4794,4794,12450,名詞,数詞,*,*,*,* 10 | ALPHA,5139,5139,11633,名詞,普通名詞,一般,*,*,* 11 | ALPHA,4785,4785,13620,名詞,固有名詞,一般,*,*,* 12 | ALPHA,4787,4787,14228,名詞,固有名詞,人名,一般,*,* 13 | ALPHA,4791,4791,15793,名詞,固有名詞,地名,一般,*,* 14 | ALPHA,5687,5687,15246,感動詞,一般,*,*,*,* 15 | HIRAGANA,5139,5139,16012,名詞,普通名詞,一般,*,*,* 16 | HIRAGANA,5129,5129,20012,名詞,普通名詞,サ変可能,*,*,* 17 | HIRAGANA,4785,4785,18282,名詞,固有名詞,一般,*,*,* 18 | HIRAGANA,4787,4787,18269,名詞,固有名詞,人名,一般,*,* 19 | HIRAGANA,4791,4791,20474,名詞,固有名詞,地名,一般,*,* 20 | HIRAGANA,5687,5687,17786,感動詞,一般,*,*,*,* 21 | KATAKANA,5139,5139,10980,名詞,普通名詞,一般,*,*,* 22 | KATAKANA,5129,5129,14802,名詞,普通名詞,サ変可能,*,*,* 23 | KATAKANA,4785,4785,13451,名詞,固有名詞,一般,*,*,* 24 | KATAKANA,4787,4787,13759,名詞,固有名詞,人名,一般,*,* 25 | KATAKANA,4791,4791,14554,名詞,固有名詞,地名,一般,*,* 26 | KATAKANA,5687,5687,15272,感動詞,一般,*,*,*,* 27 | KANJINUMERIC,4794,4794,14170,名詞,数詞,*,*,*,* 28 | GREEK,5139,5139,11051,名詞,普通名詞,一般,*,*,* 29 | GREEK,4785,4785,13353,名詞,固有名詞,一般,*,*,* 30 | GREEK,4787,4787,13671,名詞,固有名詞,人名,一般,*,* 31 | GREEK,4791,4791,14862,名詞,固有名詞,地名,一般,*,* 32 | CYRILLIC,5139,5139,11140,名詞,普通名詞,一般,*,*,* 33 | CYRILLIC,4785,4785,13174,名詞,固有名詞,一般,*,*,* 34 | CYRILLIC,4787,4787,13495,名詞,固有名詞,人名,一般,*,* 35 | CYRILLIC,4791,4791,14700,名詞,固有名詞,地名,一般,*,* 36 | -------------------------------------------------------------------------------- /sudachipy/plugin/oov/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List 16 | 17 | from sudachipy import config 18 | 19 | from . import MeCabOovPlugin, OovProviderPlugin, SimpleOovPlugin 20 | 21 | 22 | def get_oov_plugin(json_obj) -> OovProviderPlugin: 23 | # In the future, users can define plugin by themselves 24 | try: 25 | if json_obj['class'] == 'sudachipy.plugin.oov.MeCabOovProviderPlugin': 26 | return MeCabOovPlugin(json_obj) 27 | if json_obj['class'] == 'sudachipy.plugin.oov.SimpleOovProviderPlugin': 28 | return SimpleOovPlugin(json_obj) 29 | raise ValueError('{} is invalid OovProviderPlugin class'.format(json_obj['class'])) 30 | except KeyError: 31 | raise ValueError('config file is invalid format') 32 | 33 | 34 | def get_oov_plugins() -> List[OovProviderPlugin]: 35 | key_word = 'oovProviderPlugin' 36 | if key_word not in config.settings: 37 | return [] 38 | ps = [] 39 | for obj in config.settings[key_word]: 40 | ps.append(get_oov_plugin(obj)) 41 | return ps 42 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/wordinfo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class WordInfo: 17 | def __init__(self, 18 | surface, 19 | head_word_length, 20 | pos_id, 21 | normalized_form, 22 | dictionary_form_word_id, 23 | dictionary_form, 24 | reading_form, 25 | a_unit_split, 26 | b_unit_split, 27 | word_structure, 28 | synonym_group_ids): 29 | self.surface = surface 30 | self.head_word_length = head_word_length 31 | self.pos_id = pos_id 32 | self.normalized_form = normalized_form 33 | self.dictionary_form_word_id = dictionary_form_word_id 34 | self.dictionary_form = dictionary_form 35 | self.reading_form = reading_form 36 | self.a_unit_split = a_unit_split 37 | self.b_unit_split = b_unit_split 38 | self.word_structure = word_structure 39 | self.synonym_group_ids = synonym_group_ids 40 | 41 | def length(self): 42 | return self.head_word_length 43 | -------------------------------------------------------------------------------- /sudachipy/plugin/path_rewrite/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List 16 | 17 | from sudachipy import config 18 | 19 | from . import JoinKatakanaOovPlugin, JoinNumericPlugin, PathRewritePlugin 20 | 21 | 22 | def get_path_rewrite_plugin(json_obj) -> PathRewritePlugin: 23 | # In the future, users can define plugin by themselves 24 | try: 25 | if json_obj['class'] == 'sudachipy.plugin.path_rewrite.JoinNumericPlugin': 26 | return JoinNumericPlugin(json_obj) 27 | if json_obj['class'] == 'sudachipy.plugin.path_rewrite.JoinKatakanaOovPlugin': 28 | return JoinKatakanaOovPlugin(json_obj) 29 | raise ValueError('{} is invalid PathRewritePlugin class'.format(json_obj['class'])) 30 | except KeyError: 31 | raise ValueError('config file is invalid format') 32 | 33 | 34 | def get_path_rewrite_plugins() -> List[PathRewritePlugin]: 35 | if 'pathRewritePlugin' not in config.settings: 36 | return [] 37 | ps = [] 38 | for obj in config.settings['pathRewritePlugin']: 39 | ps.append(get_path_rewrite_plugin(obj)) 40 | return ps 41 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/jtypedbytebuffer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from io import BytesIO 16 | 17 | 18 | class JTypedByteBuffer(BytesIO): 19 | """ 20 | A interface of BytesIO to write dictionary 21 | """ 22 | 23 | __ENDIAN = 'little' 24 | 25 | @classmethod 26 | def from_bytes(cls, bytes_io): 27 | return cls(bytes_io.getvalue()) 28 | 29 | def write_int(self, int_, type_, signed=True): 30 | if type_ == 'byte': 31 | len_ = 1 32 | signed = False 33 | elif type_ == 'int': 34 | len_ = 4 35 | elif type_ == 'char': 36 | len_ = 2 37 | signed = False 38 | elif type_ == 'short': 39 | len_ = 2 40 | elif type_ == 'long': 41 | len_ = 8 42 | else: 43 | raise ValueError('{} is invalid type'.format(type_)) 44 | self.write(int_.to_bytes(len_, byteorder=self.__ENDIAN, signed=signed)) 45 | 46 | def write_str(self, text): 47 | self.write(text.encode('utf-16-le')) 48 | 49 | def clear(self): 50 | self.seek(0) 51 | self.truncate(0) 52 | -------------------------------------------------------------------------------- /sudachipy/plugin/oov/oov_provider_plugin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from typing import List 17 | 18 | from sudachipy.dictionarylib.grammar import Grammar 19 | from sudachipy.latticenode import LatticeNode 20 | from sudachipy.utf8inputtext import UTF8InputText 21 | 22 | 23 | class OovProviderPlugin(ABC): 24 | 25 | @abstractmethod 26 | def set_up(self, grammar: Grammar) -> None: 27 | raise NotImplementedError 28 | 29 | @abstractmethod 30 | def provide_oov(self, input_text: UTF8InputText, offset: int, has_other_words: bool) -> List[LatticeNode]: 31 | raise NotImplementedError 32 | 33 | def get_oov(self, input_text: UTF8InputText, offset: int, has_other_words: bool) -> List[LatticeNode]: 34 | nodes = self.provide_oov(input_text, offset, has_other_words) 35 | for node in nodes: 36 | node.set_begin(offset) 37 | node.set_end(offset + node.get_word_info().length()) 38 | return nodes 39 | 40 | @staticmethod 41 | def create_node() -> LatticeNode: 42 | node = LatticeNode() 43 | node.set_oov() 44 | return node 45 | -------------------------------------------------------------------------------- /sudachipy/plugin/input_text/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List 16 | 17 | from sudachipy import config 18 | 19 | from . import DefaultInputTextPlugin, InputTextPlugin, ProlongedSoundMarkInputTextPlugin 20 | 21 | 22 | def get_input_text_plugin(json_obj) -> InputTextPlugin: 23 | # In the future, users can define plugin by themselves 24 | try: 25 | if json_obj['class'] == 'sudachipy.plugin.input_text.DefaultInputTextPlugin': 26 | return DefaultInputTextPlugin() 27 | if json_obj['class'] == 'sudachipy.plugin.input_text.ProlongedSoundMarkInputTextPlugin': 28 | return ProlongedSoundMarkInputTextPlugin(json_obj) 29 | raise ValueError('{} is invalid InputTextPlugin class'.format(json_obj['class'])) 30 | except KeyError: 31 | raise ValueError('config file is invalid format') 32 | 33 | 34 | def get_input_text_plugins() -> List[InputTextPlugin]: 35 | key_word = 'inputTextPlugin' 36 | if key_word not in config.settings: 37 | return [] 38 | ps = [] 39 | for obj in config.settings[key_word]: 40 | ps.append(get_input_text_plugin(obj)) 41 | return ps 42 | -------------------------------------------------------------------------------- /tests/test_dictionary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import unittest 17 | 18 | from sudachipy import tokenizer 19 | from sudachipy.dictionary import Dictionary 20 | 21 | 22 | class TestDictionary(unittest.TestCase): 23 | 24 | def setUp(self): 25 | resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources') 26 | self.dict_ = Dictionary(os.path.join(resource_dir, 'sudachi.json'), resource_dir=resource_dir) 27 | 28 | def tearDown(self) -> None: 29 | self.dict_.close() 30 | 31 | def test_create(self): 32 | self.assertEqual(tokenizer.Tokenizer, type(self.dict_.create())) 33 | 34 | def test_get_part_of_speech_size(self): 35 | self.assertEqual(9, self.dict_.grammar.get_part_of_speech_size()) 36 | 37 | def test_get_part_of_speech_string(self): 38 | pos = self.dict_.grammar.get_part_of_speech_string(0) 39 | self.assertIsNotNone(pos) 40 | self.assertEqual('助動詞', pos[0]) 41 | 42 | # def test_creat_with_merging_settings 43 | 44 | # def test_creat_with_merging_null_ settings 45 | 46 | 47 | if __name__ == '__main__': 48 | unittest.main() 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # IDE, Editor 104 | .idea/ 105 | .vscode/ 106 | 107 | # Sudachi dictionary 108 | *.dic 109 | -------------------------------------------------------------------------------- /tests/resources/joinnumeric/char.def: -------------------------------------------------------------------------------- 1 | # 2 | # Japanese charcter category map 3 | # 4 | # $Id: char.def 9 2012-12-12 04:13:15Z togiso $; 5 | # 6 | 7 | ################################################################################### 8 | # 9 | # CHARACTER CATEGORY DEFINITION 10 | # 11 | # CATEGORY_NAME INVOKE GROUP LENGTH 12 | # 13 | # - CATEGORY_NAME: Name of category. you have to define DEFAULT class. 14 | # - INVOKE: 1/0: always invoke unknown word processing, evan when the word can be found in the lexicon 15 | # - GROUP: 1/0: make a new word by grouping the same chracter category 16 | # - LENGTH: n: 1 to n length new words are added 17 | # 18 | DEFAULT 0 1 0 # DEFAULT is a mandatory category! 19 | SPACE 0 1 0 20 | KANJI 0 0 2 21 | SYMBOL 1 1 0 22 | NUMERIC 1 1 0 23 | ALPHA 1 1 0 24 | HIRAGANA 0 1 2 25 | KATAKANA 1 1 2 26 | KANJINUMERIC 0 1 0 #change INVOKE 1->0 27 | GREEK 1 1 0 28 | CYRILLIC 1 1 0 29 | 30 | ################################################################################### 31 | # 32 | # CODE(UCS2) TO CATEGORY MAPPING 33 | # 34 | 35 | # SPACE 36 | 0x0020 SPACE # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE 37 | 38 | # ASCII 39 | 0x0030..0x0039 NUMERIC #0-9 40 | 41 | # KANJI-NUMERIC (〇 一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆) 42 | 0x3007 KANJINUMERIC KANJI 43 | 0x4E00 KANJINUMERIC KANJI 44 | 0x4E8C KANJINUMERIC KANJI 45 | 0x4E09 KANJINUMERIC KANJI 46 | 0x56DB KANJINUMERIC KANJI 47 | 0x4E94 KANJINUMERIC KANJI 48 | 0x516D KANJINUMERIC KANJI 49 | 0x4E03 KANJINUMERIC KANJI 50 | 0x516B KANJINUMERIC KANJI 51 | 0x4E5D KANJINUMERIC KANJI 52 | 0x5341 KANJINUMERIC KANJI 53 | 0x767E KANJINUMERIC KANJI 54 | 0x5343 KANJINUMERIC KANJI 55 | 0x4E07 KANJINUMERIC KANJI 56 | 0x5104 KANJINUMERIC KANJI 57 | 0x5146 KANJINUMERIC KANJI 58 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/lexicon.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from typing import Iterator, List 17 | 18 | from .wordinfo import WordInfo 19 | 20 | 21 | class Lexicon(ABC): 22 | 23 | Itr = Iterator[List[int]] 24 | 25 | @abstractmethod 26 | def lookup(self, text: str, offset: int) -> Itr: # noqa: F821 27 | raise NotImplementedError 28 | 29 | @abstractmethod 30 | def get_word_id(self, headword: str, pos_id: int, reading_form: str) -> int: 31 | raise NotImplementedError 32 | 33 | @abstractmethod 34 | def get_left_id(self, word_id: int) -> int: 35 | raise NotImplementedError 36 | 37 | @abstractmethod 38 | def get_right_id(self, word_id: int) -> int: 39 | raise NotImplementedError 40 | 41 | @abstractmethod 42 | def get_cost(self, word_id: int) -> int: 43 | raise NotImplementedError 44 | 45 | @abstractmethod 46 | def get_word_info(self, word_id: int) -> 'WordInfo': 47 | raise NotImplementedError 48 | 49 | @abstractmethod 50 | def get_dictionary_id(self, word_id: int) -> int: 51 | raise NotImplementedError 52 | 53 | @abstractmethod 54 | def size(self) -> int: 55 | raise NotImplementedError 56 | -------------------------------------------------------------------------------- /tests/test_large_userdict.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import unittest 17 | 18 | from string import ascii_lowercase 19 | from itertools import product 20 | 21 | from sudachipy import dictionary 22 | 23 | 24 | class TestLargeUserDict(unittest.TestCase): 25 | 26 | def setUp(self): 27 | resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources') 28 | self.dict_ = dictionary.Dictionary(os.path.join(resource_dir, 'sudachi_large_user.json'), resource_dir) 29 | self.tokenizer_obj = self.dict_.create() 30 | 31 | def test_part_of_speech(self): 32 | ms = self.tokenizer_obj.tokenize('やまもも') 33 | self.assertEqual(1, len(ms)) 34 | m = ms[0] 35 | pid = m.part_of_speech_id() 36 | self.assertTrue(self.dict_.grammar.get_part_of_speech_size() > pid) 37 | 38 | # Exploit the cache space 39 | num = 0 40 | for combo in product(ascii_lowercase, repeat=3): 41 | if num > 1024: 42 | break 43 | lex = ''.join(combo) 44 | self.tokenizer_obj.tokenize(lex) 45 | num += 1 46 | 47 | ms = self.tokenizer_obj.tokenize('やまもも') 48 | self.assertEqual(pid, ms[0].part_of_speech_id()) 49 | -------------------------------------------------------------------------------- /tests/mock_grammar.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from unittest import mock 17 | 18 | from sudachipy.dictionarylib.charactercategory import CharacterCategory 19 | from sudachipy.dictionarylib.grammar import Grammar 20 | 21 | mocked_grammar = mock.Mock(spec=Grammar) 22 | mocked_grammar.get_part_of_speech_size.return_value = 0 23 | mocked_grammar.get_part_of_speech_string.return_value = None 24 | mocked_grammar.get_part_of_speech_id.return_value = 0 25 | mocked_grammar.get_connect_cost.return_value = 0 26 | # mocked_grammar.set_connect_cost.return_value = None 27 | mocked_grammar.get_bos_parameter.return_value = None 28 | mocked_grammar.get_eos_parameter.return_value = None 29 | 30 | 31 | def mocked_get_character_category(): 32 | cat = CharacterCategory() 33 | test_resources_dir = os.path.join( 34 | os.path.dirname(os.path.abspath(__file__)), 35 | os.pardir, 36 | 'sudachipy', 37 | 'resources') 38 | try: 39 | cat.read_character_definition(os.path.join(test_resources_dir, 'char.def')) 40 | except IOError as e: 41 | print(e) 42 | return cat 43 | 44 | 45 | mocked_grammar.get_character_category.side_effect = mocked_get_character_category 46 | 47 | 48 | mocked_grammar.set_character_category.return_value = None 49 | -------------------------------------------------------------------------------- /tests/dictionarylib/test_dictionaryheader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import mmap 16 | import os 17 | import unittest 18 | 19 | from sudachipy.dictionarylib import SYSTEM_DICT_VERSION_2 20 | from sudachipy.dictionarylib.dictionaryheader import DictionaryHeader 21 | 22 | 23 | class TestDictionaryHeader(unittest.TestCase): 24 | 25 | def setUp(self): 26 | # Copied from sudachipy.dictionay.Dictionary.read_system_dictionary 27 | test_resources_dir = os.path.join( 28 | os.path.dirname(os.path.abspath(__file__)), 29 | os.pardir, 30 | 'resources') 31 | filename = os.path.join(test_resources_dir, 'system.dic') 32 | with open(filename, 'rb') as system_dic: 33 | bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ) 34 | offset = 0 35 | self.header = DictionaryHeader.from_bytes(bytes_, offset) 36 | 37 | def test_version(self): 38 | self.assertEqual(SYSTEM_DICT_VERSION_2, self.header.version) 39 | 40 | def test_create_time(self): 41 | self.assertTrue(self.header.create_time > 0) 42 | 43 | def test_description(self): 44 | self.assertEqual("the system dictionary for the unit tests", self.header.description) 45 | 46 | 47 | if __name__ == '__main__': 48 | unittest.main() 49 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/dictionaryversion.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # the first version of system dictionaries 16 | SYSTEM_DICT_VERSION_1 = 0x7366d3f18bd111e7 17 | 18 | # the second version of system dictionaries 19 | SYSTEM_DICT_VERSION_2 = 0xce9f011a92394434 20 | 21 | # the first version of user dictionaries 22 | USER_DICT_VERSION_1 = 0xa50f31188bd211e7 23 | 24 | # the second version of user dictionaries 25 | USER_DICT_VERSION_2 = 0x9fdeb5a90168d868 26 | 27 | # the third version of user dictionaries 28 | USER_DICT_VERSION_3 = 0xca9811756ff64fb0 29 | 30 | 31 | def is_dictionary(version): 32 | return version in [ 33 | SYSTEM_DICT_VERSION_1, SYSTEM_DICT_VERSION_2, 34 | USER_DICT_VERSION_1, USER_DICT_VERSION_2, USER_DICT_VERSION_3 35 | ] 36 | 37 | 38 | def is_system_dictionary(version): 39 | return version == SYSTEM_DICT_VERSION_1 or version == SYSTEM_DICT_VERSION_2 40 | 41 | 42 | def is_user_dictionary(version): 43 | return version == USER_DICT_VERSION_1 or version == USER_DICT_VERSION_2 or version == USER_DICT_VERSION_3 44 | 45 | 46 | def has_grammar(version): 47 | return is_system_dictionary(version) or version == USER_DICT_VERSION_2 or version == USER_DICT_VERSION_3 48 | 49 | 50 | def has_synonym_group_ids(version): 51 | return version == SYSTEM_DICT_VERSION_2 or version == USER_DICT_VERSION_3 52 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from setuptools import setup, find_packages 16 | 17 | from distutils.extension import Extension 18 | 19 | extensions = [ 20 | Extension('sudachipy.latticenode', ['sudachipy/latticenode.pyx']), 21 | Extension('sudachipy.lattice', ['sudachipy/lattice.pyx']), 22 | Extension('sudachipy.tokenizer', ['sudachipy/tokenizer.pyx']), 23 | ] 24 | 25 | setup(name="SudachiPy", 26 | use_scm_version=True, 27 | setup_requires=['setuptools_scm', 'cython'], 28 | description="Python version of Sudachi, the Japanese Morphological Analyzer", 29 | long_description=open('README.md', encoding='utf-8').read(), 30 | long_description_content_type="text/markdown", 31 | url="https://github.com/WorksApplications/SudachiPy", 32 | license="Apache-2.0", 33 | author="Works Applications", 34 | author_email="sudachi@worksap.co.jp", 35 | packages=find_packages(include=["sudachipy", "sudachipy.*"]), 36 | package_data={"": ["resources/*.json", "resources/*.dic", "resources/*.def"]}, 37 | entry_points={ 38 | "console_scripts": ["sudachipy=sudachipy.command_line:main"], 39 | }, 40 | install_requires=[ 41 | "sortedcontainers~=2.1.0", 42 | 'dartsclone~=0.9.0', 43 | ], 44 | ext_modules=extensions, 45 | ) 46 | -------------------------------------------------------------------------------- /sudachipy/plugin/connect_cost/inhibitconnectioncost.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from sudachipy import config 16 | from sudachipy.dictionarylib.grammar import Grammar 17 | 18 | from .editconnectioncost import EditConnectionCostPlugin 19 | 20 | 21 | class InhibitConnectionPlugin(EditConnectionCostPlugin): 22 | """ A Edit Connection Cost Plugin for inhibiting the connections. 23 | 24 | The following is an example of settings. 25 | 26 | `` 27 | { 28 | { 29 | "class" : "sudachipy.plugin.connect_cost.InhibitConnectionPlugin", 30 | "inhibitedPair" : [ [ 0, 233 ], [435, 332] ] 31 | } 32 | } 33 | `` 34 | 35 | Attributes: 36 | _inhibit_pairs: a list of int pairs. At each pair, the first one is right-ID 37 | of the left node and the second one is left-ID of the right node in a connection. 38 | 39 | """ 40 | 41 | def __init__(self): 42 | self._inhibit_pairs = [] 43 | 44 | def set_up(self, grammar: Grammar) -> None: 45 | if 'inhibitedPair' in config.settings: 46 | self._inhibit_pairs = config.settings['inhibitedPair'] 47 | 48 | def edit(self, grammar: Grammar) -> None: 49 | for pair in self._inhibit_pairs: 50 | if len(pair) < 2: 51 | continue 52 | self.inhibit_connection(grammar, pair[0], pair[1]) 53 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/wordparameterlist.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class WordParameterList(object): 17 | 18 | ELEMENT_SIZE = 2 * 3 19 | ELEMENT_SIZE_AS_SHORT = 3 20 | 21 | def __init__(self, bytes_, offset): 22 | original_offset = bytes_.tell() 23 | bytes_.seek(offset) 24 | self.size = int.from_bytes(bytes_.read(4), 'little') 25 | array_offset = bytes_.tell() 26 | self._array_view = memoryview(bytes_)[array_offset: array_offset + self.size * self.ELEMENT_SIZE] 27 | self._array_view = self._array_view.cast('h') 28 | # self.is_copied = False 29 | bytes_.seek(original_offset) 30 | 31 | def __del__(self): 32 | self._array_view.release() 33 | 34 | def storage_size(self): 35 | return 4 + self.ELEMENT_SIZE * self.size 36 | 37 | def get_size(self): 38 | return self.size 39 | 40 | def get_left_id(self, word_id): 41 | return self._array_view[self.ELEMENT_SIZE_AS_SHORT * word_id] 42 | 43 | def get_right_id(self, word_id): 44 | return self._array_view[self.ELEMENT_SIZE_AS_SHORT * word_id + 1] 45 | 46 | def get_cost(self, word_id): 47 | return self._array_view[self.ELEMENT_SIZE_AS_SHORT * word_id + 2] 48 | 49 | def set_cost(self, word_id, cost): 50 | # bytes_ must be ACCESS_COPY mode 51 | self._array_view[self.ELEMENT_SIZE_AS_SHORT * word_id + 2] = cost 52 | -------------------------------------------------------------------------------- /sudachipy/plugin/oov/simple_oov_plugin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from sudachipy.dictionarylib import wordinfo 16 | 17 | from . import OovProviderPlugin 18 | 19 | 20 | class SimpleOovPlugin(OovProviderPlugin): 21 | 22 | def __init__(self, json_obj): 23 | self.left_id = json_obj['leftId'] 24 | self.right_id = json_obj['rightId'] 25 | self.cost = json_obj['cost'] 26 | self.__oov_pos_strings = json_obj['oovPOS'] 27 | self.oov_pos_id = -1 28 | 29 | def set_up(self, grammar): 30 | self.oov_pos_id = grammar.get_part_of_speech_id(self.__oov_pos_strings) 31 | 32 | def provide_oov(self, input_text, offset, has_other_words): 33 | if not has_other_words: 34 | node = self.create_node() 35 | node.set_parameter(self.left_id, self.right_id, self.cost) 36 | length = input_text.get_word_candidate_length(offset) 37 | s = input_text.get_substring(offset, offset + length) 38 | info = wordinfo.WordInfo(surface=s, head_word_length=length, pos_id=self.oov_pos_id, normalized_form=s, 39 | dictionary_form_word_id=-1, dictionary_form=s, reading_form="", 40 | a_unit_split=[], b_unit_split=[], word_structure=[], synonym_group_ids=[]) 41 | node.set_word_info(info) 42 | return [node] 43 | else: 44 | return [] 45 | -------------------------------------------------------------------------------- /sudachipy/plugin/input_text/prolongedsoundmark.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import InputTextPlugin 16 | 17 | 18 | class ProlongedSoundMarkInputTextPlugin(InputTextPlugin): 19 | 20 | def __init__(self, json_obj): 21 | self._psm_set = set() 22 | self._replace_symbol = 'ー' 23 | if not json_obj: 24 | return 25 | if 'prolongedSoundMarks' in json_obj: 26 | self._psm_set = set([ord(psm) for psm in json_obj['prolongedSoundMarks']]) 27 | if 'replacementSymbol' in json_obj: 28 | self._replace_symbol = json_obj['replacementSymbol'] 29 | 30 | def set_up(self) -> None: 31 | pass 32 | 33 | def rewrite(self, builder: InputTextPlugin.Builder) -> None: 34 | text = builder.get_text() 35 | n = len(text) 36 | offset = 0 37 | is_psm = False 38 | m_start_idx = n 39 | for i in range(n): 40 | cp = ord(text[i]) 41 | if not is_psm and cp in self._psm_set: 42 | is_psm = True 43 | m_start_idx = i 44 | elif is_psm and cp not in self._psm_set: 45 | if i - m_start_idx > 1: 46 | builder.replace(m_start_idx - offset, i - offset, self._replace_symbol) 47 | offset += i - m_start_idx - 1 48 | is_psm = False 49 | if is_psm and n - m_start_idx > 1: 50 | builder.replace(m_start_idx - offset, n - offset, self._replace_symbol) 51 | -------------------------------------------------------------------------------- /sudachipy/plugin/connect_cost/editconnectioncost.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | 17 | from sudachipy.dictionarylib.grammar import Grammar 18 | 19 | 20 | class EditConnectionCostPlugin(ABC): 21 | """ Interface of plugins for editing the connection costs. 22 | 23 | The following is an example of settings. 24 | `` 25 | { 26 | { 27 | "class" : "sudachipy.plugin.connect_cost.SampleEditConnectionPlugin", 28 | "example" : "example setting" 29 | } 30 | } 31 | `` 32 | 33 | """ 34 | 35 | @abstractmethod 36 | def set_up(self, grammar: Grammar) -> None: 37 | """ Set up plugin. 38 | 39 | Args: 40 | grammar: grammar of system dictionary 41 | 42 | """ 43 | raise NotImplementedError 44 | 45 | @abstractmethod 46 | def edit(self, grammar: Grammar) -> None: 47 | """ Edit a connection costs. 48 | 49 | Args: 50 | grammar: grammar of system dictionary 51 | 52 | """ 53 | raise NotImplementedError 54 | 55 | @staticmethod 56 | def inhibit_connection(grammar: Grammar, left: int, right: int) -> None: 57 | """ Inhibit a connection. 58 | 59 | Args: 60 | grammar: grammar of system dictionary 61 | left: right-ID of left node 62 | right: left-ID of right node 63 | 64 | """ 65 | grammar.set_connect_cost(left, right, Grammar.INHIBITED_CONNECTION) 66 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/dictionaryheader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import struct 16 | 17 | from sudachipy.dictionarylib.jtypedbytebuffer import JTypedByteBuffer 18 | from . import dictionaryversion 19 | 20 | 21 | class DictionaryHeader: 22 | 23 | __DESCRIPTION_SIZE = 256 24 | __STORAGE_SIZE = 8 + 8 + __DESCRIPTION_SIZE 25 | 26 | def __init__(self, version, create_time, description): 27 | self.version = version 28 | self.create_time = create_time 29 | self.description = description 30 | 31 | @classmethod 32 | def from_bytes(cls, bytes_, offset): 33 | version, create_time = struct.unpack_from("<2Q", bytes_, offset) 34 | offset += 16 35 | 36 | len_ = 0 37 | while len_ < cls.__DESCRIPTION_SIZE: 38 | if bytes_[offset + len_] == 0: 39 | break 40 | len_ += 1 41 | description = bytes_[offset:offset + len_].decode("utf-8") 42 | return cls(version, create_time, description) 43 | 44 | def to_bytes(self): 45 | buf = JTypedByteBuffer(b'\x00' * (16 + self.__DESCRIPTION_SIZE)) 46 | buf.seek(0) 47 | buf.write_int(self.version, 'long', signed=False) 48 | buf.write_int(self.create_time, 'long') 49 | bdesc = self.description.encode('utf-8') 50 | if len(bdesc) > self.__DESCRIPTION_SIZE: 51 | raise ValueError('description is too long') 52 | buf.write(bdesc) 53 | return buf.getvalue() 54 | 55 | def storage_size(self): 56 | return self.__STORAGE_SIZE 57 | 58 | def is_system_dictionary(self): 59 | return dictionaryversion.is_system_dictionary(self.version) 60 | 61 | def is_user_dictionary(self): 62 | return dictionaryversion.is_user_dictionary(self.version) 63 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/userdictionarybuilder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .dictionarybuilder import DictionaryBuilder 16 | 17 | 18 | class UserDictionaryBuilder(DictionaryBuilder): 19 | 20 | def __init__(self, grammar, system_lexicon, *, logger=None): 21 | super().__init__(logger=logger) 22 | self.is_user_dictionary = True 23 | self.grammar = grammar 24 | self.system_lexicon = system_lexicon 25 | 26 | def build(self, lexicon_paths, matrix_input_stream, out_stream): 27 | """ 28 | Violated LSP 29 | :param lexicon_paths: 30 | :param out_stream: 31 | :return: 32 | """ 33 | self.logger.info('reading the source file...') 34 | for path in lexicon_paths: 35 | with open(path, 'r', encoding='utf-8') as rf: 36 | self.build_lexicon(rf) 37 | self.logger.info('{} words\n'.format(len(self.entries))) 38 | 39 | self.write_grammar(None, out_stream) 40 | self.write_lexicon(out_stream) 41 | 42 | def get_posid(self, strs): 43 | pos_id = self.grammar.get_part_of_speech_id(strs) 44 | if pos_id < 0: 45 | pos_id = super().get_posid(strs) + self.grammar.get_part_of_speech_size() 46 | return pos_id 47 | 48 | def get_wordid(self, headword, pos_id, reading_form): 49 | wid = super().get_wordid(headword, pos_id, reading_form) 50 | if wid >= 0: 51 | return wid | (1 << 28) 52 | return self.system_lexicon.get_word_id(headword, pos_id, reading_form) 53 | 54 | def check_wordid(self, wid): 55 | if wid >= (1 << 28): 56 | super().check_wordid(wid & ((1 << 28) - 1)) 57 | elif wid < 0 or wid >= self.system_lexicon.size(): 58 | raise ValueError('invalid word id') 59 | -------------------------------------------------------------------------------- /sudachipy/morpheme.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class Morpheme: 17 | def __init__(self, list_, index): 18 | self.word_info = None 19 | self.list = list_ 20 | self.index = index 21 | 22 | def __str__(self): 23 | return self.surface() 24 | 25 | def begin(self): 26 | return self.list.get_begin(self.index) 27 | 28 | def end(self): 29 | return self.list.get_end(self.index) 30 | 31 | def surface(self): 32 | return self.list.get_surface(self.index) 33 | 34 | def part_of_speech(self): 35 | wi = self.get_word_info() 36 | return self.list.grammar.get_part_of_speech_string(wi.pos_id) 37 | 38 | def part_of_speech_id(self): 39 | wi = self.get_word_info() 40 | return wi.pos_id 41 | 42 | def dictionary_form(self): 43 | wi = self.get_word_info() 44 | return wi.dictionary_form 45 | 46 | def normalized_form(self): 47 | wi = self.get_word_info() 48 | return wi.normalized_form 49 | 50 | def reading_form(self): 51 | wi = self.get_word_info() 52 | return wi.reading_form 53 | 54 | def split(self, mode): 55 | wi = self.get_word_info() 56 | return self.list.split(mode, self.index, wi) 57 | 58 | def is_oov(self): 59 | return self.list.is_oov(self.index) 60 | 61 | def word_id(self): 62 | return self.list.path[self.index].get_word_id() 63 | 64 | def dictionary_id(self): 65 | return self.list.path[self.index].get_dictionary_id() 66 | 67 | def synonym_group_ids(self): 68 | wi = self.get_word_info() 69 | return wi.synonym_group_ids 70 | 71 | def get_word_info(self): 72 | if not self.word_info: 73 | self.word_info = self.list.get_word_info(self.index) 74 | return self.word_info 75 | -------------------------------------------------------------------------------- /tests/mock_inputtext.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from copy import deepcopy 16 | from unittest import mock 17 | 18 | from sudachipy.utf8inputtext import UTF8InputText 19 | 20 | 21 | mocked_input_text = mock.Mock(spec=UTF8InputText) 22 | text = '' 23 | types = [] 24 | 25 | 26 | def set_text(text_: str) -> None: 27 | global text, types 28 | text = text_ 29 | types = [set() for _ in text] 30 | 31 | 32 | def set_category_type(begin: int, end: int, type_) -> None: 33 | global types 34 | for i in range(begin, end): 35 | types[i].add(type_) 36 | 37 | 38 | mocked_input_text.get_text.return_value = text 39 | 40 | mocked_input_text.get_original_text.return_value = text 41 | 42 | 43 | def _mocked_get_substring(begin: int, end: int) -> str: 44 | global text 45 | return text[begin:end] 46 | 47 | 48 | mocked_input_text.get_substring.side_effect = _mocked_get_substring 49 | 50 | 51 | def _mocked_get_char_category_types(begin: int, end: int = None) -> set: 52 | global text, types 53 | if end is None: 54 | return types[begin] 55 | continuous_category = deepcopy(types[begin]) 56 | for i in range(begin + 1, end): 57 | continuous_category = continuous_category.intersection(types[i]) 58 | return continuous_category 59 | 60 | 61 | mocked_input_text.get_char_category_types.side_effect = _mocked_get_char_category_types 62 | 63 | 64 | def _mocked_get_char_category_continuous_length(idx: int) -> int: 65 | global text, types 66 | continuous_category = deepcopy(types[idx]) 67 | for i in range(idx + 1, len(text)): 68 | continuous_category = continuous_category.intersection(types[i]) 69 | if not continuous_category: 70 | return i - idx 71 | return len(text) - idx 72 | 73 | 74 | mocked_input_text.get_char_category_continuous_length.side_effect = _mocked_get_char_category_continuous_length 75 | 76 | 77 | def _mocked_get_code_points_offset_length(idx: int, offset: int) -> int: 78 | return offset 79 | 80 | 81 | mocked_input_text.get_code_points_offset_length.side_effect = _mocked_get_code_points_offset_length 82 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/binarydictionary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import mmap 16 | 17 | from .dictionaryversion import has_grammar, has_synonym_group_ids, is_dictionary 18 | from .dictionaryheader import DictionaryHeader 19 | from .doublearraylexicon import DoubleArrayLexicon 20 | from .grammar import Grammar 21 | 22 | 23 | class BinaryDictionary(object): 24 | 25 | def __init__(self, bytes_: mmap.mmap, grammar: Grammar, header: DictionaryHeader, lexicon: DoubleArrayLexicon): 26 | self._bytes = bytes_ 27 | self._grammar = grammar 28 | self._header = header 29 | self._lexicon = lexicon 30 | 31 | @staticmethod 32 | def _read_dictionary(filename, access=mmap.ACCESS_READ): 33 | with open(filename, 'rb') as system_dic: 34 | bytes_ = mmap.mmap(system_dic.fileno(), 0, access=access) 35 | offset = 0 36 | header = DictionaryHeader.from_bytes(bytes_, offset) 37 | offset += header.storage_size() 38 | if not is_dictionary(header.version): 39 | raise Exception('invalid dictionary version') 40 | grammar = None 41 | if has_grammar(header.version): 42 | grammar = Grammar(bytes_, offset) 43 | offset += grammar.get_storage_size() 44 | 45 | lexicon = DoubleArrayLexicon(bytes_, offset, has_synonym_group_ids(header.version)) 46 | return bytes_, grammar, header, lexicon 47 | 48 | @classmethod 49 | def from_system_dictionary(cls, filename): 50 | args = cls._read_dictionary(filename) 51 | if not args[2].is_system_dictionary(): 52 | raise IOError('invalid system dictionary') 53 | return cls(*args) 54 | 55 | @classmethod 56 | def from_user_dictionary(cls, filename): 57 | args = cls._read_dictionary(filename, mmap.ACCESS_COPY) 58 | if not args[2].is_user_dictionary(): 59 | raise IOError('invalid user dictionary') 60 | return cls(*args) 61 | 62 | def close(self): 63 | del self._grammar 64 | del self._lexicon 65 | self._bytes.close() 66 | 67 | @property 68 | def grammar(self) -> Grammar: 69 | return self._grammar 70 | 71 | @property 72 | def header(self) -> DictionaryHeader: 73 | return self._header 74 | 75 | @property 76 | def lexicon(self) -> DoubleArrayLexicon: 77 | return self._lexicon 78 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/wordinfolist.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import struct 16 | 17 | from .wordinfo import WordInfo 18 | 19 | 20 | class WordInfoList(object): 21 | def __init__(self, bytes_, offset, word_size, has_synonym_gid): 22 | self.bytes = bytes_ 23 | self.offset = offset 24 | self._word_size = word_size 25 | self.has_synonym_gid = has_synonym_gid 26 | 27 | def get_word_info(self, word_id): 28 | orig_pos = self.bytes.tell() 29 | index = self.word_id_to_offset(word_id) 30 | self.bytes.seek(index) 31 | surface = self.buffer_to_string() 32 | head_word_length = self.buffer_to_string_length() 33 | pos_id = int.from_bytes(self.bytes.read(2), 'little') 34 | normalized_form = self.buffer_to_string() 35 | if not normalized_form: 36 | normalized_form = surface 37 | dictionary_form_word_id = int.from_bytes(self.bytes.read(4), 'little', signed=True) 38 | reading_form = self.buffer_to_string() 39 | if not reading_form: 40 | reading_form = surface 41 | a_unit_split = self.buffer_to_int_array() 42 | b_unit_split = self.buffer_to_int_array() 43 | word_structure = self.buffer_to_int_array() 44 | 45 | synonym_gids = [] 46 | if self.has_synonym_gid: 47 | synonym_gids = self.buffer_to_int_array() 48 | 49 | dictionary_form = surface 50 | if dictionary_form_word_id >= 0 and dictionary_form_word_id != word_id: 51 | wi = self.get_word_info(dictionary_form_word_id) 52 | dictionary_form = wi.surface 53 | 54 | self.bytes.seek(orig_pos) 55 | 56 | return WordInfo(surface, head_word_length, pos_id, normalized_form, dictionary_form_word_id, 57 | dictionary_form, reading_form, a_unit_split, b_unit_split, word_structure, synonym_gids) 58 | 59 | def word_id_to_offset(self, word_id): 60 | i = self.offset + 4 * word_id 61 | return int.from_bytes(self.bytes[i:i + 4], 'little', signed=False) 62 | 63 | def buffer_to_string_length(self): 64 | length = self.bytes.read_byte() 65 | if length < 128: 66 | return length 67 | low = self.bytes.read_byte() 68 | return ((length & 0x7F) << 8) | low 69 | 70 | def buffer_to_string(self): 71 | length = self.buffer_to_string_length() 72 | return self.bytes.read(2 * length).decode('utf-16-le') 73 | 74 | def buffer_to_int_array(self): 75 | length = self.bytes.read_byte() 76 | _bytes = self.bytes.read(4 * length) 77 | return list(struct.unpack('{}i'.format(length), _bytes)) 78 | 79 | def size(self): 80 | return self._word_size 81 | -------------------------------------------------------------------------------- /sudachipy/morphemelist.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import latticenode 16 | from . import morpheme 17 | from . import tokenizer 18 | 19 | 20 | class MorphemeList: 21 | 22 | @classmethod 23 | def empty(cls): 24 | return MorphemeList(None, None, None, []) 25 | 26 | def __init__(self, input_, grammar, lexicon, path): 27 | self.input_text = input_ 28 | self.grammar = grammar 29 | self.lexicon = lexicon 30 | self.path = path 31 | 32 | def __getitem__(self, index): 33 | n_morphs = len(self.path) 34 | if index >= n_morphs or index < -n_morphs: 35 | raise IndexError("Morpheme list index out of range") 36 | if index < 0: 37 | return morpheme.Morpheme(self, n_morphs + index) 38 | return morpheme.Morpheme(self, index) 39 | 40 | def __len__(self): 41 | return len(self.path) 42 | 43 | def __iter__(self): 44 | for index in range(len(self.path)): 45 | yield morpheme.Morpheme(self, index) 46 | return 47 | 48 | def __str__(self): 49 | return ''.join([mm.surface() for mm in self]) 50 | 51 | def get_begin(self, index): 52 | return self.input_text.get_original_index(self.path[index].get_begin()) 53 | 54 | def get_end(self, index): 55 | return self.input_text.get_original_index(self.path[index].get_end()) 56 | 57 | def get_surface(self, index): 58 | begin = self.get_begin(index) 59 | end = self.get_end(index) 60 | return self.input_text.get_original_text()[begin:end] 61 | 62 | def get_word_info(self, index): 63 | return self.path[index].get_word_info() 64 | 65 | def split(self, mode, index, wi): 66 | if mode is tokenizer.Tokenizer.SplitMode.A: 67 | word_ids = wi.a_unit_split 68 | elif mode is tokenizer.Tokenizer.SplitMode.B: 69 | word_ids = wi.b_unit_split 70 | else: 71 | return [self.__getitem__(index)] 72 | 73 | if len(word_ids) == 0 or len(word_ids) == 1: 74 | return [self.__getitem__(index)] 75 | 76 | offset = self.path[index].get_begin() 77 | nodes = [] 78 | for wid in word_ids: 79 | n = latticenode.LatticeNode(self.lexicon, 0, 0, 0, wid) 80 | n.set_begin(offset) 81 | offset += n.get_word_info().head_word_length 82 | n.set_end(offset) 83 | nodes.append(n) 84 | 85 | return MorphemeList(self.input_text, self.grammar, self.lexicon, nodes) 86 | 87 | def is_oov(self, index): 88 | return self.path[index].is_oov() 89 | 90 | def get_internal_cost(self): 91 | return self.path[-1].get_path_cost() - self.path[0].get_path_cost() 92 | 93 | def size(self): 94 | return len(self.path) 95 | -------------------------------------------------------------------------------- /tests/resources/dict/lex.csv: -------------------------------------------------------------------------------- 1 | た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,* 2 | に,2,2,11406,に,助詞,接続助詞,*,*,*,*,ニ,に,*,A,*,*,*,* 3 | に,3,3,4481,に,助詞,格助詞,*,*,*,*,ニ,に,*,A,*,*,*,* 4 | 京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,1/5 5 | 東,7,7,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,* 6 | 東京,6,6,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* 7 | 東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,* 8 | 行く,4,4,5105,行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* 9 | 行っ,5,5,5122,行っ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,A,*,*,*,* 10 | 都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,* 11 | アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,*,A,*,*,*,* 12 | アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,*,A,*,*,*,* 13 | アイアイウ,6,6,32766,アイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,*,A,*,*,*,* 14 | 0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,*,A,*,*,*,* 15 | 1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,*,A,*,*,*,* 16 | 2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,*,A,*,*,*,* 17 | 3,9,9,2478,3,名詞,数詞,*,*,*,*,サン,3,*,A,*,*,*,* 18 | 4,9,9,2478,4,名詞,数詞,*,*,*,*,ヨン,4,*,A,*,*,*,* 19 | 5,9,9,2478,5,名詞,数詞,*,*,*,*,ゴ,5,*,A,*,*,*,* 20 | 6,9,9,2478,6,名詞,数詞,*,*,*,*,ロク,6,*,A,*,*,*,* 21 | 7,9,9,2478,7,名詞,数詞,*,*,*,*,ナナ,7,*,A,*,*,*,* 22 | 8,9,9,2478,8,名詞,数詞,*,*,*,*,ハチ,8,*,A,*,*,*,* 23 | 9,9,9,2478,9,名詞,数詞,*,*,*,*,キュウ,9,*,A,*,*,*,* 24 | 〇,9,9,2478,〇,名詞,数詞,*,*,*,*,ゼロ,〇,*,A,*,*,*,* 25 | 一,9,9,2478,一,名詞,数詞,*,*,*,*,イチ,一,*,A,*,*,*,* 26 | 二,9,9,2478,二,名詞,数詞,*,*,*,*,ニ,二,*,A,*,*,*,* 27 | 三,9,9,2478,三,名詞,数詞,*,*,*,*,サン,三,*,A,*,*,*,* 28 | 四,9,9,2478,四,名詞,数詞,*,*,*,*,ヨン,四,*,A,*,*,*,* 29 | 五,9,9,2478,五,名詞,数詞,*,*,*,*,ゴ,五,*,A,*,*,*,* 30 | 六,9,9,2478,六,名詞,数詞,*,*,*,*,ロク,六,*,A,*,*,*,* 31 | 七,9,9,2478,七,名詞,数詞,*,*,*,*,ナナ,七,*,A,*,*,*,* 32 | 八,9,9,2478,八,名詞,数詞,*,*,*,*,ハチ,八,*,A,*,*,*,* 33 | 九,9,9,2478,九,名詞,数詞,*,*,*,*,キュウ,九,*,A,*,*,*,* 34 | 六三四,6,6,0,六三四,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,*,A,*,*,*,* 35 | いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* 36 | いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,* 37 | 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,2478,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,* 38 | 特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,*,A,*,*,*,* 39 | な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,A,*,*,*,* -------------------------------------------------------------------------------- /tests/plugin/test_join_katakana_oov_plugin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import unittest 17 | 18 | from sudachipy.config import settings 19 | from sudachipy.dictionary import Dictionary 20 | from sudachipy.plugin.path_rewrite import JoinKatakanaOovPlugin 21 | from sudachipy.utf8inputtextbuilder import UTF8InputTextBuilder 22 | 23 | 24 | class TestJoinKatakanaOOVPlugin(unittest.TestCase): 25 | 26 | def setUp(self): 27 | resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources') 28 | self.dict_ = Dictionary(os.path.join(resource_dir, 'sudachi.json'), resource_dir) 29 | self.tokenizer = self.dict_.create() 30 | self.plugin = JoinKatakanaOovPlugin(settings['pathRewritePlugin'][1]) 31 | 32 | def test_katakana_length(self): 33 | # アイ, アイウ in the dictionary 34 | self.plugin._min_length = 0 35 | path = self.get_path('アイアイウ') 36 | self.assertEqual(2, len(path)) 37 | 38 | self.plugin._min_length = 1 39 | path = self.get_path('アイアイウ') 40 | self.assertEqual(2, len(path)) 41 | 42 | self.plugin._min_length = 2 43 | path = self.get_path('アイアイウ') 44 | self.assertEqual(2, len(path)) 45 | 46 | self.plugin._min_length = 3 47 | path = self.get_path('アイアイウ') 48 | self.assertEqual(1, len(path)) 49 | 50 | def test_pos(self): 51 | # アイアイウ is 名詞-固有名詞-地名-一般 in the dictionary 52 | self.plugin._min_length = 3 53 | path = self.get_path('アイアイウ') 54 | self.assertEqual(1, len(path)) 55 | self.assertFalse(path[0].is_oov()) 56 | self.assertEqual(['名詞', '固有名詞', '地名', '一般', '*', '*'], 57 | self.dict_.grammar.get_part_of_speech_string(path[0].get_word_info().pos_id)) 58 | 59 | def test_starts_with_middle(self): 60 | self.plugin._min_length = 3 61 | path = self.get_path('アイウアイアイウ') 62 | self.assertEqual(1, len(path)) 63 | 64 | def test_starts_with_tail(self): 65 | self.plugin._min_length = 3 66 | path = self.get_path('アイウアイウアイ') 67 | self.assertEqual(1, len(path)) 68 | 69 | def test_with_nooovbow(self): 70 | self.plugin._min_length = 3 71 | path = self.get_path('ァアイアイウ') 72 | self.assertEqual(2, len(path)) 73 | self.assertEqual('ァ', path[0].get_word_info().surface) 74 | 75 | path = self.get_path('アイウァアイウ') 76 | self.assertEqual(1, len(path)) 77 | 78 | def get_path(self, text: str): 79 | input_ = UTF8InputTextBuilder(text, self.tokenizer._grammar).build() 80 | self.tokenizer._build_lattice(input_) 81 | path = self.tokenizer._lattice.get_best_path() 82 | self.plugin.rewrite(input_, path, self.tokenizer._lattice) 83 | self.tokenizer._lattice.clear() 84 | return path 85 | 86 | 87 | if __name__ == '__main__': 88 | unittest.main() 89 | -------------------------------------------------------------------------------- /sudachipy/plugin/path_rewrite/join_katakana_oov_plugin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from sudachipy.dictionarylib.categorytype import CategoryType 16 | from sudachipy.latticenode import LatticeNode 17 | from sudachipy.plugin.path_rewrite.path_rewrite_plugin import PathRewritePlugin 18 | from sudachipy.utf8inputtext import UTF8InputText 19 | 20 | 21 | class JoinKatakanaOovPlugin(PathRewritePlugin): 22 | 23 | def __init__(self, json_obj): 24 | self.__pos = json_obj['oovPOS'] 25 | self._min_length = 1 26 | if 'minLength' in json_obj: 27 | self._min_length = json_obj['minLength'] 28 | self.oov_pos_id = None 29 | 30 | def set_up(self, grammar): 31 | if not self.__pos: 32 | raise ValueError("oovPOS is undefined") 33 | self.oov_pos_id = grammar.get_part_of_speech_id(self.__pos) 34 | if self.oov_pos_id < 0: 35 | raise ValueError("oovPOS is invalid") 36 | 37 | def rewrite(self, text, path, lattice): 38 | i = 0 39 | while True: 40 | if i >= len(path): 41 | break 42 | node = path[i] 43 | if not (node.is_oov() or self.is_shorter(self._min_length, text, node)) or \ 44 | not self.is_katakana_node(text, node): 45 | i += 1 46 | continue 47 | begin = i - 1 48 | while True: 49 | if begin < 0: 50 | break 51 | if not self.is_katakana_node(text, path[begin]): 52 | begin += 1 53 | break 54 | begin -= 1 55 | begin = max(0, begin) 56 | end = i + 1 57 | while True: 58 | if end >= len(path): 59 | break 60 | if not self.is_katakana_node(text, path[end]): 61 | break 62 | end += 1 63 | pass 64 | while begin != end and not self.can_oov_bow_node(text, path[begin]): 65 | begin += 1 66 | if (end - begin) > 1: 67 | self.concatenate_oov(path, begin, end, self.oov_pos_id, lattice) 68 | i = begin + 1 # skip next node, as we already know it is not a joinable katakana 69 | i += 1 70 | 71 | def is_katakana_node(self, text, node): 72 | return CategoryType.KATAKANA in self.get_char_category_types(text, node) 73 | 74 | def is_one_char(self, text, node): 75 | b = node.get_begin() 76 | return b + text.get_code_points_offset_length(b, 1) == node.get_end() 77 | 78 | def can_oov_bow_node(self, text, node): 79 | return CategoryType.NOOOVBOW not in text.get_char_category_types(node.get_begin()) 80 | 81 | @staticmethod 82 | def is_shorter(length: int, text: UTF8InputText, node: LatticeNode): 83 | return text.code_point_count(node.get_begin(), node.get_end()) < length 84 | -------------------------------------------------------------------------------- /sudachipy/utf8inputtext.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class UTF8InputText: 17 | def __init__(self, grammar, original_text, modified_text, bytes_, offsets, byte_indexes, char_categories, char_category_continuities, can_bow_list=None): 18 | self.original_text = original_text 19 | self.modified_text = modified_text 20 | self.bytes = bytes_ 21 | self.offsets = offsets 22 | self.byte_indexes = byte_indexes 23 | self.char_categories = char_categories 24 | self.char_category_continuities = char_category_continuities 25 | self.can_bow_list = can_bow_list 26 | 27 | def get_original_text(self): 28 | return self.original_text 29 | 30 | def get_text(self): 31 | return self.modified_text 32 | 33 | def get_byte_text(self): 34 | return self.bytes 35 | 36 | def get_substring(self, begin, end): 37 | if begin < 0: 38 | raise IndexError(begin) 39 | if end > len(self.bytes): 40 | raise IndexError(end) 41 | if (begin > end): 42 | raise IndexError(end - begin) 43 | 44 | return self.modified_text[self.byte_indexes[begin]:self.byte_indexes[end]] 45 | 46 | def get_offset_text_length(self, index): 47 | return self.byte_indexes[index] 48 | 49 | def get_original_index(self, index): 50 | return self.offsets[index] 51 | 52 | def get_char_category_types(self, begin, end=None): 53 | if end is None: 54 | return self.char_categories[self.byte_indexes[begin]] 55 | if begin + self.get_char_category_continuous_length(begin) < end: 56 | return [] 57 | b = self.byte_indexes[begin] 58 | e = self.byte_indexes[end] 59 | continuous_category = set(self.char_categories[b]) 60 | for i in range(b + 1, e): 61 | continuous_category = continuous_category & self.char_categories[i] 62 | return continuous_category 63 | 64 | def get_char_category_continuous_length(self, index): 65 | return self.char_category_continuities[index] 66 | 67 | def get_code_points_offset_length(self, index, code_point_offset): 68 | length = 0 69 | target = self.byte_indexes[index] + code_point_offset 70 | for i in range(index, len(self.bytes)): 71 | if self.byte_indexes[i] >= target: 72 | return length 73 | length += 1 74 | return length 75 | 76 | def can_bow(self, idx: int) -> bool: 77 | return (self.bytes[idx] & 0xC0 != 0x80) and self.can_bow_list[self.byte_indexes[idx]] 78 | 79 | def code_point_count(self, begin: int, end: int): 80 | return self.byte_indexes[end] - self.byte_indexes[begin] 81 | 82 | def get_word_candidate_length(self, index): 83 | for i in range(index + 1, len(self.bytes)): 84 | if self.can_bow(i): 85 | return i - index 86 | return len(self.bytes) - index 87 | -------------------------------------------------------------------------------- /sudachipy/plugin/path_rewrite/path_rewrite_plugin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | 17 | from sudachipy.dictionarylib.wordinfo import WordInfo 18 | 19 | 20 | class PathRewritePlugin(ABC): 21 | 22 | @abstractmethod 23 | def set_up(self, grammar): 24 | raise NotImplementedError 25 | 26 | @abstractmethod 27 | def rewrite(self, text, path, lattice): 28 | raise NotImplementedError 29 | 30 | def concatenate(self, path, begin, end, lattice, normalized_form): 31 | if begin >= end: 32 | raise IndexError("begin >= end") 33 | b = path[begin].get_begin() 34 | e = path[end - 1].get_end() 35 | pos_id = path[begin].get_word_info().pos_id 36 | surface = "" 37 | length = 0 38 | normalized_builder, dictionary_builder, reading_builder = "", "", "" 39 | for i in range(begin, end): 40 | info = path[i].get_word_info() 41 | surface += info.surface 42 | length += info.head_word_length 43 | if not normalized_form: 44 | normalized_builder += info.normalized_form 45 | dictionary_builder += info.dictionary_form 46 | reading_builder += info.reading_form 47 | 48 | normalized_form = normalized_form if normalized_form else normalized_builder 49 | wi = WordInfo(surface=surface, head_word_length=length, pos_id=pos_id, 50 | normalized_form=normalized_form, dictionary_form=dictionary_builder, dictionary_form_word_id=-1, 51 | reading_form=reading_builder, a_unit_split=[], b_unit_split=[], word_structure=[], synonym_group_ids=[]) 52 | 53 | node = lattice.create_node() 54 | node.set_range(b, e) 55 | node.set_word_info(wi) 56 | 57 | path[begin:end] = [node] 58 | return node 59 | 60 | def concatenate_oov(self, path, begin, end, pos_id, lattice): 61 | if begin >= end: 62 | raise IndexError("begin >= end") 63 | b = path[begin].get_begin() 64 | e = path[end - 1].get_end() 65 | 66 | n = lattice.get_minimum_node(b, e) 67 | if n is not None: 68 | path[begin:end] = [n] 69 | return n 70 | 71 | surface = "" 72 | length = 0 73 | for i in range(begin, end): 74 | info = path[i].get_word_info() 75 | surface += info.surface 76 | length += info.head_word_length 77 | 78 | wi = WordInfo(surface=surface, head_word_length=length, pos_id=pos_id, 79 | normalized_form=surface, dictionary_form=surface, dictionary_form_word_id=-1, 80 | reading_form="", a_unit_split=[], b_unit_split=[], word_structure=[], synonym_group_ids=[]) 81 | 82 | node = lattice.create_node() 83 | node.set_range(b, e) 84 | node.set_word_info(wi) 85 | node.set_oov() 86 | 87 | path[begin:end] = [node] 88 | return node 89 | 90 | def get_char_category_types(self, text, node): 91 | return text.get_char_category_types(node.get_begin(), node.get_end()) 92 | -------------------------------------------------------------------------------- /sudachipy/latticenode.pyx: -------------------------------------------------------------------------------- 1 | # cython: profile=True 2 | 3 | # Copyright (c) 2019 Works Applications Co., Ltd. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .dictionarylib.wordinfo import WordInfo 18 | 19 | __NULL_SURFACE = '(null)' 20 | UNK =\ 21 | WordInfo(__NULL_SURFACE, 0, -1, __NULL_SURFACE, -1, 22 | __NULL_SURFACE, __NULL_SURFACE, [], [], [], []) 23 | 24 | cdef class LatticeNode: 25 | 26 | def __init__(self, lexicon=None, left_id=None, right_id=None, cost=None, word_id=None): 27 | 28 | self.begin = 0 29 | self.end = 0 30 | self.word_id = 0 31 | self._is_oov = False 32 | self.best_previous_node = None 33 | self.is_connected_to_bos = False 34 | self.extra_word_info = None 35 | 36 | self._is_defined = True 37 | if lexicon is left_id is right_id is cost is word_id is None: 38 | self._is_defined = False 39 | return 40 | self.lexicon = lexicon 41 | self.left_id = left_id 42 | self.right_id = right_id 43 | self.cost = cost 44 | self.word_id = word_id 45 | 46 | def set_parameter(self, left_id: int, right_id: int, cost: int) -> None: 47 | self.left_id = left_id 48 | self.right_id = right_id 49 | self.cost = cost 50 | 51 | def get_begin(self) -> int: 52 | return self.begin 53 | 54 | def set_begin(self, begin) -> None: 55 | self.begin = begin 56 | 57 | def get_end(self) -> int: 58 | return self.end 59 | 60 | def set_end(self, end) -> None: 61 | self.end = end 62 | 63 | def set_range(self, begin: int, end: int) -> None: 64 | self.begin = begin 65 | self.end = end 66 | 67 | def is_oov(self): 68 | return self._is_oov 69 | 70 | def set_oov(self): 71 | self._is_oov = True 72 | 73 | def is_defined(self): 74 | return self._is_defined 75 | 76 | def set_defined(self): 77 | self._is_defined = True 78 | 79 | def get_word_info(self) -> WordInfo: 80 | if not self._is_defined: 81 | return UNK 82 | if self.extra_word_info: 83 | return self.extra_word_info 84 | return self.lexicon.get_word_info(self.word_id) 85 | 86 | def set_word_info(self, word_info: WordInfo) -> None: 87 | self.extra_word_info = word_info 88 | self._is_defined = True 89 | 90 | def get_path_cost(self) -> int: 91 | return self.cost 92 | 93 | def get_left_id(self) -> int: 94 | return self.left_id 95 | 96 | def get_right_id(self) -> int: 97 | return self.right_id 98 | 99 | def get_word_id(self) -> int: 100 | return self.word_id 101 | 102 | def get_dictionary_id(self) -> int: 103 | if not self._is_defined or self.extra_word_info: 104 | return -1 105 | return self.lexicon.get_dictionary_id(self.word_id) # self.word_id >> 28 106 | 107 | def __str__(self): 108 | surface = "(None)" 109 | if self.word_id >= 0 or self.extra_word_info: 110 | surface = self.get_word_info().surface 111 | 112 | return "{} {} {}({}) {} {} {}".format( 113 | self.get_begin(), self.get_end(), surface, self.word_id, self.left_id, self.right_id, self.cost 114 | ) 115 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/grammar.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class Grammar: 17 | INHIBITED_CONNECTION = 0x7fff 18 | 19 | def __init__(self, bytes_, offset): 20 | self._POS_DEPTH = 6 21 | self._BOS_PARAMETER = [0, 0, 0] 22 | self._EOS_PARAMETER = [0, 0, 0] 23 | 24 | self.char_category = None 25 | self.is_copied_connect_table = False 26 | 27 | original_offset = bytes_.tell() 28 | bytes_.seek(offset) 29 | pos_size = self.bytes_get_short(bytes_) 30 | self.pos_list = [] 31 | for i in range(pos_size): 32 | pos = [] 33 | for j in range(self._POS_DEPTH): 34 | pos.append(self.bytes_get_string(bytes_)) 35 | self.pos_list.append(pos) 36 | left_id_size = self.bytes_get_short(bytes_) 37 | right_id_size = self.bytes_get_short(bytes_) 38 | connect_table_offset = bytes_.tell() 39 | 40 | self.storage_size = (connect_table_offset - offset) + 2 * left_id_size * right_id_size 41 | 42 | self._matrix_view = \ 43 | memoryview(bytes_)[connect_table_offset: connect_table_offset + 2 * left_id_size * right_id_size] 44 | if left_id_size * right_id_size != 0: 45 | self._matrix_view = self._matrix_view.cast('h', shape=[left_id_size, right_id_size]) 46 | bytes_.seek(original_offset) 47 | 48 | def get_storage_size(self): 49 | return self.storage_size 50 | 51 | def get_part_of_speech_size(self): 52 | return len(self.pos_list) 53 | 54 | def get_part_of_speech_string(self, pos_id): 55 | return self.pos_list[pos_id] 56 | 57 | def get_part_of_speech_id(self, pos): 58 | return self.pos_list.index(pos) if pos in self.pos_list else -1 59 | 60 | def get_connect_cost(self, left: int, right: int) -> int: 61 | """ Returns connection cost of nodes 62 | 63 | Args: 64 | left: right-ID of left node 65 | right: left-ID of right node 66 | 67 | Returns: 68 | cost of connection 69 | 70 | """ 71 | return self._matrix_view[right, left] 72 | 73 | def set_connect_cost(self, left: int, right: int, cost: int) -> None: 74 | """ Sets connection cost of nodes 75 | 76 | Note: bytes_ must be ACCESS_COPY mode 77 | 78 | Args: 79 | left: right-ID of left node 80 | right: left-ID of right node 81 | cost: cost of connection 82 | 83 | """ 84 | self._matrix_view[right, left] = cost 85 | 86 | def get_bos_parameter(self): 87 | return self._BOS_PARAMETER 88 | 89 | def get_eos_parameter(self): 90 | return self._EOS_PARAMETER 91 | 92 | def get_character_category(self): 93 | return self.char_category 94 | 95 | def set_character_category(self, char_category): 96 | self.char_category = char_category 97 | 98 | @staticmethod 99 | def bytes_get_string(bytes_): 100 | length = bytes_.read_byte() 101 | string = bytes_.read(2 * length) 102 | return string.decode('utf-16') 103 | 104 | @staticmethod 105 | def bytes_get_short(bytes_): 106 | return int.from_bytes(bytes_.read(2), 'little', signed=True) 107 | 108 | def add_pos_list(self, grammar): 109 | self.pos_list.extend(grammar.pos_list) 110 | -------------------------------------------------------------------------------- /sudachipy/plugin/input_text/default_input_text_plugin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from unicodedata import normalize 17 | 18 | from sudachipy import config 19 | 20 | from . import InputTextPlugin 21 | 22 | 23 | class DefaultInputTextPlugin(InputTextPlugin): 24 | def __init__(self): 25 | self.ignore_normalize_set = set() 26 | self.key_lengths = {} 27 | self.replace_char_map = {} 28 | 29 | def set_up(self) -> None: 30 | rewrite_def = os.path.join(config.DEFAULT_RESOURCEDIR, "rewrite.def") 31 | if not rewrite_def: 32 | raise AttributeError("rewriteDef is not defined") 33 | self.read_rewrite_lists(rewrite_def) 34 | 35 | def rewrite(self, builder: InputTextPlugin.Builder) -> None: 36 | offset = 0 37 | next_offset = 0 38 | text = builder.get_text() 39 | 40 | i = -1 41 | while True: 42 | i += 1 43 | if i >= len(text): 44 | break 45 | textloop = False 46 | offset += next_offset 47 | next_offset = 0 48 | original = text[i] 49 | 50 | # 1. replace char without normalize 51 | max_length = min(self.key_lengths.get(original, 0), len(text) - i) 52 | for l in range(max_length, 0, -1): 53 | replace = self.replace_char_map.get(text[i:i + l]) 54 | if replace: 55 | builder.replace(i + offset, i + l + offset, replace) 56 | next_offset += len(replace) - l 57 | i += l - 1 58 | textloop = True 59 | break 60 | if textloop: 61 | continue 62 | 63 | # 2. normalize 64 | # 2-1. capital alphabet (not only Latin but Greek, Cyrillic, etc.) -> small 65 | lower = original.lower() 66 | if lower in self.ignore_normalize_set: 67 | if original == lower: 68 | continue 69 | replace = lower 70 | else: 71 | # 2-2. normalize (except in ignoreNormalize) 72 | # e.g. full-width alphabet -> half-width / ligature / etc. 73 | replace = normalize("NFKC", lower) 74 | next_offset = len(replace) - 1 75 | if original != replace: 76 | builder.replace(i + offset, i + 1 + offset, replace) 77 | 78 | def read_rewrite_lists(self, rewrite_def): 79 | with open(rewrite_def, "r", encoding="utf-8") as f: 80 | for i, line in enumerate(f): 81 | line = line.strip() 82 | if (not line) or line.startswith("#"): 83 | continue 84 | cols = line.split() 85 | 86 | # ignored normalize list 87 | if len(cols) == 1: 88 | key = cols[0] 89 | if len(key) != 1: 90 | raise RuntimeError("{} is not character at line {}".format(key, i)) 91 | self.ignore_normalize_set.add(key) 92 | # replace char list 93 | elif len(cols) == 2: 94 | if cols[0] in self.replace_char_map: 95 | raise RuntimeError("{} is already defined at line {}".format(cols[0], i)) 96 | if self.key_lengths.get(cols[0][0], 0) < len(cols[0]): 97 | self.key_lengths[cols[0][0]] = len(cols[0]) 98 | self.replace_char_map[cols[0]] = cols[1] 99 | else: 100 | raise RuntimeError("invalid format at line {}".format(i)) 101 | -------------------------------------------------------------------------------- /sudachipy/dictionary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import config 16 | from . import dictionarylib 17 | from .dictionarylib.binarydictionary import BinaryDictionary 18 | from .dictionarylib.lexiconset import LexiconSet 19 | from .plugin.input_text import get_input_text_plugins 20 | from .plugin.oov import get_oov_plugins 21 | from .plugin.path_rewrite import get_path_rewrite_plugins 22 | from .tokenizer import Tokenizer 23 | 24 | 25 | class UndefinedDictionaryError(Exception): 26 | pass 27 | 28 | 29 | class Dictionary: 30 | 31 | def __init__(self, config_path=None, resource_dir=None, dict_type=None): 32 | config.settings.set_up(config_path, resource_dir, dict_type) 33 | self.grammar = None 34 | self.lexicon = None 35 | self.input_text_plugins = [] 36 | self.edit_connection_plugin = [] 37 | self.oov_provider_plugins = [] 38 | self.path_rewrite_plugins = [] 39 | self.dictionaries = [] 40 | self.header = None 41 | self._read_system_dictionary(config.settings.system_dict_path()) 42 | 43 | # self.edit_connection_plugin = [InhibitConnectionPlugin()] 44 | # for p in self.edit_connection_plugin: 45 | # p.set_up(self.grammar) 46 | # p.edit(self.grammar) 47 | 48 | self._read_character_definition(config.settings.char_def_path()) 49 | 50 | self.input_text_plugins = get_input_text_plugins() 51 | for p in self.input_text_plugins: 52 | p.set_up() 53 | 54 | self.oov_provider_plugins = get_oov_plugins() 55 | if not self.oov_provider_plugins: 56 | raise AttributeError("no OOV provider") 57 | for p in self.oov_provider_plugins: 58 | p.set_up(self.grammar) 59 | 60 | self.path_rewrite_plugins = get_path_rewrite_plugins() 61 | for p in self.path_rewrite_plugins: 62 | p.set_up(self.grammar) 63 | 64 | for filename in config.settings.user_dict_paths(): 65 | self._read_user_dictionary(filename) 66 | 67 | def _read_system_dictionary(self, filename): 68 | if filename is None: 69 | raise ValueError("system dictionary is not specified") 70 | dict_ = BinaryDictionary.from_system_dictionary(filename) 71 | self.dictionaries.append(dict_) 72 | self.grammar = dict_.grammar 73 | self.lexicon = LexiconSet(dict_.lexicon) 74 | 75 | def _read_user_dictionary(self, filename): 76 | if self.lexicon.is_full(): 77 | raise ValueError('too many dictionaries') 78 | dict_ = BinaryDictionary.from_user_dictionary(filename) 79 | self.dictionaries.append(dict_) 80 | user_lexicon = dict_.lexicon 81 | tokenizer_ = Tokenizer(self.grammar, self.lexicon, self.input_text_plugins, self.oov_provider_plugins, []) 82 | user_lexicon.calculate_cost(tokenizer_) 83 | self.lexicon.add(user_lexicon, self.grammar.get_part_of_speech_size()) 84 | if dict_.grammar: 85 | self.grammar.add_pos_list(dict_.grammar) 86 | 87 | def _read_character_definition(self, filename): 88 | if self.grammar is None: 89 | return 90 | char_category = dictionarylib.charactercategory.CharacterCategory() 91 | char_category.read_character_definition(filename) 92 | self.grammar.set_character_category(char_category) 93 | 94 | def close(self): 95 | self.grammar = None 96 | self.lexicon = None 97 | for dict_ in self.dictionaries: 98 | dict_.close() 99 | 100 | def create(self, mode=None): 101 | return Tokenizer( 102 | self.grammar, self.lexicon, self.input_text_plugins, self.oov_provider_plugins, self.path_rewrite_plugins, mode=mode) 103 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/doublearraylexicon.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import mmap 16 | 17 | from dartsclone import DoubleArray 18 | 19 | from . import wordidtable 20 | from . import wordinfolist 21 | from . import wordparameterlist 22 | from .lexicon import Lexicon 23 | 24 | 25 | class DoubleArrayLexicon(Lexicon): 26 | 27 | __SIGNED_SHORT_MIN = -32768 28 | __SIGNED_SHORT_MAX = 32767 29 | __USER_DICT_COST_PER_MORPH = -20 30 | 31 | trie = None 32 | word_id_table = None 33 | word_params = None 34 | 35 | def __init__(self, bytes_: mmap.mmap, offset: int, has_synonym_gid: bool): 36 | self.trie = DoubleArray() 37 | bytes_.seek(offset) 38 | size = int.from_bytes(bytes_.read(4), 'little') 39 | offset += 4 40 | 41 | array = memoryview(bytes_)[offset:offset + size * 4] 42 | self.trie.set_array(array, size) 43 | offset += self.trie.total_size() 44 | 45 | self.word_id_table = wordidtable.WordIdTable(bytes_, offset) 46 | offset += self.word_id_table.storage_size() 47 | 48 | self.word_params = wordparameterlist.WordParameterList(bytes_, offset) 49 | offset += self.word_params.storage_size() 50 | 51 | self.word_infos = wordinfolist.WordInfoList(bytes_, offset, self.word_params.get_size(), has_synonym_gid) 52 | 53 | def __del__(self): 54 | del self.word_params 55 | 56 | def lookup(self, text: bytes, offset: int) -> Lexicon.Itr: 57 | key = text[offset:] 58 | result = self.trie.common_prefix_search(key, length=len(key)) 59 | for index, length in result: 60 | word_ids = self.word_id_table.get(index) 61 | length += offset 62 | for word_id in word_ids: 63 | yield (word_id, length) 64 | 65 | def get_left_id(self, word_id: int) -> int: 66 | return self.word_params.get_left_id(word_id) 67 | 68 | def get_right_id(self, word_id: int) -> int: 69 | return self.word_params.get_right_id(word_id) 70 | 71 | def get_cost(self, word_id: int) -> int: 72 | return self.word_params.get_cost(word_id) 73 | 74 | def get_word_info(self, word_id: int) -> 'WordInfo': # noqa: F821 75 | return self.word_infos.get_word_info(word_id) 76 | 77 | def size(self) -> int: 78 | return self.word_params.size 79 | 80 | def get_word_id(self, headword: str, pos_id: int, reading_form: str) -> int: 81 | for wid, _ in self.lookup(headword.encode('utf-8'), 0): 82 | if self._compare_word_id(wid, headword, pos_id, reading_form): 83 | return wid 84 | 85 | for wid in range(self.word_infos.size()): 86 | if self._compare_word_id(wid, headword, pos_id, reading_form): 87 | return wid 88 | 89 | return -1 90 | 91 | def _compare_word_id(self, wid: int, headword: str, pos_id: int, reading_form: str) -> bool: 92 | info = self.word_infos.get_word_info(wid) 93 | return info.surface == headword \ 94 | and info.pos_id == pos_id \ 95 | and info.reading_form == reading_form 96 | 97 | def get_dictionary_id(self, word_id: int) -> int: 98 | return 0 99 | 100 | def calculate_cost(self, tokenizer) -> None: 101 | for wid in range(self.word_params.size): 102 | if self.get_cost(wid) != self.__SIGNED_SHORT_MIN: 103 | continue 104 | surface = self.get_word_info(wid).surface 105 | ms = tokenizer.tokenize(surface, None) 106 | cost = ms.get_internal_cost() + self.__USER_DICT_COST_PER_MORPH * len(ms) 107 | cost = min(cost, self.__SIGNED_SHORT_MAX) 108 | cost = max(cost, self.__SIGNED_SHORT_MIN) 109 | self.word_params.set_cost(wid, cost) 110 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/lexiconset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from functools import lru_cache 16 | from typing import List 17 | 18 | from .lexicon import Lexicon 19 | 20 | 21 | class LexiconSet(Lexicon): 22 | 23 | __MAX_DICTIONARIES = 16 24 | 25 | def __init__(self, system_lexicon: Lexicon): 26 | self.lexicons = [system_lexicon] 27 | self.pos_offsets = [0] 28 | 29 | def add(self, lexicon: Lexicon, pos_offset: int) -> None: 30 | if lexicon not in self.lexicons: 31 | self.lexicons.append(lexicon) 32 | self.pos_offsets.append(pos_offset) 33 | 34 | def is_full(self) -> bool: 35 | return len(self.lexicons) >= self.__MAX_DICTIONARIES 36 | 37 | def lookup(self, text: str, offset: int) -> Lexicon.Itr: 38 | if len(self.lexicons) == 1: 39 | return self.lexicons[0].lookup(text, offset) 40 | return self.__lookup(text, offset) 41 | 42 | def __lookup(self, text: str, offset: int) -> Lexicon.Itr: 43 | indices = list(range(len(self.lexicons)))[1:] + [0] 44 | for dict_id in indices: 45 | pairs = self.lexicons[dict_id].lookup(text, offset) 46 | for pair in pairs: 47 | yield (self.build_word_id(dict_id, pair[0]), pair[1]) 48 | 49 | def get_left_id(self, word_id: int) -> int: 50 | return self.lexicons[self.get_dictionary_id(word_id)]\ 51 | .get_left_id(self.get_word_id1(word_id)) 52 | 53 | def get_right_id(self, word_id: int) -> int: 54 | return self.lexicons[self.get_dictionary_id(word_id)]\ 55 | .get_right_id(self.get_word_id1(word_id)) 56 | 57 | def get_cost(self, word_id: int) -> int: 58 | return self.lexicons[self.get_dictionary_id(word_id)]\ 59 | .get_cost(self.get_word_id1(word_id)) 60 | 61 | @lru_cache(1024) 62 | def get_word_info(self, word_id: int) -> 'WordInfo': # noqa: F821 63 | dic_id = self.get_dictionary_id(word_id) 64 | winfo = self.lexicons[dic_id].get_word_info(self.get_word_id1(word_id)) 65 | pos_id = winfo.pos_id 66 | if dic_id > 0 and pos_id >= self.pos_offsets[1]: # user defined part-of-speech 67 | winfo.pos_id = winfo.pos_id - self.pos_offsets[1] + self.pos_offsets[dic_id] 68 | winfo.a_unit_split = self.convert_split(winfo.a_unit_split, dic_id) 69 | winfo.b_unit_split = self.convert_split(winfo.b_unit_split, dic_id) 70 | winfo.word_structure = self.convert_split(winfo.word_structure, dic_id) 71 | return winfo 72 | 73 | def get_dictionary_id(self, word_id: int) -> int: 74 | return word_id >> 28 75 | 76 | @staticmethod 77 | def get_word_id1(word_id: int) -> int: 78 | return 0x0FFFFFFF & word_id 79 | 80 | def get_word_id(self, headword: str, pos_id: int, reading_form: str) -> int: 81 | for dic_id in range(len(self.lexicons)): 82 | wid = self.lexicons[dic_id].get_word_id(headword, pos_id, reading_form) 83 | if wid <= 0: 84 | return self.build_word_id(dic_id, wid) 85 | return self.lexicons[0].get_word_id(headword, pos_id, reading_form) 86 | 87 | def build_word_id(self, dict_id, word_id): 88 | if word_id > 0x0FFFFFFF: 89 | raise AttributeError("word ID is too large: ", word_id) 90 | if dict_id > len(self.lexicons): 91 | raise AttributeError("dictionary ID is too large: ", dict_id) 92 | return (dict_id << 28) | word_id 93 | 94 | def size(self) -> int: 95 | return sum([lex.size() for lex in self.lexicons]) 96 | 97 | def convert_split(self, split: List[int], dict_id: int) -> List[int]: 98 | for i in range(len(split)): 99 | if self.get_dictionary_id(split[i]) > 0: 100 | split[i] = self.build_word_id(dict_id, self.get_word_id1(split[i])) 101 | return split 102 | -------------------------------------------------------------------------------- /sudachipy/plugin/path_rewrite/join_numeric_plugin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import warnings 16 | 17 | from sudachipy.dictionarylib.categorytype import CategoryType 18 | 19 | from .numericparser import NumericParser 20 | from .path_rewrite_plugin import PathRewritePlugin 21 | 22 | 23 | class JoinNumericPlugin(PathRewritePlugin): 24 | 25 | _numeric_pos_id = None 26 | _enable_normalize = True 27 | 28 | def __init__(self, json_obj): 29 | self._NUMERIC_POS = ['名詞', '数詞', '*', '*', '*', '*'] 30 | if not json_obj: 31 | return 32 | if 'joinKanjiNumeric' in json_obj: 33 | warnings.warn('joinKanjiNumeric is already nonsense key', SyntaxWarning) 34 | if 'enableNormalize' in json_obj: 35 | self._enable_normalize = json_obj['enableNormalize'] 36 | 37 | def set_up(self, grammar): 38 | self._numeric_pos_id = grammar.get_part_of_speech_id(self._NUMERIC_POS) 39 | 40 | def rewrite(self, text, path, lattice): 41 | begin_index = -1 42 | comma_as_digit = True 43 | period_as_digit = True 44 | parser = NumericParser() 45 | i = -1 46 | 47 | while i < len(path) - 1: 48 | i += 1 49 | node = path[i] 50 | types = self.get_char_category_types(text, node) 51 | s = node.get_word_info().normalized_form 52 | if CategoryType.NUMERIC in types or CategoryType.KANJINUMERIC in types or \ 53 | (period_as_digit and s == '.') or (comma_as_digit and s == ','): 54 | 55 | if begin_index < 0: 56 | parser.clear() 57 | begin_index = i 58 | 59 | for c in s: 60 | if not parser.append(c): 61 | if begin_index >= 0: 62 | if parser.error_state == NumericParser.Error.COMMA: 63 | comma_as_digit = False 64 | i = begin_index - 1 65 | elif parser.error_state == NumericParser.Error.POINT: 66 | period_as_digit = False 67 | i = begin_index - 1 68 | begin_index = -1 69 | break 70 | continue 71 | 72 | if begin_index >= 0: 73 | if parser.done(): 74 | self._concat(path, begin_index, i, lattice, parser) 75 | i = begin_index + 1 76 | else: 77 | ss = path[i - 1].get_word_info().normalized_form 78 | if (parser.error_state == NumericParser.Error.COMMA and ss == ',') or \ 79 | (parser.error_state == NumericParser.Error.POINT and ss == '.'): 80 | self._concat(path, begin_index, i - 1, lattice, parser) 81 | i = begin_index + 2 82 | begin_index = -1 83 | if not comma_as_digit and s != ',': 84 | comma_as_digit = True 85 | if not period_as_digit and s != '.': 86 | period_as_digit = True 87 | 88 | if begin_index >= 0: 89 | if parser.done(): 90 | self._concat(path, begin_index, len(path), lattice, parser) 91 | else: 92 | ss = path[-1].get_word_info().normalized_form 93 | if (parser.error_state == NumericParser.Error.COMMA and ss == ',') or \ 94 | (parser.error_state == NumericParser.Error.POINT and ss == '.'): 95 | self._concat(path, begin_index, len(path) - 1, lattice, parser) 96 | 97 | def _concat(self, path, begin, end, lattice, parser) -> None: 98 | if path[begin].get_word_info().pos_id != self._numeric_pos_id: 99 | return 100 | if self._enable_normalize: 101 | normalized_form = parser.get_normalized() 102 | if end - begin > 1 or normalized_form != path[begin].get_word_info().normalized_form: 103 | self.concatenate(path, begin, end, lattice, normalized_form) 104 | return 105 | if end - begin > 1: 106 | self.concatenate(path, begin, end, lattice, '') 107 | -------------------------------------------------------------------------------- /tests/dictionarylib/test_userdictionarybuilder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import shutil 17 | import tempfile 18 | import time 19 | from logging import getLogger 20 | from unittest import TestCase 21 | 22 | from sudachipy.dictionarylib import SYSTEM_DICT_VERSION_2 23 | from sudachipy.dictionarylib.dictionaryheader import DictionaryHeader 24 | from sudachipy.dictionarylib.userdictionarybuilder import UserDictionaryBuilder 25 | 26 | from .test_dictionarybuilder import TestDictionaryBuilder 27 | 28 | 29 | class TestUserDictionaryBuilder(TestCase): 30 | 31 | def setUp(self): 32 | self.test_dir = tempfile.mkdtemp() 33 | test_resources_dir = os.path.join( 34 | os.path.dirname(os.path.abspath(__file__)), 35 | os.pardir, 36 | 'resources') 37 | self.dict_filename = os.path.join(test_resources_dir, 'system.dic') 38 | _, _, self.grammar, self.lexicon_set = \ 39 | TestDictionaryBuilder.read_system_dictionary(self.dict_filename) 40 | self.logger = getLogger() 41 | self.logger.disabled = True 42 | 43 | def tearDown(self): 44 | shutil.rmtree(self.test_dir) 45 | 46 | def test_parseline_with_userdefined_POS(self): 47 | builder = UserDictionaryBuilder(self.grammar, self.lexicon_set, logger=self.logger) 48 | builder.parse_line('田中,0,0,0,田中,存在,しない,品詞,*,*,*,タナカ,田中,*,A,*,*,*,*'.split(',')) 49 | self.assertEqual(1, len(builder.pos_table.get_list())) 50 | 51 | def test_build(self): 52 | out_path = os.path.join(self.test_dir, 'output.txt') 53 | in_path = os.path.join(self.test_dir, 'input.txt') 54 | 55 | out_stream = open(out_path, 'wb') 56 | # lexicon_paths = [self.input_path] 57 | # matrix_input_stream = open(self.matrix_path, 'r') 58 | with open(in_path, 'w', encoding='utf-8') as wf: 59 | wf.write("東京都市,0,0,0,東京都市,名詞,固有名詞,地名,一般,*,*,ヒガシキョウトシ,東京都市,*,B,\"東,名詞,普通名詞,一般,*,*,*,ヒガシ/3/U1\",*,\"4/3/市,名詞,普通名詞,一般,*,*,*,シ\",*\n") 60 | wf.write('市,-1,-1,0,市,名詞,普通名詞,一般,*,*,*,シ,市,*,A,*,*,*,*\n') 61 | 62 | _, _, grammar, lexicon_set = TestDictionaryBuilder.read_system_dictionary(self.dict_filename) 63 | header = DictionaryHeader(SYSTEM_DICT_VERSION_2, int(time.time()), 'test') 64 | out_stream.write(header.to_bytes()) 65 | builder = UserDictionaryBuilder(grammar, lexicon_set, logger=self.logger) 66 | lexicon_paths = [in_path] 67 | builder.build(lexicon_paths, None, out_stream) 68 | out_stream.close() 69 | 70 | buffers, header, grammar, lexicon_set = TestDictionaryBuilder.read_system_dictionary(out_path) 71 | lexicon = lexicon_set.lexicons[0] 72 | 73 | # header 74 | self.assertEqual(SYSTEM_DICT_VERSION_2, header.version) 75 | self.assertEqual('test', header.description) 76 | 77 | # lexicon 78 | self.assertEqual(0, lexicon.get_left_id(0)) 79 | self.assertEqual(0, lexicon.get_cost(0)) 80 | wi = lexicon.get_word_info(0) 81 | self.assertEqual('東京都市', wi.surface) 82 | self.assertEqual('東京都市', wi.normalized_form) 83 | self.assertEqual(-1, wi.dictionary_form_word_id) 84 | self.assertEqual('ヒガシキョウトシ', wi.reading_form) 85 | self.assertEqual(3, wi.pos_id) 86 | self.assertEqual([4, 3, 1 | (1 << 28)], wi.a_unit_split) 87 | self.assertEqual([], wi.b_unit_split) 88 | self.assertEqual([4, 3, 1 | (1 << 28)], wi.word_structure) 89 | lst = lexicon.lookup('東京都市'.encode('utf-8'), 0) 90 | self.assertEqual((0, len('東京都市'.encode('utf-8'))), lst.__next__()) 91 | with self.assertRaises(StopIteration): 92 | lst.__next__() 93 | 94 | self.assertEqual(-1, lexicon.get_left_id(1)) 95 | self.assertEqual(0, lexicon.get_cost(1)) 96 | wi = lexicon.get_word_info(1) 97 | self.assertEqual('市', wi.surface) 98 | self.assertEqual('市', wi.normalized_form) 99 | self.assertEqual(-1, wi.dictionary_form_word_id) 100 | self.assertEqual('シ', wi.reading_form) 101 | self.assertEqual(4, wi.pos_id) 102 | self.assertEqual([], wi.a_unit_split) 103 | self.assertEqual([], wi.b_unit_split) 104 | lst = lexicon.lookup('東'.encode('utf-8'), 0) 105 | with self.assertRaises(StopIteration): 106 | lst.__next__() 107 | -------------------------------------------------------------------------------- /sudachipy/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import os 17 | import warnings 18 | from importlib import import_module 19 | from importlib.util import find_spec 20 | from pathlib import Path 21 | from typing import List 22 | 23 | DEFAULT_RESOURCEDIR = Path(__file__).absolute().parent / 'resources' 24 | DEFAULT_SETTINGFILE = DEFAULT_RESOURCEDIR / 'sudachi.json' 25 | DEFAULT_RESOURCEDIR = DEFAULT_RESOURCEDIR.as_posix() 26 | DEFAULT_SETTINGFILE = DEFAULT_SETTINGFILE.as_posix() 27 | 28 | 29 | def get_absolute_dict_path(dict_type: str) -> str: 30 | pkg_path = Path(import_module('sudachidict_' + dict_type).__file__).parent 31 | dic_path = pkg_path / 'resources' / 'system.dic' 32 | return str(dic_path.absolute()) 33 | 34 | 35 | def to_absolute_resource_path(resource_dir: str, dict_path: str) -> str: 36 | if Path(dict_path).is_absolute(): 37 | return dict_path 38 | else: 39 | return os.path.join(resource_dir, dict_path) 40 | 41 | 42 | def find_dict_path(dict_type='core'): 43 | is_installed = find_spec('sudachidict_{}'.format(dict_type)) 44 | if is_installed: 45 | return get_absolute_dict_path(dict_type) 46 | else: 47 | raise ModuleNotFoundError( 48 | 'Package `sudachidict_{}` dose not exist. ' 49 | 'You may install it with a command `$ pip install sudachidict_{}`'.format(dict_type, dict_type) 50 | ) 51 | 52 | 53 | class _Settings(object): 54 | 55 | DICT_PATH_KEY = 'systemDict' 56 | CHAR_DEF_KEY = 'characterDefinitionFile' 57 | USER_DICT_PATH_KEY = 'userDict' 58 | 59 | def __init__(self): 60 | self.__is_active = False 61 | self.__dict_ = None 62 | self.__config_path = None 63 | self.resource_dir = None 64 | 65 | def set_up(self, config_path=None, resource_dir=None, dict_type=None) -> None: 66 | config_path = config_path or DEFAULT_SETTINGFILE 67 | self.__config_path = config_path 68 | resource_dir = resource_dir or os.path.dirname(config_path) 69 | with open(config_path, 'r', encoding='utf-8') as f: 70 | self.__dict_ = json.load(f) 71 | self.__is_active = True 72 | self.resource_dir = resource_dir 73 | if dict_type is not None: 74 | if dict_type in ['small', 'core', 'full']: 75 | if self.DICT_PATH_KEY in self.__dict_ and self.__dict_[self.DICT_PATH_KEY] and \ 76 | 'sudachidict_{}'.format(dict_type) not in self.__dict_[self.DICT_PATH_KEY]: 77 | warnings.warn( 78 | 'Two system dictionaries may be specified. ' 79 | 'The `sudachidict_{}` defined "dict_type" overrides those defined in the config file.'.format(dict_type) 80 | ) 81 | self.__dict_[self.DICT_PATH_KEY] = find_dict_path(dict_type=dict_type) 82 | else: 83 | raise ValueError('"dict_type" must be "small", "core", or "full".') 84 | else: 85 | if self.DICT_PATH_KEY not in self.__dict_ or not self.__dict_[self.DICT_PATH_KEY]: 86 | self.__dict_[self.DICT_PATH_KEY] = find_dict_path() 87 | 88 | def __setitem__(self, key, value): 89 | if not self.__is_active: 90 | self.set_up() 91 | self.__dict_[key] = value 92 | 93 | def __getitem__(self, key): 94 | if not self.__is_active: 95 | self.set_up() 96 | return self.__dict_[key] 97 | 98 | def keys(self): 99 | return self.__dict_.keys() 100 | 101 | def __contains__(self, item): 102 | return item in self.__dict_.keys() 103 | 104 | def system_dict_path(self) -> str: 105 | dict_path = self.__dict_[self.DICT_PATH_KEY] 106 | return to_absolute_resource_path(self.resource_dir, dict_path) 107 | 108 | def char_def_path(self) -> str: 109 | key = self.CHAR_DEF_KEY 110 | if key in self.__dict_: 111 | return to_absolute_resource_path(self.resource_dir, self.__dict_[key]) 112 | raise KeyError('`{}` not defined in setting file'.format(key)) 113 | 114 | def user_dict_paths(self) -> List[str]: 115 | key = self.USER_DICT_PATH_KEY 116 | if key in self.__dict_: 117 | return [to_absolute_resource_path(self.resource_dir, path) for path in self.__dict_[key]] 118 | return [] 119 | 120 | 121 | settings = _Settings() 122 | -------------------------------------------------------------------------------- /tests/dictionarylib/test_doublearraylexicon.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import mmap 16 | import os 17 | import unittest 18 | 19 | from sudachipy.dictionarylib.dictionaryheader import DictionaryHeader 20 | from sudachipy.dictionarylib.doublearraylexicon import DoubleArrayLexicon 21 | 22 | 23 | class TestDoubleArrayLexicon(unittest.TestCase): 24 | 25 | __GRAMMAR_SIZE = 470 26 | 27 | def setUp(self): 28 | # Copied from sudachipy.dictionay.Dictionary.read_system_dictionary 29 | test_resources_dir = os.path.join( 30 | os.path.dirname(os.path.abspath(__file__)), 31 | os.pardir, 32 | 'resources') 33 | filename = os.path.join(test_resources_dir, 'system.dic') 34 | with open(filename, 'rb') as system_dic: 35 | bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ) 36 | header = DictionaryHeader.from_bytes(bytes_, 0) 37 | if not header.is_system_dictionary(): 38 | raise Exception('invalid system dictionary') 39 | self.lexicon = DoubleArrayLexicon(bytes_, header.storage_size() + 470, True) 40 | 41 | def test_lookup(self): 42 | res = self.lexicon.lookup('東京都'.encode('utf-8'), 0) 43 | self.assertEqual((4, 3), res.__next__()) # 東 44 | 45 | self.assertEqual((5, 6), res.__next__()) # 東京 46 | 47 | self.assertEqual((6, 9), res.__next__()) # 東京都 48 | 49 | with self.assertRaises(StopIteration): 50 | res.__next__() 51 | 52 | res = self.lexicon.lookup('東京都に'.encode('utf-8'), 9) 53 | self.assertEqual((1, 12), res.__next__()) # に(接続助詞) 54 | self.assertEqual((2, 12), res.__next__()) # に(格助詞) 55 | with self.assertRaises(StopIteration): 56 | res.__next__() 57 | 58 | res = self.lexicon.lookup('あれ'.encode('utf-8'), 0) 59 | with self.assertRaises(StopIteration): 60 | res.__next__() 61 | 62 | def test_parameters(self): 63 | # た 64 | self.assertEqual(1, self.lexicon.get_left_id(0)) 65 | self.assertEqual(1, self.lexicon.get_right_id(0)) 66 | self.assertEqual(8729, self.lexicon.get_cost(0)) 67 | 68 | # 東京都 69 | self.assertEqual(6, self.lexicon.get_left_id(6)) 70 | self.assertEqual(8, self.lexicon.get_right_id(6)) 71 | self.assertEqual(5320, self.lexicon.get_cost(6)) 72 | 73 | # 都 74 | self.assertEqual(8, self.lexicon.get_left_id(9)) 75 | self.assertEqual(8, self.lexicon.get_right_id(9)) 76 | self.assertEqual(2914, self.lexicon.get_cost(9)) 77 | 78 | def test_wordinfo(self): 79 | # た 80 | wi = self.lexicon.get_word_info(0) 81 | self.assertEqual('た', wi.surface) 82 | self.assertEqual(3, wi.head_word_length) 83 | self.assertEqual(0, wi.pos_id) 84 | self.assertEqual('た', wi.normalized_form) 85 | self.assertEqual(-1, wi.dictionary_form_word_id) 86 | self.assertEqual('た', wi.dictionary_form) 87 | self.assertEqual('タ', wi.reading_form) 88 | self.assertEqual([], wi.a_unit_split) 89 | self.assertEqual([], wi.b_unit_split) 90 | self.assertEqual([], wi.word_structure) 91 | 92 | # 行っ 93 | wi = self.lexicon.get_word_info(8) 94 | self.assertEqual('行っ', wi.surface) 95 | self.assertEqual('行く', wi.normalized_form) 96 | self.assertEqual(7, wi.dictionary_form_word_id) 97 | self.assertEqual('行く', wi.dictionary_form) 98 | 99 | # 東京都 100 | wi = self.lexicon.get_word_info(6) 101 | self.assertEqual('東京都', wi.surface) 102 | self.assertEqual([5, 9], wi.a_unit_split) 103 | self.assertEqual([], wi.b_unit_split) 104 | self.assertEqual([5, 9], wi.word_structure) 105 | self.assertEqual([], wi.synonym_group_ids) 106 | 107 | def test_wordinfo_with_longword(self): 108 | # 0123456789 * 30 109 | wi = self.lexicon.get_word_info(36) 110 | self.assertEqual(300, len(wi.surface)) 111 | self.assertEqual(300, wi.head_word_length) 112 | self.assertEqual(300, len(wi.normalized_form)) 113 | self.assertEqual(-1, wi.dictionary_form_word_id) 114 | self.assertEqual(300, len(wi.dictionary_form)) 115 | self.assertEqual(570, len(wi.reading_form)) 116 | 117 | def test_size(self): 118 | self.assertEqual(39, self.lexicon.size()) 119 | 120 | 121 | if __name__ == '__main__': 122 | unittest.main() 123 | -------------------------------------------------------------------------------- /tests/plugin/test_default_input_text_plugin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import unittest 17 | 18 | from sudachipy.plugin.input_text.default_input_text_plugin import DefaultInputTextPlugin 19 | from sudachipy.utf8inputtextbuilder import UTF8InputTextBuilder 20 | 21 | from tests import mock_grammar 22 | 23 | 24 | class TestDefaultInputTextPlugin(unittest.TestCase): 25 | 26 | original_text = "ÂBΓД㈱ガウ゛⼼Ⅲ" 27 | normalized_text = "âbγд(株)ガヴ⼼ⅲ" 28 | 29 | def setUp(self): 30 | self.builder = UTF8InputTextBuilder(self.original_text, mock_grammar.mocked_grammar) 31 | 32 | self.plugin = DefaultInputTextPlugin() 33 | 34 | try: 35 | self.plugin.set_up() 36 | except IOError: 37 | self.fail('no file') 38 | 39 | self.test_resources_dir = os.path.join( 40 | os.path.dirname(os.path.abspath(__file__)), 41 | os.pardir, 42 | 'resources') 43 | 44 | def test_before_rewrite(self): 45 | self.assertEqual(self.original_text, self.builder.get_original_text()) 46 | self.assertEqual(self.original_text, self.builder.get_text()) 47 | text = self.builder.build() 48 | self.assertEqual(self.original_text, text.get_original_text()) 49 | self.assertEqual(self.original_text, text.get_text()) 50 | bytes_ = text.get_byte_text() 51 | self.assertEqual(30, len(bytes_)) 52 | expected = b'\xc3\x82\xef\xbc\xa2\xce\x93\xd0\x94\xe3\x88\xb1\xef\xbd\xb6\xef\xbe\x9e\xe3\x82\xa6\xe3\x82\x9b\xe2\xbc\xbc\xe2\x85\xa2' 53 | self.assertEqual(expected, bytes_) 54 | self.assertEqual(0, text.get_original_index(0)) 55 | self.assertEqual(0, text.get_original_index(1)) 56 | self.assertEqual(1, text.get_original_index(2)) 57 | self.assertEqual(1, text.get_original_index(4)) 58 | self.assertEqual(3, text.get_original_index(8)) 59 | self.assertEqual(5, text.get_original_index(12)) 60 | self.assertEqual(9, text.get_original_index(24)) 61 | self.assertEqual(9, text.get_original_index(26)) 62 | 63 | def test_after_rewrite(self): 64 | self.assertEqual(self.original_text, self.builder.get_original_text()) 65 | self.assertEqual(self.original_text, self.builder.get_text()) 66 | self.plugin.rewrite(self.builder) 67 | text = self.builder.build() 68 | self.assertEqual(self.original_text, text.get_original_text()) 69 | self.assertEqual(self.normalized_text, text.get_text()) 70 | bytes_ = text.get_byte_text() 71 | self.assertEqual(24, len(bytes_)) 72 | expected = b'\xc3\xa2\x62\xce\xb3\xd0\xb4\x28\xe6\xa0\xaa\x29\xe3\x82\xac\xe3\x83\xb4\xe2\xbc\xbc\xe2\x85\xb2' 73 | self.assertEqual(expected, bytes_) 74 | self.assertEqual(0, text.get_original_index(0)) 75 | self.assertEqual(0, text.get_original_index(1)) 76 | self.assertEqual(1, text.get_original_index(2)) 77 | self.assertEqual(2, text.get_original_index(3)) 78 | self.assertEqual(4, text.get_original_index(7)) 79 | self.assertEqual(5, text.get_original_index(8)) 80 | self.assertEqual(5, text.get_original_index(11)) 81 | self.assertEqual(7, text.get_original_index(15)) 82 | self.assertEqual(7, text.get_original_index(17)) 83 | 84 | # def test_setup_with_null(self): 85 | 86 | def test_invalid_format_ignorelist(self): 87 | plugin = DefaultInputTextPlugin() 88 | with self.assertRaises(RuntimeError) as cm: 89 | plugin.read_rewrite_lists(os.path.join(self.test_resources_dir, 'rewrite_error_ignorelist.def')) 90 | self.assertEqual('12 is not character at line 1', cm.exception.args[0]) 91 | 92 | def test_invalid_format_replacelist(self): 93 | plugin = DefaultInputTextPlugin() 94 | with self.assertRaises(RuntimeError) as cm: 95 | plugin.read_rewrite_lists(os.path.join(self.test_resources_dir, 'rewrite_error_replacelist.def')) 96 | self.assertEqual('invalid format at line 1', cm.exception.args[0]) 97 | 98 | def test_duplicated_lines_replacelist(self): 99 | plugin = DefaultInputTextPlugin() 100 | with self.assertRaises(RuntimeError) as cm: 101 | plugin.read_rewrite_lists(os.path.join(self.test_resources_dir, 'rewrite_error_dup.def')) 102 | self.assertEqual('12 is already defined at line 2', cm.exception.args[0]) 103 | 104 | 105 | if __name__ == '__main__': 106 | unittest.main() 107 | -------------------------------------------------------------------------------- /tests/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import unittest 17 | 18 | from sudachipy import dictionary 19 | 20 | 21 | class TestTokenizer(unittest.TestCase): 22 | 23 | def setUp(self): 24 | resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources') 25 | self.dict_ = dictionary.Dictionary(os.path.join(resource_dir, 'sudachi.json'), resource_dir) 26 | self.tokenizer_obj = self.dict_.create() 27 | 28 | def test_tokenize_small_katanana_only(self): 29 | ms = self.tokenizer_obj.tokenize('ァ') 30 | self.assertEqual(1, len(ms)) 31 | 32 | def test_part_of_speech(self): 33 | ms = self.tokenizer_obj.tokenize('京都') 34 | self.assertEqual(1, len(ms)) 35 | m = ms[0] 36 | pid = m.part_of_speech_id() 37 | self.assertTrue(self.dict_.grammar.get_part_of_speech_size() > pid) 38 | pos = m.part_of_speech() 39 | self.assertEqual(pos, self.dict_.grammar.get_part_of_speech_string(pid)) 40 | 41 | def test_get_word_id(self): 42 | ms = self.tokenizer_obj.tokenize('京都') 43 | self.assertEqual(1, len(ms)) 44 | self.assertEqual(['名詞', '固有名詞', '地名', '一般', '*', '*'], ms[0].part_of_speech()) 45 | 46 | wid = ms[0].word_id() 47 | ms = self.tokenizer_obj.tokenize('ぴらる') 48 | self.assertEqual(1, len(ms)) 49 | self.assertNotEqual(wid, ms[0].word_id()) 50 | self.assertEqual(['名詞', '普通名詞', '一般', '*', '*', '*'], ms[0].part_of_speech()) 51 | 52 | ms = self.tokenizer_obj.tokenize('京') 53 | self.assertEqual(1, len(ms)) 54 | 55 | def test_get_dictionary_id(self): 56 | ms = self.tokenizer_obj.tokenize('京都') 57 | self.assertEqual(1, ms.size()) 58 | self.assertEqual(0, ms[0].dictionary_id()) 59 | 60 | ms = self.tokenizer_obj.tokenize('ぴらる') 61 | self.assertEqual(1, ms.size()) 62 | self.assertEqual(1, ms[0].dictionary_id()) 63 | 64 | ms = self.tokenizer_obj.tokenize('京') 65 | self.assertEqual(1, ms.size()) 66 | self.assertTrue(ms[0].dictionary_id() < 0) 67 | 68 | def test_get_synonym_group_ids(self): 69 | ms = self.tokenizer_obj.tokenize('京都') 70 | self.assertEqual(1, ms.size()) 71 | self.assertEqual([1, 5], ms[0].synonym_group_ids()) 72 | 73 | ms = self.tokenizer_obj.tokenize('ぴらる') 74 | self.assertEqual(1, ms.size()) 75 | self.assertEqual([], ms[0].synonym_group_ids()) 76 | 77 | ms = self.tokenizer_obj.tokenize('東京府') 78 | self.assertEqual(1, ms.size()) 79 | self.assertEqual([1, 3], ms[0].synonym_group_ids()) 80 | 81 | def test_tokenize_kanji_alphabet_word(self): 82 | self.assertEqual(len(self.tokenizer_obj.tokenize('特a')), 1) 83 | self.assertEqual(len(self.tokenizer_obj.tokenize('ab')), 1) 84 | self.assertEqual(len(self.tokenizer_obj.tokenize('特ab')), 2) 85 | 86 | def test_tokenizer_with_dots(self): 87 | ms = self.tokenizer_obj.tokenize('京都…') 88 | self.assertEqual(4, ms.size()) 89 | self.assertEqual(ms[1].surface(), '…') 90 | self.assertEqual(ms[1].normalized_form(), '.') 91 | self.assertEqual(ms[2].surface(), '') 92 | self.assertEqual(ms[2].normalized_form(), '.') 93 | self.assertEqual(ms[3].surface(), '') 94 | self.assertEqual(ms[3].normalized_form(), '.') 95 | 96 | def test_tokenizer_morpheme_split(self): 97 | from sudachipy import tokenizer 98 | ms = self.tokenizer_obj.tokenize('東京都', tokenizer.Tokenizer.SplitMode.C) 99 | self.assertEqual(1, ms.size()) 100 | self.assertEqual(ms[0].surface(), '東京都') 101 | 102 | ms_a = ms[0].split(tokenizer.Tokenizer.SplitMode.A) 103 | self.assertEqual(2, ms_a.size()) 104 | self.assertEqual(ms_a[0].surface(), '東京') 105 | self.assertEqual(ms_a[1].surface(), '都') 106 | 107 | def test_tokenizer_morpheme_list_range(self): 108 | from sudachipy import tokenizer 109 | ms = self.tokenizer_obj.tokenize('東京都', tokenizer.Tokenizer.SplitMode.A) 110 | self.assertEqual(2, ms.size()) 111 | self.assertEqual(ms[0].surface(), '東京') 112 | self.assertEqual(ms[1].surface(), '都') 113 | 114 | self.assertEqual(ms[-1].surface(), ms[1].surface()) 115 | self.assertEqual(ms[-2].surface(), ms[0].surface()) 116 | with self.assertRaises(IndexError) as cm: 117 | ms[2] 118 | with self.assertRaises(IndexError) as cm: 119 | ms[-3] 120 | 121 | 122 | if __name__ == '__main__': 123 | unittest.main() 124 | -------------------------------------------------------------------------------- /sudachipy/resources/char.def: -------------------------------------------------------------------------------- 1 | # 2 | # Japanese charcter category map 3 | # 4 | # $Id: char.def 9 2012-12-12 04:13:15Z togiso $; 5 | # 6 | 7 | ################################################################################### 8 | # 9 | # CHARACTER CATEGORY DEFINITION 10 | # 11 | # CATEGORY_NAME INVOKE GROUP LENGTH 12 | # 13 | # - CATEGORY_NAME: Name of category. you have to define DEFAULT class. 14 | # - INVOKE: 1/0: always invoke unknown word processing, evan when the word can be found in the lexicon 15 | # - GROUP: 1/0: make a new word by grouping the same chracter category 16 | # - LENGTH: n: 1 to n length new words are added 17 | # 18 | DEFAULT 0 1 0 # DEFAULT is a mandatory category! 19 | SPACE 0 1 0 20 | KANJI 0 0 2 21 | SYMBOL 1 1 0 22 | NUMERIC 1 1 0 23 | ALPHA 1 1 0 24 | HIRAGANA 0 1 2 25 | KATAKANA 1 1 2 26 | KANJINUMERIC 0 1 0 #change INVOKE 1->0 27 | GREEK 1 1 0 28 | CYRILLIC 1 1 0 29 | 30 | ################################################################################### 31 | # 32 | # CODE(UCS2) TO CATEGORY MAPPING 33 | # 34 | 35 | # SPACE 36 | 0x0020 SPACE # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE 37 | 0x000D SPACE 38 | 0x0009 SPACE 39 | 0x000B SPACE 40 | 0x000A SPACE 41 | 42 | # ASCII 43 | 0x0021..0x002F SYMBOL #!"#$%&'()*+,-./ 44 | 0x0030..0x0039 NUMERIC #0-9 45 | 0x003A..0x0040 SYMBOL #:;<=>?@ 46 | 0x0041..0x005A ALPHA #A-Z 47 | 0x005B..0x0060 SYMBOL #[\]^_` 48 | 0x0061..0x007A ALPHA #a-z 49 | 0x007B..0x007E SYMBOL #{|}~ 50 | 51 | # Latin 52 | 0x00A1..0x00BF SYMBOL # Latin 1 #¡->¿ 53 | 0x00C0..0x00D6 ALPHA # Latin 1 #À->Ö 54 | 0x00D7 SYMBOL # Latin 1 #× 55 | 0x00D8..0x00F6 ALPHA # Latin 1 #Ø->ö 56 | 0x00F7 SYMBOL # Latin 1 #÷ 57 | 0x00F8..0x00FF ALPHA # Latin 1 #ø->ÿ 58 | 0x0100..0x017F ALPHA # Latin Extended A 59 | 0x0180..0x0236 ALPHA # Latin Extended B 60 | 0x1E00..0x1EF9 ALPHA # Latin Extended Additional 61 | 62 | # CYRILLIC 63 | 0x0400..0x04F9 CYRILLIC #Ѐ->ӹ 64 | 0x0500..0x050F CYRILLIC # Cyrillic supplementary 65 | 66 | # GREEK 67 | 0x0374..0x03FB GREEK # Greek and Coptic #ʹ->ϻ 68 | 69 | # HIRAGANA 70 | 0x3041..0x309F HIRAGANA 71 | 72 | # KATAKANA 73 | #0x30A1..0x30FF KATAKANA 74 | 0x30A1..0x30FA KATAKANA 75 | 0x30FC..0x30FF KATAKANA 76 | 0x31F0..0x31FF KATAKANA # Small KU .. Small RO 77 | # 0x30FC KATAKANA HIRAGANA # ー 78 | 0x30A1 NOOOVBOW # Small A 79 | 0x30A3 NOOOVBOW 80 | 0x30A5 NOOOVBOW 81 | 0x30A7 NOOOVBOW 82 | 0x30A9 NOOOVBOW 83 | 0x30E3 NOOOVBOW 84 | 0x30E5 NOOOVBOW 85 | 0x30E7 NOOOVBOW 86 | 0x30EE NOOOVBOW 87 | 0x30FB..0x30FE NOOOVBOW 88 | 89 | # Half KATAKANA 90 | 0xFF66..0xFF9D KATAKANA 91 | 0xFF9E..0xFF9F KATAKANA 92 | 93 | # KANJI 94 | 0x2E80..0x2EF3 KANJI # CJK Raidcals Supplement 95 | 0x2F00..0x2FD5 KANJI 96 | 0x3005 KANJI NOOOVBOW 97 | 0x3007 KANJI 98 | 0x3400..0x4DB5 KANJI # CJK Unified Ideographs Extention 99 | #0x4E00..0x9FA5 KANJI 100 | 0x4E00..0x9FFF KANJI 101 | 0xF900..0xFA2D KANJI 102 | 0xFA30..0xFA6A KANJI 103 | 104 | 105 | # KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆) 106 | 0x4E00 KANJINUMERIC KANJI 107 | 0x4E8C KANJINUMERIC KANJI 108 | 0x4E09 KANJINUMERIC KANJI 109 | 0x56DB KANJINUMERIC KANJI 110 | 0x4E94 KANJINUMERIC KANJI 111 | 0x516D KANJINUMERIC KANJI 112 | 0x4E03 KANJINUMERIC KANJI 113 | 0x516B KANJINUMERIC KANJI 114 | 0x4E5D KANJINUMERIC KANJI 115 | 0x5341 KANJINUMERIC KANJI 116 | 0x767E KANJINUMERIC KANJI 117 | 0x5343 KANJINUMERIC KANJI 118 | 0x4E07 KANJINUMERIC KANJI 119 | 0x5104 KANJINUMERIC KANJI 120 | 0x5146 KANJINUMERIC KANJI 121 | 122 | # ZENKAKU 123 | 0xFF10..0xFF19 NUMERIC 124 | 0xFF21..0xFF3A ALPHA 125 | 0xFF41..0xFF5A ALPHA 126 | 0xFF01..0xFF0F SYMBOL #!->/ 127 | 0xFF1A..0xFF20 SYMBOL #:->@ 128 | 0xFF3B..0xFF40 SYMBOL #[->` 129 | 0xFF5B..0xFF65 SYMBOL #{->・ 130 | 0xFFE0..0xFFEF SYMBOL # HalfWidth and Full width Form 131 | 132 | # OTHER SYMBOLS 133 | 0x2000..0x206F SYMBOL # General Punctuation 134 | 0x2070..0x209F NUMERIC # Superscripts and Subscripts 135 | 0x20A0..0x20CF SYMBOL # Currency Symbols 136 | 0x20D0..0x20FF SYMBOL # Combining Diaritical Marks for Symbols 137 | 0x2100..0x214F SYMBOL # Letterlike Symbols 138 | 0x2150..0x218F NUMERIC # Number forms 139 | 0x2100..0x214B SYMBOL # Letterlike Symbols 140 | 0x2190..0x21FF SYMBOL # Arrow 141 | 0x2200..0x22FF SYMBOL # Mathematical Operators 142 | 0x2300..0x23FF SYMBOL # Miscellaneuos Technical 143 | 0x2460..0x24FF SYMBOL # Enclosed NUMERICs 144 | 0x2501..0x257F SYMBOL # Box Drawing 145 | 0x2580..0x259F SYMBOL # Block Elements 146 | 0x25A0..0x25FF SYMBOL # Geometric Shapes 147 | 0x2600..0x26FE SYMBOL # Miscellaneous Symbols 148 | 0x2700..0x27BF SYMBOL # Dingbats 149 | 0x27F0..0x27FF SYMBOL # Supplemental Arrows A 150 | 0x27C0..0x27EF SYMBOL # Miscellaneous Mathematical Symbols-A 151 | 0x2800..0x28FF SYMBOL # Braille Patterns 152 | 0x2900..0x297F SYMBOL # Supplemental Arrows B 153 | 0x2B00..0x2BFF SYMBOL # Miscellaneous Symbols and Arrows 154 | 0x2A00..0x2AFF SYMBOL # Supplemental Mathematical Operators 155 | 0x3300..0x33FF SYMBOL 156 | 0x3200..0x32FE SYMBOL # ENclosed CJK Letters and Months 157 | 0x3000..0x303F SYMBOL # CJK Symbol and Punctuation 158 | 0xFE30..0xFE4F SYMBOL # CJK Compatibility Forms 159 | 0xFE50..0xFE6B SYMBOL # Small Form Variants 160 | 161 | # added 2006/3/13 162 | 0x3007 SYMBOL KANJINUMERIC 163 | 164 | # added 2018/11/30 165 | 0x309b..0x309c HIRAGANA KATAKANA # voiced/semi-voiced sound marks 166 | 167 | # END OF TABLE 168 | -------------------------------------------------------------------------------- /tests/test_grammar.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import mmap 16 | import os 17 | import shutil 18 | import tempfile 19 | import unittest 20 | 21 | from sudachipy.dictionarylib.grammar import Grammar 22 | 23 | 24 | class TestGrammar(unittest.TestCase): 25 | 26 | alloc_size = 4096 27 | 28 | def setUp(self): 29 | storage = bytearray() 30 | self.build_partofspeech(storage) 31 | self.build_connect_table(storage) 32 | self.test_dir = tempfile.mkdtemp() 33 | f = os.path.join(self.test_dir, 'test_file.txt') 34 | with open(f, 'wb') as wf: 35 | wf.write(bytes(storage)) 36 | self.mmap = None 37 | with open(f, 'rb') as rf: 38 | self.mmap = mmap.mmap(rf.fileno(), 0, access=mmap.ACCESS_COPY) 39 | self.storage_size = self.mmap.size() 40 | offset = 0 41 | self.grammar = Grammar(self.mmap, offset) 42 | 43 | def tearDown(self): 44 | shutil.rmtree(self.test_dir) 45 | 46 | def test_storage_size(self): 47 | self.assertEqual(self.storage_size, self.grammar.storage_size) 48 | 49 | def test_get_partofspeech_string(self): 50 | self.assertEqual(6, len(self.grammar.get_part_of_speech_string(0))) 51 | self.assertEqual("BOS/EOS", self.grammar.get_part_of_speech_string(0)[0]) 52 | self.assertEqual("*", self.grammar.get_part_of_speech_string(0)[5]) 53 | 54 | self.assertEqual("一般", self.grammar.get_part_of_speech_string(1)[1]) 55 | self.assertEqual("*", self.grammar.get_part_of_speech_string(1)[5]) 56 | 57 | self.assertEqual("五段-サ行", self.grammar.get_part_of_speech_string(2)[4]) 58 | self.assertEqual("終止形-一般", self.grammar.get_part_of_speech_string(2)[5]) 59 | 60 | def test_get_connect_cost(self): 61 | self.assertEqual(0, self.grammar.get_connect_cost(0, 0)) 62 | self.assertEqual(-100, self.grammar.get_connect_cost(2, 1)) 63 | self.assertEqual(200, self.grammar.get_connect_cost(1, 2)) 64 | 65 | def test_set_connect_cost(self): 66 | self.grammar.set_connect_cost(0, 0, 300) 67 | self.assertEqual(300, self.grammar.get_connect_cost(0, 0)) 68 | 69 | def test_get_bos_parameters(self): 70 | self.assertEqual(0, self.grammar.get_bos_parameter()[0]) 71 | self.assertEqual(0, self.grammar.get_bos_parameter()[1]) 72 | self.assertEqual(0, self.grammar.get_bos_parameter()[2]) 73 | 74 | def test_get_eos_parameters(self): 75 | self.assertEqual(0, self.grammar.get_eos_parameter()[0]) 76 | self.assertEqual(0, self.grammar.get_eos_parameter()[1]) 77 | self.assertEqual(0, self.grammar.get_eos_parameter()[2]) 78 | 79 | def test_read_from_file(self): 80 | # Todo 81 | # after tidying up dictionary management 82 | pass 83 | 84 | @staticmethod 85 | def build_partofspeech(storage): 86 | storage.extend((3).to_bytes(2, byteorder='little', signed=True)) # number of part of speech 87 | 88 | storage.extend(b'\x07B\x00O\x00S\x00/\x00E\x00O\x00S\x00\x01*\x00\x01*\x00\x01*\x00\x01*\x00\x01*\x00') 89 | 90 | storage.extend(b'\x02') 91 | storage.extend('名刺'.encode('utf-16-le')) 92 | storage.extend(b'\x02') 93 | storage.extend('一般'.encode('utf-16-le')) 94 | storage.extend(b'\x01*\x00\x01*\x00\x01*\x00\x01*\x00') 95 | 96 | storage.extend(b'\x02') 97 | storage.extend('動詞'.encode('utf-16-le')) 98 | storage.extend(b'\x02') 99 | storage.extend('一般'.encode('utf-16-le')) 100 | storage.extend(b'\x01*\x00\x01*\x00\x05') 101 | storage.extend('五段-サ行'.encode('utf-16-le')) 102 | storage.extend(b'\x06') 103 | storage.extend('終止形-一般'.encode('utf-16-le')) 104 | 105 | @staticmethod 106 | def build_connect_table(storage): 107 | storage.extend((3).to_bytes(2, byteorder='little', signed=True)) # number of leftId 108 | storage.extend((3).to_bytes(2, byteorder='little', signed=True)) # number of rightId 109 | 110 | storage.extend((0).to_bytes(2, byteorder='little', signed=True)) # number of rightId 111 | storage.extend((-300).to_bytes(2, byteorder='little', signed=True)) # number of rightId 112 | storage.extend((3000).to_bytes(2, byteorder='little', signed=True)) # number of rightId 113 | 114 | storage.extend((300).to_bytes(2, byteorder='little', signed=True)) # number of rightId 115 | storage.extend((-500).to_bytes(2, byteorder='little', signed=True)) # number of rightId 116 | storage.extend((-100).to_bytes(2, byteorder='little', signed=True)) # number of frightId 117 | 118 | storage.extend((-3000).to_bytes(2, byteorder='little', signed=True)) # number of rightId 119 | storage.extend((200).to_bytes(2, byteorder='little', signed=True)) # number of rightId 120 | storage.extend((2000).to_bytes(2, byteorder='little', signed=True)) # number of rightId 121 | 122 | 123 | if __name__ == '__main__': 124 | unittest.main() 125 | -------------------------------------------------------------------------------- /tests/test_switchdictionary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import os 17 | import shutil 18 | import tempfile 19 | import time 20 | from logging import getLogger 21 | from unittest import TestCase 22 | 23 | from sudachipy.dictionary import Dictionary 24 | from sudachipy.dictionarylib import SYSTEM_DICT_VERSION_2 25 | from sudachipy.dictionarylib.dictionarybuilder import DictionaryBuilder 26 | from sudachipy.dictionarylib.dictionaryheader import DictionaryHeader 27 | 28 | 29 | class TestSwitchDictionary(TestCase): 30 | 31 | def setUp(self): 32 | self.logger = getLogger() 33 | self.logger.disabled = True 34 | 35 | self.temp_dir = tempfile.mkdtemp() 36 | self.resource_dir = os.path.join(self.temp_dir, 'resources') 37 | os.makedirs(self.resource_dir) 38 | 39 | test_resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources') 40 | self.char_def_path = os.path.join(self.resource_dir, 'char.def') 41 | shutil.copy(os.path.join(test_resource_dir, 'char.def'), self.char_def_path) 42 | 43 | self.sudachi_json_path = os.path.join(self.resource_dir, 'sudachi.json') 44 | shutil.copy(os.path.join(test_resource_dir, 'sudachi.json'), self.sudachi_json_path) 45 | self._rewrite_json(self.sudachi_json_path, 'userDict', []) 46 | 47 | self.matrix_path = os.path.join(self.resource_dir, 'matrix.txt') 48 | with open(self.matrix_path, 'w', encoding='utf-8') as wf: 49 | wf.write('1 1\n0 0 200\n') 50 | 51 | small_lexs = ["島,0,0,0,島,名詞,普通名詞,一般,*,*,*,シマ,島,*,A,*,*,*"] 52 | core_lexs = ["徳島本町,0,0,0,徳島本町,名詞,固有名詞,地名,一般,*,*,トクシマホンチョウ,徳島本町,*,A,*,*,*,*"] 53 | notcore_lexs = ["徳島堰,0,0,0,徳島堰,名詞,固有名詞,一般,*,*,*,トクシマセギ,徳島堰,*,A,*,*,*"] 54 | 55 | small_lines = small_lexs 56 | core_lines = small_lexs + core_lexs 57 | full_lines = small_lexs + core_lexs + notcore_lexs 58 | 59 | self.small_txt_path = os.path.join(self.resource_dir, 'small.csv') 60 | self.core_txt_path = os.path.join(self.resource_dir, 'core.csv') 61 | self.full_txt_path = os.path.join(self.resource_dir, 'full.csv') 62 | 63 | self.small_dic_path = self._build_dictionary(self.small_txt_path, small_lines, 'small.dic') 64 | self.core_dic_path = self._build_dictionary(self.core_txt_path, core_lines, 'core.dic') 65 | self.full_dic_path = self._build_dictionary(self.full_txt_path, full_lines, 'full.dic') 66 | 67 | def tearDown(self): 68 | shutil.rmtree(self.temp_dir) 69 | 70 | @staticmethod 71 | def _rewrite_json(json_file_path, k, v): 72 | with open(json_file_path, 'r') as f: 73 | obj = json.load(f) 74 | obj[k] = v 75 | with open(json_file_path, 'w') as f: 76 | json.dump(obj, f, ensure_ascii=False, indent=4) 77 | 78 | def _build_dictionary(self, input_txt_path, lex_lines, dictionary_name): 79 | with open(input_txt_path, 'w', encoding='utf-8') as wf: 80 | wf.write("\n".join(lex_lines)) 81 | 82 | out_path = os.path.join(self.resource_dir, dictionary_name) 83 | out_stream = open(out_path, 'wb') 84 | lexicon_paths = [input_txt_path] 85 | matrix_input_stream = open(self.matrix_path, 'r', encoding='utf-8') 86 | 87 | header = DictionaryHeader(SYSTEM_DICT_VERSION_2, int(time.time()), 'test') 88 | out_stream.write(header.to_bytes()) 89 | builder = DictionaryBuilder(logger=self.logger) 90 | builder.build(lexicon_paths, matrix_input_stream, out_stream) 91 | out_stream.close() 92 | matrix_input_stream.close() 93 | 94 | return out_path 95 | 96 | def test_switch_dictionary(self): 97 | self._rewrite_json(self.sudachi_json_path, 'systemDict', 'small.dic') # relative path 98 | self.dict = Dictionary(config_path=self.sudachi_json_path, resource_dir=self.resource_dir) 99 | self.assertEqual(1, self.dict.lexicon.size()) 100 | self._rewrite_json(self.sudachi_json_path, 'systemDict', self.small_dic_path) # abstract path 101 | self.dict = Dictionary(config_path=self.sudachi_json_path, resource_dir=self.resource_dir) 102 | self.assertEqual(1, self.dict.lexicon.size()) 103 | 104 | self._rewrite_json(self.sudachi_json_path, 'systemDict', 'core.dic') 105 | self.dict = Dictionary(config_path=self.sudachi_json_path, resource_dir=self.resource_dir) 106 | self.assertEqual(2, self.dict.lexicon.size()) 107 | self._rewrite_json(self.sudachi_json_path, 'systemDict', self.core_dic_path) 108 | self.dict = Dictionary(config_path=self.sudachi_json_path, resource_dir=self.resource_dir) 109 | self.assertEqual(2, self.dict.lexicon.size()) 110 | 111 | self._rewrite_json(self.sudachi_json_path, 'systemDict', 'full.dic') 112 | self.dict = Dictionary(config_path=self.sudachi_json_path, resource_dir=self.resource_dir) 113 | self.assertEqual(3, self.dict.lexicon.size()) 114 | self._rewrite_json(self.sudachi_json_path, 'systemDict', self.full_dic_path) 115 | self.dict = Dictionary(config_path=self.sudachi_json_path, resource_dir=self.resource_dir) 116 | self.assertEqual(3, self.dict.lexicon.size()) 117 | -------------------------------------------------------------------------------- /sudachipy/utf8inputtextbuilder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import utf8inputtext 16 | from .dictionarylib.categorytype import CategoryType 17 | 18 | 19 | class UTF8InputTextBuilder: 20 | def __init__(self, text, grammar): 21 | 22 | self.grammar = grammar 23 | self.original_text = text 24 | self.modified_text = text 25 | self.modified_to_original = list(range(len(self.original_text) + 1)) 26 | # 注: サロゲートペア文字は考慮していない 27 | 28 | def replace(self, begin, end, str_): 29 | if begin < 0: 30 | raise IndexError(begin) 31 | if begin > len(self.modified_text): 32 | raise IndexError("begin > length") 33 | if begin > end: 34 | raise IndexError("begin > end") 35 | if begin == end: 36 | raise AttributeError("begin == end") 37 | 38 | if end > len(self.modified_text): 39 | end = len(self.modified_text) 40 | 41 | self.modified_text = str_.join([self.modified_text[:begin], self.modified_text[end:]]) 42 | 43 | modified_begin = self.modified_to_original[begin] 44 | modified_end = self.modified_to_original[end] 45 | length = len(str_) 46 | if end - begin > length: 47 | del self.modified_to_original[begin + length:end] 48 | self.modified_to_original[begin] = modified_begin 49 | for i in range(1, length): 50 | if begin + i < end: 51 | self.modified_to_original[begin + i] = modified_end 52 | else: 53 | self.modified_to_original.insert(begin + i, modified_end) 54 | 55 | def get_original_text(self): 56 | return self.original_text 57 | 58 | def get_text(self): 59 | return self.modified_text 60 | 61 | def build(self): 62 | modified_string_text = self.get_text() 63 | byte_text = modified_string_text.encode('utf-8') 64 | 65 | length = len(byte_text) 66 | byte_indexes = [0 for i in range(length + 1)] 67 | offsets = [0 for i in range(length + 1)] 68 | j = 0 69 | for i in range(len(self.modified_text)): 70 | # 注: サロゲートペア文字は考慮していない 71 | for _ in range(self.utf8_byte_length(ord(self.modified_text[i]))): 72 | byte_indexes[j] = i 73 | offsets[j] = self.modified_to_original[i] 74 | j += 1 75 | byte_indexes[length] = len(modified_string_text) 76 | offsets[length] = self.modified_to_original[-1] 77 | 78 | char_categories = self.get_char_category_types(modified_string_text) 79 | char_category_continuities = self.get_char_category_continuities(modified_string_text, length, char_categories) 80 | can_bow_list = self._build_can_bow_list(modified_string_text, char_categories) 81 | return utf8inputtext.UTF8InputText( 82 | self.grammar, self.original_text, modified_string_text, byte_text, 83 | offsets, byte_indexes, char_categories, char_category_continuities, can_bow_list) 84 | 85 | def get_char_category_types(self, text): 86 | return [self.grammar.get_character_category().get_category_types(ord(c)) for c in text] 87 | 88 | def get_char_category_continuities(self, text, byte_length, char_categories): 89 | if len(text) == 0: 90 | return [] 91 | char_category_continuities = [] 92 | i = 0 93 | while i < len(char_categories): 94 | next_ = i + self.get_char_category_continuous_length(char_categories, i) 95 | length = 0 96 | for j in range(i, next_): 97 | length += self.utf8_byte_length(ord(text[j])) 98 | for k in range(length, 0, -1): 99 | char_category_continuities.append(k) 100 | i = next_ 101 | return char_category_continuities 102 | 103 | def get_char_category_continuous_length(self, char_categories, offset): 104 | continuous_category = set(char_categories[offset]) 105 | for length in range(1, len(char_categories) - offset): 106 | continuous_category = continuous_category & char_categories[offset + length] 107 | if len(continuous_category) == 0: 108 | return length 109 | return len(char_categories) - offset 110 | 111 | def utf8_byte_length(self, cp): 112 | if cp < 0: 113 | return 0 114 | elif cp <= 0x7F: 115 | return 1 116 | elif cp <= 0x7FF: 117 | return 2 118 | elif cp <= 0xFFFF: 119 | return 3 120 | elif cp <= 0x10FFFF: 121 | return 4 122 | else: 123 | return 0 124 | 125 | def _build_can_bow_list(self, text, char_categories): 126 | if not text: 127 | return [] 128 | can_bow_list = [] 129 | for i, cat in enumerate(char_categories): 130 | if i == 0: 131 | can_bow_list.append(True) 132 | continue 133 | 134 | if CategoryType.ALPHA in cat or CategoryType.GREEK in cat or CategoryType.CYRILLIC in cat: 135 | types = cat & char_categories[i - 1] 136 | can_bow_list.append(not bool(types)) 137 | continue 138 | 139 | can_bow_list.append(True) 140 | 141 | return can_bow_list 142 | -------------------------------------------------------------------------------- /tests/plugin/test_prolongedsoundmarkinput.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from unittest import TestCase 16 | 17 | from sudachipy.plugin.input_text import ProlongedSoundMarkInputTextPlugin 18 | from sudachipy.utf8inputtextbuilder import UTF8InputTextBuilder 19 | 20 | from tests.mock_grammar import mocked_grammar 21 | 22 | 23 | class TestProlongedSoundMarkInputTextPlugin(TestCase): 24 | 25 | def setUp(self) -> None: 26 | self.plugin = ProlongedSoundMarkInputTextPlugin(None) 27 | for psm in ['ー', '〜', '〰']: 28 | self.plugin._psm_set.add(ord(psm)) 29 | 30 | def test_combine_continuous_prolonged_sound_mark(self): 31 | original = 'ゴーール' 32 | normalized = 'ゴール' 33 | builder = UTF8InputTextBuilder(original, mocked_grammar) 34 | self.plugin.rewrite(builder) 35 | text = builder.build() 36 | 37 | self.assertEqual(original, text.original_text) 38 | self.assertEqual(normalized, text.get_text()) 39 | bytes_ = text.get_byte_text() 40 | self.assertEqual(9, len(bytes_)) 41 | 42 | self.assertEqual(b'\xe3\x82\xb4\xe3\x83\xbc\xe3\x83\xab', bytes_) 43 | self.assertEqual(0, text.get_original_index(0)) 44 | self.assertEqual(1, text.get_original_index(3)) 45 | self.assertEqual(3, text.get_original_index(6)) 46 | self.assertEqual(4, text.get_original_index(9)) 47 | 48 | def test_combined_continuous_prolonged_sound_marks_at_end(self): 49 | original = 'スーパーー' 50 | normalized = 'スーパー' 51 | builder = UTF8InputTextBuilder(original, mocked_grammar) 52 | self.plugin.rewrite(builder) 53 | text = builder.build() 54 | 55 | self.assertEqual(original, text.original_text) 56 | self.assertEqual(normalized, text.get_text()) 57 | bytes_ = text.get_byte_text() 58 | self.assertEqual(12, len(bytes_)) 59 | 60 | self.assertEqual(b'\xe3\x82\xb9\xe3\x83\xbc\xe3\x83\x91\xe3\x83\xbc', bytes_) 61 | self.assertEqual(0, text.get_original_index(0)) 62 | self.assertEqual(1, text.get_original_index(3)) 63 | self.assertEqual(2, text.get_original_index(6)) 64 | self.assertEqual(3, text.get_original_index(9)) 65 | self.assertEqual(5, text.get_original_index(12)) 66 | 67 | def test_combine_continuous_prolonged_sound_marks_multi_times(self): 68 | original = 'エーービーーーシーーーー' 69 | normalized = 'エービーシー' 70 | builder = UTF8InputTextBuilder(original, mocked_grammar) 71 | self.plugin.rewrite(builder) 72 | text = builder.build() 73 | 74 | self.assertEqual(original, text.original_text) 75 | self.assertEqual(normalized, text.get_text()) 76 | bytes_ = text.get_byte_text() 77 | self.assertEqual(18, len(bytes_)) 78 | 79 | self.assertEqual(b'\xe3\x82\xa8\xe3\x83\xbc\xe3\x83\x93\xe3\x83\xbc\xe3\x82\xb7\xe3\x83\xbc', bytes_) 80 | self.assertEqual(0, text.get_original_index(0)) 81 | self.assertEqual(1, text.get_original_index(3)) 82 | self.assertEqual(3, text.get_original_index(6)) 83 | self.assertEqual(4, text.get_original_index(9)) 84 | self.assertEqual(7, text.get_original_index(12)) 85 | self.assertEqual(8, text.get_original_index(15)) 86 | self.assertEqual(12, text.get_original_index(18)) 87 | 88 | def test_combine_continuous_prolonged_sound_marks_multi_symbol_types(self): 89 | original = 'エーービ〜〜〜シ〰〰〰〰' 90 | normalized = 'エービーシー' 91 | builder = UTF8InputTextBuilder(original, mocked_grammar) 92 | self.plugin.rewrite(builder) 93 | text = builder.build() 94 | 95 | self.assertEqual(original, text.original_text) 96 | self.assertEqual(normalized, text.get_text()) 97 | bytes_ = text.get_byte_text() 98 | self.assertEqual(18, len(bytes_)) 99 | 100 | self.assertEqual(b'\xe3\x82\xa8\xe3\x83\xbc\xe3\x83\x93\xe3\x83\xbc\xe3\x82\xb7\xe3\x83\xbc', bytes_) 101 | self.assertEqual(0, text.get_original_index(0)) 102 | self.assertEqual(1, text.get_original_index(3)) 103 | self.assertEqual(3, text.get_original_index(6)) 104 | self.assertEqual(4, text.get_original_index(9)) 105 | self.assertEqual(7, text.get_original_index(12)) 106 | self.assertEqual(8, text.get_original_index(15)) 107 | self.assertEqual(12, text.get_original_index(18)) 108 | 109 | def test_combine_continuous_prolonged_sound_marks_multi_mixed_symbol_types(self): 110 | original = 'エー〜ビ〜〰ーシ〰ー〰〜' 111 | normalized = 'エービーシー' 112 | builder = UTF8InputTextBuilder(original, mocked_grammar) 113 | self.plugin.rewrite(builder) 114 | text = builder.build() 115 | 116 | self.assertEqual(original, text.original_text) 117 | self.assertEqual(normalized, text.get_text()) 118 | bytes_ = text.get_byte_text() 119 | self.assertEqual(18, len(bytes_)) 120 | 121 | self.assertEqual(b'\xe3\x82\xa8\xe3\x83\xbc\xe3\x83\x93\xe3\x83\xbc\xe3\x82\xb7\xe3\x83\xbc', bytes_) 122 | self.assertEqual(0, text.get_original_index(0)) 123 | self.assertEqual(1, text.get_original_index(3)) 124 | self.assertEqual(3, text.get_original_index(6)) 125 | self.assertEqual(4, text.get_original_index(9)) 126 | self.assertEqual(7, text.get_original_index(12)) 127 | self.assertEqual(8, text.get_original_index(15)) 128 | self.assertEqual(12, text.get_original_index(18)) 129 | -------------------------------------------------------------------------------- /sudachipy/lattice.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Optional 16 | 17 | from .dictionarylib.grammar import Grammar 18 | from .latticenode cimport LatticeNode 19 | 20 | cdef class Lattice: 21 | 22 | def __init__(self, grammar: Grammar): 23 | self.size = 0 24 | self.capacity = 0 25 | 26 | 27 | self.end_lists = [] 28 | self.grammar = grammar 29 | self.eos_params = grammar.get_eos_parameter() 30 | cdef LatticeNode bos_node = LatticeNode() 31 | bos_params = grammar.get_bos_parameter() 32 | bos_node.set_parameter(bos_params[0], bos_params[1], bos_params[2]) 33 | bos_node.is_connected_to_bos = True 34 | self.end_lists.append([bos_node]) 35 | self.connect_costs = self.grammar._matrix_view 36 | 37 | cpdef void resize(self, int size): 38 | if size > self.capacity: 39 | self.expand(size) 40 | self.size = size 41 | self.eos_node = LatticeNode() 42 | self.eos_node.set_parameter(self.eos_params[0], self.eos_params[1], self.eos_params[2]) 43 | self.eos_node.begin = self.eos_node.end = size 44 | 45 | def clear(self) -> None: 46 | for i in range(1, self.size + 1): 47 | self.end_lists[i].clear() 48 | self.size = 0 49 | self.eos_node = None 50 | 51 | def expand(self, new_size: int) -> None: 52 | expand_list = [[] for _ in range(self.size, new_size)] 53 | self.end_lists.extend(expand_list) 54 | self.capacity = new_size 55 | 56 | def get_nodes_with_end(self, end: int) -> List[LatticeNode]: 57 | return self.end_lists[end] 58 | 59 | def get_nodes(self, begin: int, end: int) -> List[LatticeNode]: 60 | return [node for node in self.end_lists[end] if node.get_begin() == begin] 61 | 62 | def get_minimum_node(self, begin: int, end: int) -> Optional[LatticeNode]: 63 | nodes = self.get_nodes(begin, end) 64 | if not nodes: 65 | return None 66 | min_arg = nodes[0] 67 | for node in nodes[1:]: 68 | if node.get_path_cost() < min_arg.get_path_cost(): 69 | min_arg = node 70 | return min_arg 71 | 72 | cpdef void insert(self, int begin, int end, LatticeNode node): 73 | self.end_lists[end].append(node) 74 | node.begin = begin 75 | node.end = end 76 | self.connect_node(node) 77 | 78 | def remove(self, begin: int, end: int, node: LatticeNode) -> None: 79 | self.end_lists[end].remove(node) 80 | 81 | @staticmethod 82 | def create_node() -> LatticeNode: 83 | return LatticeNode() 84 | 85 | def has_previous_node(self, index: int) -> bool: 86 | return bool(self.end_lists[index]) 87 | 88 | cdef void connect_node(self, LatticeNode r_node): 89 | begin = r_node.begin 90 | r_node.total_cost = INT_MAX 91 | 92 | cdef LatticeNode l_node 93 | cdef int connect_cost 94 | for l_node in self.end_lists[begin]: 95 | if not l_node.is_connected_to_bos: 96 | continue 97 | connect_cost = self.connect_costs[r_node.left_id, l_node.right_id] 98 | 99 | # 0x7fff == Grammar.INHIBITED_CONNECTION: 100 | if connect_cost == 0x7fff: 101 | continue 102 | cost = l_node.total_cost + connect_cost 103 | if cost < r_node.total_cost: 104 | r_node.total_cost = cost 105 | r_node.best_previous_node = l_node 106 | 107 | r_node.is_connected_to_bos = r_node.best_previous_node is not None 108 | r_node.total_cost += r_node.cost 109 | 110 | cdef void connect_eos_node(self): 111 | self.connect_node(self.eos_node) 112 | 113 | def get_best_path(self) -> List[LatticeNode]: 114 | # self.connect_node(self.eos_node) 115 | if not self.eos_node.is_connected_to_bos: # EOS node 116 | raise AttributeError("EOS is not connected to BOS") 117 | result = [] 118 | node = self.eos_node.best_previous_node 119 | while node is not self.end_lists[0][0]: 120 | result.append(node) 121 | node = node.best_previous_node 122 | return list(reversed(result)) 123 | 124 | def dump(self, logger): 125 | if logger.disabled: 126 | return 127 | index = 0 128 | for i in range(self.size + 1, -1, -1): 129 | r_nodes = self.end_lists[i] if i <= self.size else [self.eos_node] 130 | for r_node in r_nodes: 131 | surface = '(null)' 132 | pos = 'BOS/EOS' 133 | 134 | if r_node.is_defined(): 135 | wi = r_node.get_word_info() 136 | surface = wi.surface 137 | pos_id = wi.pos_id 138 | pos = '(null)' 139 | if pos_id >= 0: 140 | pos = ','.join(self.grammar.get_part_of_speech_string(pos_id)) 141 | 142 | costs = [] 143 | for l_node in self.end_lists[r_node.get_begin()]: 144 | cost = self.grammar.get_connect_cost(l_node.get_right_id(), r_node.get_left_id()) 145 | costs.append(str(cost)) 146 | index += 1 147 | 148 | logger.info('%d: %d %d %s(%d) %s %d %d %d: %s' % 149 | (index, r_node.get_begin(), r_node.get_end(), 150 | surface, r_node.get_word_id(), pos, r_node.get_left_id(), 151 | r_node.get_right_id(), r_node.get_path_cost(), ' '.join(costs))) 152 | -------------------------------------------------------------------------------- /sudachipy/dictionarylib/charactercategory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import math 16 | import re 17 | from queue import PriorityQueue 18 | 19 | from . import categorytype 20 | 21 | 22 | class CharacterCategory(object): 23 | 24 | class Range(object): 25 | 26 | def __lt__(self, other): 27 | return self.high < other.high 28 | 29 | def __init__(self, low=0, high=0, categories=None): 30 | self.low = low 31 | self.high = high 32 | self.categories = categories or [] 33 | 34 | def contains(self, cp): 35 | return self.low <= cp < self.high 36 | 37 | def containing_length(self, text): 38 | for i in range(len(text)): 39 | c = ord(text[i]) 40 | if c < self.low or c > self.high: 41 | return i 42 | return len(text) 43 | 44 | def lower(self, cp): 45 | return self.high <= cp 46 | 47 | def higher(self, cp): 48 | return self.low > cp 49 | 50 | def match(self, other): 51 | return self.low == other.low and self.high == other.high 52 | 53 | def __init__(self): 54 | self.range_list = [] 55 | 56 | def _compile(self) -> None: 57 | """ 58 | _compile transforms self.range_list to non overlapped range list 59 | to apply binary search in get_category_types 60 | :return: 61 | """ 62 | self.range_list.sort(key=lambda x: x.high) 63 | self.range_list.sort(key=lambda x: x.low) 64 | new_range_list = [] 65 | left_chain = PriorityQueue() 66 | right_chain = self.range_list 67 | states = [] 68 | pivot = 0 69 | while True: 70 | if left_chain.empty(): 71 | if not right_chain: 72 | break 73 | right = right_chain.pop(0) 74 | left_chain.put(right) 75 | pivot = right.low 76 | states.extend(right.categories) 77 | continue 78 | left = left_chain.get() 79 | right = right_chain[0] if right_chain else None 80 | left_end = left.high 81 | right_begin = right.low if right else math.inf 82 | if left_end <= right_begin: 83 | new_range_list.append(self.Range(pivot, left_end, set(states))) 84 | pivot = left_end 85 | for cat in left.categories: 86 | states.remove(cat) 87 | continue 88 | else: 89 | new_range_list.append(self.Range(pivot, right_begin, set(states))) 90 | pivot = right_begin 91 | states.extend(right.categories) 92 | left_chain.put(right) 93 | left_chain.put(left) 94 | right_chain.pop(0) 95 | self.range_list = [] 96 | _range = new_range_list[0] 97 | for irange in new_range_list[1:]: 98 | if irange.low == _range.high and irange.categories == _range.categories: 99 | _range = self.Range(_range.low, irange.high, _range.categories) 100 | else: 101 | self.range_list.append(_range) 102 | _range = irange 103 | self.range_list.append(_range) 104 | 105 | def get_category_types(self, code_point): 106 | begin = 0 107 | n = len(self.range_list) 108 | end = n 109 | pivot = (begin + end) // 2 110 | while 0 <= pivot < n: 111 | range_ = self.range_list[pivot] 112 | if range_.contains(code_point): 113 | return range_.categories 114 | if range_.lower(code_point): 115 | begin = pivot 116 | else: # range_.higher(code_point) 117 | end = pivot 118 | new_pivot = (begin + end) // 2 119 | if new_pivot == pivot: 120 | break 121 | pivot = new_pivot 122 | return {categorytype.CategoryType.DEFAULT} 123 | 124 | def read_character_definition(self, char_def=None): 125 | """ 126 | :param char_def: path 127 | """ 128 | 129 | if char_def is not None: 130 | f = open(char_def, 'r', encoding="utf-8") 131 | else: 132 | f = open("char.def", 'r', encoding="utf-8") 133 | 134 | for i, line in enumerate(f.readlines()): 135 | line = line.rstrip() 136 | if re.fullmatch(r"\s*", line) or re.match("#", line): 137 | continue 138 | cols = re.split(r"\s+", line) 139 | if len(cols) < 2: 140 | f.close() 141 | raise AttributeError("invalid format at line {}".format(i)) 142 | if not re.match("0x", cols[0]): 143 | continue 144 | range_ = self.Range() 145 | r = re.split("\\.\\.", cols[0]) 146 | range_.low = int(r[0], 16) 147 | range_.high = range_.low + 1 148 | if len(r) > 1: 149 | range_.high = int(r[1], 16) + 1 150 | if range_.low >= range_.high: 151 | f.close() 152 | raise AttributeError("invalid range at line {}".format(i)) 153 | for j in range(1, len(cols)): 154 | if re.match("#", cols[j]) or cols[j] == '': 155 | break 156 | type_ = categorytype.CategoryType.get(cols[j]) 157 | if type_ is None: 158 | f.close() 159 | raise AttributeError("{} is invalid type at line {}".format(cols[j], i)) 160 | range_.categories.append(type_) 161 | self.range_list.append(range_) 162 | 163 | f.close() 164 | self._compile() 165 | -------------------------------------------------------------------------------- /tests/plugin/test_numericparser.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from unittest import TestCase 16 | 17 | from sudachipy.plugin.path_rewrite.numericparser import NumericParser 18 | 19 | 20 | class TestNumericParser(TestCase): 21 | 22 | def setUp(self) -> None: 23 | self.parser = NumericParser() 24 | 25 | def test_digits(self): 26 | self.assertTrue(self.parse('1000')) 27 | self.assertEqual('1000', self.parser.get_normalized()) 28 | self.parser.clear() 29 | 30 | def test_starts_with_zero(self): 31 | self.assertTrue(self.parse('001000')) 32 | self.assertEqual('001000', self.parser.get_normalized()) 33 | self.parser.clear() 34 | 35 | self.assertTrue(self.parse('〇一〇〇〇')) 36 | self.assertEqual('01000', self.parser.get_normalized()) 37 | self.parser.clear() 38 | 39 | self.assertTrue(self.parse('00.1000')) 40 | self.assertEqual('00.1', self.parser.get_normalized()) 41 | self.parser.clear() 42 | 43 | self.assertTrue(self.parse('000')) 44 | self.assertEqual('000', self.parser.get_normalized()) 45 | self.parser.clear() 46 | 47 | def test_use_small_unit(self): 48 | self.assertTrue(self.parse('二十七')) 49 | self.assertEqual('27', self.parser.get_normalized()) 50 | self.parser.clear() 51 | 52 | self.assertTrue(self.parse('千三百二十七')) 53 | self.assertEqual('1327', self.parser.get_normalized()) 54 | self.parser.clear() 55 | 56 | self.assertTrue(self.parse('千十七')) 57 | self.assertEqual('1017', self.parser.get_normalized()) 58 | self.parser.clear() 59 | 60 | self.assertTrue(self.parse('千三百二十七.〇五')) 61 | self.assertEqual('1327.05', self.parser.get_normalized()) 62 | self.parser.clear() 63 | 64 | self.assertFalse(self.parse('三百二十百')) 65 | self.parser.clear() 66 | 67 | def test_use_large_unit(self): 68 | self.assertTrue(self.parse('1万')) 69 | self.assertEqual('10000', self.parser.get_normalized()) 70 | self.parser.clear() 71 | 72 | self.assertTrue(self.parse('千三百二十七万')) 73 | self.assertEqual('13270000', self.parser.get_normalized()) 74 | self.parser.clear() 75 | 76 | self.assertTrue(self.parse('千三百二十七万一四')) 77 | self.assertEqual('13270014', self.parser.get_normalized()) 78 | self.parser.clear() 79 | 80 | self.assertTrue(self.parse('千三百二十七万一四.〇五')) 81 | self.assertEqual('13270014.05', self.parser.get_normalized()) 82 | self.parser.clear() 83 | 84 | self.assertTrue(self.parse('三兆2千億千三百二十七万一四.〇五')) 85 | self.assertEqual('3200013270014.05', self.parser.get_normalized()) 86 | self.parser.clear() 87 | 88 | self.assertFalse(self.parse('億万')) 89 | self.parser.clear() 90 | 91 | def test_float_with_unit(self): 92 | self.assertTrue(self.parse('1.5千')) 93 | self.assertEqual('1500', self.parser.get_normalized()) 94 | self.parser.clear() 95 | 96 | self.assertTrue(self.parse('1.5百万')) 97 | self.assertEqual('1500000', self.parser.get_normalized()) 98 | self.parser.clear() 99 | 100 | self.assertTrue(self.parse('1.5百万1.5千20')) 101 | self.assertEqual('1501520', self.parser.get_normalized()) 102 | self.parser.clear() 103 | 104 | self.assertFalse(self.parse('1.5千5百')) 105 | self.parser.clear() 106 | 107 | self.assertFalse(self.parse('1.5千500')) 108 | self.parser.clear() 109 | 110 | def test_log_numeric(self): 111 | self.assertTrue(self.parse('200000000000000000000万')) 112 | self.assertEqual('2000000000000000000000000', self.parser.get_normalized()) 113 | self.parser.clear() 114 | 115 | def test_with_comma(self): 116 | self.assertTrue(self.parse('2,000,000')) 117 | self.assertEqual('2000000', self.parser.get_normalized()) 118 | self.parser.clear() 119 | 120 | self.assertTrue(self.parse('259万2,300')) 121 | self.assertEqual('2592300', self.parser.get_normalized()) 122 | self.parser.clear() 123 | 124 | self.assertFalse(self.parse('200,00,000')) 125 | self.assertEqual(NumericParser.Error.COMMA, self.parser._error_state) 126 | self.parser.clear() 127 | 128 | self.assertFalse(self.parse('2,4')) 129 | self.assertEqual(NumericParser.Error.COMMA, self.parser._error_state) 130 | self.parser.clear() 131 | 132 | self.assertFalse(self.parse('000,000')) 133 | self.assertEqual(NumericParser.Error.COMMA, self.parser._error_state) 134 | self.parser.clear() 135 | 136 | self.assertFalse(self.parse(',000')) 137 | self.assertEqual(NumericParser.Error.COMMA, self.parser._error_state) 138 | self.parser.clear() 139 | 140 | self.assertFalse(self.parse('256,55.1')) 141 | self.assertEqual(NumericParser.Error.COMMA, self.parser._error_state) 142 | self.parser.clear() 143 | 144 | def test_not_digit(self): 145 | self.assertFalse(self.parse('@@@')) 146 | self.parser.clear() 147 | 148 | def test_float_point(self): 149 | self.assertTrue(self.parse('6.0')) 150 | self.assertEqual('6', self.parser.get_normalized()) 151 | self.parser.clear() 152 | 153 | self.assertFalse(self.parse('6.')) 154 | self.assertEqual(NumericParser.Error.POINT, self.parser.error_state) 155 | self.parser.clear() 156 | 157 | self.assertFalse(self.parse('1.2.3')) 158 | self.assertEqual(NumericParser.Error.POINT, self.parser.error_state) 159 | self.parser.clear() 160 | 161 | def parse(self, text: str) -> bool: 162 | for c in text: 163 | if not self.parser.append(c): 164 | return False 165 | return self.parser.done() 166 | -------------------------------------------------------------------------------- /sudachipy/plugin/oov/mecab_oov_plugin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from collections import defaultdict 17 | 18 | from sudachipy import config 19 | from sudachipy.dictionarylib import categorytype 20 | from sudachipy.dictionarylib import wordinfo 21 | 22 | from .oov_provider_plugin import OovProviderPlugin 23 | 24 | 25 | class MeCabOovPlugin(OovProviderPlugin): 26 | class CategoryInfo: 27 | def __init__(self): 28 | self.type_ = None 29 | self.is_invoke = None 30 | self.is_group = None 31 | self.length = None 32 | 33 | class OOV: 34 | def __init__(self): 35 | self.left_id = -1 36 | self.right_id = -1 37 | self.cost = -1 38 | self.pos_id = None 39 | 40 | def __init__(self, json_obj=None): 41 | if json_obj: 42 | self.__chardef_filename = json_obj['charDef'] 43 | self.__unkdef_filename = json_obj['unkDef'] 44 | else: 45 | self.__chardef_filename = None 46 | self.__unkdef_filename = None 47 | self.categories = {} 48 | self.oov_list = defaultdict(list) 49 | 50 | def set_up(self, grammar): 51 | char_def = os.path.join(config.settings.resource_dir, self.__chardef_filename) 52 | if not char_def: 53 | raise AttributeError("charDef is not defined") 54 | self.read_character_property(char_def) 55 | 56 | unk_def = os.path.join(config.settings.resource_dir, self.__unkdef_filename) 57 | if not unk_def: 58 | raise AttributeError("unkDef is not defined") 59 | self.read_oov(unk_def, grammar) 60 | 61 | def provide_oov(self, input_text, offset, has_other_words): 62 | length = input_text.get_char_category_continuous_length(offset) 63 | if length < 1: 64 | return [] 65 | nodes = [] 66 | for type_ in input_text.get_char_category_types(offset): 67 | if type_ not in self.categories: 68 | continue 69 | cinfo = self.categories[type_] 70 | llength = length 71 | if cinfo.type_ not in self.oov_list: 72 | continue 73 | oovs = self.oov_list[cinfo.type_] 74 | if not cinfo.is_invoke and has_other_words: 75 | continue 76 | if cinfo.is_group: 77 | s = input_text.get_substring(offset, offset + length) 78 | for oov in oovs: 79 | nodes.append(self.get_oov_node(s, oov, length)) 80 | llength -= 1 81 | for i in range(1, cinfo.length + 1): 82 | sublength = input_text.get_code_points_offset_length(offset, i) 83 | if sublength > llength: 84 | break 85 | s = input_text.get_substring(offset, offset + sublength) 86 | for oov in oovs: 87 | nodes.append(self.get_oov_node(s, oov, sublength)) 88 | return nodes 89 | 90 | def get_oov_node(self, text, oov, length): 91 | node = self.create_node() 92 | node.set_parameter(oov.left_id, oov.right_id, oov.cost) 93 | info = wordinfo.WordInfo(surface=text, head_word_length=length, pos_id=oov.pos_id, normalized_form=text, 94 | dictionary_form_word_id=-1, dictionary_form=text, reading_form="", 95 | a_unit_split=[], b_unit_split=[], word_structure=[], synonym_group_ids=[]) 96 | node.set_word_info(info) 97 | return node 98 | 99 | def read_character_property(self, char_def): 100 | with open(char_def, "r", encoding="utf-8") as f: 101 | for i, line in enumerate(f, start=1): 102 | line = line.strip() 103 | if not line or line.startswith("#") or line.startswith("0x"): 104 | continue 105 | cols = line.split() 106 | if len(cols) < 4: 107 | raise ValueError("invalid format at line {}".format(i)) 108 | try: 109 | type_ = getattr(categorytype.CategoryType, cols[0]) 110 | except AttributeError: 111 | raise ValueError("`{}` is invalid type at line {}".format(cols[0], i)) 112 | if type_ in self.categories: 113 | raise ValueError("`{}` is already defined at line {}".format(cols[0], i)) 114 | 115 | info = self.CategoryInfo() 116 | info.type_ = type_ 117 | info.is_invoke = (cols[1] != "0") 118 | info.is_group = (cols[2] != "0") 119 | info.length = int(cols[3]) 120 | self.categories[type_] = info 121 | 122 | def read_oov(self, unk_def, grammar): 123 | with open(unk_def, "r", encoding="utf-8") as f: 124 | for i, line in enumerate(f, start=1): 125 | line = line.strip() 126 | if not line: 127 | continue 128 | cols = line.split(",") 129 | if len(cols) < 10: 130 | raise ValueError("invalid format at line {}".format(i)) 131 | try: 132 | type_ = getattr(categorytype.CategoryType, cols[0]) 133 | except AttributeError: 134 | raise ValueError("`{}` is invalid type at line {}".format(cols[0], i)) 135 | if type_ not in self.categories: 136 | raise ValueError("`{}` is undefined at line {}".format(cols[0], i)) 137 | 138 | oov = self.OOV() 139 | oov.left_id = int(cols[1]) 140 | oov.right_id = int(cols[2]) 141 | oov.cost = int(cols[3]) 142 | pos = cols[4:10] 143 | oov.pos_id = grammar.get_part_of_speech_id(pos) 144 | self.oov_list[type_].append(oov) 145 | -------------------------------------------------------------------------------- /tests/plugin/test_join_numeric_plugin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Works Applications Co., Ltd. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import unittest 17 | 18 | from sudachipy.dictionary import Dictionary 19 | from sudachipy.plugin.path_rewrite import JoinNumericPlugin 20 | from sudachipy.utf8inputtextbuilder import UTF8InputTextBuilder 21 | 22 | 23 | class TestJoinNumericOOVPlugin(unittest.TestCase): 24 | 25 | def setUp(self): 26 | pass 27 | resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources') 28 | self.dict_ = Dictionary(os.path.join(resource_dir, 'numeric_sudachi.json'), resource_dir) 29 | self.tokenizer = self.dict_.create() 30 | self.plugin = JoinNumericPlugin(None) 31 | self.plugin.set_up(self.dict_.grammar) 32 | 33 | def test_digit(self): 34 | path = self.get_path('123円20銭') 35 | self.assertEqual(4, len(path)) 36 | self.assertEqual('123', path[0].get_word_info().surface) 37 | self.assertEqual('20', path[2].get_word_info().surface) 38 | 39 | path = self.get_path('080-121') 40 | self.assertEqual(3, len(path)) 41 | self.assertEqual('080', path[0].get_word_info().surface) 42 | self.assertEqual('121', path[2].get_word_info().surface) 43 | 44 | def test_kanji_numeric(self): 45 | path = self.get_path('一二三万二千円') 46 | self.assertEqual(2, len(path)) 47 | self.assertEqual('一二三万二千', path[0].get_word_info().surface) 48 | 49 | path = self.get_path('二百百') 50 | self.assertEqual(3, len(path)) 51 | 52 | def test_normalize(self): 53 | self.plugin._enable_normalize = True 54 | path = self.get_path('一二三万二千円') 55 | self.assertEqual(2, len(path)) 56 | self.assertEqual('1232000', path[0].get_word_info().normalized_form) 57 | 58 | def test_normalized_with_not_numeric(self): 59 | self.plugin._enable_normalize = True 60 | path = self.get_path('六三四') 61 | self.assertEqual(1, len(path)) 62 | self.assertEqual('六三四', path[0].get_word_info().normalized_form) 63 | 64 | def test_point(self): 65 | self.plugin._enable_normalize = True 66 | 67 | path = self.get_path('1.002') 68 | self.assertEqual(1, len(path)) 69 | self.assertEqual('1.002', path[0].get_word_info().normalized_form) 70 | 71 | path = self.get_path('.002') 72 | self.assertEqual(2, len(path)) 73 | self.assertEqual('.', path[0].get_word_info().normalized_form) 74 | self.assertEqual('002', path[1].get_word_info().normalized_form) 75 | 76 | path = self.get_path('22.') 77 | self.assertEqual(2, len(path)) 78 | self.assertEqual('22', path[0].get_word_info().normalized_form) 79 | self.assertEqual('.', path[1].get_word_info().normalized_form) 80 | 81 | path = self.get_path('22.節') 82 | self.assertEqual(3, len(path)) 83 | self.assertEqual('22', path[0].get_word_info().normalized_form) 84 | self.assertEqual('.', path[1].get_word_info().normalized_form) 85 | 86 | path = self.get_path('.c') 87 | self.assertEqual(2, len(path)) 88 | self.assertEqual('.', path[0].get_word_info().normalized_form) 89 | 90 | path = self.get_path('1.20.3') 91 | self.assertEqual(5, len(path)) 92 | self.assertEqual('20', path[2].get_word_info().normalized_form) 93 | 94 | path = self.get_path('652...') 95 | self.assertEqual(4, len(path)) 96 | self.assertEqual('652', path[0].get_word_info().normalized_form) 97 | 98 | def test_comma(self): 99 | self.plugin._enable_normalize = True 100 | 101 | path = self.get_path('2,000,000') 102 | self.assertEqual(1, len(path)) 103 | self.assertEqual('2000000', path[0].get_word_info().normalized_form) 104 | 105 | path = self.get_path('2,00,000,000円') 106 | self.assertEqual(8, len(path)) 107 | self.assertEqual('2', path[0].get_word_info().normalized_form) 108 | self.assertEqual(',', path[1].get_word_info().normalized_form) 109 | self.assertEqual('00', path[2].get_word_info().normalized_form) 110 | self.assertEqual(',', path[3].get_word_info().normalized_form) 111 | self.assertEqual('000', path[4].get_word_info().normalized_form) 112 | self.assertEqual(',', path[5].get_word_info().normalized_form) 113 | self.assertEqual('000', path[6].get_word_info().normalized_form) 114 | 115 | path = self.get_path(',') 116 | self.assertEqual(1, len(path)) 117 | 118 | path = self.get_path('652,,,') 119 | self.assertEqual(4, len(path)) 120 | self.assertEqual('652', path[0].get_word_info().normalized_form) 121 | 122 | path = self.get_path('256,5.50389') 123 | self.assertEqual(3, len(path)) 124 | self.assertEqual('256', path[0].get_word_info().normalized_form) 125 | self.assertEqual('5.50389', path[2].get_word_info().normalized_form) 126 | 127 | path = self.get_path('256,550.389') 128 | self.assertEqual(1, len(path)) 129 | self.assertEqual('256550.389', path[0].get_word_info().normalized_form) 130 | 131 | def test_single_node(self): 132 | self.plugin._enable_normalize = False 133 | path = self.get_path('猫三匹') 134 | self.assertEqual(3, len(path)) 135 | self.assertEqual('三', path[1].get_word_info().normalized_form) 136 | 137 | self.plugin._enable_normalize = True 138 | path = self.get_path('猫三匹') 139 | self.assertEqual(3, len(path)) 140 | self.assertEqual('3', path[1].get_word_info().normalized_form) 141 | 142 | def get_path(self, text: str): 143 | input_ = UTF8InputTextBuilder(text, self.tokenizer._grammar).build() 144 | self.tokenizer._build_lattice(input_) 145 | path = self.tokenizer._lattice.get_best_path() 146 | self.plugin.rewrite(input_, path, self.tokenizer._lattice) 147 | self.tokenizer._lattice.clear() 148 | return path 149 | 150 | 151 | if __name__ == '__main__': 152 | unittest.main() 153 | --------------------------------------------------------------------------------