├── MANIFEST.in
├── tests
    ├── resources
    │   ├── rewrite_error_ignorelist.def
    │   ├── rewrite_error_dup.def
    │   ├── rewrite_error_replacelist.def
    │   ├── rewrite.def
    │   ├── dict
    │   │   ├── user2.csv
    │   │   ├── user.csv
    │   │   ├── matrix.def
    │   │   └── lex.csv
    │   ├── char.def
    │   ├── sudachi.json
    │   ├── numeric_sudachi.json
    │   ├── sudachi_large_user.json
    │   ├── unk.def
    │   └── joinnumeric
    │   │   └── char.def
    ├── __init__.py
    ├── plugin
    │   ├── __init__.py
    │   ├── test_join_katakana_oov_plugin.py
    │   ├── test_default_input_text_plugin.py
    │   ├── test_prolongedsoundmarkinput.py
    │   ├── test_numericparser.py
    │   └── test_join_numeric_plugin.py
    ├── dictionarylib
    │   ├── __init__.py
    │   ├── test_dictionaryheader.py
    │   ├── test_userdictionarybuilder.py
    │   └── test_doublearraylexicon.py
    ├── test_dictionary.py
    ├── test_large_userdict.py
    ├── mock_grammar.py
    ├── mock_inputtext.py
    ├── test_tokenizer.py
    ├── test_grammar.py
    └── test_switchdictionary.py
├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── python-publish.yml
    │   ├── python-publish-macos.yml
    │   ├── python-publish-windows.yml
    │   └── build.yml
├── requirements.txt
├── scripts
    ├── flake8.cfg
    ├── format.sh
    ├── checkheader.sh
    ├── license-header.txt
    └── test.sh
├── .gitattributes
├── sudachipy
    ├── latticenode.pxd
    ├── lattice.pxd
    ├── plugin
    │   ├── connect_cost
    │   │   ├── __init__.py
    │   │   ├── inhibitconnectioncost.py
    │   │   └── editconnectioncost.py
    │   ├── oov
    │   │   ├── __init__.py
    │   │   ├── utils.py
    │   │   ├── oov_provider_plugin.py
    │   │   ├── simple_oov_plugin.py
    │   │   └── mecab_oov_plugin.py
    │   ├── input_text
    │   │   ├── __init__.py
    │   │   ├── input_text.py
    │   │   ├── utils.py
    │   │   ├── prolongedsoundmark.py
    │   │   └── default_input_text_plugin.py
    │   ├── path_rewrite
    │   │   ├── __init__.py
    │   │   ├── utils.py
    │   │   ├── join_katakana_oov_plugin.py
    │   │   ├── path_rewrite_plugin.py
    │   │   └── join_numeric_plugin.py
    │   └── __init__.py
    ├── __init__.py
    ├── dictionarylib
    │   ├── __init__.py
    │   ├── wordidtable.py
    │   ├── categorytype.py
    │   ├── wordinfo.py
    │   ├── jtypedbytebuffer.py
    │   ├── lexicon.py
    │   ├── dictionaryversion.py
    │   ├── wordparameterlist.py
    │   ├── dictionaryheader.py
    │   ├── userdictionarybuilder.py
    │   ├── binarydictionary.py
    │   ├── wordinfolist.py
    │   ├── grammar.py
    │   ├── doublearraylexicon.py
    │   ├── lexiconset.py
    │   └── charactercategory.py
    ├── resources
    │   ├── sudachi.json
    │   ├── unk.def
    │   └── char.def
    ├── morpheme.py
    ├── morphemelist.py
    ├── utf8inputtext.py
    ├── latticenode.pyx
    ├── dictionary.py
    ├── config.py
    ├── utf8inputtextbuilder.py
    └── lattice.pyx
├── .gitignore
└── setup.py


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE requirements.txt
2 | 


--------------------------------------------------------------------------------
/tests/resources/rewrite_error_ignorelist.def:
--------------------------------------------------------------------------------
1 | # there are two characters in ignore list
2 | 12


--------------------------------------------------------------------------------
/tests/resources/rewrite_error_dup.def:
--------------------------------------------------------------------------------
1 | # there are ad uplicated replacement.
2 | 12 21
3 | 12 31
4 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: WorksApplications
4 | 


--------------------------------------------------------------------------------
/tests/resources/rewrite_error_replacelist.def:
--------------------------------------------------------------------------------
1 | # there are three columns in replace list
2 | 12 21 31
3 | 


--------------------------------------------------------------------------------
/tests/resources/rewrite.def:
--------------------------------------------------------------------------------
1 | # ignore normalize list
2 | Ⅲ
3 | ⅲ
4 | ⼼
5 | 
6 | # replace char list
7 | ｶﾞ	ガ
8 | ウ゛	ヴ
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sortedcontainers~=2.1.0
2 | dartsclone~=0.9.0
3 | # flake8
4 | # flake8-import-order
5 | # flake8-buitins
6 | 


--------------------------------------------------------------------------------
/tests/resources/dict/user2.csv:
--------------------------------------------------------------------------------
1 | ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,*,A,*,*,*,*
2 | かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,*,A,*,*,*,*
3 | 


--------------------------------------------------------------------------------
/scripts/flake8.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = \
3 | 	E501, \ # line too long ( > _ characters)
4 | 	C901, \ # too complex
5 | max-line-length = 140
6 | exclude = __init__.py
7 | max-complexity = 10
8 | 


--------------------------------------------------------------------------------
/scripts/format.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | cd $(dirname $0)
4 | 
5 | flake8 --show --config=flake8.cfg ../sudachipy
6 | flake8 --show --config=flake8.cfg ../tests
7 | 
8 | cd ..
9 | scripts/checkheader.sh


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | * text=auto
 2 | 
 3 | *.def text
 4 | *.in text
 5 | *.json text
 6 | *.md text
 7 | *.py text
 8 | *.txt text
 9 | 
10 | *.pyc binary
11 | *.pyd binary
12 | *.pyo binary
13 | *.pyw binary
14 | *.dic binary
15 | *.png binary
16 | *.jpg binary
17 | 


--------------------------------------------------------------------------------
/tests/resources/dict/user.csv:
--------------------------------------------------------------------------------
1 | ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,*,A,*,*,*,*
2 | 府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,*,A,*,*,*,*
3 | 東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,*,B,5/U1,*,5/U1,1/3
4 | すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,*,A,*,*,*,*
5 | 


--------------------------------------------------------------------------------
/scripts/checkheader.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | HEADER=scripts/license-header.txt
 4 | SIZE=`wc -c < "$HEADER"`
 5 | 
 6 | RES=`find setup.py sudachipy tests -type f -name '*.py' -exec cmp -n "$SIZE" "$HEADER" {} \;`
 7 | if [ -n "$RES" ]; then
 8 |     echo "$RES" | awk '{print "invalid license header on " $2}' >&2
 9 |     exit 1
10 | fi


--------------------------------------------------------------------------------
/sudachipy/latticenode.pxd:
--------------------------------------------------------------------------------
 1 | cdef class LatticeNode:
 2 | 
 3 |     cdef int begin
 4 |     cdef int end
 5 |     cdef int total_cost
 6 |     cdef int word_id
 7 |     cdef bint _is_oov
 8 |     cdef LatticeNode best_previous_node
 9 |     cdef bint is_connected_to_bos
10 |     cdef object extra_word_info
11 |     cdef object undefined_word_info
12 |     cdef bint _is_defined
13 |     cdef object lexicon
14 |     cdef int left_id
15 |     cdef int right_id
16 |     cdef int cost
17 | 
18 | 


--------------------------------------------------------------------------------
/sudachipy/lattice.pxd:
--------------------------------------------------------------------------------
 1 | from .latticenode cimport LatticeNode
 2 | 
 3 | cdef extern from "limits.h":
 4 |     cdef int INT_MAX
 5 | 
 6 | cdef class Lattice:
 7 | 
 8 |     cdef int size
 9 |     cdef int capacity
10 |     cdef LatticeNode eos_node
11 | 
12 |     cdef list end_lists
13 |     cdef object grammar
14 |     cdef object eos_params
15 |     cdef const short[:,:] connect_costs
16 | 
17 |     cpdef void resize(self, int size)
18 |     cpdef void insert(self, int begin, int end, LatticeNode node)
19 |     cdef void connect_node(self, LatticeNode r_node)
20 |     cdef void connect_eos_node(self)
21 | 


--------------------------------------------------------------------------------
/tests/resources/char.def:
--------------------------------------------------------------------------------
 1 | 0x0030..0x0039 NUMERIC  #0-9
 2 | 0x0041..0x005A ALPHA    #A-Z
 3 | 0x0061..0x007A ALPHA    #a-z
 4 | 0x00C0..0x00FF ALPHA  # Latin 1 #À->ÿ
 5 | 0x3041..0x309F  HIRAGANA
 6 | 0x30A1..0x30FF  KATAKANA
 7 | 0x30A1          NOOOVBOW
 8 | 0xFF66..0xFF9D  KATAKANA
 9 | 0xFF9E..0xFF9F  KATAKANA
10 | 0x2E80..0x2EF3  KANJI # CJK Raidcals Supplement
11 | 0x2F00..0x2FD5  KANJI
12 | 0x3005          KANJI
13 | 0x3007          KANJI
14 | 0x3400..0x4DB5  KANJI # CJK Unified Ideographs Extention
15 | 0x4E00..0x9FA5  KANJI
16 | 0xF900..0xFA2D  KANJI
17 | 0xFA30..0xFA6A  KANJI
18 | 0xFF10..0xFF19 NUMERIC
19 | 0xFF21..0xFF3A ALPHA
20 | 0xFF41..0xFF5A ALPHA
21 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/scripts/license-header.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/plugin/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/tests/dictionarylib/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/connect_cost/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .editconnectioncost import EditConnectionCostPlugin
16 | from .inhibitconnectioncost import InhibitConnectionPlugin
17 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/oov/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .oov_provider_plugin import OovProviderPlugin
16 | from .mecab_oov_plugin import MeCabOovPlugin
17 | from .simple_oov_plugin import SimpleOovPlugin
18 | from .utils import get_oov_plugins
19 | 


--------------------------------------------------------------------------------
/tests/resources/sudachi.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "systemDict" : "system.dic",
 3 |     "userDict" : [ "user.dic" ],
 4 |     "characterDefinitionFile" : "char.def",
 5 |     "inputTextPlugin" : [
 6 |         { "class" : "sudachipy.plugin.input_text.DefaultInputTextPlugin" }
 7 |     ],
 8 |     "oovProviderPlugin" : [
 9 |         { "class" : "sudachipy.plugin.oov.SimpleOovProviderPlugin",
10 |           "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
11 |           "leftId" : 8,
12 |           "rightId" : 8,
13 |           "cost" : 6000 }
14 |     ],
15 |     "pathRewritePlugin" : [
16 |         { "class" : "sudachipy.plugin.path_rewrite.JoinNumericPlugin",
17 |           "enableNormalize" : true },
18 |         { "class" : "sudachipy.plugin.path_rewrite.JoinKatakanaOovPlugin",
19 |           "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
20 |           "minLength" : 3
21 |         }
22 |     ]
23 | }
24 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/input_text/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .input_text import InputTextPlugin
16 | from .default_input_text_plugin import DefaultInputTextPlugin
17 | from .prolongedsoundmark import ProlongedSoundMarkInputTextPlugin
18 | from .utils import get_input_text_plugins
19 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/path_rewrite/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .join_katakana_oov_plugin import JoinKatakanaOovPlugin
16 | from .join_numeric_plugin import JoinNumericPlugin
17 | from .path_rewrite_plugin import PathRewritePlugin
18 | from .utils import get_path_rewrite_plugins
19 | 


--------------------------------------------------------------------------------
/tests/resources/numeric_sudachi.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "systemDict" : "system.dic",
 3 |     "userDict" : [ "user.dic" ],
 4 |     "characterDefinitionFile" : "joinnumeric/char.def",
 5 |     "inputTextPlugin" : [
 6 |         { "class" : "sudachipy.plugin.input_text.DefaultInputTextPlugin" }
 7 |     ],
 8 |     "oovProviderPlugin" : [
 9 |         { "class" : "sudachipy.plugin.oov.SimpleOovProviderPlugin",
10 |           "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
11 |           "leftId" : 8,
12 |           "rightId" : 8,
13 |           "cost" : 6000 }
14 |     ],
15 |     "pathRewritePlugin" : [
16 |         { "class" : "sudachipy.plugin.path_rewrite.JoinNumericPlugin",
17 |           "enableNormalize" : true },
18 |         { "class" : "sudachipy.plugin.path_rewrite.JoinKatakanaOovPlugin",
19 |           "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
20 |           "minLength" : 3
21 |         }
22 |     ]
23 | }
24 | 


--------------------------------------------------------------------------------
/tests/resources/sudachi_large_user.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "systemDict" : "system.dic",
 3 |     "userDict" : [ "user.dic", "large_user.dic" ],
 4 |     "characterDefinitionFile" : "char.def",
 5 |     "inputTextPlugin" : [
 6 |         { "class" : "sudachipy.plugin.input_text.DefaultInputTextPlugin" }
 7 |     ],
 8 |     "oovProviderPlugin" : [
 9 |         { "class" : "sudachipy.plugin.oov.SimpleOovProviderPlugin",
10 |           "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
11 |           "leftId" : 8,
12 |           "rightId" : 8,
13 |           "cost" : 6000 }
14 |     ],
15 |     "pathRewritePlugin" : [
16 |         { "class" : "sudachipy.plugin.path_rewrite.JoinNumericPlugin",
17 |           "enableNormalize" : true },
18 |         { "class" : "sudachipy.plugin.path_rewrite.JoinKatakanaOovPlugin",
19 |           "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
20 |           "minLength" : 3
21 |         }
22 |     ]
23 | }
24 | 


--------------------------------------------------------------------------------
/sudachipy/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from . import utf8inputtextbuilder
16 | from . import tokenizer
17 | from . import config
18 | 
19 | from pkg_resources import get_distribution, DistributionNotFound
20 | try:
21 |     __version__ = get_distribution(__name__).version
22 | except DistributionNotFound:
23 |     # package is not installed
24 |     pass
25 | 


--------------------------------------------------------------------------------
/scripts/test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # print error message only when it fails
 4 | # python unittest print message in stderr even if it succeed
 5 | # You need to prepare system.dic in resources and tests/resources
 6 | # see README
 7 | 
 8 | set -e
 9 | 
10 | # build dictionaries
11 | if !(type sudachipy > /dev/null 2>&1); then
12 |     python setup.py develop
13 | fi
14 | sudachipy build -o tests/resources/system.dic -d "the system dictionary for the unit tests" -m tests/resources/dict/matrix.def tests/resources/dict/lex.csv
15 | sudachipy ubuild -o tests/resources/user.dic -s tests/resources/system.dic tests/resources/dict/user.csv
16 | sudachipy ubuild -o tests/resources/large_user.dic -s tests/resources/system.dic tests/resources/dict/large_user.csv
17 | 
18 | set +e
19 | 
20 | # unittest
21 | RES=`python -m unittest discover tests -p '*test*.py' 2>&1`
22 | STATUS=$?
23 | RES_TAIL=`echo "$RES" | tail -1`
24 | if [[ $RES_TAIL != "OK" ]]; then
25 |     >&2 echo "$RES"
26 | fi
27 | 
28 | exit $STATUS
29 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish-macos.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package for macOS
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 | 
10 |     runs-on: ${{ matrix.os }}
11 | 
12 |     strategy:
13 |       matrix:
14 |         os: [macos-latest]
15 |         architecture: [x64]
16 |         python-version: [3.6, 3.7, 3.8]
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         architecture: ${{ matrix.architecture }}
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install setuptools wheel twine
29 |     - name: Build and publish
30 |       env:
31 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
32 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
33 |       run: |
34 |         python setup.py bdist_wheel
35 |         twine upload dist/*
36 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish-windows.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package for Windows
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 | 
10 |     runs-on: ${{ matrix.os }}
11 | 
12 |     strategy:
13 |       matrix:
14 |         os: [windows-latest]
15 |         architecture: [x64, x86]
16 |         python-version: [3.6, 3.7, 3.8]
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         architecture: ${{ matrix.architecture }}
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install setuptools wheel twine
29 |     - name: Build and publish
30 |       env:
31 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
32 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
33 |       run: |
34 |         python setup.py bdist_wheel
35 |         twine upload dist/*
36 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from . import grammar
16 | from . import charactercategory
17 | from . import categorytype
18 | from . import lexiconset
19 | from . import doublearraylexicon
20 | from . import dictionaryheader
21 | from .dictionaryversion import (
22 |     SYSTEM_DICT_VERSION_1, SYSTEM_DICT_VERSION_2, USER_DICT_VERSION_1, USER_DICT_VERSION_2, USER_DICT_VERSION_3,
23 | )
24 | from .binarydictionary import BinaryDictionary
25 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/input_text/input_text.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import ABC, abstractmethod
16 | 
17 | from sudachipy.utf8inputtextbuilder import UTF8InputTextBuilder
18 | 
19 | 
20 | class InputTextPlugin(ABC):
21 | 
22 |     Builder = UTF8InputTextBuilder
23 | 
24 |     @abstractmethod
25 |     def set_up(self) -> None:
26 |         raise NotImplementedError
27 | 
28 |     @abstractmethod
29 |     def rewrite(self, builder: Builder) -> None:
30 |         raise NotImplementedError
31 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #from sudachipy.plugin.oov import simple_oov_plugin
16 | #from sudachipy.plugin.oov import mecab_oov_plugin
17 | #from sudachipy.plugin.path_rewrite import join_numeric_plugin
18 | #from sudachipy.plugin.path_rewrite import join_katakana_oov_plugin
19 | #from sudachipy.plugin.input_text import default_input_text_plugin
20 | #from .connect_cost.inhibitconnectioncost import InhibitConnectionPlugin
21 | #from . import oov
22 | #from . import input_text
23 | #from . import path_rewrite
24 | 


--------------------------------------------------------------------------------
/sudachipy/resources/sudachi.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "systemDict" : "",
 3 |     "characterDefinitionFile" : "char.def",
 4 |     "inputTextPlugin" : [
 5 |         { "class" : "sudachipy.plugin.input_text.DefaultInputTextPlugin" },
 6 |         { "class" : "sudachipy.plugin.input_text.ProlongedSoundMarkInputTextPlugin",
 7 |           "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"],
 8 |           "replacementSymbol": "ー"}
 9 |     ],
10 |     "oovProviderPlugin" : [
11 |         { "class" : "sudachipy.plugin.oov.MeCabOovProviderPlugin",
12 |           "charDef" : "char.def",
13 |           "unkDef" : "unk.def" },
14 |         { "class" : "sudachipy.plugin.oov.SimpleOovProviderPlugin",
15 |           "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ],
16 |           "leftId" : 5968,
17 |           "rightId" : 5968,
18 |           "cost" : 3857 }
19 |     ],
20 |     "pathRewritePlugin" : [
21 |         { "class" : "sudachipy.plugin.path_rewrite.JoinNumericPlugin",
22 |           "enableNormalize" : true },
23 |         { "class" : "sudachipy.plugin.path_rewrite.JoinKatakanaOovPlugin",
24 |           "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
25 |           "minLength": 3 }
26 |     ]
27 | }
28 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/wordidtable.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import struct
16 | 
17 | 
18 | class WordIdTable(object):
19 |     def __init__(self, bytes_, offset):
20 |         bytes_.seek(offset)
21 |         self.size = int.from_bytes(bytes_.read(4), 'little')
22 |         self.offset = offset + 4
23 |         self._bytes_view = memoryview(bytes_)[self.offset: self.offset + self.size]
24 | 
25 |     def __del__(self):
26 |         self._bytes_view.release()
27 | 
28 |     def storage_size(self):
29 |         return 4 + self.size
30 | 
31 |     def get(self, index):
32 |         length = self._bytes_view[index]
33 |         result = struct.unpack_from("<{}I".format(length), self._bytes_view, index + 1)
34 |         return result
35 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: build
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ develop ]
 9 |   pull_request:
10 |     branches: [ develop ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - name: Set up Python 3.9
20 |       uses: actions/setup-python@v2
21 |       with:
22 |         python-version: 3.9
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install flake8
27 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
28 |     - name: Lint with flake8
29 |       run: |
30 |         # stop the build if there are Python syntax errors or undefined names
31 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
32 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
33 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
34 |     - name: Check license header
35 |       run: |
36 |         scripts/checkheader.sh
37 |     - name: Test with unittest
38 |       run: |
39 |         python setup.py build_ext --inplace
40 |         scripts/test.sh


--------------------------------------------------------------------------------
/tests/resources/dict/matrix.def:
--------------------------------------------------------------------------------
  1 | 10 10
  2 | 0 0 0
  3 | 0 1 863
  4 | 0 2 2124
  5 | 0 3 1032
  6 | 0 4 591
  7 | 0 5 -162
  8 | 0 6 -79
  9 | 0 7 887
 10 | 0 8 447
 11 | 0 9 -535
 12 | 1 0 -3689
 13 | 1 1 -3361
 14 | 1 2 -7643
 15 | 1 3 -3267
 16 | 1 4 809
 17 | 1 5 -1098
 18 | 1 6 4606
 19 | 1 7 4269
 20 | 1 8 4567
 21 | 1 9 1635
 22 | 2 0 -1959
 23 | 2 1 2457
 24 | 2 2 811
 25 | 2 3 840
 26 | 2 4 903
 27 | 2 5 -958
 28 | 2 6 517
 29 | 2 7 2037
 30 | 2 8 1392
 31 | 2 9 -193
 32 | 3 0 -2288
 33 | 3 1 1741
 34 | 3 2 487
 35 | 3 3 792
 36 | 3 4 -1474
 37 | 3 5 -3429
 38 | 3 6 126
 39 | 3 7 437
 40 | 3 8 605
 41 | 3 9 -547
 42 | 4 0 -2809
 43 | 4 1 -3584
 44 | 4 2 -6743
 45 | 4 3 -2869
 46 | 4 4 -2805
 47 | 4 5 -407
 48 | 4 6 3422
 49 | 4 7 5642
 50 | 4 8 6382
 51 | 4 9 2165
 52 | 5 0 -509
 53 | 5 1 -3665
 54 | 5 2 -3882
 55 | 5 3 -572
 56 | 5 4 -1036
 57 | 5 5 -54
 58 | 5 6 2570
 59 | 5 7 3319
 60 | 5 8 4059
 61 | 5 9 882
 62 | 6 0 101
 63 | 6 1 2933
 64 | 6 2 2198
 65 | 6 3 -2004
 66 | 6 4 4392
 67 | 6 5 4017
 68 | 6 6 569
 69 | 6 7 475
 70 | 6 8 -390
 71 | 6 9 852
 72 | 7 0 -852
 73 | 7 1 2079
 74 | 7 2 1180
 75 | 7 3 -3084
 76 | 7 4 2010
 77 | 7 5 1570
 78 | 7 6 746
 79 | 7 7 2341
 80 | 7 8 2051
 81 | 7 9 1393
 82 | 8 0 -522
 83 | 8 1 3354
 84 | 8 2 2037
 85 | 8 3 -2542
 86 | 8 4 3071
 87 | 8 5 2631
 88 | 8 6 -352
 89 | 8 7 2847
 90 | 8 8 1134
 91 | 8 9 1256
 92 | 9 0 -975
 93 | 9 1 2498
 94 | 9 2 1690
 95 | 9 3 -1523
 96 | 9 4 3023
 97 | 9 5 3139
 98 | 9 6 2562
 99 | 9 7 3962
100 | 9 8 418
101 | 9 9 -2490
102 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/categorytype.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from enum import Enum
16 | 
17 | 
18 | class CategoryType(Enum):
19 |     DEFAULT = 1
20 |     SPACE = 1 << 1
21 |     KANJI = 1 << 2
22 |     SYMBOL = 1 << 3
23 |     NUMERIC = 1 << 4
24 |     ALPHA = 1 << 5
25 |     HIRAGANA = 1 << 6
26 |     KATAKANA = 1 << 7
27 |     KANJINUMERIC = 1 << 8
28 |     GREEK = 1 << 9
29 |     CYRILLIC = 1 << 10
30 |     USER1 = 1 << 11
31 |     USER2 = 1 << 12
32 |     USER3 = 1 << 13
33 |     USER4 = 1 << 14
34 |     NOOOVBOW = 1 << 15
35 | 
36 |     def get_id(self):
37 |         return self.id
38 | 
39 |     def get_type(self, id_):
40 |         for type_ in CategoryType.values():
41 |             if type_.get_id() is id_:
42 |                 return type_
43 |         return None
44 | 
45 |     @staticmethod
46 |     def get(str_):
47 |         try:
48 |             return CategoryType[str_]
49 |         except KeyError:
50 |             return None
51 | 


--------------------------------------------------------------------------------
/tests/resources/unk.def:
--------------------------------------------------------------------------------
 1 | DEFAULT,5968,5968,3857,補助記号,一般,*,*,*,*
 2 | SPACE,5966,5966,6056,空白,*,*,*,*,*
 3 | KANJI,5139,5139,14657,名詞,普通名詞,一般,*,*,*
 4 | KANJI,5129,5129,17308,名詞,普通名詞,サ変可能,*,*,*
 5 | KANJI,4785,4785,18181,名詞,固有名詞,一般,*,*,*
 6 | KANJI,4787,4787,18086,名詞,固有名詞,人名,一般,*,*
 7 | KANJI,4791,4791,19198,名詞,固有名詞,地名,一般,*,*
 8 | SYMBOL,5129,5129,17094,名詞,普通名詞,サ変可能,*,*,*
 9 | NUMERIC,4794,4794,12450,名詞,数詞,*,*,*,*
10 | ALPHA,5139,5139,11633,名詞,普通名詞,一般,*,*,*
11 | ALPHA,4785,4785,13620,名詞,固有名詞,一般,*,*,*
12 | ALPHA,4787,4787,14228,名詞,固有名詞,人名,一般,*,*
13 | ALPHA,4791,4791,15793,名詞,固有名詞,地名,一般,*,*
14 | ALPHA,5687,5687,15246,感動詞,一般,*,*,*,*
15 | HIRAGANA,5139,5139,16012,名詞,普通名詞,一般,*,*,*
16 | HIRAGANA,5129,5129,20012,名詞,普通名詞,サ変可能,*,*,*
17 | HIRAGANA,4785,4785,18282,名詞,固有名詞,一般,*,*,*
18 | HIRAGANA,4787,4787,18269,名詞,固有名詞,人名,一般,*,*
19 | HIRAGANA,4791,4791,20474,名詞,固有名詞,地名,一般,*,*
20 | HIRAGANA,5687,5687,17786,感動詞,一般,*,*,*,*
21 | KATAKANA,5139,5139,10980,名詞,普通名詞,一般,*,*,*
22 | KATAKANA,5129,5129,14802,名詞,普通名詞,サ変可能,*,*,*
23 | KATAKANA,4785,4785,13451,名詞,固有名詞,一般,*,*,*
24 | KATAKANA,4787,4787,13759,名詞,固有名詞,人名,一般,*,*
25 | KATAKANA,4791,4791,14554,名詞,固有名詞,地名,一般,*,*
26 | KATAKANA,5687,5687,15272,感動詞,一般,*,*,*,*
27 | KANJINUMERIC,4794,4794,14170,名詞,数詞,*,*,*,*
28 | GREEK,5139,5139,11051,名詞,普通名詞,一般,*,*,*
29 | GREEK,4785,4785,13353,名詞,固有名詞,一般,*,*,*
30 | GREEK,4787,4787,13671,名詞,固有名詞,人名,一般,*,*
31 | GREEK,4791,4791,14862,名詞,固有名詞,地名,一般,*,*
32 | CYRILLIC,5139,5139,11140,名詞,普通名詞,一般,*,*,*
33 | CYRILLIC,4785,4785,13174,名詞,固有名詞,一般,*,*,*
34 | CYRILLIC,4787,4787,13495,名詞,固有名詞,人名,一般,*,*
35 | CYRILLIC,4791,4791,14700,名詞,固有名詞,地名,一般,*,*
36 | 


--------------------------------------------------------------------------------
/sudachipy/resources/unk.def:
--------------------------------------------------------------------------------
 1 | DEFAULT,5968,5968,3857,補助記号,一般,*,*,*,*
 2 | SPACE,5966,5966,6056,空白,*,*,*,*,*
 3 | KANJI,5139,5139,14657,名詞,普通名詞,一般,*,*,*
 4 | KANJI,5129,5129,17308,名詞,普通名詞,サ変可能,*,*,*
 5 | KANJI,4785,4785,18181,名詞,固有名詞,一般,*,*,*
 6 | KANJI,4787,4787,18086,名詞,固有名詞,人名,一般,*,*
 7 | KANJI,4791,4791,19198,名詞,固有名詞,地名,一般,*,*
 8 | SYMBOL,5129,5129,17094,名詞,普通名詞,サ変可能,*,*,*
 9 | NUMERIC,4794,4794,12450,名詞,数詞,*,*,*,*
10 | ALPHA,5139,5139,11633,名詞,普通名詞,一般,*,*,*
11 | ALPHA,4785,4785,13620,名詞,固有名詞,一般,*,*,*
12 | ALPHA,4787,4787,14228,名詞,固有名詞,人名,一般,*,*
13 | ALPHA,4791,4791,15793,名詞,固有名詞,地名,一般,*,*
14 | ALPHA,5687,5687,15246,感動詞,一般,*,*,*,*
15 | HIRAGANA,5139,5139,16012,名詞,普通名詞,一般,*,*,*
16 | HIRAGANA,5129,5129,20012,名詞,普通名詞,サ変可能,*,*,*
17 | HIRAGANA,4785,4785,18282,名詞,固有名詞,一般,*,*,*
18 | HIRAGANA,4787,4787,18269,名詞,固有名詞,人名,一般,*,*
19 | HIRAGANA,4791,4791,20474,名詞,固有名詞,地名,一般,*,*
20 | HIRAGANA,5687,5687,17786,感動詞,一般,*,*,*,*
21 | KATAKANA,5139,5139,10980,名詞,普通名詞,一般,*,*,*
22 | KATAKANA,5129,5129,14802,名詞,普通名詞,サ変可能,*,*,*
23 | KATAKANA,4785,4785,13451,名詞,固有名詞,一般,*,*,*
24 | KATAKANA,4787,4787,13759,名詞,固有名詞,人名,一般,*,*
25 | KATAKANA,4791,4791,14554,名詞,固有名詞,地名,一般,*,*
26 | KATAKANA,5687,5687,15272,感動詞,一般,*,*,*,*
27 | KANJINUMERIC,4794,4794,14170,名詞,数詞,*,*,*,*
28 | GREEK,5139,5139,11051,名詞,普通名詞,一般,*,*,*
29 | GREEK,4785,4785,13353,名詞,固有名詞,一般,*,*,*
30 | GREEK,4787,4787,13671,名詞,固有名詞,人名,一般,*,*
31 | GREEK,4791,4791,14862,名詞,固有名詞,地名,一般,*,*
32 | CYRILLIC,5139,5139,11140,名詞,普通名詞,一般,*,*,*
33 | CYRILLIC,4785,4785,13174,名詞,固有名詞,一般,*,*,*
34 | CYRILLIC,4787,4787,13495,名詞,固有名詞,人名,一般,*,*
35 | CYRILLIC,4791,4791,14700,名詞,固有名詞,地名,一般,*,*
36 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/oov/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import List
16 | 
17 | from sudachipy import config
18 | 
19 | from . import MeCabOovPlugin, OovProviderPlugin, SimpleOovPlugin
20 | 
21 | 
22 | def get_oov_plugin(json_obj) -> OovProviderPlugin:
23 |     # In the future, users can define plugin by themselves
24 |     try:
25 |         if json_obj['class'] == 'sudachipy.plugin.oov.MeCabOovProviderPlugin':
26 |             return MeCabOovPlugin(json_obj)
27 |         if json_obj['class'] == 'sudachipy.plugin.oov.SimpleOovProviderPlugin':
28 |             return SimpleOovPlugin(json_obj)
29 |         raise ValueError('{} is invalid OovProviderPlugin class'.format(json_obj['class']))
30 |     except KeyError:
31 |         raise ValueError('config file is invalid format')
32 | 
33 | 
34 | def get_oov_plugins() -> List[OovProviderPlugin]:
35 |     key_word = 'oovProviderPlugin'
36 |     if key_word not in config.settings:
37 |         return []
38 |     ps = []
39 |     for obj in config.settings[key_word]:
40 |         ps.append(get_oov_plugin(obj))
41 |     return ps
42 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/wordinfo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | class WordInfo:
17 |     def __init__(self,
18 |                  surface,
19 |                  head_word_length,
20 |                  pos_id,
21 |                  normalized_form,
22 |                  dictionary_form_word_id,
23 |                  dictionary_form,
24 |                  reading_form,
25 |                  a_unit_split,
26 |                  b_unit_split,
27 |                  word_structure,
28 |                  synonym_group_ids):
29 |         self.surface = surface
30 |         self.head_word_length = head_word_length
31 |         self.pos_id = pos_id
32 |         self.normalized_form = normalized_form
33 |         self.dictionary_form_word_id = dictionary_form_word_id
34 |         self.dictionary_form = dictionary_form
35 |         self.reading_form = reading_form
36 |         self.a_unit_split = a_unit_split
37 |         self.b_unit_split = b_unit_split
38 |         self.word_structure = word_structure
39 |         self.synonym_group_ids = synonym_group_ids
40 | 
41 |     def length(self):
42 |         return self.head_word_length
43 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/path_rewrite/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import List
16 | 
17 | from sudachipy import config
18 | 
19 | from . import JoinKatakanaOovPlugin, JoinNumericPlugin, PathRewritePlugin
20 | 
21 | 
22 | def get_path_rewrite_plugin(json_obj) -> PathRewritePlugin:
23 |     # In the future, users can define plugin by themselves
24 |     try:
25 |         if json_obj['class'] == 'sudachipy.plugin.path_rewrite.JoinNumericPlugin':
26 |             return JoinNumericPlugin(json_obj)
27 |         if json_obj['class'] == 'sudachipy.plugin.path_rewrite.JoinKatakanaOovPlugin':
28 |             return JoinKatakanaOovPlugin(json_obj)
29 |         raise ValueError('{} is invalid PathRewritePlugin class'.format(json_obj['class']))
30 |     except KeyError:
31 |         raise ValueError('config file is invalid format')
32 | 
33 | 
34 | def get_path_rewrite_plugins() -> List[PathRewritePlugin]:
35 |     if 'pathRewritePlugin' not in config.settings:
36 |         return []
37 |     ps = []
38 |     for obj in config.settings['pathRewritePlugin']:
39 |         ps.append(get_path_rewrite_plugin(obj))
40 |     return ps
41 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/jtypedbytebuffer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from io import BytesIO
16 | 
17 | 
18 | class JTypedByteBuffer(BytesIO):
19 |     """
20 |     A interface of BytesIO to write dictionary
21 |     """
22 | 
23 |     __ENDIAN = 'little'
24 | 
25 |     @classmethod
26 |     def from_bytes(cls, bytes_io):
27 |         return cls(bytes_io.getvalue())
28 | 
29 |     def write_int(self, int_, type_, signed=True):
30 |         if type_ == 'byte':
31 |             len_ = 1
32 |             signed = False
33 |         elif type_ == 'int':
34 |             len_ = 4
35 |         elif type_ == 'char':
36 |             len_ = 2
37 |             signed = False
38 |         elif type_ == 'short':
39 |             len_ = 2
40 |         elif type_ == 'long':
41 |             len_ = 8
42 |         else:
43 |             raise ValueError('{} is invalid type'.format(type_))
44 |         self.write(int_.to_bytes(len_, byteorder=self.__ENDIAN, signed=signed))
45 | 
46 |     def write_str(self, text):
47 |         self.write(text.encode('utf-16-le'))
48 | 
49 |     def clear(self):
50 |         self.seek(0)
51 |         self.truncate(0)
52 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/oov/oov_provider_plugin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import ABC, abstractmethod
16 | from typing import List
17 | 
18 | from sudachipy.dictionarylib.grammar import Grammar
19 | from sudachipy.latticenode import LatticeNode
20 | from sudachipy.utf8inputtext import UTF8InputText
21 | 
22 | 
23 | class OovProviderPlugin(ABC):
24 | 
25 |     @abstractmethod
26 |     def set_up(self, grammar: Grammar) -> None:
27 |         raise NotImplementedError
28 | 
29 |     @abstractmethod
30 |     def provide_oov(self, input_text: UTF8InputText, offset: int, has_other_words: bool) -> List[LatticeNode]:
31 |         raise NotImplementedError
32 | 
33 |     def get_oov(self, input_text: UTF8InputText, offset: int, has_other_words: bool) -> List[LatticeNode]:
34 |         nodes = self.provide_oov(input_text, offset, has_other_words)
35 |         for node in nodes:
36 |             node.set_begin(offset)
37 |             node.set_end(offset + node.get_word_info().length())
38 |         return nodes
39 | 
40 |     @staticmethod
41 |     def create_node() -> LatticeNode:
42 |         node = LatticeNode()
43 |         node.set_oov()
44 |         return node
45 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/input_text/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import List
16 | 
17 | from sudachipy import config
18 | 
19 | from . import DefaultInputTextPlugin, InputTextPlugin, ProlongedSoundMarkInputTextPlugin
20 | 
21 | 
22 | def get_input_text_plugin(json_obj) -> InputTextPlugin:
23 |     # In the future, users can define plugin by themselves
24 |     try:
25 |         if json_obj['class'] == 'sudachipy.plugin.input_text.DefaultInputTextPlugin':
26 |             return DefaultInputTextPlugin()
27 |         if json_obj['class'] == 'sudachipy.plugin.input_text.ProlongedSoundMarkInputTextPlugin':
28 |             return ProlongedSoundMarkInputTextPlugin(json_obj)
29 |         raise ValueError('{} is invalid InputTextPlugin class'.format(json_obj['class']))
30 |     except KeyError:
31 |         raise ValueError('config file is invalid format')
32 | 
33 | 
34 | def get_input_text_plugins() -> List[InputTextPlugin]:
35 |     key_word = 'inputTextPlugin'
36 |     if key_word not in config.settings:
37 |         return []
38 |     ps = []
39 |     for obj in config.settings[key_word]:
40 |         ps.append(get_input_text_plugin(obj))
41 |     return ps
42 | 


--------------------------------------------------------------------------------
/tests/test_dictionary.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import unittest
17 | 
18 | from sudachipy import tokenizer
19 | from sudachipy.dictionary import Dictionary
20 | 
21 | 
22 | class TestDictionary(unittest.TestCase):
23 | 
24 |     def setUp(self):
25 |         resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources')
26 |         self.dict_ = Dictionary(os.path.join(resource_dir, 'sudachi.json'), resource_dir=resource_dir)
27 | 
28 |     def tearDown(self) -> None:
29 |         self.dict_.close()
30 | 
31 |     def test_create(self):
32 |         self.assertEqual(tokenizer.Tokenizer, type(self.dict_.create()))
33 | 
34 |     def test_get_part_of_speech_size(self):
35 |         self.assertEqual(9, self.dict_.grammar.get_part_of_speech_size())
36 | 
37 |     def test_get_part_of_speech_string(self):
38 |         pos = self.dict_.grammar.get_part_of_speech_string(0)
39 |         self.assertIsNotNone(pos)
40 |         self.assertEqual('助動詞', pos[0])
41 | 
42 |     # def test_creat_with_merging_settings
43 | 
44 |     # def test_creat_with_merging_null_ settings
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     unittest.main()
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # IDE, Editor
104 | .idea/
105 | .vscode/
106 | 
107 | # Sudachi dictionary
108 | *.dic
109 | 


--------------------------------------------------------------------------------
/tests/resources/joinnumeric/char.def:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Japanese charcter category map
 3 | #
 4 | #   $Id: char.def 9 2012-12-12 04:13:15Z togiso $;
 5 | #
 6 | 
 7 | ###################################################################################
 8 | # 
 9 | #  CHARACTER CATEGORY DEFINITION
10 | #
11 | #  CATEGORY_NAME INVOKE GROUP LENGTH
12 | #
13 | #   - CATEGORY_NAME: Name of category. you have to define DEFAULT class.
14 | #   - INVOKE: 1/0:   always invoke unknown word processing, evan when the word can be found in the lexicon
15 | #   - GROUP:  1/0:   make a new word by grouping the same chracter category
16 | #   - LENGTH: n:     1 to n length new words are added
17 | #
18 | DEFAULT         0 1 0  # DEFAULT is a mandatory category!
19 | SPACE           0 1 0  
20 | KANJI           0 0 2
21 | SYMBOL          1 1 0
22 | NUMERIC         1 1 0
23 | ALPHA           1 1 0
24 | HIRAGANA        0 1 2
25 | KATAKANA        1 1 2
26 | KANJINUMERIC    0 1 0  #change INVOKE 1->0
27 | GREEK           1 1 0
28 | CYRILLIC        1 1 0
29 | 
30 | ###################################################################################
31 | #
32 | # CODE(UCS2) TO CATEGORY MAPPING
33 | #
34 | 
35 | # SPACE
36 | 0x0020 SPACE  # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE
37 | 
38 | # ASCII
39 | 0x0030..0x0039 NUMERIC  #0-9
40 | 
41 | # KANJI-NUMERIC (〇 一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆)
42 | 0x3007 KANJINUMERIC KANJI
43 | 0x4E00 KANJINUMERIC KANJI
44 | 0x4E8C KANJINUMERIC KANJI
45 | 0x4E09 KANJINUMERIC KANJI
46 | 0x56DB KANJINUMERIC KANJI
47 | 0x4E94 KANJINUMERIC KANJI
48 | 0x516D KANJINUMERIC KANJI
49 | 0x4E03 KANJINUMERIC KANJI
50 | 0x516B KANJINUMERIC KANJI
51 | 0x4E5D KANJINUMERIC KANJI
52 | 0x5341 KANJINUMERIC KANJI
53 | 0x767E KANJINUMERIC KANJI
54 | 0x5343 KANJINUMERIC KANJI
55 | 0x4E07 KANJINUMERIC KANJI
56 | 0x5104 KANJINUMERIC KANJI
57 | 0x5146 KANJINUMERIC KANJI
58 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/lexicon.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import ABC, abstractmethod
16 | from typing import Iterator, List
17 | 
18 | from .wordinfo import WordInfo
19 | 
20 | 
21 | class Lexicon(ABC):
22 | 
23 |     Itr = Iterator[List[int]]
24 | 
25 |     @abstractmethod
26 |     def lookup(self, text: str, offset: int) -> Itr:  # noqa: F821
27 |         raise NotImplementedError
28 | 
29 |     @abstractmethod
30 |     def get_word_id(self, headword: str, pos_id: int, reading_form: str) -> int:
31 |         raise NotImplementedError
32 | 
33 |     @abstractmethod
34 |     def get_left_id(self, word_id: int) -> int:
35 |         raise NotImplementedError
36 | 
37 |     @abstractmethod
38 |     def get_right_id(self, word_id: int) -> int:
39 |         raise NotImplementedError
40 | 
41 |     @abstractmethod
42 |     def get_cost(self, word_id: int) -> int:
43 |         raise NotImplementedError
44 | 
45 |     @abstractmethod
46 |     def get_word_info(self, word_id: int) -> 'WordInfo':
47 |         raise NotImplementedError
48 | 
49 |     @abstractmethod
50 |     def get_dictionary_id(self, word_id: int) -> int:
51 |         raise NotImplementedError
52 | 
53 |     @abstractmethod
54 |     def size(self) -> int:
55 |         raise NotImplementedError
56 | 


--------------------------------------------------------------------------------
/tests/test_large_userdict.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import unittest
17 | 
18 | from string import ascii_lowercase
19 | from itertools import product
20 | 
21 | from sudachipy import dictionary
22 | 
23 | 
24 | class TestLargeUserDict(unittest.TestCase):
25 | 
26 |     def setUp(self):
27 |         resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources')
28 |         self.dict_ = dictionary.Dictionary(os.path.join(resource_dir, 'sudachi_large_user.json'), resource_dir)
29 |         self.tokenizer_obj = self.dict_.create()
30 | 
31 |     def test_part_of_speech(self):
32 |         ms = self.tokenizer_obj.tokenize('やまもも')
33 |         self.assertEqual(1, len(ms))
34 |         m = ms[0]
35 |         pid = m.part_of_speech_id()
36 |         self.assertTrue(self.dict_.grammar.get_part_of_speech_size() > pid)
37 | 
38 |         # Exploit the cache space
39 |         num = 0
40 |         for combo in product(ascii_lowercase, repeat=3):
41 |             if num > 1024:
42 |                 break
43 |             lex = ''.join(combo)
44 |             self.tokenizer_obj.tokenize(lex)
45 |             num += 1
46 | 
47 |         ms = self.tokenizer_obj.tokenize('やまもも')
48 |         self.assertEqual(pid, ms[0].part_of_speech_id())
49 | 


--------------------------------------------------------------------------------
/tests/mock_grammar.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | from unittest import mock
17 | 
18 | from sudachipy.dictionarylib.charactercategory import CharacterCategory
19 | from sudachipy.dictionarylib.grammar import Grammar
20 | 
21 | mocked_grammar = mock.Mock(spec=Grammar)
22 | mocked_grammar.get_part_of_speech_size.return_value = 0
23 | mocked_grammar.get_part_of_speech_string.return_value = None
24 | mocked_grammar.get_part_of_speech_id.return_value = 0
25 | mocked_grammar.get_connect_cost.return_value = 0
26 | # mocked_grammar.set_connect_cost.return_value = None
27 | mocked_grammar.get_bos_parameter.return_value = None
28 | mocked_grammar.get_eos_parameter.return_value = None
29 | 
30 | 
31 | def mocked_get_character_category():
32 |     cat = CharacterCategory()
33 |     test_resources_dir = os.path.join(
34 |         os.path.dirname(os.path.abspath(__file__)),
35 |         os.pardir,
36 |         'sudachipy',
37 |         'resources')
38 |     try:
39 |         cat.read_character_definition(os.path.join(test_resources_dir, 'char.def'))
40 |     except IOError as e:
41 |         print(e)
42 |     return cat
43 | 
44 | 
45 | mocked_grammar.get_character_category.side_effect = mocked_get_character_category
46 | 
47 | 
48 | mocked_grammar.set_character_category.return_value = None
49 | 


--------------------------------------------------------------------------------
/tests/dictionarylib/test_dictionaryheader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import mmap
16 | import os
17 | import unittest
18 | 
19 | from sudachipy.dictionarylib import SYSTEM_DICT_VERSION_2
20 | from sudachipy.dictionarylib.dictionaryheader import DictionaryHeader
21 | 
22 | 
23 | class TestDictionaryHeader(unittest.TestCase):
24 | 
25 |     def setUp(self):
26 |         # Copied from sudachipy.dictionay.Dictionary.read_system_dictionary
27 |         test_resources_dir = os.path.join(
28 |             os.path.dirname(os.path.abspath(__file__)),
29 |             os.pardir,
30 |             'resources')
31 |         filename = os.path.join(test_resources_dir, 'system.dic')
32 |         with open(filename, 'rb') as system_dic:
33 |             bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ)
34 |         offset = 0
35 |         self.header = DictionaryHeader.from_bytes(bytes_, offset)
36 | 
37 |     def test_version(self):
38 |         self.assertEqual(SYSTEM_DICT_VERSION_2, self.header.version)
39 | 
40 |     def test_create_time(self):
41 |         self.assertTrue(self.header.create_time > 0)
42 | 
43 |     def test_description(self):
44 |         self.assertEqual("the system dictionary for the unit tests", self.header.description)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     unittest.main()
49 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/dictionaryversion.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # the first version of system dictionaries
16 | SYSTEM_DICT_VERSION_1 = 0x7366d3f18bd111e7
17 | 
18 | # the second version of system dictionaries
19 | SYSTEM_DICT_VERSION_2 = 0xce9f011a92394434
20 | 
21 | # the first version of user dictionaries
22 | USER_DICT_VERSION_1 = 0xa50f31188bd211e7
23 | 
24 | # the second version of user dictionaries
25 | USER_DICT_VERSION_2 = 0x9fdeb5a90168d868
26 | 
27 | # the third version of user dictionaries
28 | USER_DICT_VERSION_3 = 0xca9811756ff64fb0
29 | 
30 | 
31 | def is_dictionary(version):
32 |     return version in [
33 |         SYSTEM_DICT_VERSION_1, SYSTEM_DICT_VERSION_2,
34 |         USER_DICT_VERSION_1, USER_DICT_VERSION_2, USER_DICT_VERSION_3
35 |     ]
36 | 
37 | 
38 | def is_system_dictionary(version):
39 |     return version == SYSTEM_DICT_VERSION_1 or version == SYSTEM_DICT_VERSION_2
40 | 
41 | 
42 | def is_user_dictionary(version):
43 |     return version == USER_DICT_VERSION_1 or version == USER_DICT_VERSION_2 or version == USER_DICT_VERSION_3
44 | 
45 | 
46 | def has_grammar(version):
47 |     return is_system_dictionary(version) or version == USER_DICT_VERSION_2 or version == USER_DICT_VERSION_3
48 | 
49 | 
50 | def has_synonym_group_ids(version):
51 |     return version == SYSTEM_DICT_VERSION_2 or version == USER_DICT_VERSION_3
52 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from setuptools import setup, find_packages
16 | 
17 | from distutils.extension import Extension
18 | 
19 | extensions = [
20 |         Extension('sudachipy.latticenode', ['sudachipy/latticenode.pyx']),
21 |         Extension('sudachipy.lattice', ['sudachipy/lattice.pyx']),
22 |         Extension('sudachipy.tokenizer', ['sudachipy/tokenizer.pyx']),
23 |         ]
24 | 
25 | setup(name="SudachiPy",
26 |       use_scm_version=True,
27 |       setup_requires=['setuptools_scm', 'cython'],
28 |       description="Python version of Sudachi, the Japanese Morphological Analyzer",
29 |       long_description=open('README.md', encoding='utf-8').read(),
30 |       long_description_content_type="text/markdown",
31 |       url="https://github.com/WorksApplications/SudachiPy",
32 |       license="Apache-2.0",
33 |       author="Works Applications",
34 |       author_email="sudachi@worksap.co.jp",
35 |       packages=find_packages(include=["sudachipy", "sudachipy.*"]),
36 |       package_data={"": ["resources/*.json", "resources/*.dic", "resources/*.def"]},
37 |       entry_points={
38 |           "console_scripts": ["sudachipy=sudachipy.command_line:main"],
39 |       },
40 |       install_requires=[
41 |             "sortedcontainers~=2.1.0",
42 |             'dartsclone~=0.9.0',
43 |       ],
44 |       ext_modules=extensions,
45 |       )
46 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/connect_cost/inhibitconnectioncost.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from sudachipy import config
16 | from sudachipy.dictionarylib.grammar import Grammar
17 | 
18 | from .editconnectioncost import EditConnectionCostPlugin
19 | 
20 | 
21 | class InhibitConnectionPlugin(EditConnectionCostPlugin):
22 |     """ A Edit Connection Cost Plugin for inhibiting the connections.
23 | 
24 |     The following is an example of settings.
25 | 
26 |     ``
27 |     {
28 |         {
29 |             "class" : "sudachipy.plugin.connect_cost.InhibitConnectionPlugin",
30 |             "inhibitedPair" : [ [ 0, 233 ], [435, 332] ]
31 |         }
32 |     }
33 |     ``
34 | 
35 |     Attributes:
36 |         _inhibit_pairs: a list of int pairs. At each pair, the first one is right-ID
37 |         of the left node and the second one is left-ID of the right node in a connection.
38 | 
39 |     """
40 | 
41 |     def __init__(self):
42 |         self._inhibit_pairs = []
43 | 
44 |     def set_up(self, grammar: Grammar) -> None:
45 |         if 'inhibitedPair' in config.settings:
46 |             self._inhibit_pairs = config.settings['inhibitedPair']
47 | 
48 |     def edit(self, grammar: Grammar) -> None:
49 |         for pair in self._inhibit_pairs:
50 |             if len(pair) < 2:
51 |                 continue
52 |             self.inhibit_connection(grammar, pair[0], pair[1])
53 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/wordparameterlist.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | class WordParameterList(object):
17 | 
18 |     ELEMENT_SIZE = 2 * 3
19 |     ELEMENT_SIZE_AS_SHORT = 3
20 | 
21 |     def __init__(self, bytes_, offset):
22 |         original_offset = bytes_.tell()
23 |         bytes_.seek(offset)
24 |         self.size = int.from_bytes(bytes_.read(4), 'little')
25 |         array_offset = bytes_.tell()
26 |         self._array_view = memoryview(bytes_)[array_offset: array_offset + self.size * self.ELEMENT_SIZE]
27 |         self._array_view = self._array_view.cast('h')
28 |         # self.is_copied = False
29 |         bytes_.seek(original_offset)
30 | 
31 |     def __del__(self):
32 |         self._array_view.release()
33 | 
34 |     def storage_size(self):
35 |         return 4 + self.ELEMENT_SIZE * self.size
36 | 
37 |     def get_size(self):
38 |         return self.size
39 | 
40 |     def get_left_id(self, word_id):
41 |         return self._array_view[self.ELEMENT_SIZE_AS_SHORT * word_id]
42 | 
43 |     def get_right_id(self, word_id):
44 |         return self._array_view[self.ELEMENT_SIZE_AS_SHORT * word_id + 1]
45 | 
46 |     def get_cost(self, word_id):
47 |         return self._array_view[self.ELEMENT_SIZE_AS_SHORT * word_id + 2]
48 | 
49 |     def set_cost(self, word_id, cost):
50 |         # bytes_ must be ACCESS_COPY mode
51 |         self._array_view[self.ELEMENT_SIZE_AS_SHORT * word_id + 2] = cost
52 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/oov/simple_oov_plugin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from sudachipy.dictionarylib import wordinfo
16 | 
17 | from . import OovProviderPlugin
18 | 
19 | 
20 | class SimpleOovPlugin(OovProviderPlugin):
21 | 
22 |     def __init__(self, json_obj):
23 |         self.left_id = json_obj['leftId']
24 |         self.right_id = json_obj['rightId']
25 |         self.cost = json_obj['cost']
26 |         self.__oov_pos_strings = json_obj['oovPOS']
27 |         self.oov_pos_id = -1
28 | 
29 |     def set_up(self, grammar):
30 |         self.oov_pos_id = grammar.get_part_of_speech_id(self.__oov_pos_strings)
31 | 
32 |     def provide_oov(self, input_text, offset, has_other_words):
33 |         if not has_other_words:
34 |             node = self.create_node()
35 |             node.set_parameter(self.left_id, self.right_id, self.cost)
36 |             length = input_text.get_word_candidate_length(offset)
37 |             s = input_text.get_substring(offset, offset + length)
38 |             info = wordinfo.WordInfo(surface=s, head_word_length=length, pos_id=self.oov_pos_id, normalized_form=s,
39 |                                      dictionary_form_word_id=-1, dictionary_form=s, reading_form="",
40 |                                      a_unit_split=[], b_unit_split=[], word_structure=[], synonym_group_ids=[])
41 |             node.set_word_info(info)
42 |             return [node]
43 |         else:
44 |             return []
45 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/input_text/prolongedsoundmark.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from . import InputTextPlugin
16 | 
17 | 
18 | class ProlongedSoundMarkInputTextPlugin(InputTextPlugin):
19 | 
20 |     def __init__(self, json_obj):
21 |         self._psm_set = set()
22 |         self._replace_symbol = 'ー'
23 |         if not json_obj:
24 |             return
25 |         if 'prolongedSoundMarks' in json_obj:
26 |             self._psm_set = set([ord(psm) for psm in json_obj['prolongedSoundMarks']])
27 |         if 'replacementSymbol' in json_obj:
28 |             self._replace_symbol = json_obj['replacementSymbol']
29 | 
30 |     def set_up(self) -> None:
31 |         pass
32 | 
33 |     def rewrite(self, builder: InputTextPlugin.Builder) -> None:
34 |         text = builder.get_text()
35 |         n = len(text)
36 |         offset = 0
37 |         is_psm = False
38 |         m_start_idx = n
39 |         for i in range(n):
40 |             cp = ord(text[i])
41 |             if not is_psm and cp in self._psm_set:
42 |                 is_psm = True
43 |                 m_start_idx = i
44 |             elif is_psm and cp not in self._psm_set:
45 |                 if i - m_start_idx > 1:
46 |                     builder.replace(m_start_idx - offset, i - offset, self._replace_symbol)
47 |                     offset += i - m_start_idx - 1
48 |                 is_psm = False
49 |         if is_psm and n - m_start_idx > 1:
50 |             builder.replace(m_start_idx - offset, n - offset, self._replace_symbol)
51 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/connect_cost/editconnectioncost.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import ABC, abstractmethod
16 | 
17 | from sudachipy.dictionarylib.grammar import Grammar
18 | 
19 | 
20 | class EditConnectionCostPlugin(ABC):
21 |     """ Interface of plugins for editing the connection costs.
22 | 
23 |     The following is an example of settings.
24 |     ``
25 |     {
26 |         {
27 |             "class" : "sudachipy.plugin.connect_cost.SampleEditConnectionPlugin",
28 |             "example" : "example setting"
29 |         }
30 |     }
31 |     ``
32 | 
33 |     """
34 | 
35 |     @abstractmethod
36 |     def set_up(self, grammar: Grammar) -> None:
37 |         """ Set up plugin.
38 | 
39 |         Args:
40 |             grammar: grammar of system dictionary
41 | 
42 |         """
43 |         raise NotImplementedError
44 | 
45 |     @abstractmethod
46 |     def edit(self, grammar: Grammar) -> None:
47 |         """ Edit a connection costs.
48 | 
49 |         Args:
50 |             grammar: grammar of system dictionary
51 | 
52 |         """
53 |         raise NotImplementedError
54 | 
55 |     @staticmethod
56 |     def inhibit_connection(grammar: Grammar, left: int, right: int) -> None:
57 |         """ Inhibit a connection.
58 | 
59 |         Args:
60 |             grammar: grammar of system dictionary
61 |             left: right-ID of left node
62 |             right: left-ID of right node
63 | 
64 |         """
65 |         grammar.set_connect_cost(left, right, Grammar.INHIBITED_CONNECTION)
66 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/dictionaryheader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import struct
16 | 
17 | from sudachipy.dictionarylib.jtypedbytebuffer import JTypedByteBuffer
18 | from . import dictionaryversion
19 | 
20 | 
21 | class DictionaryHeader:
22 | 
23 |     __DESCRIPTION_SIZE = 256
24 |     __STORAGE_SIZE = 8 + 8 + __DESCRIPTION_SIZE
25 | 
26 |     def __init__(self, version, create_time, description):
27 |         self.version = version
28 |         self.create_time = create_time
29 |         self.description = description
30 | 
31 |     @classmethod
32 |     def from_bytes(cls, bytes_, offset):
33 |         version, create_time = struct.unpack_from("<2Q", bytes_, offset)
34 |         offset += 16
35 | 
36 |         len_ = 0
37 |         while len_ < cls.__DESCRIPTION_SIZE:
38 |             if bytes_[offset + len_] == 0:
39 |                 break
40 |             len_ += 1
41 |         description = bytes_[offset:offset + len_].decode("utf-8")
42 |         return cls(version, create_time, description)
43 | 
44 |     def to_bytes(self):
45 |         buf = JTypedByteBuffer(b'\x00' * (16 + self.__DESCRIPTION_SIZE))
46 |         buf.seek(0)
47 |         buf.write_int(self.version, 'long', signed=False)
48 |         buf.write_int(self.create_time, 'long')
49 |         bdesc = self.description.encode('utf-8')
50 |         if len(bdesc) > self.__DESCRIPTION_SIZE:
51 |             raise ValueError('description is too long')
52 |         buf.write(bdesc)
53 |         return buf.getvalue()
54 | 
55 |     def storage_size(self):
56 |         return self.__STORAGE_SIZE
57 | 
58 |     def is_system_dictionary(self):
59 |         return dictionaryversion.is_system_dictionary(self.version)
60 | 
61 |     def is_user_dictionary(self):
62 |         return dictionaryversion.is_user_dictionary(self.version)
63 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/userdictionarybuilder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .dictionarybuilder import DictionaryBuilder
16 | 
17 | 
18 | class UserDictionaryBuilder(DictionaryBuilder):
19 | 
20 |     def __init__(self, grammar, system_lexicon, *, logger=None):
21 |         super().__init__(logger=logger)
22 |         self.is_user_dictionary = True
23 |         self.grammar = grammar
24 |         self.system_lexicon = system_lexicon
25 | 
26 |     def build(self, lexicon_paths, matrix_input_stream, out_stream):
27 |         """
28 |         Violated LSP
29 |         :param lexicon_paths:
30 |         :param out_stream:
31 |         :return:
32 |         """
33 |         self.logger.info('reading the source file...')
34 |         for path in lexicon_paths:
35 |             with open(path, 'r', encoding='utf-8') as rf:
36 |                 self.build_lexicon(rf)
37 |         self.logger.info('{} words\n'.format(len(self.entries)))
38 | 
39 |         self.write_grammar(None, out_stream)
40 |         self.write_lexicon(out_stream)
41 | 
42 |     def get_posid(self, strs):
43 |         pos_id = self.grammar.get_part_of_speech_id(strs)
44 |         if pos_id < 0:
45 |             pos_id = super().get_posid(strs) + self.grammar.get_part_of_speech_size()
46 |         return pos_id
47 | 
48 |     def get_wordid(self, headword, pos_id, reading_form):
49 |         wid = super().get_wordid(headword, pos_id, reading_form)
50 |         if wid >= 0:
51 |             return wid | (1 << 28)
52 |         return self.system_lexicon.get_word_id(headword, pos_id, reading_form)
53 | 
54 |     def check_wordid(self, wid):
55 |         if wid >= (1 << 28):
56 |             super().check_wordid(wid & ((1 << 28) - 1))
57 |         elif wid < 0 or wid >= self.system_lexicon.size():
58 |             raise ValueError('invalid word id')
59 | 


--------------------------------------------------------------------------------
/sudachipy/morpheme.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | class Morpheme:
17 |     def __init__(self, list_, index):
18 |         self.word_info = None
19 |         self.list = list_
20 |         self.index = index
21 | 
22 |     def __str__(self):
23 |         return self.surface()
24 | 
25 |     def begin(self):
26 |         return self.list.get_begin(self.index)
27 | 
28 |     def end(self):
29 |         return self.list.get_end(self.index)
30 | 
31 |     def surface(self):
32 |         return self.list.get_surface(self.index)
33 | 
34 |     def part_of_speech(self):
35 |         wi = self.get_word_info()
36 |         return self.list.grammar.get_part_of_speech_string(wi.pos_id)
37 | 
38 |     def part_of_speech_id(self):
39 |         wi = self.get_word_info()
40 |         return wi.pos_id
41 | 
42 |     def dictionary_form(self):
43 |         wi = self.get_word_info()
44 |         return wi.dictionary_form
45 | 
46 |     def normalized_form(self):
47 |         wi = self.get_word_info()
48 |         return wi.normalized_form
49 | 
50 |     def reading_form(self):
51 |         wi = self.get_word_info()
52 |         return wi.reading_form
53 | 
54 |     def split(self, mode):
55 |         wi = self.get_word_info()
56 |         return self.list.split(mode, self.index, wi)
57 | 
58 |     def is_oov(self):
59 |         return self.list.is_oov(self.index)
60 | 
61 |     def word_id(self):
62 |         return self.list.path[self.index].get_word_id()
63 | 
64 |     def dictionary_id(self):
65 |         return self.list.path[self.index].get_dictionary_id()
66 | 
67 |     def synonym_group_ids(self):
68 |         wi = self.get_word_info()
69 |         return wi.synonym_group_ids
70 | 
71 |     def get_word_info(self):
72 |         if not self.word_info:
73 |             self.word_info = self.list.get_word_info(self.index)
74 |         return self.word_info
75 | 


--------------------------------------------------------------------------------
/tests/mock_inputtext.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from copy import deepcopy
16 | from unittest import mock
17 | 
18 | from sudachipy.utf8inputtext import UTF8InputText
19 | 
20 | 
21 | mocked_input_text = mock.Mock(spec=UTF8InputText)
22 | text = ''
23 | types = []
24 | 
25 | 
26 | def set_text(text_: str) -> None:
27 |     global text, types
28 |     text = text_
29 |     types = [set() for _ in text]
30 | 
31 | 
32 | def set_category_type(begin: int, end: int, type_) -> None:
33 |     global types
34 |     for i in range(begin, end):
35 |         types[i].add(type_)
36 | 
37 | 
38 | mocked_input_text.get_text.return_value = text
39 | 
40 | mocked_input_text.get_original_text.return_value = text
41 | 
42 | 
43 | def _mocked_get_substring(begin: int, end: int) -> str:
44 |     global text
45 |     return text[begin:end]
46 | 
47 | 
48 | mocked_input_text.get_substring.side_effect = _mocked_get_substring
49 | 
50 | 
51 | def _mocked_get_char_category_types(begin: int, end: int = None) -> set:
52 |     global text, types
53 |     if end is None:
54 |         return types[begin]
55 |     continuous_category = deepcopy(types[begin])
56 |     for i in range(begin + 1, end):
57 |         continuous_category = continuous_category.intersection(types[i])
58 |     return continuous_category
59 | 
60 | 
61 | mocked_input_text.get_char_category_types.side_effect = _mocked_get_char_category_types
62 | 
63 | 
64 | def _mocked_get_char_category_continuous_length(idx: int) -> int:
65 |     global text, types
66 |     continuous_category = deepcopy(types[idx])
67 |     for i in range(idx + 1, len(text)):
68 |         continuous_category = continuous_category.intersection(types[i])
69 |         if not continuous_category:
70 |             return i - idx
71 |     return len(text) - idx
72 | 
73 | 
74 | mocked_input_text.get_char_category_continuous_length.side_effect = _mocked_get_char_category_continuous_length
75 | 
76 | 
77 | def _mocked_get_code_points_offset_length(idx: int, offset: int) -> int:
78 |     return offset
79 | 
80 | 
81 | mocked_input_text.get_code_points_offset_length.side_effect = _mocked_get_code_points_offset_length
82 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/binarydictionary.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import mmap
16 | 
17 | from .dictionaryversion import has_grammar, has_synonym_group_ids, is_dictionary
18 | from .dictionaryheader import DictionaryHeader
19 | from .doublearraylexicon import DoubleArrayLexicon
20 | from .grammar import Grammar
21 | 
22 | 
23 | class BinaryDictionary(object):
24 | 
25 |     def __init__(self, bytes_: mmap.mmap, grammar: Grammar, header: DictionaryHeader, lexicon: DoubleArrayLexicon):
26 |         self._bytes = bytes_
27 |         self._grammar = grammar
28 |         self._header = header
29 |         self._lexicon = lexicon
30 | 
31 |     @staticmethod
32 |     def _read_dictionary(filename, access=mmap.ACCESS_READ):
33 |         with open(filename, 'rb') as system_dic:
34 |             bytes_ = mmap.mmap(system_dic.fileno(), 0, access=access)
35 |         offset = 0
36 |         header = DictionaryHeader.from_bytes(bytes_, offset)
37 |         offset += header.storage_size()
38 |         if not is_dictionary(header.version):
39 |             raise Exception('invalid dictionary version')
40 |         grammar = None
41 |         if has_grammar(header.version):
42 |             grammar = Grammar(bytes_, offset)
43 |             offset += grammar.get_storage_size()
44 | 
45 |         lexicon = DoubleArrayLexicon(bytes_, offset, has_synonym_group_ids(header.version))
46 |         return bytes_, grammar, header, lexicon
47 | 
48 |     @classmethod
49 |     def from_system_dictionary(cls, filename):
50 |         args = cls._read_dictionary(filename)
51 |         if not args[2].is_system_dictionary():
52 |             raise IOError('invalid system dictionary')
53 |         return cls(*args)
54 | 
55 |     @classmethod
56 |     def from_user_dictionary(cls, filename):
57 |         args = cls._read_dictionary(filename, mmap.ACCESS_COPY)
58 |         if not args[2].is_user_dictionary():
59 |             raise IOError('invalid user dictionary')
60 |         return cls(*args)
61 | 
62 |     def close(self):
63 |         del self._grammar
64 |         del self._lexicon
65 |         self._bytes.close()
66 | 
67 |     @property
68 |     def grammar(self) -> Grammar:
69 |         return self._grammar
70 | 
71 |     @property
72 |     def header(self) -> DictionaryHeader:
73 |         return self._header
74 | 
75 |     @property
76 |     def lexicon(self) -> DoubleArrayLexicon:
77 |         return self._lexicon
78 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/wordinfolist.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import struct
16 | 
17 | from .wordinfo import WordInfo
18 | 
19 | 
20 | class WordInfoList(object):
21 |     def __init__(self, bytes_, offset, word_size, has_synonym_gid):
22 |         self.bytes = bytes_
23 |         self.offset = offset
24 |         self._word_size = word_size
25 |         self.has_synonym_gid = has_synonym_gid
26 | 
27 |     def get_word_info(self, word_id):
28 |         orig_pos = self.bytes.tell()
29 |         index = self.word_id_to_offset(word_id)
30 |         self.bytes.seek(index)
31 |         surface = self.buffer_to_string()
32 |         head_word_length = self.buffer_to_string_length()
33 |         pos_id = int.from_bytes(self.bytes.read(2), 'little')
34 |         normalized_form = self.buffer_to_string()
35 |         if not normalized_form:
36 |             normalized_form = surface
37 |         dictionary_form_word_id = int.from_bytes(self.bytes.read(4), 'little', signed=True)
38 |         reading_form = self.buffer_to_string()
39 |         if not reading_form:
40 |             reading_form = surface
41 |         a_unit_split = self.buffer_to_int_array()
42 |         b_unit_split = self.buffer_to_int_array()
43 |         word_structure = self.buffer_to_int_array()
44 | 
45 |         synonym_gids = []
46 |         if self.has_synonym_gid:
47 |             synonym_gids = self.buffer_to_int_array()
48 | 
49 |         dictionary_form = surface
50 |         if dictionary_form_word_id >= 0 and dictionary_form_word_id != word_id:
51 |             wi = self.get_word_info(dictionary_form_word_id)
52 |             dictionary_form = wi.surface
53 | 
54 |         self.bytes.seek(orig_pos)
55 | 
56 |         return WordInfo(surface, head_word_length, pos_id, normalized_form, dictionary_form_word_id,
57 |                         dictionary_form, reading_form, a_unit_split, b_unit_split, word_structure, synonym_gids)
58 | 
59 |     def word_id_to_offset(self, word_id):
60 |         i = self.offset + 4 * word_id
61 |         return int.from_bytes(self.bytes[i:i + 4], 'little', signed=False)
62 | 
63 |     def buffer_to_string_length(self):
64 |         length = self.bytes.read_byte()
65 |         if length < 128:
66 |             return length
67 |         low = self.bytes.read_byte()
68 |         return ((length & 0x7F) << 8) | low
69 | 
70 |     def buffer_to_string(self):
71 |         length = self.buffer_to_string_length()
72 |         return self.bytes.read(2 * length).decode('utf-16-le')
73 | 
74 |     def buffer_to_int_array(self):
75 |         length = self.bytes.read_byte()
76 |         _bytes = self.bytes.read(4 * length)
77 |         return list(struct.unpack('{}i'.format(length), _bytes))
78 | 
79 |     def size(self):
80 |         return self._word_size
81 | 


--------------------------------------------------------------------------------
/sudachipy/morphemelist.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from . import latticenode
16 | from . import morpheme
17 | from . import tokenizer
18 | 
19 | 
20 | class MorphemeList:
21 | 
22 |     @classmethod
23 |     def empty(cls):
24 |         return MorphemeList(None, None, None, [])
25 | 
26 |     def __init__(self, input_, grammar, lexicon, path):
27 |         self.input_text = input_
28 |         self.grammar = grammar
29 |         self.lexicon = lexicon
30 |         self.path = path
31 | 
32 |     def __getitem__(self, index):
33 |         n_morphs = len(self.path)
34 |         if index >= n_morphs or index < -n_morphs:
35 |             raise IndexError("Morpheme list index out of range")
36 |         if index < 0:
37 |             return morpheme.Morpheme(self, n_morphs + index)
38 |         return morpheme.Morpheme(self, index)
39 | 
40 |     def __len__(self):
41 |         return len(self.path)
42 | 
43 |     def __iter__(self):
44 |         for index in range(len(self.path)):
45 |             yield morpheme.Morpheme(self, index)
46 |         return
47 | 
48 |     def __str__(self):
49 |         return ''.join([mm.surface() for mm in self])
50 | 
51 |     def get_begin(self, index):
52 |         return self.input_text.get_original_index(self.path[index].get_begin())
53 | 
54 |     def get_end(self, index):
55 |         return self.input_text.get_original_index(self.path[index].get_end())
56 | 
57 |     def get_surface(self, index):
58 |         begin = self.get_begin(index)
59 |         end = self.get_end(index)
60 |         return self.input_text.get_original_text()[begin:end]
61 | 
62 |     def get_word_info(self, index):
63 |         return self.path[index].get_word_info()
64 | 
65 |     def split(self, mode, index, wi):
66 |         if mode is tokenizer.Tokenizer.SplitMode.A:
67 |             word_ids = wi.a_unit_split
68 |         elif mode is tokenizer.Tokenizer.SplitMode.B:
69 |             word_ids = wi.b_unit_split
70 |         else:
71 |             return [self.__getitem__(index)]
72 | 
73 |         if len(word_ids) == 0 or len(word_ids) == 1:
74 |             return [self.__getitem__(index)]
75 | 
76 |         offset = self.path[index].get_begin()
77 |         nodes = []
78 |         for wid in word_ids:
79 |             n = latticenode.LatticeNode(self.lexicon, 0, 0, 0, wid)
80 |             n.set_begin(offset)
81 |             offset += n.get_word_info().head_word_length
82 |             n.set_end(offset)
83 |             nodes.append(n)
84 | 
85 |         return MorphemeList(self.input_text, self.grammar, self.lexicon, nodes)
86 | 
87 |     def is_oov(self, index):
88 |         return self.path[index].is_oov()
89 | 
90 |     def get_internal_cost(self):
91 |         return self.path[-1].get_path_cost() - self.path[0].get_path_cost()
92 | 
93 |     def size(self):
94 |         return len(self.path)
95 | 


--------------------------------------------------------------------------------
/tests/resources/dict/lex.csv:
--------------------------------------------------------------------------------
 1 | た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,*
 2 | に,2,2,11406,に,助詞,接続助詞,*,*,*,*,ニ,に,*,A,*,*,*,*
 3 | に,3,3,4481,に,助詞,格助詞,*,*,*,*,ニ,に,*,A,*,*,*,*
 4 | 京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,1/5
 5 | 東,7,7,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,*
 6 | 東京,6,6,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,*
 7 | 東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,*
 8 | 行く,4,4,5105,行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,*
 9 | 行っ,5,5,5122,行っ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,A,*,*,*,*
10 | 都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*
11 | アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,*,A,*,*,*,*
12 | アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,*,A,*,*,*,*
13 | アイアイウ,6,6,32766,アイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,*,A,*,*,*,*
14 | 0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,*,A,*,*,*,*
15 | 1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,*,A,*,*,*,*
16 | 2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,*,A,*,*,*,*
17 | 3,9,9,2478,3,名詞,数詞,*,*,*,*,サン,3,*,A,*,*,*,*
18 | 4,9,9,2478,4,名詞,数詞,*,*,*,*,ヨン,4,*,A,*,*,*,*
19 | 5,9,9,2478,5,名詞,数詞,*,*,*,*,ゴ,5,*,A,*,*,*,*
20 | 6,9,9,2478,6,名詞,数詞,*,*,*,*,ロク,6,*,A,*,*,*,*
21 | 7,9,9,2478,7,名詞,数詞,*,*,*,*,ナナ,7,*,A,*,*,*,*
22 | 8,9,9,2478,8,名詞,数詞,*,*,*,*,ハチ,8,*,A,*,*,*,*
23 | 9,9,9,2478,9,名詞,数詞,*,*,*,*,キュウ,9,*,A,*,*,*,*
24 | 〇,9,9,2478,〇,名詞,数詞,*,*,*,*,ゼロ,〇,*,A,*,*,*,*
25 | 一,9,9,2478,一,名詞,数詞,*,*,*,*,イチ,一,*,A,*,*,*,*
26 | 二,9,9,2478,二,名詞,数詞,*,*,*,*,ニ,二,*,A,*,*,*,*
27 | 三,9,9,2478,三,名詞,数詞,*,*,*,*,サン,三,*,A,*,*,*,*
28 | 四,9,9,2478,四,名詞,数詞,*,*,*,*,ヨン,四,*,A,*,*,*,*
29 | 五,9,9,2478,五,名詞,数詞,*,*,*,*,ゴ,五,*,A,*,*,*,*
30 | 六,9,9,2478,六,名詞,数詞,*,*,*,*,ロク,六,*,A,*,*,*,*
31 | 七,9,9,2478,七,名詞,数詞,*,*,*,*,ナナ,七,*,A,*,*,*,*
32 | 八,9,9,2478,八,名詞,数詞,*,*,*,*,ハチ,八,*,A,*,*,*,*
33 | 九,9,9,2478,九,名詞,数詞,*,*,*,*,キュウ,九,*,A,*,*,*,*
34 | 六三四,6,6,0,六三四,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,*,A,*,*,*,*
35 | いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,*
36 | いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,*
37 | 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,2478,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,*
38 | 特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,*,A,*,*,*,*
39 | な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,A,*,*,*,*


--------------------------------------------------------------------------------
/tests/plugin/test_join_katakana_oov_plugin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import unittest
17 | 
18 | from sudachipy.config import settings
19 | from sudachipy.dictionary import Dictionary
20 | from sudachipy.plugin.path_rewrite import JoinKatakanaOovPlugin
21 | from sudachipy.utf8inputtextbuilder import UTF8InputTextBuilder
22 | 
23 | 
24 | class TestJoinKatakanaOOVPlugin(unittest.TestCase):
25 | 
26 |     def setUp(self):
27 |         resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources')
28 |         self.dict_ = Dictionary(os.path.join(resource_dir, 'sudachi.json'), resource_dir)
29 |         self.tokenizer = self.dict_.create()
30 |         self.plugin = JoinKatakanaOovPlugin(settings['pathRewritePlugin'][1])
31 | 
32 |     def test_katakana_length(self):
33 |         # アイ, アイウ in the dictionary
34 |         self.plugin._min_length = 0
35 |         path = self.get_path('アイアイウ')
36 |         self.assertEqual(2, len(path))
37 | 
38 |         self.plugin._min_length = 1
39 |         path = self.get_path('アイアイウ')
40 |         self.assertEqual(2, len(path))
41 | 
42 |         self.plugin._min_length = 2
43 |         path = self.get_path('アイアイウ')
44 |         self.assertEqual(2, len(path))
45 | 
46 |         self.plugin._min_length = 3
47 |         path = self.get_path('アイアイウ')
48 |         self.assertEqual(1, len(path))
49 | 
50 |     def test_pos(self):
51 |         # アイアイウ is 名詞-固有名詞-地名-一般 in the dictionary
52 |         self.plugin._min_length = 3
53 |         path = self.get_path('アイアイウ')
54 |         self.assertEqual(1, len(path))
55 |         self.assertFalse(path[0].is_oov())
56 |         self.assertEqual(['名詞', '固有名詞', '地名', '一般', '*', '*'],
57 |             self.dict_.grammar.get_part_of_speech_string(path[0].get_word_info().pos_id))
58 | 
59 |     def test_starts_with_middle(self):
60 |         self.plugin._min_length = 3
61 |         path = self.get_path('アイウアイアイウ')
62 |         self.assertEqual(1, len(path))
63 | 
64 |     def test_starts_with_tail(self):
65 |         self.plugin._min_length = 3
66 |         path = self.get_path('アイウアイウアイ')
67 |         self.assertEqual(1, len(path))
68 | 
69 |     def test_with_nooovbow(self):
70 |         self.plugin._min_length = 3
71 |         path = self.get_path('ァアイアイウ')
72 |         self.assertEqual(2, len(path))
73 |         self.assertEqual('ァ', path[0].get_word_info().surface)
74 | 
75 |         path = self.get_path('アイウァアイウ')
76 |         self.assertEqual(1, len(path))
77 | 
78 |     def get_path(self, text: str):
79 |         input_ = UTF8InputTextBuilder(text, self.tokenizer._grammar).build()
80 |         self.tokenizer._build_lattice(input_)
81 |         path = self.tokenizer._lattice.get_best_path()
82 |         self.plugin.rewrite(input_, path, self.tokenizer._lattice)
83 |         self.tokenizer._lattice.clear()
84 |         return path
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     unittest.main()
89 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/path_rewrite/join_katakana_oov_plugin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from sudachipy.dictionarylib.categorytype import CategoryType
16 | from sudachipy.latticenode import LatticeNode
17 | from sudachipy.plugin.path_rewrite.path_rewrite_plugin import PathRewritePlugin
18 | from sudachipy.utf8inputtext import UTF8InputText
19 | 
20 | 
21 | class JoinKatakanaOovPlugin(PathRewritePlugin):
22 | 
23 |     def __init__(self, json_obj):
24 |         self.__pos = json_obj['oovPOS']
25 |         self._min_length = 1
26 |         if 'minLength' in json_obj:
27 |             self._min_length = json_obj['minLength']
28 |         self.oov_pos_id = None
29 | 
30 |     def set_up(self, grammar):
31 |         if not self.__pos:
32 |             raise ValueError("oovPOS is undefined")
33 |         self.oov_pos_id = grammar.get_part_of_speech_id(self.__pos)
34 |         if self.oov_pos_id < 0:
35 |             raise ValueError("oovPOS is invalid")
36 | 
37 |     def rewrite(self, text, path, lattice):
38 |         i = 0
39 |         while True:
40 |             if i >= len(path):
41 |                 break
42 |             node = path[i]
43 |             if not (node.is_oov() or self.is_shorter(self._min_length, text, node)) or \
44 |                     not self.is_katakana_node(text, node):
45 |                 i += 1
46 |                 continue
47 |             begin = i - 1
48 |             while True:
49 |                 if begin < 0:
50 |                     break
51 |                 if not self.is_katakana_node(text, path[begin]):
52 |                     begin += 1
53 |                     break
54 |                 begin -= 1
55 |             begin = max(0, begin)
56 |             end = i + 1
57 |             while True:
58 |                 if end >= len(path):
59 |                     break
60 |                 if not self.is_katakana_node(text, path[end]):
61 |                     break
62 |                 end += 1
63 |             pass
64 |             while begin != end and not self.can_oov_bow_node(text, path[begin]):
65 |                 begin += 1
66 |             if (end - begin) > 1:
67 |                 self.concatenate_oov(path, begin, end, self.oov_pos_id, lattice)
68 |                 i = begin + 1  # skip next node, as we already know it is not a joinable katakana
69 |             i += 1
70 | 
71 |     def is_katakana_node(self, text, node):
72 |         return CategoryType.KATAKANA in self.get_char_category_types(text, node)
73 | 
74 |     def is_one_char(self, text, node):
75 |         b = node.get_begin()
76 |         return b + text.get_code_points_offset_length(b, 1) == node.get_end()
77 | 
78 |     def can_oov_bow_node(self, text, node):
79 |         return CategoryType.NOOOVBOW not in text.get_char_category_types(node.get_begin())
80 | 
81 |     @staticmethod
82 |     def is_shorter(length: int, text: UTF8InputText, node: LatticeNode):
83 |         return text.code_point_count(node.get_begin(), node.get_end()) < length
84 | 


--------------------------------------------------------------------------------
/sudachipy/utf8inputtext.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | class UTF8InputText:
17 |     def __init__(self, grammar, original_text, modified_text, bytes_, offsets, byte_indexes, char_categories, char_category_continuities, can_bow_list=None):
18 |         self.original_text = original_text
19 |         self.modified_text = modified_text
20 |         self.bytes = bytes_
21 |         self.offsets = offsets
22 |         self.byte_indexes = byte_indexes
23 |         self.char_categories = char_categories
24 |         self.char_category_continuities = char_category_continuities
25 |         self.can_bow_list = can_bow_list
26 | 
27 |     def get_original_text(self):
28 |         return self.original_text
29 | 
30 |     def get_text(self):
31 |         return self.modified_text
32 | 
33 |     def get_byte_text(self):
34 |         return self.bytes
35 | 
36 |     def get_substring(self, begin, end):
37 |         if begin < 0:
38 |             raise IndexError(begin)
39 |         if end > len(self.bytes):
40 |             raise IndexError(end)
41 |         if (begin > end):
42 |             raise IndexError(end - begin)
43 | 
44 |         return self.modified_text[self.byte_indexes[begin]:self.byte_indexes[end]]
45 | 
46 |     def get_offset_text_length(self, index):
47 |         return self.byte_indexes[index]
48 | 
49 |     def get_original_index(self, index):
50 |         return self.offsets[index]
51 | 
52 |     def get_char_category_types(self, begin, end=None):
53 |         if end is None:
54 |             return self.char_categories[self.byte_indexes[begin]]
55 |         if begin + self.get_char_category_continuous_length(begin) < end:
56 |             return []
57 |         b = self.byte_indexes[begin]
58 |         e = self.byte_indexes[end]
59 |         continuous_category = set(self.char_categories[b])
60 |         for i in range(b + 1, e):
61 |             continuous_category = continuous_category & self.char_categories[i]
62 |         return continuous_category
63 | 
64 |     def get_char_category_continuous_length(self, index):
65 |         return self.char_category_continuities[index]
66 | 
67 |     def get_code_points_offset_length(self, index, code_point_offset):
68 |         length = 0
69 |         target = self.byte_indexes[index] + code_point_offset
70 |         for i in range(index, len(self.bytes)):
71 |             if self.byte_indexes[i] >= target:
72 |                 return length
73 |             length += 1
74 |         return length
75 | 
76 |     def can_bow(self, idx: int) -> bool:
77 |         return (self.bytes[idx] & 0xC0 != 0x80) and self.can_bow_list[self.byte_indexes[idx]]
78 | 
79 |     def code_point_count(self, begin: int, end: int):
80 |         return self.byte_indexes[end] - self.byte_indexes[begin]
81 | 
82 |     def get_word_candidate_length(self, index):
83 |         for i in range(index + 1, len(self.bytes)):
84 |             if self.can_bow(i):
85 |                 return i - index
86 |         return len(self.bytes) - index
87 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/path_rewrite/path_rewrite_plugin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Works Applications Co., Ltd.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import ABC, abstractmethod
16 | 
17 | from sudachipy.dictionarylib.wordinfo import WordInfo
18 | 
19 | 
20 | class PathRewritePlugin(ABC):
21 | 
22 |     @abstractmethod
23 |     def set_up(self, grammar):
24 |         raise NotImplementedError
25 | 
26 |     @abstractmethod
27 |     def rewrite(self, text, path, lattice):
28 |         raise NotImplementedError
29 | 
30 |     def concatenate(self, path, begin, end, lattice, normalized_form):
31 |         if begin >= end:
32 |             raise IndexError("begin >= end")
33 |         b = path[begin].get_begin()
34 |         e = path[end - 1].get_end()
35 |         pos_id = path[begin].get_word_info().pos_id
36 |         surface = ""
37 |         length = 0
38 |         normalized_builder, dictionary_builder, reading_builder = "", "", ""
39 |         for i in range(begin, end):
40 |             info = path[i].get_word_info()
41 |             surface += info.surface
42 |             length += info.head_word_length
43 |             if not normalized_form:
44 |                 normalized_builder += info.normalized_form
45 |             dictionary_builder += info.dictionary_form
46 |             reading_builder += info.reading_form
47 | 
48 |         normalized_form = normalized_form if normalized_form else normalized_builder
49 |         wi = WordInfo(surface=surface, head_word_length=length, pos_id=pos_id,
50 |                       normalized_form=normalized_form, dictionary_form=dictionary_builder, dictionary_form_word_id=-1,
51 |                       reading_form=reading_builder, a_unit_split=[], b_unit_split=[], word_structure=[], synonym_group_ids=[])
52 | 
53 |         node = lattice.create_node()
54 |         node.set_range(b, e)
55 |         node.set_word_info(wi)
56 | 
57 |         path[begin:end] = [node]
58 |         return node
59 | 
60 |     def concatenate_oov(self, path, begin, end, pos_id, lattice):
61 |         if begin >= end:
62 |             raise IndexError("begin >= end")
63 |         b = path[begin].get_begin()
64 |         e = path[end - 1].get_end()
65 | 
66 |         n = lattice.get_minimum_node(b, e)
67 |         if n is not None:
68 |             path[begin:end] = [n]
69 |             return n
70 | 
71 |         surface = ""
72 |         length = 0
73 |         for i in range(begin, end):
74 |             info = path[i].get_word_info()
75 |             surface += info.surface
76 |             length += info.head_word_length
77 | 
78 |         wi = WordInfo(surface=surface, head_word_length=length, pos_id=pos_id,
79 |                       normalized_form=surface, dictionary_form=surface, dictionary_form_word_id=-1,
80 |                       reading_form="", a_unit_split=[], b_unit_split=[], word_structure=[], synonym_group_ids=[])
81 | 
82 |         node = lattice.create_node()
83 |         node.set_range(b, e)
84 |         node.set_word_info(wi)
85 |         node.set_oov()
86 | 
87 |         path[begin:end] = [node]
88 |         return node
89 | 
90 |     def get_char_category_types(self, text, node):
91 |         return text.get_char_category_types(node.get_begin(), node.get_end())
92 | 


--------------------------------------------------------------------------------
/sudachipy/latticenode.pyx:
--------------------------------------------------------------------------------
  1 | # cython: profile=True
  2 | 
  3 | # Copyright (c) 2019 Works Applications Co., Ltd.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | from .dictionarylib.wordinfo import WordInfo
 18 | 
 19 | __NULL_SURFACE = '(null)'
 20 | UNK =\
 21 |     WordInfo(__NULL_SURFACE, 0, -1, __NULL_SURFACE, -1,
 22 |              __NULL_SURFACE, __NULL_SURFACE, [], [], [], [])
 23 | 
 24 | cdef class LatticeNode:
 25 | 
 26 |     def __init__(self, lexicon=None, left_id=None, right_id=None, cost=None, word_id=None):
 27 | 
 28 |         self.begin = 0
 29 |         self.end = 0
 30 |         self.word_id = 0
 31 |         self._is_oov = False
 32 |         self.best_previous_node = None
 33 |         self.is_connected_to_bos = False
 34 |         self.extra_word_info = None
 35 | 
 36 |         self._is_defined = True
 37 |         if lexicon is left_id is right_id is cost is word_id is None:
 38 |             self._is_defined = False
 39 |             return
 40 |         self.lexicon = lexicon
 41 |         self.left_id = left_id
 42 |         self.right_id = right_id
 43 |         self.cost = cost
 44 |         self.word_id = word_id
 45 | 
 46 |     def set_parameter(self, left_id: int, right_id: int, cost: int) -> None:
 47 |         self.left_id = left_id
 48 |         self.right_id = right_id
 49 |         self.cost = cost
 50 | 
 51 |     def get_begin(self) -> int:
 52 |         return self.begin
 53 | 
 54 |     def set_begin(self, begin) -> None:
 55 |         self.begin = begin
 56 | 
 57 |     def get_end(self) -> int:
 58 |         return self.end
 59 | 
 60 |     def set_end(self, end) -> None:
 61 |         self.end = end
 62 | 
 63 |     def set_range(self, begin: int, end: int) -> None:
 64 |         self.begin = begin
 65 |         self.end = end
 66 | 
 67 |     def is_oov(self):
 68 |         return self._is_oov
 69 | 
 70 |     def set_oov(self):
 71 |         self._is_oov = True
 72 |     
 73 |     def is_defined(self):
 74 |         return self._is_defined
 75 |     
 76 |     def set_defined(self):
 77 |         self._is_defined = True
 78 |     
 79 |     def get_word_info(self) -> WordInfo:
 80 |         if not self._is_defined:
 81 |             return UNK
 82 |         if self.extra_word_info:
 83 |             return self.extra_word_info
 84 |         return self.lexicon.get_word_info(self.word_id)
 85 | 
 86 |     def set_word_info(self, word_info: WordInfo) -> None:
 87 |         self.extra_word_info = word_info
 88 |         self._is_defined = True
 89 | 
 90 |     def get_path_cost(self) -> int:
 91 |         return self.cost
 92 |     
 93 |     def get_left_id(self) -> int:
 94 |         return self.left_id
 95 |     
 96 |     def get_right_id(self) -> int:
 97 |         return self.right_id
 98 | 
 99 |     def get_word_id(self) -> int:
100 |         return self.word_id
101 | 
102 |     def get_dictionary_id(self) -> int:
103 |         if not self._is_defined or self.extra_word_info:
104 |             return -1
105 |         return self.lexicon.get_dictionary_id(self.word_id)  # self.word_id >> 28
106 | 
107 |     def __str__(self):
108 |         surface = "(None)"
109 |         if self.word_id >= 0 or self.extra_word_info:
110 |             surface = self.get_word_info().surface
111 | 
112 |         return "{} {} {}({}) {} {} {}".format(
113 |             self.get_begin(), self.get_end(), surface, self.word_id, self.left_id, self.right_id, self.cost
114 |         )
115 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/grammar.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | class Grammar:
 17 |     INHIBITED_CONNECTION = 0x7fff
 18 | 
 19 |     def __init__(self, bytes_, offset):
 20 |         self._POS_DEPTH = 6
 21 |         self._BOS_PARAMETER = [0, 0, 0]
 22 |         self._EOS_PARAMETER = [0, 0, 0]
 23 | 
 24 |         self.char_category = None
 25 |         self.is_copied_connect_table = False
 26 | 
 27 |         original_offset = bytes_.tell()
 28 |         bytes_.seek(offset)
 29 |         pos_size = self.bytes_get_short(bytes_)
 30 |         self.pos_list = []
 31 |         for i in range(pos_size):
 32 |             pos = []
 33 |             for j in range(self._POS_DEPTH):
 34 |                 pos.append(self.bytes_get_string(bytes_))
 35 |             self.pos_list.append(pos)
 36 |         left_id_size = self.bytes_get_short(bytes_)
 37 |         right_id_size = self.bytes_get_short(bytes_)
 38 |         connect_table_offset = bytes_.tell()
 39 | 
 40 |         self.storage_size = (connect_table_offset - offset) + 2 * left_id_size * right_id_size
 41 | 
 42 |         self._matrix_view = \
 43 |             memoryview(bytes_)[connect_table_offset: connect_table_offset + 2 * left_id_size * right_id_size]
 44 |         if left_id_size * right_id_size != 0:
 45 |             self._matrix_view = self._matrix_view.cast('h', shape=[left_id_size, right_id_size])
 46 |         bytes_.seek(original_offset)
 47 | 
 48 |     def get_storage_size(self):
 49 |         return self.storage_size
 50 | 
 51 |     def get_part_of_speech_size(self):
 52 |         return len(self.pos_list)
 53 | 
 54 |     def get_part_of_speech_string(self, pos_id):
 55 |         return self.pos_list[pos_id]
 56 | 
 57 |     def get_part_of_speech_id(self, pos):
 58 |         return self.pos_list.index(pos) if pos in self.pos_list else -1
 59 | 
 60 |     def get_connect_cost(self, left: int, right: int) -> int:
 61 |         """ Returns connection cost of nodes
 62 | 
 63 |         Args:
 64 |             left: right-ID of left node
 65 |             right: left-ID of right node
 66 | 
 67 |         Returns:
 68 |             cost of connection
 69 | 
 70 |         """
 71 |         return self._matrix_view[right, left]
 72 | 
 73 |     def set_connect_cost(self, left: int, right: int, cost: int) -> None:
 74 |         """ Sets connection cost of nodes
 75 | 
 76 |         Note: bytes_ must be ACCESS_COPY mode
 77 | 
 78 |         Args:
 79 |             left: right-ID of left node
 80 |             right: left-ID of right node
 81 |             cost: cost of connection
 82 | 
 83 |         """
 84 |         self._matrix_view[right, left] = cost
 85 | 
 86 |     def get_bos_parameter(self):
 87 |         return self._BOS_PARAMETER
 88 | 
 89 |     def get_eos_parameter(self):
 90 |         return self._EOS_PARAMETER
 91 | 
 92 |     def get_character_category(self):
 93 |         return self.char_category
 94 | 
 95 |     def set_character_category(self, char_category):
 96 |         self.char_category = char_category
 97 | 
 98 |     @staticmethod
 99 |     def bytes_get_string(bytes_):
100 |         length = bytes_.read_byte()
101 |         string = bytes_.read(2 * length)
102 |         return string.decode('utf-16')
103 | 
104 |     @staticmethod
105 |     def bytes_get_short(bytes_):
106 |         return int.from_bytes(bytes_.read(2), 'little', signed=True)
107 | 
108 |     def add_pos_list(self, grammar):
109 |         self.pos_list.extend(grammar.pos_list)
110 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/input_text/default_input_text_plugin.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | from unicodedata import normalize
 17 | 
 18 | from sudachipy import config
 19 | 
 20 | from . import InputTextPlugin
 21 | 
 22 | 
 23 | class DefaultInputTextPlugin(InputTextPlugin):
 24 |     def __init__(self):
 25 |         self.ignore_normalize_set = set()
 26 |         self.key_lengths = {}
 27 |         self.replace_char_map = {}
 28 | 
 29 |     def set_up(self) -> None:
 30 |         rewrite_def = os.path.join(config.DEFAULT_RESOURCEDIR, "rewrite.def")
 31 |         if not rewrite_def:
 32 |             raise AttributeError("rewriteDef is not defined")
 33 |         self.read_rewrite_lists(rewrite_def)
 34 | 
 35 |     def rewrite(self, builder: InputTextPlugin.Builder) -> None:
 36 |         offset = 0
 37 |         next_offset = 0
 38 |         text = builder.get_text()
 39 | 
 40 |         i = -1
 41 |         while True:
 42 |             i += 1
 43 |             if i >= len(text):
 44 |                 break
 45 |             textloop = False
 46 |             offset += next_offset
 47 |             next_offset = 0
 48 |             original = text[i]
 49 | 
 50 |             # 1. replace char without normalize
 51 |             max_length = min(self.key_lengths.get(original, 0), len(text) - i)
 52 |             for l in range(max_length, 0, -1):
 53 |                 replace = self.replace_char_map.get(text[i:i + l])
 54 |                 if replace:
 55 |                     builder.replace(i + offset, i + l + offset, replace)
 56 |                     next_offset += len(replace) - l
 57 |                     i += l - 1
 58 |                     textloop = True
 59 |                     break
 60 |             if textloop:
 61 |                 continue
 62 | 
 63 |             # 2. normalize
 64 |             # 2-1. capital alphabet (not only Latin but Greek, Cyrillic, etc.) -> small
 65 |             lower = original.lower()
 66 |             if lower in self.ignore_normalize_set:
 67 |                 if original == lower:
 68 |                     continue
 69 |                 replace = lower
 70 |             else:
 71 |                 # 2-2. normalize (except in ignoreNormalize)
 72 |                 #   e.g. full-width alphabet -> half-width / ligature / etc.
 73 |                 replace = normalize("NFKC", lower)
 74 |             next_offset = len(replace) - 1
 75 |             if original != replace:
 76 |                 builder.replace(i + offset, i + 1 + offset, replace)
 77 | 
 78 |     def read_rewrite_lists(self, rewrite_def):
 79 |         with open(rewrite_def, "r", encoding="utf-8") as f:
 80 |             for i, line in enumerate(f):
 81 |                 line = line.strip()
 82 |                 if (not line) or line.startswith("#"):
 83 |                     continue
 84 |                 cols = line.split()
 85 | 
 86 |                 # ignored normalize list
 87 |                 if len(cols) == 1:
 88 |                     key = cols[0]
 89 |                     if len(key) != 1:
 90 |                         raise RuntimeError("{} is not character at line {}".format(key, i))
 91 |                     self.ignore_normalize_set.add(key)
 92 |                 # replace char list
 93 |                 elif len(cols) == 2:
 94 |                     if cols[0] in self.replace_char_map:
 95 |                         raise RuntimeError("{} is already defined at line {}".format(cols[0], i))
 96 |                     if self.key_lengths.get(cols[0][0], 0) < len(cols[0]):
 97 |                         self.key_lengths[cols[0][0]] = len(cols[0])
 98 |                     self.replace_char_map[cols[0]] = cols[1]
 99 |                 else:
100 |                     raise RuntimeError("invalid format at line {}".format(i))
101 | 


--------------------------------------------------------------------------------
/sudachipy/dictionary.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from . import config
 16 | from . import dictionarylib
 17 | from .dictionarylib.binarydictionary import BinaryDictionary
 18 | from .dictionarylib.lexiconset import LexiconSet
 19 | from .plugin.input_text import get_input_text_plugins
 20 | from .plugin.oov import get_oov_plugins
 21 | from .plugin.path_rewrite import get_path_rewrite_plugins
 22 | from .tokenizer import Tokenizer
 23 | 
 24 | 
 25 | class UndefinedDictionaryError(Exception):
 26 |     pass
 27 | 
 28 | 
 29 | class Dictionary:
 30 | 
 31 |     def __init__(self, config_path=None, resource_dir=None, dict_type=None):
 32 |         config.settings.set_up(config_path, resource_dir, dict_type)
 33 |         self.grammar = None
 34 |         self.lexicon = None
 35 |         self.input_text_plugins = []
 36 |         self.edit_connection_plugin = []
 37 |         self.oov_provider_plugins = []
 38 |         self.path_rewrite_plugins = []
 39 |         self.dictionaries = []
 40 |         self.header = None
 41 |         self._read_system_dictionary(config.settings.system_dict_path())
 42 | 
 43 |         # self.edit_connection_plugin = [InhibitConnectionPlugin()]
 44 |         # for p in self.edit_connection_plugin:
 45 |         #     p.set_up(self.grammar)
 46 |         #     p.edit(self.grammar)
 47 | 
 48 |         self._read_character_definition(config.settings.char_def_path())
 49 | 
 50 |         self.input_text_plugins = get_input_text_plugins()
 51 |         for p in self.input_text_plugins:
 52 |             p.set_up()
 53 | 
 54 |         self.oov_provider_plugins = get_oov_plugins()
 55 |         if not self.oov_provider_plugins:
 56 |             raise AttributeError("no OOV provider")
 57 |         for p in self.oov_provider_plugins:
 58 |             p.set_up(self.grammar)
 59 | 
 60 |         self.path_rewrite_plugins = get_path_rewrite_plugins()
 61 |         for p in self.path_rewrite_plugins:
 62 |             p.set_up(self.grammar)
 63 | 
 64 |         for filename in config.settings.user_dict_paths():
 65 |             self._read_user_dictionary(filename)
 66 | 
 67 |     def _read_system_dictionary(self, filename):
 68 |         if filename is None:
 69 |             raise ValueError("system dictionary is not specified")
 70 |         dict_ = BinaryDictionary.from_system_dictionary(filename)
 71 |         self.dictionaries.append(dict_)
 72 |         self.grammar = dict_.grammar
 73 |         self.lexicon = LexiconSet(dict_.lexicon)
 74 | 
 75 |     def _read_user_dictionary(self, filename):
 76 |         if self.lexicon.is_full():
 77 |             raise ValueError('too many dictionaries')
 78 |         dict_ = BinaryDictionary.from_user_dictionary(filename)
 79 |         self.dictionaries.append(dict_)
 80 |         user_lexicon = dict_.lexicon
 81 |         tokenizer_ = Tokenizer(self.grammar, self.lexicon, self.input_text_plugins, self.oov_provider_plugins, [])
 82 |         user_lexicon.calculate_cost(tokenizer_)
 83 |         self.lexicon.add(user_lexicon, self.grammar.get_part_of_speech_size())
 84 |         if dict_.grammar:
 85 |             self.grammar.add_pos_list(dict_.grammar)
 86 | 
 87 |     def _read_character_definition(self, filename):
 88 |         if self.grammar is None:
 89 |             return
 90 |         char_category = dictionarylib.charactercategory.CharacterCategory()
 91 |         char_category.read_character_definition(filename)
 92 |         self.grammar.set_character_category(char_category)
 93 | 
 94 |     def close(self):
 95 |         self.grammar = None
 96 |         self.lexicon = None
 97 |         for dict_ in self.dictionaries:
 98 |             dict_.close()
 99 | 
100 |     def create(self, mode=None):
101 |         return Tokenizer(
102 |             self.grammar, self.lexicon, self.input_text_plugins, self.oov_provider_plugins, self.path_rewrite_plugins, mode=mode)
103 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/doublearraylexicon.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import mmap
 16 | 
 17 | from dartsclone import DoubleArray
 18 | 
 19 | from . import wordidtable
 20 | from . import wordinfolist
 21 | from . import wordparameterlist
 22 | from .lexicon import Lexicon
 23 | 
 24 | 
 25 | class DoubleArrayLexicon(Lexicon):
 26 | 
 27 |     __SIGNED_SHORT_MIN = -32768
 28 |     __SIGNED_SHORT_MAX = 32767
 29 |     __USER_DICT_COST_PER_MORPH = -20
 30 | 
 31 |     trie = None
 32 |     word_id_table = None
 33 |     word_params = None
 34 | 
 35 |     def __init__(self, bytes_: mmap.mmap, offset: int, has_synonym_gid: bool):
 36 |         self.trie = DoubleArray()
 37 |         bytes_.seek(offset)
 38 |         size = int.from_bytes(bytes_.read(4), 'little')
 39 |         offset += 4
 40 | 
 41 |         array = memoryview(bytes_)[offset:offset + size * 4]
 42 |         self.trie.set_array(array, size)
 43 |         offset += self.trie.total_size()
 44 | 
 45 |         self.word_id_table = wordidtable.WordIdTable(bytes_, offset)
 46 |         offset += self.word_id_table.storage_size()
 47 | 
 48 |         self.word_params = wordparameterlist.WordParameterList(bytes_, offset)
 49 |         offset += self.word_params.storage_size()
 50 | 
 51 |         self.word_infos = wordinfolist.WordInfoList(bytes_, offset, self.word_params.get_size(), has_synonym_gid)
 52 | 
 53 |     def __del__(self):
 54 |         del self.word_params
 55 | 
 56 |     def lookup(self, text: bytes, offset: int) -> Lexicon.Itr:
 57 |         key = text[offset:]
 58 |         result = self.trie.common_prefix_search(key, length=len(key))
 59 |         for index, length in result:
 60 |             word_ids = self.word_id_table.get(index)
 61 |             length += offset
 62 |             for word_id in word_ids:
 63 |                 yield (word_id, length)
 64 | 
 65 |     def get_left_id(self, word_id: int) -> int:
 66 |         return self.word_params.get_left_id(word_id)
 67 | 
 68 |     def get_right_id(self, word_id: int) -> int:
 69 |         return self.word_params.get_right_id(word_id)
 70 | 
 71 |     def get_cost(self, word_id: int) -> int:
 72 |         return self.word_params.get_cost(word_id)
 73 | 
 74 |     def get_word_info(self, word_id: int) -> 'WordInfo':  # noqa: F821
 75 |         return self.word_infos.get_word_info(word_id)
 76 | 
 77 |     def size(self) -> int:
 78 |         return self.word_params.size
 79 | 
 80 |     def get_word_id(self, headword: str, pos_id: int, reading_form: str) -> int:
 81 |         for wid, _ in self.lookup(headword.encode('utf-8'), 0):
 82 |             if self._compare_word_id(wid, headword, pos_id, reading_form):
 83 |                 return wid
 84 | 
 85 |         for wid in range(self.word_infos.size()):
 86 |             if self._compare_word_id(wid, headword, pos_id, reading_form):
 87 |                 return wid
 88 | 
 89 |         return -1
 90 | 
 91 |     def _compare_word_id(self, wid: int, headword: str, pos_id: int, reading_form: str) -> bool:
 92 |         info = self.word_infos.get_word_info(wid)
 93 |         return info.surface == headword \
 94 |             and info.pos_id == pos_id \
 95 |             and info.reading_form == reading_form
 96 | 
 97 |     def get_dictionary_id(self, word_id: int) -> int:
 98 |         return 0
 99 | 
100 |     def calculate_cost(self, tokenizer) -> None:
101 |         for wid in range(self.word_params.size):
102 |             if self.get_cost(wid) != self.__SIGNED_SHORT_MIN:
103 |                 continue
104 |             surface = self.get_word_info(wid).surface
105 |             ms = tokenizer.tokenize(surface, None)
106 |             cost = ms.get_internal_cost() + self.__USER_DICT_COST_PER_MORPH * len(ms)
107 |             cost = min(cost, self.__SIGNED_SHORT_MAX)
108 |             cost = max(cost, self.__SIGNED_SHORT_MIN)
109 |             self.word_params.set_cost(wid, cost)
110 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/lexiconset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from functools import lru_cache
 16 | from typing import List
 17 | 
 18 | from .lexicon import Lexicon
 19 | 
 20 | 
 21 | class LexiconSet(Lexicon):
 22 | 
 23 |     __MAX_DICTIONARIES = 16
 24 | 
 25 |     def __init__(self, system_lexicon: Lexicon):
 26 |         self.lexicons = [system_lexicon]
 27 |         self.pos_offsets = [0]
 28 | 
 29 |     def add(self, lexicon: Lexicon, pos_offset: int) -> None:
 30 |         if lexicon not in self.lexicons:
 31 |             self.lexicons.append(lexicon)
 32 |             self.pos_offsets.append(pos_offset)
 33 | 
 34 |     def is_full(self) -> bool:
 35 |         return len(self.lexicons) >= self.__MAX_DICTIONARIES
 36 | 
 37 |     def lookup(self, text: str, offset: int) -> Lexicon.Itr:
 38 |         if len(self.lexicons) == 1:
 39 |             return self.lexicons[0].lookup(text, offset)
 40 |         return self.__lookup(text, offset)
 41 | 
 42 |     def __lookup(self, text: str, offset: int) -> Lexicon.Itr:
 43 |         indices = list(range(len(self.lexicons)))[1:] + [0]
 44 |         for dict_id in indices:
 45 |             pairs = self.lexicons[dict_id].lookup(text, offset)
 46 |             for pair in pairs:
 47 |                 yield (self.build_word_id(dict_id, pair[0]), pair[1])
 48 | 
 49 |     def get_left_id(self, word_id: int) -> int:
 50 |         return self.lexicons[self.get_dictionary_id(word_id)]\
 51 |             .get_left_id(self.get_word_id1(word_id))
 52 | 
 53 |     def get_right_id(self, word_id: int) -> int:
 54 |         return self.lexicons[self.get_dictionary_id(word_id)]\
 55 |             .get_right_id(self.get_word_id1(word_id))
 56 | 
 57 |     def get_cost(self, word_id: int) -> int:
 58 |         return self.lexicons[self.get_dictionary_id(word_id)]\
 59 |             .get_cost(self.get_word_id1(word_id))
 60 | 
 61 |     @lru_cache(1024)
 62 |     def get_word_info(self, word_id: int) -> 'WordInfo':  # noqa: F821
 63 |         dic_id = self.get_dictionary_id(word_id)
 64 |         winfo = self.lexicons[dic_id].get_word_info(self.get_word_id1(word_id))
 65 |         pos_id = winfo.pos_id
 66 |         if dic_id > 0 and pos_id >= self.pos_offsets[1]:  # user defined part-of-speech
 67 |             winfo.pos_id = winfo.pos_id - self.pos_offsets[1] + self.pos_offsets[dic_id]
 68 |         winfo.a_unit_split = self.convert_split(winfo.a_unit_split, dic_id)
 69 |         winfo.b_unit_split = self.convert_split(winfo.b_unit_split, dic_id)
 70 |         winfo.word_structure = self.convert_split(winfo.word_structure, dic_id)
 71 |         return winfo
 72 | 
 73 |     def get_dictionary_id(self, word_id: int) -> int:
 74 |         return word_id >> 28
 75 | 
 76 |     @staticmethod
 77 |     def get_word_id1(word_id: int) -> int:
 78 |         return 0x0FFFFFFF & word_id
 79 | 
 80 |     def get_word_id(self, headword: str, pos_id: int, reading_form: str) -> int:
 81 |         for dic_id in range(len(self.lexicons)):
 82 |             wid = self.lexicons[dic_id].get_word_id(headword, pos_id, reading_form)
 83 |             if wid <= 0:
 84 |                 return self.build_word_id(dic_id, wid)
 85 |         return self.lexicons[0].get_word_id(headword, pos_id, reading_form)
 86 | 
 87 |     def build_word_id(self, dict_id, word_id):
 88 |         if word_id > 0x0FFFFFFF:
 89 |             raise AttributeError("word ID is too large: ", word_id)
 90 |         if dict_id > len(self.lexicons):
 91 |             raise AttributeError("dictionary ID is too large: ", dict_id)
 92 |         return (dict_id << 28) | word_id
 93 | 
 94 |     def size(self) -> int:
 95 |         return sum([lex.size() for lex in self.lexicons])
 96 | 
 97 |     def convert_split(self, split: List[int], dict_id: int) -> List[int]:
 98 |         for i in range(len(split)):
 99 |             if self.get_dictionary_id(split[i]) > 0:
100 |                 split[i] = self.build_word_id(dict_id, self.get_word_id1(split[i]))
101 |         return split
102 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/path_rewrite/join_numeric_plugin.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import warnings
 16 | 
 17 | from sudachipy.dictionarylib.categorytype import CategoryType
 18 | 
 19 | from .numericparser import NumericParser
 20 | from .path_rewrite_plugin import PathRewritePlugin
 21 | 
 22 | 
 23 | class JoinNumericPlugin(PathRewritePlugin):
 24 | 
 25 |     _numeric_pos_id = None
 26 |     _enable_normalize = True
 27 | 
 28 |     def __init__(self, json_obj):
 29 |         self._NUMERIC_POS = ['名詞', '数詞', '*', '*', '*', '*']
 30 |         if not json_obj:
 31 |             return
 32 |         if 'joinKanjiNumeric' in json_obj:
 33 |             warnings.warn('joinKanjiNumeric is already nonsense key', SyntaxWarning)
 34 |         if 'enableNormalize' in json_obj:
 35 |             self._enable_normalize = json_obj['enableNormalize']
 36 | 
 37 |     def set_up(self, grammar):
 38 |         self._numeric_pos_id = grammar.get_part_of_speech_id(self._NUMERIC_POS)
 39 | 
 40 |     def rewrite(self, text, path, lattice):
 41 |         begin_index = -1
 42 |         comma_as_digit = True
 43 |         period_as_digit = True
 44 |         parser = NumericParser()
 45 |         i = -1
 46 | 
 47 |         while i < len(path) - 1:
 48 |             i += 1
 49 |             node = path[i]
 50 |             types = self.get_char_category_types(text, node)
 51 |             s = node.get_word_info().normalized_form
 52 |             if CategoryType.NUMERIC in types or CategoryType.KANJINUMERIC in types or \
 53 |                (period_as_digit and s == '.') or (comma_as_digit and s == ','):
 54 | 
 55 |                 if begin_index < 0:
 56 |                     parser.clear()
 57 |                     begin_index = i
 58 | 
 59 |                 for c in s:
 60 |                     if not parser.append(c):
 61 |                         if begin_index >= 0:
 62 |                             if parser.error_state == NumericParser.Error.COMMA:
 63 |                                 comma_as_digit = False
 64 |                                 i = begin_index - 1
 65 |                             elif parser.error_state == NumericParser.Error.POINT:
 66 |                                 period_as_digit = False
 67 |                                 i = begin_index - 1
 68 |                             begin_index = -1
 69 |                         break
 70 |                 continue
 71 | 
 72 |             if begin_index >= 0:
 73 |                 if parser.done():
 74 |                     self._concat(path, begin_index, i, lattice, parser)
 75 |                     i = begin_index + 1
 76 |                 else:
 77 |                     ss = path[i - 1].get_word_info().normalized_form
 78 |                     if (parser.error_state == NumericParser.Error.COMMA and ss == ',') or \
 79 |                        (parser.error_state == NumericParser.Error.POINT and ss == '.'):
 80 |                         self._concat(path, begin_index, i - 1, lattice, parser)
 81 |                         i = begin_index + 2
 82 |             begin_index = -1
 83 |             if not comma_as_digit and s != ',':
 84 |                 comma_as_digit = True
 85 |             if not period_as_digit and s != '.':
 86 |                 period_as_digit = True
 87 | 
 88 |         if begin_index >= 0:
 89 |             if parser.done():
 90 |                 self._concat(path, begin_index, len(path), lattice, parser)
 91 |             else:
 92 |                 ss = path[-1].get_word_info().normalized_form
 93 |                 if (parser.error_state == NumericParser.Error.COMMA and ss == ',') or \
 94 |                    (parser.error_state == NumericParser.Error.POINT and ss == '.'):
 95 |                     self._concat(path, begin_index, len(path) - 1, lattice, parser)
 96 | 
 97 |     def _concat(self, path, begin, end, lattice, parser) -> None:
 98 |         if path[begin].get_word_info().pos_id != self._numeric_pos_id:
 99 |             return
100 |         if self._enable_normalize:
101 |             normalized_form = parser.get_normalized()
102 |             if end - begin > 1 or normalized_form != path[begin].get_word_info().normalized_form:
103 |                 self.concatenate(path, begin, end, lattice, normalized_form)
104 |             return
105 |         if end - begin > 1:
106 |             self.concatenate(path, begin, end, lattice, '')
107 | 


--------------------------------------------------------------------------------
/tests/dictionarylib/test_userdictionarybuilder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import shutil
 17 | import tempfile
 18 | import time
 19 | from logging import getLogger
 20 | from unittest import TestCase
 21 | 
 22 | from sudachipy.dictionarylib import SYSTEM_DICT_VERSION_2
 23 | from sudachipy.dictionarylib.dictionaryheader import DictionaryHeader
 24 | from sudachipy.dictionarylib.userdictionarybuilder import UserDictionaryBuilder
 25 | 
 26 | from .test_dictionarybuilder import TestDictionaryBuilder
 27 | 
 28 | 
 29 | class TestUserDictionaryBuilder(TestCase):
 30 | 
 31 |     def setUp(self):
 32 |         self.test_dir = tempfile.mkdtemp()
 33 |         test_resources_dir = os.path.join(
 34 |             os.path.dirname(os.path.abspath(__file__)),
 35 |             os.pardir,
 36 |             'resources')
 37 |         self.dict_filename = os.path.join(test_resources_dir, 'system.dic')
 38 |         _, _, self.grammar, self.lexicon_set = \
 39 |             TestDictionaryBuilder.read_system_dictionary(self.dict_filename)
 40 |         self.logger = getLogger()
 41 |         self.logger.disabled = True
 42 | 
 43 |     def tearDown(self):
 44 |         shutil.rmtree(self.test_dir)
 45 | 
 46 |     def test_parseline_with_userdefined_POS(self):
 47 |         builder = UserDictionaryBuilder(self.grammar, self.lexicon_set, logger=self.logger)
 48 |         builder.parse_line('田中,0,0,0,田中,存在,しない,品詞,*,*,*,タナカ,田中,*,A,*,*,*,*'.split(','))
 49 |         self.assertEqual(1, len(builder.pos_table.get_list()))
 50 | 
 51 |     def test_build(self):
 52 |         out_path = os.path.join(self.test_dir, 'output.txt')
 53 |         in_path = os.path.join(self.test_dir, 'input.txt')
 54 | 
 55 |         out_stream = open(out_path, 'wb')
 56 |         # lexicon_paths = [self.input_path]
 57 |         # matrix_input_stream = open(self.matrix_path, 'r')
 58 |         with open(in_path, 'w', encoding='utf-8') as wf:
 59 |             wf.write("東京都市,0,0,0,東京都市,名詞,固有名詞,地名,一般,*,*,ヒガシキョウトシ,東京都市,*,B,\"東,名詞,普通名詞,一般,*,*,*,ヒガシ/3/U1\",*,\"4/3/市,名詞,普通名詞,一般,*,*,*,シ\",*\n")
 60 |             wf.write('市,-1,-1,0,市,名詞,普通名詞,一般,*,*,*,シ,市,*,A,*,*,*,*\n')
 61 | 
 62 |         _, _, grammar, lexicon_set = TestDictionaryBuilder.read_system_dictionary(self.dict_filename)
 63 |         header = DictionaryHeader(SYSTEM_DICT_VERSION_2, int(time.time()), 'test')
 64 |         out_stream.write(header.to_bytes())
 65 |         builder = UserDictionaryBuilder(grammar, lexicon_set, logger=self.logger)
 66 |         lexicon_paths = [in_path]
 67 |         builder.build(lexicon_paths, None, out_stream)
 68 |         out_stream.close()
 69 | 
 70 |         buffers, header, grammar, lexicon_set = TestDictionaryBuilder.read_system_dictionary(out_path)
 71 |         lexicon = lexicon_set.lexicons[0]
 72 | 
 73 |         # header
 74 |         self.assertEqual(SYSTEM_DICT_VERSION_2, header.version)
 75 |         self.assertEqual('test', header.description)
 76 | 
 77 |         # lexicon
 78 |         self.assertEqual(0, lexicon.get_left_id(0))
 79 |         self.assertEqual(0, lexicon.get_cost(0))
 80 |         wi = lexicon.get_word_info(0)
 81 |         self.assertEqual('東京都市', wi.surface)
 82 |         self.assertEqual('東京都市', wi.normalized_form)
 83 |         self.assertEqual(-1, wi.dictionary_form_word_id)
 84 |         self.assertEqual('ヒガシキョウトシ', wi.reading_form)
 85 |         self.assertEqual(3, wi.pos_id)
 86 |         self.assertEqual([4, 3, 1 | (1 << 28)], wi.a_unit_split)
 87 |         self.assertEqual([], wi.b_unit_split)
 88 |         self.assertEqual([4, 3, 1 | (1 << 28)], wi.word_structure)
 89 |         lst = lexicon.lookup('東京都市'.encode('utf-8'), 0)
 90 |         self.assertEqual((0, len('東京都市'.encode('utf-8'))), lst.__next__())
 91 |         with self.assertRaises(StopIteration):
 92 |             lst.__next__()
 93 | 
 94 |         self.assertEqual(-1, lexicon.get_left_id(1))
 95 |         self.assertEqual(0, lexicon.get_cost(1))
 96 |         wi = lexicon.get_word_info(1)
 97 |         self.assertEqual('市', wi.surface)
 98 |         self.assertEqual('市', wi.normalized_form)
 99 |         self.assertEqual(-1, wi.dictionary_form_word_id)
100 |         self.assertEqual('シ', wi.reading_form)
101 |         self.assertEqual(4, wi.pos_id)
102 |         self.assertEqual([], wi.a_unit_split)
103 |         self.assertEqual([], wi.b_unit_split)
104 |         lst = lexicon.lookup('東'.encode('utf-8'), 0)
105 |         with self.assertRaises(StopIteration):
106 |             lst.__next__()
107 | 


--------------------------------------------------------------------------------
/sudachipy/config.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import json
 16 | import os
 17 | import warnings
 18 | from importlib import import_module
 19 | from importlib.util import find_spec
 20 | from pathlib import Path
 21 | from typing import List
 22 | 
 23 | DEFAULT_RESOURCEDIR = Path(__file__).absolute().parent / 'resources'
 24 | DEFAULT_SETTINGFILE = DEFAULT_RESOURCEDIR / 'sudachi.json'
 25 | DEFAULT_RESOURCEDIR = DEFAULT_RESOURCEDIR.as_posix()
 26 | DEFAULT_SETTINGFILE = DEFAULT_SETTINGFILE.as_posix()
 27 | 
 28 | 
 29 | def get_absolute_dict_path(dict_type: str) -> str:
 30 |     pkg_path = Path(import_module('sudachidict_' + dict_type).__file__).parent
 31 |     dic_path = pkg_path / 'resources' / 'system.dic'
 32 |     return str(dic_path.absolute())
 33 | 
 34 | 
 35 | def to_absolute_resource_path(resource_dir: str, dict_path: str) -> str:
 36 |     if Path(dict_path).is_absolute():
 37 |         return dict_path
 38 |     else:
 39 |         return os.path.join(resource_dir, dict_path)
 40 | 
 41 | 
 42 | def find_dict_path(dict_type='core'):
 43 |     is_installed = find_spec('sudachidict_{}'.format(dict_type))
 44 |     if is_installed:
 45 |         return get_absolute_dict_path(dict_type)
 46 |     else:
 47 |         raise ModuleNotFoundError(
 48 |             'Package `sudachidict_{}` dose not exist. '
 49 |             'You may install it with a command `$ pip install sudachidict_{}`'.format(dict_type, dict_type)
 50 |         )
 51 | 
 52 | 
 53 | class _Settings(object):
 54 | 
 55 |     DICT_PATH_KEY = 'systemDict'
 56 |     CHAR_DEF_KEY = 'characterDefinitionFile'
 57 |     USER_DICT_PATH_KEY = 'userDict'
 58 | 
 59 |     def __init__(self):
 60 |         self.__is_active = False
 61 |         self.__dict_ = None
 62 |         self.__config_path = None
 63 |         self.resource_dir = None
 64 | 
 65 |     def set_up(self, config_path=None, resource_dir=None, dict_type=None) -> None:
 66 |         config_path = config_path or DEFAULT_SETTINGFILE
 67 |         self.__config_path = config_path
 68 |         resource_dir = resource_dir or os.path.dirname(config_path)
 69 |         with open(config_path, 'r', encoding='utf-8') as f:
 70 |             self.__dict_ = json.load(f)
 71 |         self.__is_active = True
 72 |         self.resource_dir = resource_dir
 73 |         if dict_type is not None:
 74 |             if dict_type in ['small', 'core', 'full']:
 75 |                 if self.DICT_PATH_KEY in self.__dict_ and self.__dict_[self.DICT_PATH_KEY] and \
 76 |                         'sudachidict_{}'.format(dict_type) not in self.__dict_[self.DICT_PATH_KEY]:
 77 |                     warnings.warn(
 78 |                         'Two system dictionaries may be specified. '
 79 |                         'The `sudachidict_{}` defined "dict_type" overrides those defined in the config file.'.format(dict_type)
 80 |                     )
 81 |                 self.__dict_[self.DICT_PATH_KEY] = find_dict_path(dict_type=dict_type)
 82 |             else:
 83 |                 raise ValueError('"dict_type" must be "small", "core", or "full".')
 84 |         else:
 85 |             if self.DICT_PATH_KEY not in self.__dict_ or not self.__dict_[self.DICT_PATH_KEY]:
 86 |                 self.__dict_[self.DICT_PATH_KEY] = find_dict_path()
 87 | 
 88 |     def __setitem__(self, key, value):
 89 |         if not self.__is_active:
 90 |             self.set_up()
 91 |         self.__dict_[key] = value
 92 | 
 93 |     def __getitem__(self, key):
 94 |         if not self.__is_active:
 95 |             self.set_up()
 96 |         return self.__dict_[key]
 97 | 
 98 |     def keys(self):
 99 |         return self.__dict_.keys()
100 | 
101 |     def __contains__(self, item):
102 |         return item in self.__dict_.keys()
103 | 
104 |     def system_dict_path(self) -> str:
105 |         dict_path = self.__dict_[self.DICT_PATH_KEY]
106 |         return to_absolute_resource_path(self.resource_dir, dict_path)
107 | 
108 |     def char_def_path(self) -> str:
109 |         key = self.CHAR_DEF_KEY
110 |         if key in self.__dict_:
111 |             return to_absolute_resource_path(self.resource_dir, self.__dict_[key])
112 |         raise KeyError('`{}` not defined in setting file'.format(key))
113 | 
114 |     def user_dict_paths(self) -> List[str]:
115 |         key = self.USER_DICT_PATH_KEY
116 |         if key in self.__dict_:
117 |             return [to_absolute_resource_path(self.resource_dir, path) for path in self.__dict_[key]]
118 |         return []
119 | 
120 | 
121 | settings = _Settings()
122 | 


--------------------------------------------------------------------------------
/tests/dictionarylib/test_doublearraylexicon.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import mmap
 16 | import os
 17 | import unittest
 18 | 
 19 | from sudachipy.dictionarylib.dictionaryheader import DictionaryHeader
 20 | from sudachipy.dictionarylib.doublearraylexicon import DoubleArrayLexicon
 21 | 
 22 | 
 23 | class TestDoubleArrayLexicon(unittest.TestCase):
 24 | 
 25 |     __GRAMMAR_SIZE = 470
 26 | 
 27 |     def setUp(self):
 28 |         # Copied from sudachipy.dictionay.Dictionary.read_system_dictionary
 29 |         test_resources_dir = os.path.join(
 30 |             os.path.dirname(os.path.abspath(__file__)),
 31 |             os.pardir,
 32 |             'resources')
 33 |         filename = os.path.join(test_resources_dir, 'system.dic')
 34 |         with open(filename, 'rb') as system_dic:
 35 |             bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ)
 36 |         header = DictionaryHeader.from_bytes(bytes_, 0)
 37 |         if not header.is_system_dictionary():
 38 |             raise Exception('invalid system dictionary')
 39 |         self.lexicon = DoubleArrayLexicon(bytes_, header.storage_size() + 470, True)
 40 | 
 41 |     def test_lookup(self):
 42 |         res = self.lexicon.lookup('東京都'.encode('utf-8'), 0)
 43 |         self.assertEqual((4, 3), res.__next__())  # 東
 44 | 
 45 |         self.assertEqual((5, 6), res.__next__())  # 東京
 46 | 
 47 |         self.assertEqual((6, 9), res.__next__())  # 東京都
 48 | 
 49 |         with self.assertRaises(StopIteration):
 50 |             res.__next__()
 51 | 
 52 |         res = self.lexicon.lookup('東京都に'.encode('utf-8'), 9)
 53 |         self.assertEqual((1, 12), res.__next__())  # に(接続助詞)
 54 |         self.assertEqual((2, 12), res.__next__())  # に(格助詞)
 55 |         with self.assertRaises(StopIteration):
 56 |             res.__next__()
 57 | 
 58 |         res = self.lexicon.lookup('あれ'.encode('utf-8'), 0)
 59 |         with self.assertRaises(StopIteration):
 60 |             res.__next__()
 61 | 
 62 |     def test_parameters(self):
 63 |         # た
 64 |         self.assertEqual(1, self.lexicon.get_left_id(0))
 65 |         self.assertEqual(1, self.lexicon.get_right_id(0))
 66 |         self.assertEqual(8729, self.lexicon.get_cost(0))
 67 | 
 68 |         # 東京都
 69 |         self.assertEqual(6, self.lexicon.get_left_id(6))
 70 |         self.assertEqual(8, self.lexicon.get_right_id(6))
 71 |         self.assertEqual(5320, self.lexicon.get_cost(6))
 72 | 
 73 |         # 都
 74 |         self.assertEqual(8, self.lexicon.get_left_id(9))
 75 |         self.assertEqual(8, self.lexicon.get_right_id(9))
 76 |         self.assertEqual(2914, self.lexicon.get_cost(9))
 77 | 
 78 |     def test_wordinfo(self):
 79 |         # た
 80 |         wi = self.lexicon.get_word_info(0)
 81 |         self.assertEqual('た', wi.surface)
 82 |         self.assertEqual(3, wi.head_word_length)
 83 |         self.assertEqual(0, wi.pos_id)
 84 |         self.assertEqual('た', wi.normalized_form)
 85 |         self.assertEqual(-1, wi.dictionary_form_word_id)
 86 |         self.assertEqual('た', wi.dictionary_form)
 87 |         self.assertEqual('タ', wi.reading_form)
 88 |         self.assertEqual([], wi.a_unit_split)
 89 |         self.assertEqual([], wi.b_unit_split)
 90 |         self.assertEqual([], wi.word_structure)
 91 | 
 92 |         # 行っ
 93 |         wi = self.lexicon.get_word_info(8)
 94 |         self.assertEqual('行っ', wi.surface)
 95 |         self.assertEqual('行く', wi.normalized_form)
 96 |         self.assertEqual(7, wi.dictionary_form_word_id)
 97 |         self.assertEqual('行く', wi.dictionary_form)
 98 | 
 99 |         # 東京都
100 |         wi = self.lexicon.get_word_info(6)
101 |         self.assertEqual('東京都', wi.surface)
102 |         self.assertEqual([5, 9], wi.a_unit_split)
103 |         self.assertEqual([], wi.b_unit_split)
104 |         self.assertEqual([5, 9], wi.word_structure)
105 |         self.assertEqual([], wi.synonym_group_ids)
106 | 
107 |     def test_wordinfo_with_longword(self):
108 |         # 0123456789 * 30
109 |         wi = self.lexicon.get_word_info(36)
110 |         self.assertEqual(300, len(wi.surface))
111 |         self.assertEqual(300, wi.head_word_length)
112 |         self.assertEqual(300, len(wi.normalized_form))
113 |         self.assertEqual(-1, wi.dictionary_form_word_id)
114 |         self.assertEqual(300, len(wi.dictionary_form))
115 |         self.assertEqual(570, len(wi.reading_form))
116 | 
117 |     def test_size(self):
118 |         self.assertEqual(39, self.lexicon.size())
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     unittest.main()
123 | 


--------------------------------------------------------------------------------
/tests/plugin/test_default_input_text_plugin.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import unittest
 17 | 
 18 | from sudachipy.plugin.input_text.default_input_text_plugin import DefaultInputTextPlugin
 19 | from sudachipy.utf8inputtextbuilder import UTF8InputTextBuilder
 20 | 
 21 | from tests import mock_grammar
 22 | 
 23 | 
 24 | class TestDefaultInputTextPlugin(unittest.TestCase):
 25 | 
 26 |     original_text = "ÂＢΓД㈱ｶﾞウ゛⼼Ⅲ"
 27 |     normalized_text = "âbγд(株)ガヴ⼼ⅲ"
 28 | 
 29 |     def setUp(self):
 30 |         self.builder = UTF8InputTextBuilder(self.original_text, mock_grammar.mocked_grammar)
 31 | 
 32 |         self.plugin = DefaultInputTextPlugin()
 33 | 
 34 |         try:
 35 |             self.plugin.set_up()
 36 |         except IOError:
 37 |             self.fail('no file')
 38 | 
 39 |         self.test_resources_dir = os.path.join(
 40 |             os.path.dirname(os.path.abspath(__file__)),
 41 |             os.pardir,
 42 |             'resources')
 43 | 
 44 |     def test_before_rewrite(self):
 45 |         self.assertEqual(self.original_text, self.builder.get_original_text())
 46 |         self.assertEqual(self.original_text, self.builder.get_text())
 47 |         text = self.builder.build()
 48 |         self.assertEqual(self.original_text, text.get_original_text())
 49 |         self.assertEqual(self.original_text, text.get_text())
 50 |         bytes_ = text.get_byte_text()
 51 |         self.assertEqual(30, len(bytes_))
 52 |         expected = b'\xc3\x82\xef\xbc\xa2\xce\x93\xd0\x94\xe3\x88\xb1\xef\xbd\xb6\xef\xbe\x9e\xe3\x82\xa6\xe3\x82\x9b\xe2\xbc\xbc\xe2\x85\xa2'
 53 |         self.assertEqual(expected, bytes_)
 54 |         self.assertEqual(0, text.get_original_index(0))
 55 |         self.assertEqual(0, text.get_original_index(1))
 56 |         self.assertEqual(1, text.get_original_index(2))
 57 |         self.assertEqual(1, text.get_original_index(4))
 58 |         self.assertEqual(3, text.get_original_index(8))
 59 |         self.assertEqual(5, text.get_original_index(12))
 60 |         self.assertEqual(9, text.get_original_index(24))
 61 |         self.assertEqual(9, text.get_original_index(26))
 62 | 
 63 |     def test_after_rewrite(self):
 64 |         self.assertEqual(self.original_text, self.builder.get_original_text())
 65 |         self.assertEqual(self.original_text, self.builder.get_text())
 66 |         self.plugin.rewrite(self.builder)
 67 |         text = self.builder.build()
 68 |         self.assertEqual(self.original_text, text.get_original_text())
 69 |         self.assertEqual(self.normalized_text, text.get_text())
 70 |         bytes_ = text.get_byte_text()
 71 |         self.assertEqual(24, len(bytes_))
 72 |         expected = b'\xc3\xa2\x62\xce\xb3\xd0\xb4\x28\xe6\xa0\xaa\x29\xe3\x82\xac\xe3\x83\xb4\xe2\xbc\xbc\xe2\x85\xb2'
 73 |         self.assertEqual(expected, bytes_)
 74 |         self.assertEqual(0, text.get_original_index(0))
 75 |         self.assertEqual(0, text.get_original_index(1))
 76 |         self.assertEqual(1, text.get_original_index(2))
 77 |         self.assertEqual(2, text.get_original_index(3))
 78 |         self.assertEqual(4, text.get_original_index(7))
 79 |         self.assertEqual(5, text.get_original_index(8))
 80 |         self.assertEqual(5, text.get_original_index(11))
 81 |         self.assertEqual(7, text.get_original_index(15))
 82 |         self.assertEqual(7, text.get_original_index(17))
 83 | 
 84 |     # def test_setup_with_null(self):
 85 | 
 86 |     def test_invalid_format_ignorelist(self):
 87 |         plugin = DefaultInputTextPlugin()
 88 |         with self.assertRaises(RuntimeError) as cm:
 89 |             plugin.read_rewrite_lists(os.path.join(self.test_resources_dir, 'rewrite_error_ignorelist.def'))
 90 |         self.assertEqual('12 is not character at line 1', cm.exception.args[0])
 91 | 
 92 |     def test_invalid_format_replacelist(self):
 93 |         plugin = DefaultInputTextPlugin()
 94 |         with self.assertRaises(RuntimeError) as cm:
 95 |             plugin.read_rewrite_lists(os.path.join(self.test_resources_dir, 'rewrite_error_replacelist.def'))
 96 |         self.assertEqual('invalid format at line 1', cm.exception.args[0])
 97 | 
 98 |     def test_duplicated_lines_replacelist(self):
 99 |         plugin = DefaultInputTextPlugin()
100 |         with self.assertRaises(RuntimeError) as cm:
101 |             plugin.read_rewrite_lists(os.path.join(self.test_resources_dir, 'rewrite_error_dup.def'))
102 |         self.assertEqual('12 is already defined at line 2', cm.exception.args[0])
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     unittest.main()
107 | 


--------------------------------------------------------------------------------
/tests/test_tokenizer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import unittest
 17 | 
 18 | from sudachipy import dictionary
 19 | 
 20 | 
 21 | class TestTokenizer(unittest.TestCase):
 22 | 
 23 |     def setUp(self):
 24 |         resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources')
 25 |         self.dict_ = dictionary.Dictionary(os.path.join(resource_dir, 'sudachi.json'), resource_dir)
 26 |         self.tokenizer_obj = self.dict_.create()
 27 | 
 28 |     def test_tokenize_small_katanana_only(self):
 29 |         ms = self.tokenizer_obj.tokenize('ァ')
 30 |         self.assertEqual(1, len(ms))
 31 | 
 32 |     def test_part_of_speech(self):
 33 |         ms = self.tokenizer_obj.tokenize('京都')
 34 |         self.assertEqual(1, len(ms))
 35 |         m = ms[0]
 36 |         pid = m.part_of_speech_id()
 37 |         self.assertTrue(self.dict_.grammar.get_part_of_speech_size() > pid)
 38 |         pos = m.part_of_speech()
 39 |         self.assertEqual(pos, self.dict_.grammar.get_part_of_speech_string(pid))
 40 | 
 41 |     def test_get_word_id(self):
 42 |         ms = self.tokenizer_obj.tokenize('京都')
 43 |         self.assertEqual(1, len(ms))
 44 |         self.assertEqual(['名詞', '固有名詞', '地名', '一般', '*', '*'], ms[0].part_of_speech())
 45 | 
 46 |         wid = ms[0].word_id()
 47 |         ms = self.tokenizer_obj.tokenize('ぴらる')
 48 |         self.assertEqual(1, len(ms))
 49 |         self.assertNotEqual(wid, ms[0].word_id())
 50 |         self.assertEqual(['名詞', '普通名詞', '一般', '*', '*', '*'], ms[0].part_of_speech())
 51 | 
 52 |         ms = self.tokenizer_obj.tokenize('京')
 53 |         self.assertEqual(1, len(ms))
 54 | 
 55 |     def test_get_dictionary_id(self):
 56 |         ms = self.tokenizer_obj.tokenize('京都')
 57 |         self.assertEqual(1, ms.size())
 58 |         self.assertEqual(0, ms[0].dictionary_id())
 59 | 
 60 |         ms = self.tokenizer_obj.tokenize('ぴらる')
 61 |         self.assertEqual(1, ms.size())
 62 |         self.assertEqual(1, ms[0].dictionary_id())
 63 | 
 64 |         ms = self.tokenizer_obj.tokenize('京')
 65 |         self.assertEqual(1, ms.size())
 66 |         self.assertTrue(ms[0].dictionary_id() < 0)
 67 | 
 68 |     def test_get_synonym_group_ids(self):
 69 |         ms = self.tokenizer_obj.tokenize('京都')
 70 |         self.assertEqual(1, ms.size())
 71 |         self.assertEqual([1, 5], ms[0].synonym_group_ids())
 72 | 
 73 |         ms = self.tokenizer_obj.tokenize('ぴらる')
 74 |         self.assertEqual(1, ms.size())
 75 |         self.assertEqual([], ms[0].synonym_group_ids())
 76 | 
 77 |         ms = self.tokenizer_obj.tokenize('東京府')
 78 |         self.assertEqual(1, ms.size())
 79 |         self.assertEqual([1, 3], ms[0].synonym_group_ids())
 80 | 
 81 |     def test_tokenize_kanji_alphabet_word(self):
 82 |         self.assertEqual(len(self.tokenizer_obj.tokenize('特a')), 1)
 83 |         self.assertEqual(len(self.tokenizer_obj.tokenize('ab')), 1)
 84 |         self.assertEqual(len(self.tokenizer_obj.tokenize('特ab')), 2)
 85 | 
 86 |     def test_tokenizer_with_dots(self):
 87 |         ms = self.tokenizer_obj.tokenize('京都…')
 88 |         self.assertEqual(4, ms.size())
 89 |         self.assertEqual(ms[1].surface(), '…')
 90 |         self.assertEqual(ms[1].normalized_form(), '.')
 91 |         self.assertEqual(ms[2].surface(), '')
 92 |         self.assertEqual(ms[2].normalized_form(), '.')
 93 |         self.assertEqual(ms[3].surface(), '')
 94 |         self.assertEqual(ms[3].normalized_form(), '.')
 95 | 
 96 |     def test_tokenizer_morpheme_split(self):
 97 |         from sudachipy import tokenizer
 98 |         ms = self.tokenizer_obj.tokenize('東京都', tokenizer.Tokenizer.SplitMode.C)
 99 |         self.assertEqual(1, ms.size())
100 |         self.assertEqual(ms[0].surface(), '東京都')
101 | 
102 |         ms_a = ms[0].split(tokenizer.Tokenizer.SplitMode.A)
103 |         self.assertEqual(2, ms_a.size())
104 |         self.assertEqual(ms_a[0].surface(), '東京')
105 |         self.assertEqual(ms_a[1].surface(), '都')
106 | 
107 |     def test_tokenizer_morpheme_list_range(self):
108 |         from sudachipy import tokenizer
109 |         ms = self.tokenizer_obj.tokenize('東京都', tokenizer.Tokenizer.SplitMode.A)
110 |         self.assertEqual(2, ms.size())
111 |         self.assertEqual(ms[0].surface(), '東京')
112 |         self.assertEqual(ms[1].surface(), '都')
113 | 
114 |         self.assertEqual(ms[-1].surface(), ms[1].surface())
115 |         self.assertEqual(ms[-2].surface(), ms[0].surface())
116 |         with self.assertRaises(IndexError) as cm:
117 |             ms[2]
118 |         with self.assertRaises(IndexError) as cm:
119 |             ms[-3]
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     unittest.main()
124 | 


--------------------------------------------------------------------------------
/sudachipy/resources/char.def:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Japanese charcter category map
  3 | #
  4 | #   $Id: char.def 9 2012-12-12 04:13:15Z togiso $;
  5 | #
  6 | 
  7 | ###################################################################################
  8 | # 
  9 | #  CHARACTER CATEGORY DEFINITION
 10 | #
 11 | #  CATEGORY_NAME INVOKE GROUP LENGTH
 12 | #
 13 | #   - CATEGORY_NAME: Name of category. you have to define DEFAULT class.
 14 | #   - INVOKE: 1/0:   always invoke unknown word processing, evan when the word can be found in the lexicon
 15 | #   - GROUP:  1/0:   make a new word by grouping the same chracter category
 16 | #   - LENGTH: n:     1 to n length new words are added
 17 | #
 18 | DEFAULT         0 1 0  # DEFAULT is a mandatory category!
 19 | SPACE           0 1 0  
 20 | KANJI           0 0 2
 21 | SYMBOL          1 1 0
 22 | NUMERIC         1 1 0
 23 | ALPHA           1 1 0
 24 | HIRAGANA        0 1 2
 25 | KATAKANA        1 1 2
 26 | KANJINUMERIC    0 1 0  #change INVOKE 1->0
 27 | GREEK           1 1 0
 28 | CYRILLIC        1 1 0
 29 | 
 30 | ###################################################################################
 31 | #
 32 | # CODE(UCS2) TO CATEGORY MAPPING
 33 | #
 34 | 
 35 | # SPACE
 36 | 0x0020 SPACE  # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE
 37 | 0x000D SPACE
 38 | 0x0009 SPACE
 39 | 0x000B SPACE
 40 | 0x000A SPACE
 41 | 
 42 | # ASCII
 43 | 0x0021..0x002F SYMBOL   #!"#$%&'()*+,-./
 44 | 0x0030..0x0039 NUMERIC  #0-9
 45 | 0x003A..0x0040 SYMBOL   #:;<=>?@
 46 | 0x0041..0x005A ALPHA    #A-Z
 47 | 0x005B..0x0060 SYMBOL   #[\]^_`
 48 | 0x0061..0x007A ALPHA    #a-z
 49 | 0x007B..0x007E SYMBOL   #{|}~
 50 | 
 51 | # Latin
 52 | 0x00A1..0x00BF SYMBOL # Latin 1 #¡->¿
 53 | 0x00C0..0x00D6 ALPHA  # Latin 1 #À->Ö
 54 | 0x00D7         SYMBOL # Latin 1 #×
 55 | 0x00D8..0x00F6 ALPHA  # Latin 1 #Ø->ö
 56 | 0x00F7         SYMBOL # Latin 1 #÷
 57 | 0x00F8..0x00FF ALPHA  # Latin 1 #ø->ÿ
 58 | 0x0100..0x017F ALPHA  # Latin Extended A
 59 | 0x0180..0x0236 ALPHA  # Latin Extended B
 60 | 0x1E00..0x1EF9 ALPHA  # Latin Extended Additional
 61 | 
 62 | # CYRILLIC
 63 | 0x0400..0x04F9 CYRILLIC #Ѐ->ӹ
 64 | 0x0500..0x050F CYRILLIC # Cyrillic supplementary
 65 | 
 66 | # GREEK
 67 | 0x0374..0x03FB GREEK # Greek and Coptic　#ʹ->ϻ
 68 | 
 69 | # HIRAGANA
 70 | 0x3041..0x309F  HIRAGANA
 71 | 
 72 | # KATAKANA
 73 | #0x30A1..0x30FF  KATAKANA
 74 | 0x30A1..0x30FA  KATAKANA
 75 | 0x30FC..0x30FF  KATAKANA
 76 | 0x31F0..0x31FF  KATAKANA  # Small KU .. Small RO
 77 | # 0x30FC          KATAKANA HIRAGANA  # ー
 78 | 0x30A1          NOOOVBOW # Small A
 79 | 0x30A3          NOOOVBOW
 80 | 0x30A5          NOOOVBOW
 81 | 0x30A7          NOOOVBOW
 82 | 0x30A9          NOOOVBOW
 83 | 0x30E3          NOOOVBOW
 84 | 0x30E5          NOOOVBOW
 85 | 0x30E7          NOOOVBOW
 86 | 0x30EE          NOOOVBOW
 87 | 0x30FB..0x30FE  NOOOVBOW
 88 | 
 89 | # Half KATAKANA
 90 | 0xFF66..0xFF9D  KATAKANA
 91 | 0xFF9E..0xFF9F  KATAKANA
 92 | 
 93 | # KANJI
 94 | 0x2E80..0x2EF3  KANJI # CJK Raidcals Supplement
 95 | 0x2F00..0x2FD5  KANJI
 96 | 0x3005          KANJI NOOOVBOW
 97 | 0x3007          KANJI
 98 | 0x3400..0x4DB5  KANJI # CJK Unified Ideographs Extention
 99 | #0x4E00..0x9FA5  KANJI
100 | 0x4E00..0x9FFF  KANJI
101 | 0xF900..0xFA2D  KANJI
102 | 0xFA30..0xFA6A  KANJI
103 | 
104 | 
105 | # KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆)
106 | 0x4E00 KANJINUMERIC KANJI
107 | 0x4E8C KANJINUMERIC KANJI
108 | 0x4E09 KANJINUMERIC KANJI
109 | 0x56DB KANJINUMERIC KANJI
110 | 0x4E94 KANJINUMERIC KANJI
111 | 0x516D KANJINUMERIC KANJI
112 | 0x4E03 KANJINUMERIC KANJI
113 | 0x516B KANJINUMERIC KANJI
114 | 0x4E5D KANJINUMERIC KANJI
115 | 0x5341 KANJINUMERIC KANJI
116 | 0x767E KANJINUMERIC KANJI
117 | 0x5343 KANJINUMERIC KANJI
118 | 0x4E07 KANJINUMERIC KANJI
119 | 0x5104 KANJINUMERIC KANJI
120 | 0x5146 KANJINUMERIC KANJI
121 | 
122 | # ZENKAKU 
123 | 0xFF10..0xFF19 NUMERIC
124 | 0xFF21..0xFF3A ALPHA
125 | 0xFF41..0xFF5A ALPHA
126 | 0xFF01..0xFF0F SYMBOL   #！->／
127 | 0xFF1A..0xFF20 SYMBOL   #：->＠
128 | 0xFF3B..0xFF40 SYMBOL   #［->｀
129 | 0xFF5B..0xFF65 SYMBOL   #｛->･
130 | 0xFFE0..0xFFEF SYMBOL # HalfWidth and Full width Form
131 | 
132 | # OTHER SYMBOLS
133 | 0x2000..0x206F  SYMBOL # General Punctuation
134 | 0x2070..0x209F  NUMERIC # Superscripts and Subscripts
135 | 0x20A0..0x20CF  SYMBOL # Currency Symbols
136 | 0x20D0..0x20FF  SYMBOL # Combining Diaritical Marks for Symbols
137 | 0x2100..0x214F  SYMBOL # Letterlike Symbols
138 | 0x2150..0x218F  NUMERIC # Number forms
139 | 0x2100..0x214B  SYMBOL # Letterlike Symbols
140 | 0x2190..0x21FF  SYMBOL # Arrow
141 | 0x2200..0x22FF  SYMBOL # Mathematical Operators
142 | 0x2300..0x23FF  SYMBOL # Miscellaneuos Technical
143 | 0x2460..0x24FF  SYMBOL # Enclosed NUMERICs
144 | 0x2501..0x257F  SYMBOL # Box Drawing
145 | 0x2580..0x259F  SYMBOL # Block Elements
146 | 0x25A0..0x25FF  SYMBOL # Geometric Shapes
147 | 0x2600..0x26FE  SYMBOL # Miscellaneous Symbols
148 | 0x2700..0x27BF  SYMBOL # Dingbats
149 | 0x27F0..0x27FF  SYMBOL # Supplemental Arrows A
150 | 0x27C0..0x27EF  SYMBOL # Miscellaneous Mathematical Symbols-A
151 | 0x2800..0x28FF  SYMBOL # Braille Patterns
152 | 0x2900..0x297F  SYMBOL # Supplemental Arrows B
153 | 0x2B00..0x2BFF  SYMBOL # Miscellaneous Symbols and Arrows
154 | 0x2A00..0x2AFF  SYMBOL # Supplemental Mathematical Operators
155 | 0x3300..0x33FF  SYMBOL
156 | 0x3200..0x32FE  SYMBOL # ENclosed CJK Letters and Months
157 | 0x3000..0x303F  SYMBOL # CJK Symbol and Punctuation
158 | 0xFE30..0xFE4F  SYMBOL # CJK Compatibility Forms
159 | 0xFE50..0xFE6B  SYMBOL # Small Form Variants
160 | 
161 | # added 2006/3/13 
162 | 0x3007 SYMBOL KANJINUMERIC
163 | 
164 | # added 2018/11/30
165 | 0x309b..0x309c HIRAGANA KATAKANA # voiced/semi-voiced sound marks
166 | 
167 | # END OF TABLE
168 | 


--------------------------------------------------------------------------------
/tests/test_grammar.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import mmap
 16 | import os
 17 | import shutil
 18 | import tempfile
 19 | import unittest
 20 | 
 21 | from sudachipy.dictionarylib.grammar import Grammar
 22 | 
 23 | 
 24 | class TestGrammar(unittest.TestCase):
 25 | 
 26 |     alloc_size = 4096
 27 | 
 28 |     def setUp(self):
 29 |         storage = bytearray()
 30 |         self.build_partofspeech(storage)
 31 |         self.build_connect_table(storage)
 32 |         self.test_dir = tempfile.mkdtemp()
 33 |         f = os.path.join(self.test_dir, 'test_file.txt')
 34 |         with open(f, 'wb') as wf:
 35 |             wf.write(bytes(storage))
 36 |         self.mmap = None
 37 |         with open(f, 'rb') as rf:
 38 |             self.mmap = mmap.mmap(rf.fileno(), 0, access=mmap.ACCESS_COPY)
 39 |         self.storage_size = self.mmap.size()
 40 |         offset = 0
 41 |         self.grammar = Grammar(self.mmap, offset)
 42 | 
 43 |     def tearDown(self):
 44 |         shutil.rmtree(self.test_dir)
 45 | 
 46 |     def test_storage_size(self):
 47 |         self.assertEqual(self.storage_size, self.grammar.storage_size)
 48 | 
 49 |     def test_get_partofspeech_string(self):
 50 |         self.assertEqual(6, len(self.grammar.get_part_of_speech_string(0)))
 51 |         self.assertEqual("BOS/EOS", self.grammar.get_part_of_speech_string(0)[0])
 52 |         self.assertEqual("*", self.grammar.get_part_of_speech_string(0)[5])
 53 | 
 54 |         self.assertEqual("一般", self.grammar.get_part_of_speech_string(1)[1])
 55 |         self.assertEqual("*", self.grammar.get_part_of_speech_string(1)[5])
 56 | 
 57 |         self.assertEqual("五段-サ行", self.grammar.get_part_of_speech_string(2)[4])
 58 |         self.assertEqual("終止形-一般", self.grammar.get_part_of_speech_string(2)[5])
 59 | 
 60 |     def test_get_connect_cost(self):
 61 |         self.assertEqual(0, self.grammar.get_connect_cost(0, 0))
 62 |         self.assertEqual(-100, self.grammar.get_connect_cost(2, 1))
 63 |         self.assertEqual(200, self.grammar.get_connect_cost(1, 2))
 64 | 
 65 |     def test_set_connect_cost(self):
 66 |         self.grammar.set_connect_cost(0, 0, 300)
 67 |         self.assertEqual(300, self.grammar.get_connect_cost(0, 0))
 68 | 
 69 |     def test_get_bos_parameters(self):
 70 |         self.assertEqual(0, self.grammar.get_bos_parameter()[0])
 71 |         self.assertEqual(0, self.grammar.get_bos_parameter()[1])
 72 |         self.assertEqual(0, self.grammar.get_bos_parameter()[2])
 73 | 
 74 |     def test_get_eos_parameters(self):
 75 |         self.assertEqual(0, self.grammar.get_eos_parameter()[0])
 76 |         self.assertEqual(0, self.grammar.get_eos_parameter()[1])
 77 |         self.assertEqual(0, self.grammar.get_eos_parameter()[2])
 78 | 
 79 |     def test_read_from_file(self):
 80 |         # Todo
 81 |         # after tidying up dictionary management
 82 |         pass
 83 | 
 84 |     @staticmethod
 85 |     def build_partofspeech(storage):
 86 |         storage.extend((3).to_bytes(2, byteorder='little', signed=True))  # number of part of speech
 87 | 
 88 |         storage.extend(b'\x07B\x00O\x00S\x00/\x00E\x00O\x00S\x00\x01*\x00\x01*\x00\x01*\x00\x01*\x00\x01*\x00')
 89 | 
 90 |         storage.extend(b'\x02')
 91 |         storage.extend('名刺'.encode('utf-16-le'))
 92 |         storage.extend(b'\x02')
 93 |         storage.extend('一般'.encode('utf-16-le'))
 94 |         storage.extend(b'\x01*\x00\x01*\x00\x01*\x00\x01*\x00')
 95 | 
 96 |         storage.extend(b'\x02')
 97 |         storage.extend('動詞'.encode('utf-16-le'))
 98 |         storage.extend(b'\x02')
 99 |         storage.extend('一般'.encode('utf-16-le'))
100 |         storage.extend(b'\x01*\x00\x01*\x00\x05')
101 |         storage.extend('五段-サ行'.encode('utf-16-le'))
102 |         storage.extend(b'\x06')
103 |         storage.extend('終止形-一般'.encode('utf-16-le'))
104 | 
105 |     @staticmethod
106 |     def build_connect_table(storage):
107 |         storage.extend((3).to_bytes(2, byteorder='little', signed=True))  # number of leftId
108 |         storage.extend((3).to_bytes(2, byteorder='little', signed=True))  # number of rightId
109 | 
110 |         storage.extend((0).to_bytes(2, byteorder='little', signed=True))  # number of rightId
111 |         storage.extend((-300).to_bytes(2, byteorder='little', signed=True))  # number of rightId
112 |         storage.extend((3000).to_bytes(2, byteorder='little', signed=True))  # number of rightId
113 | 
114 |         storage.extend((300).to_bytes(2, byteorder='little', signed=True))  # number of rightId
115 |         storage.extend((-500).to_bytes(2, byteorder='little', signed=True))  # number of rightId
116 |         storage.extend((-100).to_bytes(2, byteorder='little', signed=True))  # number of frightId
117 | 
118 |         storage.extend((-3000).to_bytes(2, byteorder='little', signed=True))  # number of rightId
119 |         storage.extend((200).to_bytes(2, byteorder='little', signed=True))  # number of rightId
120 |         storage.extend((2000).to_bytes(2, byteorder='little', signed=True))  # number of rightId
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     unittest.main()
125 | 


--------------------------------------------------------------------------------
/tests/test_switchdictionary.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import json
 16 | import os
 17 | import shutil
 18 | import tempfile
 19 | import time
 20 | from logging import getLogger
 21 | from unittest import TestCase
 22 | 
 23 | from sudachipy.dictionary import Dictionary
 24 | from sudachipy.dictionarylib import SYSTEM_DICT_VERSION_2
 25 | from sudachipy.dictionarylib.dictionarybuilder import DictionaryBuilder
 26 | from sudachipy.dictionarylib.dictionaryheader import DictionaryHeader
 27 | 
 28 | 
 29 | class TestSwitchDictionary(TestCase):
 30 | 
 31 |     def setUp(self):
 32 |         self.logger = getLogger()
 33 |         self.logger.disabled = True
 34 | 
 35 |         self.temp_dir = tempfile.mkdtemp()
 36 |         self.resource_dir = os.path.join(self.temp_dir, 'resources')
 37 |         os.makedirs(self.resource_dir)
 38 | 
 39 |         test_resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources')
 40 |         self.char_def_path = os.path.join(self.resource_dir, 'char.def')
 41 |         shutil.copy(os.path.join(test_resource_dir, 'char.def'), self.char_def_path)
 42 | 
 43 |         self.sudachi_json_path = os.path.join(self.resource_dir, 'sudachi.json')
 44 |         shutil.copy(os.path.join(test_resource_dir, 'sudachi.json'), self.sudachi_json_path)
 45 |         self._rewrite_json(self.sudachi_json_path, 'userDict', [])
 46 | 
 47 |         self.matrix_path = os.path.join(self.resource_dir, 'matrix.txt')
 48 |         with open(self.matrix_path, 'w', encoding='utf-8') as wf:
 49 |             wf.write('1 1\n0 0 200\n')
 50 | 
 51 |         small_lexs = ["島,0,0,0,島,名詞,普通名詞,一般,*,*,*,シマ,島,*,A,*,*,*"]
 52 |         core_lexs = ["徳島本町,0,0,0,徳島本町,名詞,固有名詞,地名,一般,*,*,トクシマホンチョウ,徳島本町,*,A,*,*,*,*"]
 53 |         notcore_lexs = ["徳島堰,0,0,0,徳島堰,名詞,固有名詞,一般,*,*,*,トクシマセギ,徳島堰,*,A,*,*,*"]
 54 | 
 55 |         small_lines = small_lexs
 56 |         core_lines = small_lexs + core_lexs
 57 |         full_lines = small_lexs + core_lexs + notcore_lexs
 58 | 
 59 |         self.small_txt_path = os.path.join(self.resource_dir, 'small.csv')
 60 |         self.core_txt_path = os.path.join(self.resource_dir, 'core.csv')
 61 |         self.full_txt_path = os.path.join(self.resource_dir, 'full.csv')
 62 | 
 63 |         self.small_dic_path = self._build_dictionary(self.small_txt_path, small_lines, 'small.dic')
 64 |         self.core_dic_path = self._build_dictionary(self.core_txt_path, core_lines, 'core.dic')
 65 |         self.full_dic_path = self._build_dictionary(self.full_txt_path, full_lines, 'full.dic')
 66 | 
 67 |     def tearDown(self):
 68 |         shutil.rmtree(self.temp_dir)
 69 | 
 70 |     @staticmethod
 71 |     def _rewrite_json(json_file_path, k, v):
 72 |         with open(json_file_path, 'r') as f:
 73 |             obj = json.load(f)
 74 |         obj[k] = v
 75 |         with open(json_file_path, 'w') as f:
 76 |             json.dump(obj, f, ensure_ascii=False, indent=4)
 77 | 
 78 |     def _build_dictionary(self, input_txt_path, lex_lines, dictionary_name):
 79 |         with open(input_txt_path, 'w', encoding='utf-8') as wf:
 80 |             wf.write("\n".join(lex_lines))
 81 | 
 82 |         out_path = os.path.join(self.resource_dir, dictionary_name)
 83 |         out_stream = open(out_path, 'wb')
 84 |         lexicon_paths = [input_txt_path]
 85 |         matrix_input_stream = open(self.matrix_path, 'r', encoding='utf-8')
 86 | 
 87 |         header = DictionaryHeader(SYSTEM_DICT_VERSION_2, int(time.time()), 'test')
 88 |         out_stream.write(header.to_bytes())
 89 |         builder = DictionaryBuilder(logger=self.logger)
 90 |         builder.build(lexicon_paths, matrix_input_stream, out_stream)
 91 |         out_stream.close()
 92 |         matrix_input_stream.close()
 93 | 
 94 |         return out_path
 95 | 
 96 |     def test_switch_dictionary(self):
 97 |         self._rewrite_json(self.sudachi_json_path, 'systemDict', 'small.dic')  # relative path
 98 |         self.dict = Dictionary(config_path=self.sudachi_json_path, resource_dir=self.resource_dir)
 99 |         self.assertEqual(1, self.dict.lexicon.size())
100 |         self._rewrite_json(self.sudachi_json_path, 'systemDict', self.small_dic_path)  # abstract path
101 |         self.dict = Dictionary(config_path=self.sudachi_json_path, resource_dir=self.resource_dir)
102 |         self.assertEqual(1, self.dict.lexicon.size())
103 | 
104 |         self._rewrite_json(self.sudachi_json_path, 'systemDict', 'core.dic')
105 |         self.dict = Dictionary(config_path=self.sudachi_json_path, resource_dir=self.resource_dir)
106 |         self.assertEqual(2, self.dict.lexicon.size())
107 |         self._rewrite_json(self.sudachi_json_path, 'systemDict', self.core_dic_path)
108 |         self.dict = Dictionary(config_path=self.sudachi_json_path, resource_dir=self.resource_dir)
109 |         self.assertEqual(2, self.dict.lexicon.size())
110 | 
111 |         self._rewrite_json(self.sudachi_json_path, 'systemDict', 'full.dic')
112 |         self.dict = Dictionary(config_path=self.sudachi_json_path, resource_dir=self.resource_dir)
113 |         self.assertEqual(3, self.dict.lexicon.size())
114 |         self._rewrite_json(self.sudachi_json_path, 'systemDict', self.full_dic_path)
115 |         self.dict = Dictionary(config_path=self.sudachi_json_path, resource_dir=self.resource_dir)
116 |         self.assertEqual(3, self.dict.lexicon.size())
117 | 


--------------------------------------------------------------------------------
/sudachipy/utf8inputtextbuilder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from . import utf8inputtext
 16 | from .dictionarylib.categorytype import CategoryType
 17 | 
 18 | 
 19 | class UTF8InputTextBuilder:
 20 |     def __init__(self, text, grammar):
 21 | 
 22 |         self.grammar = grammar
 23 |         self.original_text = text
 24 |         self.modified_text = text
 25 |         self.modified_to_original = list(range(len(self.original_text) + 1))
 26 |         # 注: サロゲートペア文字は考慮していない
 27 | 
 28 |     def replace(self, begin, end, str_):
 29 |         if begin < 0:
 30 |             raise IndexError(begin)
 31 |         if begin > len(self.modified_text):
 32 |             raise IndexError("begin > length")
 33 |         if begin > end:
 34 |             raise IndexError("begin > end")
 35 |         if begin == end:
 36 |             raise AttributeError("begin == end")
 37 | 
 38 |         if end > len(self.modified_text):
 39 |             end = len(self.modified_text)
 40 | 
 41 |         self.modified_text = str_.join([self.modified_text[:begin], self.modified_text[end:]])
 42 | 
 43 |         modified_begin = self.modified_to_original[begin]
 44 |         modified_end = self.modified_to_original[end]
 45 |         length = len(str_)
 46 |         if end - begin > length:
 47 |             del self.modified_to_original[begin + length:end]
 48 |         self.modified_to_original[begin] = modified_begin
 49 |         for i in range(1, length):
 50 |             if begin + i < end:
 51 |                 self.modified_to_original[begin + i] = modified_end
 52 |             else:
 53 |                 self.modified_to_original.insert(begin + i, modified_end)
 54 | 
 55 |     def get_original_text(self):
 56 |         return self.original_text
 57 | 
 58 |     def get_text(self):
 59 |         return self.modified_text
 60 | 
 61 |     def build(self):
 62 |         modified_string_text = self.get_text()
 63 |         byte_text = modified_string_text.encode('utf-8')
 64 | 
 65 |         length = len(byte_text)
 66 |         byte_indexes = [0 for i in range(length + 1)]
 67 |         offsets = [0 for i in range(length + 1)]
 68 |         j = 0
 69 |         for i in range(len(self.modified_text)):
 70 |             # 注: サロゲートペア文字は考慮していない
 71 |             for _ in range(self.utf8_byte_length(ord(self.modified_text[i]))):
 72 |                 byte_indexes[j] = i
 73 |                 offsets[j] = self.modified_to_original[i]
 74 |                 j += 1
 75 |         byte_indexes[length] = len(modified_string_text)
 76 |         offsets[length] = self.modified_to_original[-1]
 77 | 
 78 |         char_categories = self.get_char_category_types(modified_string_text)
 79 |         char_category_continuities = self.get_char_category_continuities(modified_string_text, length, char_categories)
 80 |         can_bow_list = self._build_can_bow_list(modified_string_text, char_categories)
 81 |         return utf8inputtext.UTF8InputText(
 82 |             self.grammar, self.original_text, modified_string_text, byte_text,
 83 |             offsets, byte_indexes, char_categories, char_category_continuities, can_bow_list)
 84 | 
 85 |     def get_char_category_types(self, text):
 86 |         return [self.grammar.get_character_category().get_category_types(ord(c)) for c in text]
 87 | 
 88 |     def get_char_category_continuities(self, text, byte_length, char_categories):
 89 |         if len(text) == 0:
 90 |             return []
 91 |         char_category_continuities = []
 92 |         i = 0
 93 |         while i < len(char_categories):
 94 |             next_ = i + self.get_char_category_continuous_length(char_categories, i)
 95 |             length = 0
 96 |             for j in range(i, next_):
 97 |                 length += self.utf8_byte_length(ord(text[j]))
 98 |             for k in range(length, 0, -1):
 99 |                 char_category_continuities.append(k)
100 |             i = next_
101 |         return char_category_continuities
102 | 
103 |     def get_char_category_continuous_length(self, char_categories, offset):
104 |         continuous_category = set(char_categories[offset])
105 |         for length in range(1, len(char_categories) - offset):
106 |             continuous_category = continuous_category & char_categories[offset + length]
107 |             if len(continuous_category) == 0:
108 |                 return length
109 |         return len(char_categories) - offset
110 | 
111 |     def utf8_byte_length(self, cp):
112 |         if cp < 0:
113 |             return 0
114 |         elif cp <= 0x7F:
115 |             return 1
116 |         elif cp <= 0x7FF:
117 |             return 2
118 |         elif cp <= 0xFFFF:
119 |             return 3
120 |         elif cp <= 0x10FFFF:
121 |             return 4
122 |         else:
123 |             return 0
124 | 
125 |     def _build_can_bow_list(self, text, char_categories):
126 |         if not text:
127 |             return []
128 |         can_bow_list = []
129 |         for i, cat in enumerate(char_categories):
130 |             if i == 0:
131 |                 can_bow_list.append(True)
132 |                 continue
133 | 
134 |             if CategoryType.ALPHA in cat or CategoryType.GREEK in cat or CategoryType.CYRILLIC in cat:
135 |                 types = cat & char_categories[i - 1]
136 |                 can_bow_list.append(not bool(types))
137 |                 continue
138 | 
139 |             can_bow_list.append(True)
140 | 
141 |         return can_bow_list
142 | 


--------------------------------------------------------------------------------
/tests/plugin/test_prolongedsoundmarkinput.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from unittest import TestCase
 16 | 
 17 | from sudachipy.plugin.input_text import ProlongedSoundMarkInputTextPlugin
 18 | from sudachipy.utf8inputtextbuilder import UTF8InputTextBuilder
 19 | 
 20 | from tests.mock_grammar import mocked_grammar
 21 | 
 22 | 
 23 | class TestProlongedSoundMarkInputTextPlugin(TestCase):
 24 | 
 25 |     def setUp(self) -> None:
 26 |         self.plugin = ProlongedSoundMarkInputTextPlugin(None)
 27 |         for psm in ['ー', '〜', '〰']:
 28 |             self.plugin._psm_set.add(ord(psm))
 29 | 
 30 |     def test_combine_continuous_prolonged_sound_mark(self):
 31 |         original = 'ゴーール'
 32 |         normalized = 'ゴール'
 33 |         builder = UTF8InputTextBuilder(original, mocked_grammar)
 34 |         self.plugin.rewrite(builder)
 35 |         text = builder.build()
 36 | 
 37 |         self.assertEqual(original, text.original_text)
 38 |         self.assertEqual(normalized, text.get_text())
 39 |         bytes_ = text.get_byte_text()
 40 |         self.assertEqual(9, len(bytes_))
 41 | 
 42 |         self.assertEqual(b'\xe3\x82\xb4\xe3\x83\xbc\xe3\x83\xab', bytes_)
 43 |         self.assertEqual(0, text.get_original_index(0))
 44 |         self.assertEqual(1, text.get_original_index(3))
 45 |         self.assertEqual(3, text.get_original_index(6))
 46 |         self.assertEqual(4, text.get_original_index(9))
 47 | 
 48 |     def test_combined_continuous_prolonged_sound_marks_at_end(self):
 49 |         original = 'スーパーー'
 50 |         normalized = 'スーパー'
 51 |         builder = UTF8InputTextBuilder(original, mocked_grammar)
 52 |         self.plugin.rewrite(builder)
 53 |         text = builder.build()
 54 | 
 55 |         self.assertEqual(original, text.original_text)
 56 |         self.assertEqual(normalized, text.get_text())
 57 |         bytes_ = text.get_byte_text()
 58 |         self.assertEqual(12, len(bytes_))
 59 | 
 60 |         self.assertEqual(b'\xe3\x82\xb9\xe3\x83\xbc\xe3\x83\x91\xe3\x83\xbc', bytes_)
 61 |         self.assertEqual(0, text.get_original_index(0))
 62 |         self.assertEqual(1, text.get_original_index(3))
 63 |         self.assertEqual(2, text.get_original_index(6))
 64 |         self.assertEqual(3, text.get_original_index(9))
 65 |         self.assertEqual(5, text.get_original_index(12))
 66 | 
 67 |     def test_combine_continuous_prolonged_sound_marks_multi_times(self):
 68 |         original = 'エーービーーーシーーーー'
 69 |         normalized = 'エービーシー'
 70 |         builder = UTF8InputTextBuilder(original, mocked_grammar)
 71 |         self.plugin.rewrite(builder)
 72 |         text = builder.build()
 73 | 
 74 |         self.assertEqual(original, text.original_text)
 75 |         self.assertEqual(normalized, text.get_text())
 76 |         bytes_ = text.get_byte_text()
 77 |         self.assertEqual(18, len(bytes_))
 78 | 
 79 |         self.assertEqual(b'\xe3\x82\xa8\xe3\x83\xbc\xe3\x83\x93\xe3\x83\xbc\xe3\x82\xb7\xe3\x83\xbc', bytes_)
 80 |         self.assertEqual(0, text.get_original_index(0))
 81 |         self.assertEqual(1, text.get_original_index(3))
 82 |         self.assertEqual(3, text.get_original_index(6))
 83 |         self.assertEqual(4, text.get_original_index(9))
 84 |         self.assertEqual(7, text.get_original_index(12))
 85 |         self.assertEqual(8, text.get_original_index(15))
 86 |         self.assertEqual(12, text.get_original_index(18))
 87 | 
 88 |     def test_combine_continuous_prolonged_sound_marks_multi_symbol_types(self):
 89 |         original = 'エーービ〜〜〜シ〰〰〰〰'
 90 |         normalized = 'エービーシー'
 91 |         builder = UTF8InputTextBuilder(original, mocked_grammar)
 92 |         self.plugin.rewrite(builder)
 93 |         text = builder.build()
 94 | 
 95 |         self.assertEqual(original, text.original_text)
 96 |         self.assertEqual(normalized, text.get_text())
 97 |         bytes_ = text.get_byte_text()
 98 |         self.assertEqual(18, len(bytes_))
 99 | 
100 |         self.assertEqual(b'\xe3\x82\xa8\xe3\x83\xbc\xe3\x83\x93\xe3\x83\xbc\xe3\x82\xb7\xe3\x83\xbc', bytes_)
101 |         self.assertEqual(0, text.get_original_index(0))
102 |         self.assertEqual(1, text.get_original_index(3))
103 |         self.assertEqual(3, text.get_original_index(6))
104 |         self.assertEqual(4, text.get_original_index(9))
105 |         self.assertEqual(7, text.get_original_index(12))
106 |         self.assertEqual(8, text.get_original_index(15))
107 |         self.assertEqual(12, text.get_original_index(18))
108 | 
109 |     def test_combine_continuous_prolonged_sound_marks_multi_mixed_symbol_types(self):
110 |         original = 'エー〜ビ〜〰ーシ〰ー〰〜'
111 |         normalized = 'エービーシー'
112 |         builder = UTF8InputTextBuilder(original, mocked_grammar)
113 |         self.plugin.rewrite(builder)
114 |         text = builder.build()
115 | 
116 |         self.assertEqual(original, text.original_text)
117 |         self.assertEqual(normalized, text.get_text())
118 |         bytes_ = text.get_byte_text()
119 |         self.assertEqual(18, len(bytes_))
120 | 
121 |         self.assertEqual(b'\xe3\x82\xa8\xe3\x83\xbc\xe3\x83\x93\xe3\x83\xbc\xe3\x82\xb7\xe3\x83\xbc', bytes_)
122 |         self.assertEqual(0, text.get_original_index(0))
123 |         self.assertEqual(1, text.get_original_index(3))
124 |         self.assertEqual(3, text.get_original_index(6))
125 |         self.assertEqual(4, text.get_original_index(9))
126 |         self.assertEqual(7, text.get_original_index(12))
127 |         self.assertEqual(8, text.get_original_index(15))
128 |         self.assertEqual(12, text.get_original_index(18))
129 | 


--------------------------------------------------------------------------------
/sudachipy/lattice.pyx:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from typing import List, Optional
 16 | 
 17 | from .dictionarylib.grammar import Grammar
 18 | from .latticenode cimport LatticeNode
 19 | 
 20 | cdef class Lattice:
 21 | 
 22 |     def __init__(self, grammar: Grammar):
 23 |         self.size = 0
 24 |         self.capacity = 0
 25 | 
 26 | 
 27 |         self.end_lists = []
 28 |         self.grammar = grammar
 29 |         self.eos_params = grammar.get_eos_parameter()
 30 |         cdef LatticeNode bos_node = LatticeNode()
 31 |         bos_params = grammar.get_bos_parameter()
 32 |         bos_node.set_parameter(bos_params[0], bos_params[1], bos_params[2])
 33 |         bos_node.is_connected_to_bos = True
 34 |         self.end_lists.append([bos_node])
 35 |         self.connect_costs = self.grammar._matrix_view
 36 | 
 37 |     cpdef void resize(self, int size):
 38 |         if size > self.capacity:
 39 |             self.expand(size)
 40 |         self.size = size
 41 |         self.eos_node = LatticeNode()
 42 |         self.eos_node.set_parameter(self.eos_params[0], self.eos_params[1], self.eos_params[2])
 43 |         self.eos_node.begin = self.eos_node.end = size
 44 | 
 45 |     def clear(self) -> None:
 46 |         for i in range(1, self.size + 1):
 47 |             self.end_lists[i].clear()
 48 |         self.size = 0
 49 |         self.eos_node = None
 50 | 
 51 |     def expand(self, new_size: int) -> None:
 52 |         expand_list = [[] for _ in range(self.size, new_size)]
 53 |         self.end_lists.extend(expand_list)
 54 |         self.capacity = new_size
 55 | 
 56 |     def get_nodes_with_end(self, end: int) -> List[LatticeNode]:
 57 |         return self.end_lists[end]
 58 | 
 59 |     def get_nodes(self, begin: int, end: int) -> List[LatticeNode]:
 60 |         return [node for node in self.end_lists[end] if node.get_begin() == begin]
 61 | 
 62 |     def get_minimum_node(self, begin: int, end: int) -> Optional[LatticeNode]:
 63 |         nodes = self.get_nodes(begin, end)
 64 |         if not nodes:
 65 |             return None
 66 |         min_arg = nodes[0]
 67 |         for node in nodes[1:]:
 68 |             if node.get_path_cost() < min_arg.get_path_cost():
 69 |                 min_arg = node
 70 |         return min_arg
 71 | 
 72 |     cpdef void insert(self, int begin, int end, LatticeNode node):
 73 |         self.end_lists[end].append(node)
 74 |         node.begin = begin
 75 |         node.end = end
 76 |         self.connect_node(node)
 77 | 
 78 |     def remove(self, begin: int, end: int, node: LatticeNode) -> None:
 79 |         self.end_lists[end].remove(node)
 80 | 
 81 |     @staticmethod
 82 |     def create_node() -> LatticeNode:
 83 |         return LatticeNode()
 84 | 
 85 |     def has_previous_node(self, index: int) -> bool:
 86 |         return bool(self.end_lists[index])
 87 | 
 88 |     cdef void connect_node(self, LatticeNode r_node):
 89 |         begin = r_node.begin
 90 |         r_node.total_cost = INT_MAX
 91 | 
 92 |         cdef LatticeNode l_node
 93 |         cdef int connect_cost
 94 |         for l_node in self.end_lists[begin]:
 95 |             if not l_node.is_connected_to_bos:
 96 |                 continue
 97 |             connect_cost = self.connect_costs[r_node.left_id, l_node.right_id]
 98 | 
 99 |             # 0x7fff == Grammar.INHIBITED_CONNECTION:
100 |             if connect_cost == 0x7fff:
101 |                 continue
102 |             cost = l_node.total_cost + connect_cost
103 |             if cost < r_node.total_cost:
104 |                 r_node.total_cost = cost
105 |                 r_node.best_previous_node = l_node
106 | 
107 |         r_node.is_connected_to_bos = r_node.best_previous_node is not None
108 |         r_node.total_cost += r_node.cost
109 | 
110 |     cdef void connect_eos_node(self):
111 |         self.connect_node(self.eos_node)
112 | 
113 |     def get_best_path(self) -> List[LatticeNode]:
114 |         # self.connect_node(self.eos_node)
115 |         if not self.eos_node.is_connected_to_bos:    # EOS node
116 |             raise AttributeError("EOS is not connected to BOS")
117 |         result = []
118 |         node = self.eos_node.best_previous_node
119 |         while node is not self.end_lists[0][0]:
120 |             result.append(node)
121 |             node = node.best_previous_node
122 |         return list(reversed(result))
123 | 
124 |     def dump(self, logger):
125 |         if logger.disabled:
126 |             return
127 |         index = 0
128 |         for i in range(self.size + 1, -1, -1):
129 |             r_nodes = self.end_lists[i] if i <= self.size else [self.eos_node]
130 |             for r_node in r_nodes:
131 |                 surface = '(null)'
132 |                 pos = 'BOS/EOS'
133 | 
134 |                 if r_node.is_defined():
135 |                     wi = r_node.get_word_info()
136 |                     surface = wi.surface
137 |                     pos_id = wi.pos_id
138 |                     pos = '(null)'
139 |                     if pos_id >= 0:
140 |                         pos = ','.join(self.grammar.get_part_of_speech_string(pos_id))
141 | 
142 |                 costs = []
143 |                 for l_node in self.end_lists[r_node.get_begin()]:
144 |                     cost = self.grammar.get_connect_cost(l_node.get_right_id(), r_node.get_left_id())
145 |                     costs.append(str(cost))
146 |                 index += 1
147 | 
148 |                 logger.info('%d: %d %d %s(%d) %s %d %d %d: %s' %
149 |                             (index, r_node.get_begin(), r_node.get_end(),
150 |                              surface, r_node.get_word_id(), pos, r_node.get_left_id(),
151 |                              r_node.get_right_id(), r_node.get_path_cost(), ' '.join(costs)))
152 | 


--------------------------------------------------------------------------------
/sudachipy/dictionarylib/charactercategory.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import math
 16 | import re
 17 | from queue import PriorityQueue
 18 | 
 19 | from . import categorytype
 20 | 
 21 | 
 22 | class CharacterCategory(object):
 23 | 
 24 |     class Range(object):
 25 | 
 26 |         def __lt__(self, other):
 27 |             return self.high < other.high
 28 | 
 29 |         def __init__(self, low=0, high=0, categories=None):
 30 |             self.low = low
 31 |             self.high = high
 32 |             self.categories = categories or []
 33 | 
 34 |         def contains(self, cp):
 35 |             return self.low <= cp < self.high
 36 | 
 37 |         def containing_length(self, text):
 38 |             for i in range(len(text)):
 39 |                 c = ord(text[i])
 40 |                 if c < self.low or c > self.high:
 41 |                     return i
 42 |             return len(text)
 43 | 
 44 |         def lower(self, cp):
 45 |             return self.high <= cp
 46 | 
 47 |         def higher(self, cp):
 48 |             return self.low > cp
 49 | 
 50 |         def match(self, other):
 51 |             return self.low == other.low and self.high == other.high
 52 | 
 53 |     def __init__(self):
 54 |         self.range_list = []
 55 | 
 56 |     def _compile(self) -> None:
 57 |         """
 58 |         _compile transforms self.range_list to non overlapped range list
 59 |         to apply binary search in get_category_types
 60 |         :return:
 61 |         """
 62 |         self.range_list.sort(key=lambda x: x.high)
 63 |         self.range_list.sort(key=lambda x: x.low)
 64 |         new_range_list = []
 65 |         left_chain = PriorityQueue()
 66 |         right_chain = self.range_list
 67 |         states = []
 68 |         pivot = 0
 69 |         while True:
 70 |             if left_chain.empty():
 71 |                 if not right_chain:
 72 |                     break
 73 |                 right = right_chain.pop(0)
 74 |                 left_chain.put(right)
 75 |                 pivot = right.low
 76 |                 states.extend(right.categories)
 77 |                 continue
 78 |             left = left_chain.get()
 79 |             right = right_chain[0] if right_chain else None
 80 |             left_end = left.high
 81 |             right_begin = right.low if right else math.inf
 82 |             if left_end <= right_begin:
 83 |                 new_range_list.append(self.Range(pivot, left_end, set(states)))
 84 |                 pivot = left_end
 85 |                 for cat in left.categories:
 86 |                     states.remove(cat)
 87 |                 continue
 88 |             else:
 89 |                 new_range_list.append(self.Range(pivot, right_begin, set(states)))
 90 |                 pivot = right_begin
 91 |                 states.extend(right.categories)
 92 |                 left_chain.put(right)
 93 |                 left_chain.put(left)
 94 |                 right_chain.pop(0)
 95 |         self.range_list = []
 96 |         _range = new_range_list[0]
 97 |         for irange in new_range_list[1:]:
 98 |             if irange.low == _range.high and irange.categories == _range.categories:
 99 |                 _range = self.Range(_range.low, irange.high, _range.categories)
100 |             else:
101 |                 self.range_list.append(_range)
102 |                 _range = irange
103 |         self.range_list.append(_range)
104 | 
105 |     def get_category_types(self, code_point):
106 |         begin = 0
107 |         n = len(self.range_list)
108 |         end = n
109 |         pivot = (begin + end) // 2
110 |         while 0 <= pivot < n:
111 |             range_ = self.range_list[pivot]
112 |             if range_.contains(code_point):
113 |                 return range_.categories
114 |             if range_.lower(code_point):
115 |                 begin = pivot
116 |             else:  # range_.higher(code_point)
117 |                 end = pivot
118 |             new_pivot = (begin + end) // 2
119 |             if new_pivot == pivot:
120 |                 break
121 |             pivot = new_pivot
122 |         return {categorytype.CategoryType.DEFAULT}
123 | 
124 |     def read_character_definition(self, char_def=None):
125 |         """
126 |         :param char_def: path
127 |         """
128 | 
129 |         if char_def is not None:
130 |             f = open(char_def, 'r', encoding="utf-8")
131 |         else:
132 |             f = open("char.def", 'r', encoding="utf-8")
133 | 
134 |         for i, line in enumerate(f.readlines()):
135 |             line = line.rstrip()
136 |             if re.fullmatch(r"\s*", line) or re.match("#", line):
137 |                 continue
138 |             cols = re.split(r"\s+", line)
139 |             if len(cols) < 2:
140 |                 f.close()
141 |                 raise AttributeError("invalid format at line {}".format(i))
142 |             if not re.match("0x", cols[0]):
143 |                 continue
144 |             range_ = self.Range()
145 |             r = re.split("\\.\\.", cols[0])
146 |             range_.low = int(r[0], 16)
147 |             range_.high = range_.low + 1
148 |             if len(r) > 1:
149 |                 range_.high = int(r[1], 16) + 1
150 |             if range_.low >= range_.high:
151 |                 f.close()
152 |                 raise AttributeError("invalid range at line {}".format(i))
153 |             for j in range(1, len(cols)):
154 |                 if re.match("#", cols[j]) or cols[j] == '':
155 |                     break
156 |                 type_ = categorytype.CategoryType.get(cols[j])
157 |                 if type_ is None:
158 |                     f.close()
159 |                     raise AttributeError("{} is invalid type at line {}".format(cols[j], i))
160 |                 range_.categories.append(type_)
161 |             self.range_list.append(range_)
162 | 
163 |         f.close()
164 |         self._compile()
165 | 


--------------------------------------------------------------------------------
/tests/plugin/test_numericparser.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from unittest import TestCase
 16 | 
 17 | from sudachipy.plugin.path_rewrite.numericparser import NumericParser
 18 | 
 19 | 
 20 | class TestNumericParser(TestCase):
 21 | 
 22 |     def setUp(self) -> None:
 23 |         self.parser = NumericParser()
 24 | 
 25 |     def test_digits(self):
 26 |         self.assertTrue(self.parse('1000'))
 27 |         self.assertEqual('1000', self.parser.get_normalized())
 28 |         self.parser.clear()
 29 | 
 30 |     def test_starts_with_zero(self):
 31 |         self.assertTrue(self.parse('001000'))
 32 |         self.assertEqual('001000', self.parser.get_normalized())
 33 |         self.parser.clear()
 34 | 
 35 |         self.assertTrue(self.parse('〇一〇〇〇'))
 36 |         self.assertEqual('01000', self.parser.get_normalized())
 37 |         self.parser.clear()
 38 | 
 39 |         self.assertTrue(self.parse('00.1000'))
 40 |         self.assertEqual('00.1', self.parser.get_normalized())
 41 |         self.parser.clear()
 42 | 
 43 |         self.assertTrue(self.parse('000'))
 44 |         self.assertEqual('000', self.parser.get_normalized())
 45 |         self.parser.clear()
 46 | 
 47 |     def test_use_small_unit(self):
 48 |         self.assertTrue(self.parse('二十七'))
 49 |         self.assertEqual('27', self.parser.get_normalized())
 50 |         self.parser.clear()
 51 | 
 52 |         self.assertTrue(self.parse('千三百二十七'))
 53 |         self.assertEqual('1327', self.parser.get_normalized())
 54 |         self.parser.clear()
 55 | 
 56 |         self.assertTrue(self.parse('千十七'))
 57 |         self.assertEqual('1017', self.parser.get_normalized())
 58 |         self.parser.clear()
 59 | 
 60 |         self.assertTrue(self.parse('千三百二十七.〇五'))
 61 |         self.assertEqual('1327.05', self.parser.get_normalized())
 62 |         self.parser.clear()
 63 | 
 64 |         self.assertFalse(self.parse('三百二十百'))
 65 |         self.parser.clear()
 66 | 
 67 |     def test_use_large_unit(self):
 68 |         self.assertTrue(self.parse('1万'))
 69 |         self.assertEqual('10000', self.parser.get_normalized())
 70 |         self.parser.clear()
 71 | 
 72 |         self.assertTrue(self.parse('千三百二十七万'))
 73 |         self.assertEqual('13270000', self.parser.get_normalized())
 74 |         self.parser.clear()
 75 | 
 76 |         self.assertTrue(self.parse('千三百二十七万一四'))
 77 |         self.assertEqual('13270014', self.parser.get_normalized())
 78 |         self.parser.clear()
 79 | 
 80 |         self.assertTrue(self.parse('千三百二十七万一四.〇五'))
 81 |         self.assertEqual('13270014.05', self.parser.get_normalized())
 82 |         self.parser.clear()
 83 | 
 84 |         self.assertTrue(self.parse('三兆2千億千三百二十七万一四.〇五'))
 85 |         self.assertEqual('3200013270014.05', self.parser.get_normalized())
 86 |         self.parser.clear()
 87 | 
 88 |         self.assertFalse(self.parse('億万'))
 89 |         self.parser.clear()
 90 | 
 91 |     def test_float_with_unit(self):
 92 |         self.assertTrue(self.parse('1.5千'))
 93 |         self.assertEqual('1500', self.parser.get_normalized())
 94 |         self.parser.clear()
 95 | 
 96 |         self.assertTrue(self.parse('1.5百万'))
 97 |         self.assertEqual('1500000', self.parser.get_normalized())
 98 |         self.parser.clear()
 99 | 
100 |         self.assertTrue(self.parse('1.5百万1.5千20'))
101 |         self.assertEqual('1501520', self.parser.get_normalized())
102 |         self.parser.clear()
103 | 
104 |         self.assertFalse(self.parse('1.5千5百'))
105 |         self.parser.clear()
106 | 
107 |         self.assertFalse(self.parse('1.5千500'))
108 |         self.parser.clear()
109 | 
110 |     def test_log_numeric(self):
111 |         self.assertTrue(self.parse('200000000000000000000万'))
112 |         self.assertEqual('2000000000000000000000000', self.parser.get_normalized())
113 |         self.parser.clear()
114 | 
115 |     def test_with_comma(self):
116 |         self.assertTrue(self.parse('2,000,000'))
117 |         self.assertEqual('2000000', self.parser.get_normalized())
118 |         self.parser.clear()
119 | 
120 |         self.assertTrue(self.parse('259万2,300'))
121 |         self.assertEqual('2592300', self.parser.get_normalized())
122 |         self.parser.clear()
123 | 
124 |         self.assertFalse(self.parse('200,00,000'))
125 |         self.assertEqual(NumericParser.Error.COMMA, self.parser._error_state)
126 |         self.parser.clear()
127 | 
128 |         self.assertFalse(self.parse('2,4'))
129 |         self.assertEqual(NumericParser.Error.COMMA, self.parser._error_state)
130 |         self.parser.clear()
131 | 
132 |         self.assertFalse(self.parse('000,000'))
133 |         self.assertEqual(NumericParser.Error.COMMA, self.parser._error_state)
134 |         self.parser.clear()
135 | 
136 |         self.assertFalse(self.parse(',000'))
137 |         self.assertEqual(NumericParser.Error.COMMA, self.parser._error_state)
138 |         self.parser.clear()
139 | 
140 |         self.assertFalse(self.parse('256,55.1'))
141 |         self.assertEqual(NumericParser.Error.COMMA, self.parser._error_state)
142 |         self.parser.clear()
143 | 
144 |     def test_not_digit(self):
145 |         self.assertFalse(self.parse('@@@'))
146 |         self.parser.clear()
147 | 
148 |     def test_float_point(self):
149 |         self.assertTrue(self.parse('6.0'))
150 |         self.assertEqual('6', self.parser.get_normalized())
151 |         self.parser.clear()
152 | 
153 |         self.assertFalse(self.parse('6.'))
154 |         self.assertEqual(NumericParser.Error.POINT, self.parser.error_state)
155 |         self.parser.clear()
156 | 
157 |         self.assertFalse(self.parse('1.2.3'))
158 |         self.assertEqual(NumericParser.Error.POINT, self.parser.error_state)
159 |         self.parser.clear()
160 | 
161 |     def parse(self, text: str) -> bool:
162 |         for c in text:
163 |             if not self.parser.append(c):
164 |                 return False
165 |         return self.parser.done()
166 | 


--------------------------------------------------------------------------------
/sudachipy/plugin/oov/mecab_oov_plugin.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | from collections import defaultdict
 17 | 
 18 | from sudachipy import config
 19 | from sudachipy.dictionarylib import categorytype
 20 | from sudachipy.dictionarylib import wordinfo
 21 | 
 22 | from .oov_provider_plugin import OovProviderPlugin
 23 | 
 24 | 
 25 | class MeCabOovPlugin(OovProviderPlugin):
 26 |     class CategoryInfo:
 27 |         def __init__(self):
 28 |             self.type_ = None
 29 |             self.is_invoke = None
 30 |             self.is_group = None
 31 |             self.length = None
 32 | 
 33 |     class OOV:
 34 |         def __init__(self):
 35 |             self.left_id = -1
 36 |             self.right_id = -1
 37 |             self.cost = -1
 38 |             self.pos_id = None
 39 | 
 40 |     def __init__(self, json_obj=None):
 41 |         if json_obj:
 42 |             self.__chardef_filename = json_obj['charDef']
 43 |             self.__unkdef_filename = json_obj['unkDef']
 44 |         else:
 45 |             self.__chardef_filename = None
 46 |             self.__unkdef_filename = None
 47 |         self.categories = {}
 48 |         self.oov_list = defaultdict(list)
 49 | 
 50 |     def set_up(self, grammar):
 51 |         char_def = os.path.join(config.settings.resource_dir, self.__chardef_filename)
 52 |         if not char_def:
 53 |             raise AttributeError("charDef is not defined")
 54 |         self.read_character_property(char_def)
 55 | 
 56 |         unk_def = os.path.join(config.settings.resource_dir, self.__unkdef_filename)
 57 |         if not unk_def:
 58 |             raise AttributeError("unkDef is not defined")
 59 |         self.read_oov(unk_def, grammar)
 60 | 
 61 |     def provide_oov(self, input_text, offset, has_other_words):
 62 |         length = input_text.get_char_category_continuous_length(offset)
 63 |         if length < 1:
 64 |             return []
 65 |         nodes = []
 66 |         for type_ in input_text.get_char_category_types(offset):
 67 |             if type_ not in self.categories:
 68 |                 continue
 69 |             cinfo = self.categories[type_]
 70 |             llength = length
 71 |             if cinfo.type_ not in self.oov_list:
 72 |                 continue
 73 |             oovs = self.oov_list[cinfo.type_]
 74 |             if not cinfo.is_invoke and has_other_words:
 75 |                 continue
 76 |             if cinfo.is_group:
 77 |                 s = input_text.get_substring(offset, offset + length)
 78 |                 for oov in oovs:
 79 |                     nodes.append(self.get_oov_node(s, oov, length))
 80 |                     llength -= 1
 81 |             for i in range(1, cinfo.length + 1):
 82 |                 sublength = input_text.get_code_points_offset_length(offset, i)
 83 |                 if sublength > llength:
 84 |                     break
 85 |                 s = input_text.get_substring(offset, offset + sublength)
 86 |                 for oov in oovs:
 87 |                     nodes.append(self.get_oov_node(s, oov, sublength))
 88 |         return nodes
 89 | 
 90 |     def get_oov_node(self, text, oov, length):
 91 |         node = self.create_node()
 92 |         node.set_parameter(oov.left_id, oov.right_id, oov.cost)
 93 |         info = wordinfo.WordInfo(surface=text, head_word_length=length, pos_id=oov.pos_id, normalized_form=text,
 94 |                                  dictionary_form_word_id=-1, dictionary_form=text, reading_form="",
 95 |                                  a_unit_split=[], b_unit_split=[], word_structure=[], synonym_group_ids=[])
 96 |         node.set_word_info(info)
 97 |         return node
 98 | 
 99 |     def read_character_property(self, char_def):
100 |         with open(char_def, "r", encoding="utf-8") as f:
101 |             for i, line in enumerate(f, start=1):
102 |                 line = line.strip()
103 |                 if not line or line.startswith("#") or line.startswith("0x"):
104 |                     continue
105 |                 cols = line.split()
106 |                 if len(cols) < 4:
107 |                     raise ValueError("invalid format at line {}".format(i))
108 |                 try:
109 |                     type_ = getattr(categorytype.CategoryType, cols[0])
110 |                 except AttributeError:
111 |                     raise ValueError("`{}` is invalid type at line {}".format(cols[0], i))
112 |                 if type_ in self.categories:
113 |                     raise ValueError("`{}` is already defined at line {}".format(cols[0], i))
114 | 
115 |                 info = self.CategoryInfo()
116 |                 info.type_ = type_
117 |                 info.is_invoke = (cols[1] != "0")
118 |                 info.is_group = (cols[2] != "0")
119 |                 info.length = int(cols[3])
120 |                 self.categories[type_] = info
121 | 
122 |     def read_oov(self, unk_def, grammar):
123 |         with open(unk_def, "r", encoding="utf-8") as f:
124 |             for i, line in enumerate(f, start=1):
125 |                 line = line.strip()
126 |                 if not line:
127 |                     continue
128 |                 cols = line.split(",")
129 |                 if len(cols) < 10:
130 |                     raise ValueError("invalid format at line {}".format(i))
131 |                 try:
132 |                     type_ = getattr(categorytype.CategoryType, cols[0])
133 |                 except AttributeError:
134 |                     raise ValueError("`{}` is invalid type at line {}".format(cols[0], i))
135 |                 if type_ not in self.categories:
136 |                     raise ValueError("`{}` is undefined at line {}".format(cols[0], i))
137 | 
138 |                 oov = self.OOV()
139 |                 oov.left_id = int(cols[1])
140 |                 oov.right_id = int(cols[2])
141 |                 oov.cost = int(cols[3])
142 |                 pos = cols[4:10]
143 |                 oov.pos_id = grammar.get_part_of_speech_id(pos)
144 |                 self.oov_list[type_].append(oov)
145 | 


--------------------------------------------------------------------------------
/tests/plugin/test_join_numeric_plugin.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Works Applications Co., Ltd.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import unittest
 17 | 
 18 | from sudachipy.dictionary import Dictionary
 19 | from sudachipy.plugin.path_rewrite import JoinNumericPlugin
 20 | from sudachipy.utf8inputtextbuilder import UTF8InputTextBuilder
 21 | 
 22 | 
 23 | class TestJoinNumericOOVPlugin(unittest.TestCase):
 24 | 
 25 |     def setUp(self):
 26 |         pass
 27 |         resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources')
 28 |         self.dict_ = Dictionary(os.path.join(resource_dir, 'numeric_sudachi.json'), resource_dir)
 29 |         self.tokenizer = self.dict_.create()
 30 |         self.plugin = JoinNumericPlugin(None)
 31 |         self.plugin.set_up(self.dict_.grammar)
 32 | 
 33 |     def test_digit(self):
 34 |         path = self.get_path('123円20銭')
 35 |         self.assertEqual(4, len(path))
 36 |         self.assertEqual('123', path[0].get_word_info().surface)
 37 |         self.assertEqual('20', path[2].get_word_info().surface)
 38 | 
 39 |         path = self.get_path('080-121')
 40 |         self.assertEqual(3, len(path))
 41 |         self.assertEqual('080', path[0].get_word_info().surface)
 42 |         self.assertEqual('121', path[2].get_word_info().surface)
 43 | 
 44 |     def test_kanji_numeric(self):
 45 |         path = self.get_path('一二三万二千円')
 46 |         self.assertEqual(2, len(path))
 47 |         self.assertEqual('一二三万二千', path[0].get_word_info().surface)
 48 | 
 49 |         path = self.get_path('二百百')
 50 |         self.assertEqual(3, len(path))
 51 | 
 52 |     def test_normalize(self):
 53 |         self.plugin._enable_normalize = True
 54 |         path = self.get_path('一二三万二千円')
 55 |         self.assertEqual(2, len(path))
 56 |         self.assertEqual('1232000', path[0].get_word_info().normalized_form)
 57 | 
 58 |     def test_normalized_with_not_numeric(self):
 59 |         self.plugin._enable_normalize = True
 60 |         path = self.get_path('六三四')
 61 |         self.assertEqual(1, len(path))
 62 |         self.assertEqual('六三四', path[0].get_word_info().normalized_form)
 63 | 
 64 |     def test_point(self):
 65 |         self.plugin._enable_normalize = True
 66 | 
 67 |         path = self.get_path('1.002')
 68 |         self.assertEqual(1, len(path))
 69 |         self.assertEqual('1.002', path[0].get_word_info().normalized_form)
 70 | 
 71 |         path = self.get_path('.002')
 72 |         self.assertEqual(2, len(path))
 73 |         self.assertEqual('.', path[0].get_word_info().normalized_form)
 74 |         self.assertEqual('002', path[1].get_word_info().normalized_form)
 75 | 
 76 |         path = self.get_path('22.')
 77 |         self.assertEqual(2, len(path))
 78 |         self.assertEqual('22', path[0].get_word_info().normalized_form)
 79 |         self.assertEqual('.', path[1].get_word_info().normalized_form)
 80 | 
 81 |         path = self.get_path('22.節')
 82 |         self.assertEqual(3, len(path))
 83 |         self.assertEqual('22', path[0].get_word_info().normalized_form)
 84 |         self.assertEqual('.', path[1].get_word_info().normalized_form)
 85 | 
 86 |         path = self.get_path('.c')
 87 |         self.assertEqual(2, len(path))
 88 |         self.assertEqual('.', path[0].get_word_info().normalized_form)
 89 | 
 90 |         path = self.get_path('1.20.3')
 91 |         self.assertEqual(5, len(path))
 92 |         self.assertEqual('20', path[2].get_word_info().normalized_form)
 93 | 
 94 |         path = self.get_path('652...')
 95 |         self.assertEqual(4, len(path))
 96 |         self.assertEqual('652', path[0].get_word_info().normalized_form)
 97 | 
 98 |     def test_comma(self):
 99 |         self.plugin._enable_normalize = True
100 | 
101 |         path = self.get_path('2,000,000')
102 |         self.assertEqual(1, len(path))
103 |         self.assertEqual('2000000', path[0].get_word_info().normalized_form)
104 | 
105 |         path = self.get_path('2,00,000,000円')
106 |         self.assertEqual(8, len(path))
107 |         self.assertEqual('2', path[0].get_word_info().normalized_form)
108 |         self.assertEqual(',', path[1].get_word_info().normalized_form)
109 |         self.assertEqual('00', path[2].get_word_info().normalized_form)
110 |         self.assertEqual(',', path[3].get_word_info().normalized_form)
111 |         self.assertEqual('000', path[4].get_word_info().normalized_form)
112 |         self.assertEqual(',', path[5].get_word_info().normalized_form)
113 |         self.assertEqual('000', path[6].get_word_info().normalized_form)
114 | 
115 |         path = self.get_path(',')
116 |         self.assertEqual(1, len(path))
117 | 
118 |         path = self.get_path('652,,,')
119 |         self.assertEqual(4, len(path))
120 |         self.assertEqual('652', path[0].get_word_info().normalized_form)
121 | 
122 |         path = self.get_path('256,5.50389')
123 |         self.assertEqual(3, len(path))
124 |         self.assertEqual('256', path[0].get_word_info().normalized_form)
125 |         self.assertEqual('5.50389', path[2].get_word_info().normalized_form)
126 | 
127 |         path = self.get_path('256,550.389')
128 |         self.assertEqual(1, len(path))
129 |         self.assertEqual('256550.389', path[0].get_word_info().normalized_form)
130 | 
131 |     def test_single_node(self):
132 |         self.plugin._enable_normalize = False
133 |         path = self.get_path('猫三匹')
134 |         self.assertEqual(3, len(path))
135 |         self.assertEqual('三', path[1].get_word_info().normalized_form)
136 | 
137 |         self.plugin._enable_normalize = True
138 |         path = self.get_path('猫三匹')
139 |         self.assertEqual(3, len(path))
140 |         self.assertEqual('3', path[1].get_word_info().normalized_form)
141 | 
142 |     def get_path(self, text: str):
143 |         input_ = UTF8InputTextBuilder(text, self.tokenizer._grammar).build()
144 |         self.tokenizer._build_lattice(input_)
145 |         path = self.tokenizer._lattice.get_best_path()
146 |         self.plugin.rewrite(input_, path, self.tokenizer._lattice)
147 |         self.tokenizer._lattice.clear()
148 |         return path
149 | 
150 | 
151 | if __name__ == '__main__':
152 |     unittest.main()
153 | 


--------------------------------------------------------------------------------