├── .github
├── FUNDING.yml
├── macos-build.sh
└── workflows
│ ├── actions
│ ├── build-manylinux-aarch64
│ │ └── action.yml
│ └── build-manylinux
│ │ └── action.yml
│ ├── entrypoint.sh
│ ├── manylinux1.yml
│ ├── osx.yml
│ ├── test_manylinux.yml
│ └── windows.yml
├── .gitignore
├── CITATION.cff
├── Dockerfile
├── LICENSE
├── LICENSE.mecab
├── README.md
├── fugashi.png
├── fugashi
├── __init__.py
├── cli.py
├── fugashi.pyx
├── include
│ └── mecab
│ │ └── mecab.h
├── mecab.pxd
└── tests
│ ├── test_basic.py
│ ├── test_ipadic.py
│ └── test_nbest.py
├── fugashi_util.py
├── pyproject.toml
└── setup.py
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: polm
2 |
--------------------------------------------------------------------------------
/.github/macos-build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | FLAGS="--enable-utf8-only"
4 | X86_TRIPLET=x86_64-apple-macos10.9
5 | ARM_TRIPLET=arm64-apple-macos11
6 |
7 |
8 | git clone --depth=1 https://github.com/taku910/mecab.git
9 | cd mecab/mecab
10 |
11 | rm -rf src/.libs-arm64 src/.libs-x86_64 src/.libs.combined
12 |
13 | ./configure $FLAGS --host="arm-apple-darwin22.1.0 " CXX="clang++ -target $ARM_TRIPLET" CC="clang"
14 |
15 | make clean
16 | # nproc doesnt exist on the runner
17 | make -j$(sysctl -n hw.logicalcpu_max)
18 |
19 | mv src/.libs src/.libs-arm64
20 |
21 | ./configure $FLAGS --host="x86_64-apple-darwin22.1.0 " CXX="clang++ -target $X86_TRIPLET" CC="clang"
22 |
23 | make clean
24 | make -j$(sysctl -n hw.logicalcpu_max)
25 |
26 | mv src/.libs src/.libs-x86_64
27 |
28 | rm -rf src/.libs.combined
29 | mkdir src/.libs.combined
30 |
31 | lipo -create src/.libs-arm64/libmecab.2.dylib src/.libs-x86_64/libmecab.2.dylib -output src/.libs.combined/libmecab.2.dylib
32 |
33 | lipo -create src/.libs-arm64/libmecab.a src/.libs-x86_64/libmecab.a -output src/.libs.combined/libmecab.a
34 |
35 | cp src/.libs-arm64/libmecab.lai src/.libs.combined/libmecab.lai
36 |
37 | ls src/.libs-arm64/*.o src/.libs-arm64/mecab* | while read line; do
38 | echo $line
39 | lipo -create $line src/.libs-x86_64/$(basename $line) -output src/.libs.combined/$(basename $line)
40 | done
41 |
42 | cd src/.libs.combined
43 | ln -s ../libmecab.la libmecab.la
44 | ln -s libmecab.2.dylib libmecab.dylib
45 | cd ../..
46 | mv src/.libs.combined src/.libs
47 |
48 | sudo make install
49 | cd ../..
50 |
51 | python -m pip install --upgrade pip
52 | python -m pip install cibuildwheel==2.23.3
53 |
54 | python -m cibuildwheel --platform macos --archs x86_64,arm64,universal2 --output-dir dist
55 |
--------------------------------------------------------------------------------
/.github/workflows/actions/build-manylinux-aarch64/action.yml:
--------------------------------------------------------------------------------
1 | name: build linux aarch64 wheels with manylinux docker image
2 | runs:
3 | using: 'docker'
4 | image: docker://quay.io/pypa/manylinux2014_aarch64
5 | args:
6 | - .github/workflows/entrypoint.sh
7 |
--------------------------------------------------------------------------------
/.github/workflows/actions/build-manylinux/action.yml:
--------------------------------------------------------------------------------
1 | name: build wheels with manylinux docker image
2 | runs:
3 | using: 'docker'
4 | image: docker://quay.io/pypa/manylinux2014_x86_64
5 | args:
6 | - .github/workflows/entrypoint.sh
7 |
--------------------------------------------------------------------------------
/.github/workflows/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Install mecab, then build wheels
3 | set -e
4 |
5 | # install MeCab
6 | # TODO specify the commit used here
7 | git clone --depth=1 https://github.com/taku910/mecab.git
8 | cd mecab/mecab
9 | if [ "$(uname -m)" == "aarch64" ]
10 | then
11 | ./configure --enable-utf8-only --build=aarch64-unknown-linux-gnu
12 | else
13 | ./configure --enable-utf8-only
14 | fi
15 | make
16 | make install
17 |
18 | # Hack
19 | # see here:
20 | # https://github.com/RalfG/python-wheels-manylinux-build/issues/26
21 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/
22 |
23 | # Build the wheels
24 | Python="cp39-cp39 cp310-cp310 cp311-cp311 cp312-cp312 cp313-cp313"
25 | for PYVER in $Python; do
26 | # build the wheels
27 | /opt/python/$PYVER/bin/pip wheel /github/workspace -w /github/workspace/wheels || { echo "Failed while buiding $PYVER wheel"; exit 1; }
28 | done
29 |
30 | # fix the wheels (bundles libs)
31 | for wheel in /github/workspace/wheels/*.whl; do
32 | if [ "$(uname -m)" == "aarch64" ]
33 | then
34 | auditwheel repair "$wheel" --plat manylinux2014_aarch64 -w /github/workspace/manylinux-aarch64-wheels
35 | else
36 | auditwheel repair "$wheel" --plat manylinux2014_x86_64 -w /github/workspace/manylinux2014-wheels
37 | fi
38 | done
39 |
40 | echo "Built wheels:"
41 | if [ "$(uname -m)" == "aarch64" ]
42 | then
43 | ls /github/workspace/manylinux-aarch64-wheels
44 | else
45 | ls /github/workspace/manylinux2014-wheels
46 | fi
47 |
--------------------------------------------------------------------------------
/.github/workflows/manylinux1.yml:
--------------------------------------------------------------------------------
1 | name: Build manylinux1 wheels
2 |
3 | on: [push]
4 |
5 | jobs:
6 | build_sdist:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v4
10 | - name: Set up Python
11 | uses: actions/setup-python@v5
12 | with:
13 | python-version: '>=3.9 <3.14'
14 | - name: install MeCab
15 | run: |
16 | git clone --depth=1 https://github.com/taku910/mecab.git
17 | cd mecab/mecab
18 | ./configure --enable-utf8-only
19 | make
20 | sudo make install
21 | cd ../..
22 | - name: build sdist
23 | run: |
24 | python -m pip install --upgrade pip
25 | pip install twine build
26 | python -m build
27 | - name: upload to pypi if tagged
28 | if: startsWith(github.ref, 'refs/tags')
29 | env:
30 | TWINE_USERNAME: __token__
31 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
32 | run: |
33 | twine upload dist/fugashi*.tar.gz
34 |
35 | build_linux:
36 | runs-on: ubuntu-latest
37 | steps:
38 | - uses: actions/checkout@v4
39 | - name: Set up Python
40 | uses: actions/setup-python@v5
41 | with:
42 | python-version: '>=3.9 <3.14'
43 | - name: build array of wheels
44 | uses: ./.github/workflows/actions/build-manylinux/
45 | - name: Upload Wheels
46 | uses: actions/upload-artifact@v4
47 | with:
48 | name: manylinux2014-wheels
49 | path: manylinux2014-wheels
50 | - name: Publish to PyPI if tagged
51 | if: startsWith(github.ref, 'refs/tags')
52 | env:
53 | TWINE_USERNAME: __token__
54 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
55 | run: |
56 | python --version
57 | pip --version
58 | python -m pip install --upgrade pip
59 | pip install twine
60 | twine upload manylinux2014-wheels/fugashi*whl
61 |
62 | build_linux-aarch64:
63 | runs-on: ubuntu-latest
64 | steps:
65 | - uses: actions/checkout@v4
66 | - name: Set up Python
67 | uses: actions/setup-python@v5
68 | with:
69 | python-version: '>=3.9 <3.14'
70 | - name: Set up QEMU
71 | id: qemu
72 | uses: docker/setup-qemu-action@v1
73 | - uses: ./.github/workflows/actions/build-manylinux-aarch64/
74 | - name: Upload Wheels
75 | uses: actions/upload-artifact@v4
76 | with:
77 | name: manylinux-aarch64-wheels
78 | path: manylinux-aarch64-wheels
79 | - name: Publish to PyPI if tagged
80 | if: startsWith(github.ref, 'refs/tags')
81 | env:
82 | TWINE_USERNAME: __token__
83 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
84 | run: |
85 | python --version
86 | pip --version
87 | python -m pip install --upgrade pip
88 | pip install twine
89 | twine upload manylinux-aarch64-wheels/fugashi*whl
90 |
--------------------------------------------------------------------------------
/.github/workflows/osx.yml:
--------------------------------------------------------------------------------
1 | name: Build OSX wheels
2 |
3 | env:
4 | CIBW_ARCHS_MACOS: "x86_64"
5 |
6 | on: [push]
7 |
8 | jobs:
9 | build_osx:
10 | runs-on: macos-latest
11 | steps:
12 | - uses: actions/checkout@v4
13 | - name: Set up python
14 | uses: actions/setup-python@v5
15 | with:
16 | python-version: '>=3.8 <3.14'
17 | - name: Download and build MeCab
18 | shell: bash
19 | run: |
20 | .github/macos-build.sh
21 |
22 | - name: Upload Wheels
23 | uses: actions/upload-artifact@v4
24 | with:
25 | name: osx-wheels
26 | path: dist
27 | - name: Publish to PyPI if tagged
28 | if: startsWith(github.ref, 'refs/tags')
29 | env:
30 | TWINE_USERNAME: __token__
31 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
32 | run: |
33 | pip install twine
34 | twine upload dist/fugashi*
35 |
36 |
--------------------------------------------------------------------------------
/.github/workflows/test_manylinux.yml:
--------------------------------------------------------------------------------
1 | name: test-manylinux
2 |
3 | on:
4 | push:
5 |
6 | jobs:
7 | test_linux:
8 | runs-on: ubuntu-latest
9 | strategy:
10 | matrix:
11 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
12 | include:
13 | - python-version: '3.9'
14 | py-short: '39'
15 | py-short2: '39'
16 | - python-version: '3.10'
17 | py-short: '310'
18 | py-short2: '310'
19 | - python-version: '3.11'
20 | py-short: 311
21 | py-short2: 311
22 | - python-version: '3.12'
23 | py-short: 312
24 | py-short2: 312
25 | - python-version: '3.13'
26 | py-short: 313
27 | py-short2: 313
28 | env:
29 | PYTHON: /opt/python/cp${{ matrix.py-short }}-cp${{ matrix.py-short2 }}/bin/python
30 | steps:
31 | - uses: actions/checkout@v3
32 | - run: docker build -t fugashi .
33 | - name: setup and test
34 | run: docker run -v $(pwd):/workdir -w /workdir fugashi sh -c "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/ && $PYTHON -m pip install cython pytest wheel unidic-lite ipadic && $PYTHON -m pip install -e . && $PYTHON -m pytest"
35 |
--------------------------------------------------------------------------------
/.github/workflows/windows.yml:
--------------------------------------------------------------------------------
1 | name: Build Python Windows wheels
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | create:
8 |
9 |
10 | jobs:
11 | build_windows:
12 | runs-on: windows-latest
13 | strategy:
14 | max-parallel: 5
15 | matrix:
16 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
17 | include:
18 | - python-version: '3.9'
19 | py-short: '39'
20 | py-short2: '39'
21 | - python-version: '3.10'
22 | py-short: '310'
23 | py-short2: '310'
24 | - python-version: '3.11'
25 | py-short: 311
26 | py-short2: 311
27 | - python-version: '3.12'
28 | py-short: 312
29 | py-short2: 312
30 | - python-version: '3.13'
31 | py-short: 313
32 | py-short2: 313
33 | steps:
34 | - uses: actions/checkout@v4
35 | - name: Set up Python ${{ matrix.python-version }}
36 | uses: actions/setup-python@v5
37 | with:
38 | python-version: ${{ matrix.python-version }}
39 | - name: Cache mecab
40 | id: cache-mecab
41 | uses: actions/cache@v4
42 | with:
43 | path: C:/mecab
44 | key: mecab-win-build
45 | - name: Download MeCab Win and Unzip it
46 | if: steps.cache-mecab.outputs.cache-hit != 'true'
47 | shell: bash
48 | run: |
49 | curl -LO "https://github.com/chezou/mecab/releases/download/mecab-0.996-msvc-5/mecab-msvc-x64.zip"
50 | unzip -o "mecab-msvc-x64.zip" -d c:/mecab
51 | - name: Install dependencies
52 | run: |
53 | python -m pip install --upgrade pip setuptools
54 | pip install build delvewheel setuptools-scm
55 | - name: Build wheel
56 | run: |
57 | python -m build --wheel
58 | env:
59 | FUGASHI_NO_BUNDLE_DLL: 1
60 | - name: Repair wheel
61 | run: |
62 | python -m delvewheel repair --add-path=C:/mecab ./dist/fugashi-*.whl
63 | - name: Upload Wheel
64 | uses: actions/upload-artifact@v4
65 | with:
66 | name: win-wheels-${{ matrix.python-version }}
67 | path: wheelhouse
68 | - name: Check wheels
69 | shell: bash
70 | run: |
71 | ls -la
72 | VERSION=$(python -m setuptools_scm)
73 | pip install "wheelhouse/fugashi-${VERSION}-cp${{ matrix.py-short }}-cp${{ matrix.py-short2 }}-win_amd64.whl"
74 | - name: Publish to PyPI if tagged
75 | if: startsWith(github.ref, 'refs/tags')
76 | env:
77 | TWINE_USERNAME: __token__
78 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
79 | shell: bash
80 | run: |
81 | pip install twine
82 | twine upload wheelhouse/fugashi*
83 |
84 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # celery beat schedule file
95 | celerybeat-schedule
96 |
97 | # SageMath parsed files
98 | *.sage.py
99 |
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 |
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 |
113 | # Rope project settings
114 | .ropeproject
115 |
116 | # mkdocs documentation
117 | /site
118 |
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 |
124 | # Pyre type checker
125 | .pyre/
126 |
127 | /fugashi/fugashi.c
128 | /fugashi/libmecab.dll
129 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | preferred-citation:
3 | type: article
4 | message: "If you use fugashi in research, it would be appreciated if you site this paper."
5 | authors:
6 | - family-names: "McCann"
7 | given-names: "Paul"
8 | orcid: "https://orcid.org/0000-0003-3376-8772"
9 | title: "fugashi, a Tool for Tokenizing Japanese in Python"
10 | doi: "10.18653/v1/2020.nlposs-1.7"
11 | journal: "Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)"
12 | year: 2020
13 | month: 11
14 | start: 44
15 | end: 51
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM quay.io/pypa/manylinux2014_x86_64
2 |
3 | RUN git clone --depth=1 https://github.com/taku910/mecab.git && \
4 | cd mecab/mecab && \
5 | ./configure --enable-utf8-only && \
6 | make && \
7 | make install
8 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Paul O'Leary McCann
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/LICENSE.mecab:
--------------------------------------------------------------------------------
1 | Copyright (c) 2001-2008, Taku Kudo
2 | Copyright (c) 2004-2008, Nippon Telegraph and Telephone Corporation
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without modification, are
6 | permitted provided that the following conditions are met:
7 |
8 | * Redistributions of source code must retain the above
9 | copyright notice, this list of conditions and the
10 | following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above
13 | copyright notice, this list of conditions and the
14 | following disclaimer in the documentation and/or other
15 | materials provided with the distribution.
16 |
17 | * Neither the name of the Nippon Telegraph and Telegraph Corporation
18 | nor the names of its contributors may be used to endorse or
19 | promote products derived from this software without specific
20 | prior written permission.
21 |
22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
23 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
28 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://fugashi.streamlit.app)
2 | [](https://pypi.org/project/fugashi/)
3 | 
4 | [](https://pypi.org/project/fugashi/)
5 | 
6 |
7 | # fugashi
8 |
9 |
10 |
11 | fugashi is a Cython wrapper for [MeCab](https://taku910.github.io/mecab/), a
12 | Japanese tokenizer and morphological analysis tool. Wheels are provided for
13 | Linux, OSX (Intel), and Win64, and UniDic is [easy to install](#installing-a-dictionary).
14 |
15 | **issueを英語で書く必要はありません。**
16 |
17 | Check out the [interactive demo][], see the [blog post](https://www.dampfkraft.com/nlp/fugashi.html) for background
18 | on why fugashi exists and some of the design decisions, or see [this
19 | guide][guide] for a basic introduction to Japanese tokenization.
20 |
21 | [guide]: https://www.dampfkraft.com/nlp/how-to-tokenize-japanese.html
22 | [interactive demo]: https://fugashi.streamlit.app
23 |
24 | If you are on a platform for which wheels are not provided, you'll need to
25 | install MeCab first. It's recommended you install [from
26 | source](https://github.com/taku910/mecab). If you need to build from source on
27 | Windows, [@chezou's fork](https://github.com/chezou/mecab) is recommended; see
28 | [issue #44](https://github.com/polm/fugashi/issues/44#issuecomment-954426115)
29 | for an explanation of the problems with the official repo.
30 |
31 | Known platforms without wheels:
32 |
33 | - musl-based distros like alpine [#77](https://github.com/polm/fugashi/issues/77)
34 | - PowerPC
35 | - Windows 32bit
36 |
37 | ## Usage
38 |
39 | ```python
40 | from fugashi import Tagger
41 |
42 | tagger = Tagger('-Owakati')
43 | text = "麩菓子は、麩を主材料とした日本の菓子。"
44 | tagger.parse(text)
45 | # => '麩 菓子 は 、 麩 を 主材 料 と し た 日本 の 菓子 。'
46 | for word in tagger(text):
47 | print(word, word.feature.lemma, word.pos, sep='\t')
48 | # "feature" is the Unidic feature data as a named tuple
49 | ```
50 |
51 | ## Installing a Dictionary
52 |
53 | fugashi requires a dictionary. [UniDic](https://unidic.ninjal.ac.jp/) is
54 | recommended, and two easy-to-install versions are provided.
55 |
56 | - [unidic-lite](https://github.com/polm/unidic-lite), a slightly modified version 2.1.2 of Unidic (from 2013) that's relatively small
57 | - [unidic](https://github.com/polm/unidic-py), the latest UniDic 3.1.0, which is 770MB on disk and requires a separate download step
58 |
59 | If you just want to make sure things work you can start with `unidic-lite`, but
60 | for more serious processing `unidic` is recommended. For production use you'll
61 | generally want to generate your own dictionary too; for details see the [MeCab
62 | documentation](https://taku910.github.io/mecab/learn.html).
63 |
64 | To get either of these dictionaries, you can install them directly using `pip`
65 | or do the below:
66 |
67 | ```sh
68 | pip install 'fugashi[unidic-lite]'
69 |
70 | # The full version of UniDic requires a separate download step
71 | pip install 'fugashi[unidic]'
72 | python -m unidic download
73 | ```
74 |
75 | For more information on the different MeCab dictionaries available, see [this article](https://www.dampfkraft.com/nlp/japanese-tokenizer-dictionaries.html).
76 |
77 | ## Dictionary Use
78 |
79 | fugashi is written with the assumption you'll use Unidic to process Japanese,
80 | but it supports arbitrary dictionaries.
81 |
82 | If you're using a dictionary besides Unidic you can use the GenericTagger like this:
83 |
84 | ```python
85 | from fugashi import GenericTagger
86 | tagger = GenericTagger()
87 |
88 | # parse can be used as normal
89 | tagger.parse('something')
90 | # features from the dictionary can be accessed by field numbers
91 | for word in tagger(text):
92 | print(word.surface, word.feature[0])
93 | ```
94 |
95 | You can also create a dictionary wrapper to get feature information as a named tuple.
96 |
97 | ```python
98 | from fugashi import GenericTagger, create_feature_wrapper
99 | CustomFeatures = create_feature_wrapper('CustomFeatures', 'alpha beta gamma')
100 | tagger = GenericTagger(wrapper=CustomFeatures)
101 | for word in tagger.parseToNodeList(text):
102 | print(word.surface, word.feature.alpha)
103 | ```
104 |
105 | ## Citation
106 |
107 | If you use fugashi in research, it would be appreciated if you cite this paper. You can read it at [the ACL Anthology](https://www.aclweb.org/anthology/2020.nlposs-1.7/) or [on Arxiv](https://arxiv.org/abs/2010.06858).
108 |
109 | @inproceedings{mccann-2020-fugashi,
110 | title = "fugashi, a Tool for Tokenizing {J}apanese in Python",
111 | author = "McCann, Paul",
112 | booktitle = "Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)",
113 | month = nov,
114 | year = "2020",
115 | address = "Online",
116 | publisher = "Association for Computational Linguistics",
117 | url = "https://www.aclweb.org/anthology/2020.nlposs-1.7",
118 | pages = "44--51",
119 | abstract = "Recent years have seen an increase in the number of large-scale multilingual NLP projects. However, even in such projects, languages with special processing requirements are often excluded. One such language is Japanese. Japanese is written without spaces, tokenization is non-trivial, and while high quality open source tokenizers exist they can be hard to use and lack English documentation. This paper introduces fugashi, a MeCab wrapper for Python, and gives an introduction to tokenizing Japanese.",
120 | }
121 |
122 | ## Alternatives
123 |
124 | If you have a problem with fugashi feel free to open an issue. However, there
125 | are some cases where it might be better to use a different library.
126 |
127 | - If you don't want to deal with installing MeCab at all, try [SudachiPy](https://github.com/WorksApplications/sudachi.rs).
128 | - If you need to work with Korean, try [pymecab-ko](https://github.com/NoUnique/pymecab-ko) or [KoNLPy](https://konlpy.org/en/latest/).
129 |
130 | ## License and Copyright Notice
131 |
132 | fugashi is released under the terms of the [MIT license](./LICENSE). Please
133 | copy it far and wide.
134 |
135 | fugashi is a wrapper for MeCab, and fugashi wheels include MeCab binaries.
136 | MeCab is copyrighted free software by Taku Kudo `` and Nippon
137 | Telegraph and Telephone Corporation, and is redistributed under the [BSD
138 | License](./LICENSE.mecab).
139 |
--------------------------------------------------------------------------------
/fugashi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polm/fugashi/60594f930d49bcfb1d4d9c72d310bfa3301444d9/fugashi.png
--------------------------------------------------------------------------------
/fugashi/__init__.py:
--------------------------------------------------------------------------------
1 | from .fugashi import *
2 |
--------------------------------------------------------------------------------
/fugashi/cli.py:
--------------------------------------------------------------------------------
1 | from fugashi import GenericTagger, Tagger, build_dictionary
2 | import sys
3 | import fileinput
4 |
5 |
6 | def main():
7 | """
8 | This is a simple wrapper for fugashi so you can test it from the command line.
9 | Like the mecab binary, it treats each line of stdin as one sentence. You can
10 | pass tagger arguments here too.
11 | """
12 | args = " ".join(sys.argv[1:])
13 |
14 | # This should work if you specify a different dictionary,
15 | # but it should also work with the pip unidic.
16 | # Try the GenericTagger and then try the Unidic tagger.
17 | try:
18 | tagger = GenericTagger(args, quiet=True)
19 | except RuntimeError:
20 | tagger = Tagger(args)
21 |
22 | for line in fileinput.input([]):
23 | print(tagger.parse(line.strip()))
24 |
25 |
26 | def info():
27 | """Print configuration info."""
28 | args = " ".join(sys.argv[1:])
29 | try:
30 | tagger = GenericTagger(args, quiet=True)
31 | except RuntimeError:
32 | tagger = Tagger(args)
33 | # TODO get the fugashi version here too
34 | print("Fugashi dictionary info:")
35 | print("-----")
36 | for di in tagger.dictionary_info:
37 | for field in "version size charset filename".split():
38 | print((field + ":").ljust(10), di[field])
39 | print("-----")
40 |
41 |
42 | def build_dict():
43 | """EXPERIMENTAL A wrapper for MeCab's user dictionary building command.
44 |
45 | This also defaults to utf8.
46 | """
47 | # TODO simplify using pip-installed dictionaries as base
48 | args = sys.argv[0] + " -f utf8 -t utf8 " + " ".join(sys.argv[1:])
49 | print(args)
50 | build_dictionary(args)
51 |
--------------------------------------------------------------------------------
/fugashi/fugashi.pyx:
--------------------------------------------------------------------------------
1 | #cython: language_level=3
2 | from fugashi.mecab cimport (mecab_new, mecab_sparse_tostr2, mecab_t, mecab_node_t,
3 | mecab_sparse_tonode, mecab_nbest_sparse_tostr,
4 | mecab_dictionary_info_t, mecab_dictionary_info,
5 | mecab_model_new, mecab_strerror, mecab_dict_index,
6 | mecab_nbest_init, mecab_nbest_next_tonode)
7 | from collections import namedtuple
8 | import os
9 | import csv
10 | import shlex
11 | import sys
12 | from libc.stdlib cimport malloc, free
13 |
14 | # field names can be found in the dicrc file distributed with Unidic or here:
15 | # https://unidic.ninjal.ac.jp/faq
16 |
17 | # 2.1.2 src schema
18 | UnidicFeatures17 = namedtuple('UnidicFeatures17',
19 | ('pos1 pos2 pos3 pos4 cType cForm lForm lemma orth pron '
20 | 'orthBase pronBase goshu iType iForm fType fForm').split(' '))
21 |
22 | # 2.1.2 bin schema
23 | # The unidic-mecab-2.1.2_bin distribution adds kana accent fields.
24 | UnidicFeatures26 = namedtuple('UnidicFeatures26',
25 | ('pos1 pos2 pos3 pos4 cType cForm lForm lemma orth pron '
26 | 'orthBase pronBase goshu iType iForm fType fForm '
27 | 'kana kanaBase form formBase iConType fConType aType '
28 | 'aConType aModeType').split(' '))
29 |
30 | # schema used in 2.2.0, 2.3.0
31 | UnidicFeatures29 = namedtuple('UnidicFeatures29', 'pos1 pos2 pos3 pos4 cType '
32 | 'cForm lForm lemma orth pron orthBase pronBase goshu iType iForm fType '
33 | 'fForm iConType fConType type kana kanaBase form formBase aType aConType '
34 | 'aModType lid lemma_id'.split(' '))
35 |
36 | cdef class Node:
37 | """Generic Nodes are modeled after the data returned from MeCab.
38 |
39 | Some data is in a strict format using enums, but most useful data is in the
40 | feature string, which is an untokenized CSV string."""
41 | cdef const mecab_node_t* c_node
42 | cdef str _surface
43 | cdef str _ws
44 | cdef object features
45 | cdef object wrapper
46 |
47 | def __init__(self):
48 | pass
49 |
50 | def __repr__(self):
51 | if self.stat == 0 or self.stat == 1:
52 | return self.surface
53 | elif self.stat == 2:
54 | return ''
55 | elif self.stat == 3:
56 | return ''
57 | else:
58 | return self.surface
59 |
60 |
61 | @property
62 | def surface(self):
63 | if self._surface is None:
64 | pass
65 | return self._surface
66 |
67 | @surface.setter
68 | def surface(self, ss):
69 | self._surface = ss
70 |
71 | @property
72 | def feature(self):
73 | if self.features is None:
74 | self.set_feature(self.c_node.feature)
75 | return self.features
76 |
77 | @property
78 | def feature_raw(self):
79 | return self.c_node.feature.decode('utf-8')
80 |
81 | @property
82 | def length(self):
83 | return self.c_node.length
84 |
85 | @property
86 | def rlength(self):
87 | return self.c_node.rlength
88 |
89 | @property
90 | def posid(self):
91 | return self.c_node.posid
92 |
93 | @property
94 | def char_type(self):
95 | return self.c_node.char_type
96 |
97 | @property
98 | def stat(self):
99 | return self.c_node.stat
100 |
101 | @property
102 | def is_unk(self):
103 | return self.stat == 1
104 |
105 | @property
106 | def white_space(self):
107 | if self._ws is None:
108 | return ''
109 | return self._ws
110 |
111 | @white_space.setter
112 | def white_space(self, ws):
113 | self._ws = ws
114 |
115 | cdef list pad_none(self, list fields):
116 | try:
117 | d = len(self.wrapper._fields) - len(fields)
118 | except AttributeError:
119 | d = 0
120 | return fields + [None] * d
121 |
122 | cdef void set_feature(self, bytes feature):
123 | raw = feature.decode('utf-8')
124 | if '"' in raw:
125 | # This happens when a field contains commas. In Unidic this only
126 | # happens for the "aType" field used for accent data, and then only
127 | # a minority of the time.
128 | fields = next(csv.reader([raw]))
129 | else:
130 | fields = raw.split(',')
131 | fields = self.pad_none(fields)
132 | self.features = self.wrapper(*fields)
133 |
134 | @staticmethod
135 | cdef Node wrap(const mecab_node_t* c_node, object wrapper):
136 | cdef Node node = Node.__new__(Node)
137 | node.c_node = c_node
138 | node.wrapper = wrapper
139 |
140 | return node
141 |
142 | cdef class UnidicNode(Node):
143 | """A Unidic specific node type.
144 |
145 | At present this just adds a convenience function to get the four-field POS
146 | value.
147 | """
148 |
149 | @property
150 | def pos(self):
151 | return "{},{},{},{}".format(*self.feature[:4])
152 |
153 | @staticmethod
154 | cdef UnidicNode wrap(const mecab_node_t* c_node, object wrapper):
155 | # This has to be copied from the base node to change the type
156 | cdef UnidicNode node = UnidicNode.__new__(UnidicNode)
157 | node.c_node = c_node
158 | node.wrapper = wrapper
159 |
160 | return node
161 |
162 | def make_tuple(*args):
163 | """Take variable number of args, return tuple.
164 |
165 | The tuple constructor actually has a different type signature than the
166 | namedtuple constructor. This is a wrapper to give it the same interface.
167 | """
168 | return tuple(args)
169 |
170 | FAILMESSAGE = """
171 | Failed initializing MeCab. Please see the README for possible solutions:
172 |
173 | https://github.com/polm/fugashi
174 |
175 | If you are still having trouble, please file an issue here, and include the
176 | ERROR DETAILS below:
177 |
178 | https://github.com/polm/fugashi/issues
179 |
180 | issueを英語で書く必要はありません。
181 |
182 | ------------------- ERROR DETAILS ------------------------"""
183 |
184 | cdef str get_error_details(int argc, char** argv):
185 | """Instantiate a Model to get output from MeCab.
186 |
187 | Due to an upstream bug, errors in Tagger intialization don't give useful
188 | error output."""
189 | model = mecab_model_new(argc, argv)
190 | return mecab_strerror(NULL).decode('utf-8')
191 |
192 | cdef str get_detailed_error(list args, int argc, char** argv):
193 | """Generate guide to solving initialization errors."""
194 | msg = FAILMESSAGE + "\n"
195 | msg += "arguments: " + str(args) + "\n"
196 | msg += get_error_details(argc, argv) + "\n"
197 | msg += '----------------------------------------------------------\n'
198 | return msg
199 |
200 |
201 | cdef class GenericTagger:
202 | """Generic Tagger, supports any dictionary.
203 |
204 | By default dictionary features are wrapped in a tuple. If you want you can
205 | provide a namedtuple or similar container for them as an argument to the
206 | constructor.
207 | """
208 |
209 | cdef mecab_t* c_tagger
210 | cdef object wrapper
211 | cdef dict _cache
212 |
213 | def __init__(self, args='', wrapper=make_tuple, quiet=False):
214 | # The first argument is ignored because in the MeCab binary the argc
215 | # and argv for the process are used here.
216 | args = [b'fugashi', b'-C'] + [bytes(arg, 'utf-8') for arg in shlex.split(args)]
217 | cdef int argc = len(args)
218 | cdef char** argv = malloc(argc * sizeof(char*))
219 | for ii, arg in enumerate(args):
220 | argv[ii] = arg
221 |
222 | self.c_tagger = mecab_new(argc, argv)
223 | if self.c_tagger == NULL:
224 | # In theory mecab_strerror should return an error string from MeCab
225 | # It doesn't seem to work and just returns b'' though, so this will
226 | # have to do.
227 | msg = "Failed initializing MeCab"
228 | if not quiet:
229 | msg = get_detailed_error(args, argc, argv)
230 | free(argv)
231 | raise RuntimeError(msg)
232 | free(argv)
233 | self.wrapper = wrapper
234 | self._cache = {}
235 |
236 | def __call__(self, text):
237 | """Wrapper for parseToNodeList."""
238 | return self.parseToNodeList(text)
239 |
240 | def parse(self, str text):
241 | btext = bytes(text, 'utf-8')
242 | out = mecab_sparse_tostr2(self.c_tagger, btext, len(btext)).decode('utf-8')
243 | # MeCab always adds a newline, and in wakati mode it adds a space.
244 | # The reason for this is unclear but may be for terminal use.
245 | # It's never helpful, so remove it.
246 | return out.rstrip()
247 |
248 | cdef wrap(self, const mecab_node_t* node):
249 | # This function just exists so subclasses can override the node type.
250 | return Node.wrap(node, self.wrapper)
251 |
252 | def parseToNodeList(self, text):
253 | # cstr = bytes(text, 'utf-8')
254 | bstr = bytes(text, 'utf-8')
255 | cdef const mecab_node_t* node = mecab_sparse_tonode(self.c_tagger, bstr)
256 |
257 | # A nodelist always contains one each of BOS and EOS (beginning/end of
258 | # sentence) nodes. Since they have no information on them and MeCab
259 | # doesn't do any kind of sentence tokenization they're not useful in
260 | # the output and will be removed here.
261 |
262 | # Node that on the command line this behavior is different, and each
263 | # line is treated as a sentence.
264 |
265 | out = []
266 | while node.next:
267 | node = node.next
268 | if node.stat == 3: # eos node
269 | return out
270 | nn = self.wrap(node)
271 |
272 | # TODO maybe add an option to this function that doesn't cache the
273 | # surface. Not caching here is faster but means node surfaces are
274 | # invalidated on the next call of this function.
275 |
276 | # In theory the input string should be re-usable, but it's hard to
277 | # track ownership of it in Python properly.
278 |
279 | # avoid new string allocations
280 | # TODO try lru cache instead of intern (reason: good to age stuff out)
281 | surf = node.surface[:node.length]
282 | shash = hash(surf)
283 | if shash not in self._cache:
284 | self._cache[shash] = sys.intern(surf.decode("utf-8"))
285 | nn.surface = self._cache[shash]
286 |
287 | # do the same for whitespace
288 | nodelen = node.rlength - node.length
289 | pnode = node.prev
290 | ws = pnode.surface[pnode.length : pnode.length + nodelen]
291 | wshash = hash(ws)
292 | if wshash not in self._cache:
293 | self._cache[wshash] = sys.intern(ws.decode("utf-8"))
294 | nn.white_space = self._cache[wshash]
295 |
296 | out.append(nn)
297 |
298 | def nbest(self, text, num=10):
299 | """Return the n-best possible tokenizations of the input, giving the
300 | output as a single string.
301 | """
302 |
303 | cstr = bytes(text, 'utf-8')
304 | out = mecab_nbest_sparse_tostr(self.c_tagger, num, cstr).decode('utf-8')
305 | return out.rstrip()
306 |
307 | def nbestToNodeList(self, text, num=10):
308 | """Return the n-best possible tokenizations of the input, giving each
309 | as a list of nodes.
310 | """
311 |
312 | cstr = bytes(text, 'utf-8')
313 | assert mecab_nbest_init(self.c_tagger, cstr), (
314 | "Error at mecab_nbest_init"
315 | )
316 |
317 | ret = []
318 | for path in range(num):
319 | node = mecab_nbest_next_tonode(self.c_tagger)
320 | if not node:
321 | # this happens if there aren't enough paths
322 | break
323 | out = []
324 | while node.next:
325 | node = node.next
326 | if node.stat == 3:
327 | break
328 | nn = self.wrap(node)
329 | surf = node.surface[:node.length]
330 | shash = hash(surf)
331 |
332 | if shash not in self._cache:
333 | self._cache[shash] = sys.intern(surf.decode("utf-8"))
334 | nn.surface = self._cache[shash]
335 | out.append(nn)
336 |
337 | ret.append(out)
338 |
339 | return ret
340 |
341 | @property
342 | def dictionary_info(self):
343 | """Get info on the dictionaries of the Tagger.
344 |
345 | This only exposes basic information. The C API has functions for more
346 | sophisticated access, though it's not clear how useful they are.
347 |
348 | The dictionary info structs will be returned as a list of dictionaries.
349 | If you have only the system dictionary that'll be the only dictionary,
350 | but if you specify user dictionaries they'll also be present.
351 | """
352 | infos = []
353 | cdef mecab_dictionary_info_t* dictinfo = mecab_dictionary_info(self.c_tagger)
354 | while dictinfo:
355 | info = {}
356 | info['filename'] = dictinfo.filename.decode('utf-8')
357 | info['charset'] = dictinfo.charset.decode('utf-8')
358 | info['size'] = dictinfo.size
359 | # Note this is generally not used reliably
360 | info['version'] = dictinfo.version
361 | dictinfo = dictinfo.next
362 | infos.append(info)
363 | return infos
364 |
365 | def try_import_unidic():
366 | """Import unidic or unidic lite if available. Return dicdir."""
367 | try:
368 | import unidic
369 | return unidic.DICDIR
370 | except ImportError:
371 | try:
372 | import unidic_lite
373 | return unidic_lite.DICDIR
374 | except ImportError:
375 | # This is OK, just give up.
376 | return
377 |
378 | cdef class Tagger(GenericTagger):
379 | """Default tagger. Detects the correct Unidic feature format.
380 |
381 | Unidic 2.1.2 (17 field) and 2.2, 2.3 format (29 field) are supported.
382 | """
383 |
384 | def __init__(self, arg=''):
385 | # Use pip installed unidic if available
386 | unidicdir = try_import_unidic()
387 | if unidicdir:
388 | mecabrc = os.path.join(unidicdir, 'mecabrc')
389 | arg = '-r "{}" -d "{}" '.format(mecabrc, unidicdir) + arg
390 |
391 | super().__init__(arg)
392 |
393 | fields = self.parseToNodeList("日本")[0].feature_raw.split(',')
394 |
395 | if len(fields) == 17:
396 | self.wrapper = UnidicFeatures17
397 | elif len(fields) == 26:
398 | self.wrapper = UnidicFeatures26
399 | elif len(fields) == 29:
400 | self.wrapper = UnidicFeatures29
401 | else:
402 | raise RuntimeError("Unknown dictionary format, use a GenericTagger.")
403 |
404 | # This needs to be overridden to change the node type.
405 | cdef wrap(self, const mecab_node_t* node):
406 | return UnidicNode.wrap(node, self.wrapper)
407 |
408 | def create_feature_wrapper(name, fields, default=None):
409 | """Create a namedtuple based wrapper for dictionary features.
410 |
411 | This sets the default values for the namedtuple to None since in most cases
412 | unks will have fewer fields.
413 |
414 | The resulting type can be used as the wrapper argument to GenericTagger to
415 | support new dictionaries.
416 | """
417 | return namedtuple(name, fields, defaults=(None,) * len(fields))
418 |
419 | def build_dictionary(args):
420 | args = [bytes(arg, 'utf-8') for arg in shlex.split(args)]
421 | cdef int argc = len(args)
422 | cdef char** argv = malloc(argc * sizeof(char*))
423 | for ii, arg in enumerate(args):
424 | argv[ii] = arg
425 | out = mecab_dict_index(argc, argv)
426 | free(argv)
427 |
428 |
--------------------------------------------------------------------------------
/fugashi/include/mecab/mecab.h:
--------------------------------------------------------------------------------
1 | /*
2 | MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
3 |
4 | Copyright(C) 2001-2011 Taku Kudo
5 | Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
6 | */
7 | #ifndef MECAB_MECAB_H_
8 | #define MECAB_MECAB_H_
9 |
10 | /* C/C++ common data structures */
11 |
12 | /**
13 | * DictionaryInfo structure
14 | */
15 | struct mecab_dictionary_info_t {
16 | /**
17 | * filename of dictionary
18 | * On Windows, filename is stored in UTF-8 encoding
19 | */
20 | const char *filename;
21 |
22 | /**
23 | * character set of the dictionary. e.g., "SHIFT-JIS", "UTF-8"
24 | */
25 | const char *charset;
26 |
27 | /**
28 | * How many words are registered in this dictionary.
29 | */
30 | unsigned int size;
31 |
32 | /**
33 | * dictionary type
34 | * this value should be MECAB_USR_DIC, MECAB_SYS_DIC, or MECAB_UNK_DIC.
35 | */
36 | int type;
37 |
38 | /**
39 | * left attributes size
40 | */
41 | unsigned int lsize;
42 |
43 | /**
44 | * right attributes size
45 | */
46 | unsigned int rsize;
47 |
48 | /**
49 | * version of this dictionary
50 | */
51 | unsigned short version;
52 |
53 | /**
54 | * pointer to the next dictionary info.
55 | */
56 | struct mecab_dictionary_info_t *next;
57 | };
58 |
59 | /**
60 | * Path structure
61 | */
62 | struct mecab_path_t {
63 | /**
64 | * pointer to the right node
65 | */
66 | struct mecab_node_t* rnode;
67 |
68 | /**
69 | * pointer to the next right path
70 | */
71 | struct mecab_path_t* rnext;
72 |
73 | /**
74 | * pointer to the left node
75 | */
76 | struct mecab_node_t* lnode;
77 |
78 | /**
79 | * pointer to the next left path
80 | */
81 |
82 | struct mecab_path_t* lnext;
83 |
84 | /**
85 | * local cost
86 | */
87 | int cost;
88 |
89 | /**
90 | * marginal probability
91 | */
92 | float prob;
93 | };
94 |
95 | /**
96 | * Node structure
97 | */
98 | struct mecab_node_t {
99 | /**
100 | * pointer to the previous node.
101 | */
102 | struct mecab_node_t *prev;
103 |
104 | /**
105 | * pointer to the next node.
106 | */
107 | struct mecab_node_t *next;
108 |
109 | /**
110 | * pointer to the node which ends at the same position.
111 | */
112 | struct mecab_node_t *enext;
113 |
114 | /**
115 | * pointer to the node which starts at the same position.
116 | */
117 | struct mecab_node_t *bnext;
118 |
119 | /**
120 | * pointer to the right path.
121 | * this value is NULL if MECAB_ONE_BEST mode.
122 | */
123 | struct mecab_path_t *rpath;
124 |
125 | /**
126 | * pointer to the right path.
127 | * this value is NULL if MECAB_ONE_BEST mode.
128 | */
129 | struct mecab_path_t *lpath;
130 |
131 | /**
132 | * surface string.
133 | * this value is not 0 terminated.
134 | * You can get the length with length/rlength members.
135 | */
136 | const char *surface;
137 |
138 | /**
139 | * feature string
140 | */
141 | const char *feature;
142 |
143 | /**
144 | * unique node id
145 | */
146 | unsigned int id;
147 |
148 | /**
149 | * length of the surface form.
150 | */
151 | unsigned short length;
152 |
153 | /**
154 | * length of the surface form including white space before the morph.
155 | */
156 | unsigned short rlength;
157 |
158 | /**
159 | * right attribute id
160 | */
161 | unsigned short rcAttr;
162 |
163 | /**
164 | * left attribute id
165 | */
166 | unsigned short lcAttr;
167 |
168 | /**
169 | * unique part of speech id. This value is defined in "pos.def" file.
170 | */
171 | unsigned short posid;
172 |
173 | /**
174 | * character type
175 | */
176 | unsigned char char_type;
177 |
178 | /**
179 | * status of this model.
180 | * This value is MECAB_NOR_NODE, MECAB_UNK_NODE, MECAB_BOS_NODE, MECAB_EOS_NODE, or MECAB_EON_NODE.
181 | */
182 | unsigned char stat;
183 |
184 | /**
185 | * set 1 if this node is best node.
186 | */
187 | unsigned char isbest;
188 |
189 | /**
190 | * forward accumulative log summation.
191 | * This value is only available when MECAB_MARGINAL_PROB is passed.
192 | */
193 | float alpha;
194 |
195 | /**
196 | * backward accumulative log summation.
197 | * This value is only available when MECAB_MARGINAL_PROB is passed.
198 | */
199 | float beta;
200 |
201 | /**
202 | * marginal probability.
203 | * This value is only available when MECAB_MARGINAL_PROB is passed.
204 | */
205 | float prob;
206 |
207 | /**
208 | * word cost.
209 | */
210 | short wcost;
211 |
212 | /**
213 | * best accumulative cost from bos node to this node.
214 | */
215 | long cost;
216 | };
217 |
218 | /**
219 | * Parameters for MeCab::Node::stat
220 | */
221 | enum {
222 | /**
223 | * Normal node defined in the dictionary.
224 | */
225 | MECAB_NOR_NODE = 0,
226 | /**
227 | * Unknown node not defined in the dictionary.
228 | */
229 | MECAB_UNK_NODE = 1,
230 | /**
231 | * Virtual node representing a beginning of the sentence.
232 | */
233 | MECAB_BOS_NODE = 2,
234 | /**
235 | * Virtual node representing a end of the sentence.
236 | */
237 | MECAB_EOS_NODE = 3,
238 |
239 | /**
240 | * Virtual node representing a end of the N-best enumeration.
241 | */
242 | MECAB_EON_NODE = 4
243 | };
244 |
245 | /**
246 | * Parameters for MeCab::DictionaryInfo::type
247 | */
248 | enum {
249 | /**
250 | * This is a system dictionary.
251 | */
252 | MECAB_SYS_DIC = 0,
253 |
254 | /**
255 | * This is a user dictionary.
256 | */
257 | MECAB_USR_DIC = 1,
258 |
259 | /**
260 | * This is a unknown word dictionary.
261 | */
262 | MECAB_UNK_DIC = 2
263 | };
264 |
265 | /**
266 | * Parameters for MeCab::Lattice::request_type
267 | */
268 | enum {
269 | /**
270 | * One best result is obtained (default mode)
271 | */
272 | MECAB_ONE_BEST = 1,
273 | /**
274 | * Set this flag if you want to obtain N best results.
275 | */
276 | MECAB_NBEST = 2,
277 | /**
278 | * Set this flag if you want to enable a partial parsing mode.
279 | * When this flag is set, the input |sentence| needs to be written
280 | * in partial parsing format.
281 | */
282 | MECAB_PARTIAL = 4,
283 | /**
284 | * Set this flag if you want to obtain marginal probabilities.
285 | * Marginal probability is set in MeCab::Node::prob.
286 | * The parsing speed will get 3-5 times slower than the default mode.
287 | */
288 | MECAB_MARGINAL_PROB = 8,
289 | /**
290 | * Set this flag if you want to obtain alternative results.
291 | * Not implemented.
292 | */
293 | MECAB_ALTERNATIVE = 16,
294 | /**
295 | * When this flag is set, the result linked-list (Node::next/prev)
296 | * traverses all nodes in the lattice.
297 | */
298 | MECAB_ALL_MORPHS = 32,
299 |
300 | /**
301 | * When this flag is set, tagger internally copies the body of passed
302 | * sentence into internal buffer.
303 | */
304 | MECAB_ALLOCATE_SENTENCE = 64
305 | };
306 |
307 | /**
308 | * Parameters for MeCab::Lattice::boundary_constraint_type
309 | */
310 | enum {
311 | /**
312 | * The token boundary is not specified.
313 | */
314 | MECAB_ANY_BOUNDARY = 0,
315 |
316 | /**
317 | * The position is a strong token boundary.
318 | */
319 | MECAB_TOKEN_BOUNDARY = 1,
320 |
321 | /**
322 | * The position is not a token boundary.
323 | */
324 | MECAB_INSIDE_TOKEN = 2
325 | };
326 |
327 | /* C interface */
328 | #ifdef __cplusplus
329 | #include
330 | #else
331 | #include
332 | #endif
333 |
334 | #ifdef __cplusplus
335 | extern "C" {
336 | #endif
337 |
338 | #ifdef _WIN32
339 | #include
340 | # ifdef DLL_EXPORT
341 | # define MECAB_DLL_EXTERN __declspec(dllexport)
342 | # define MECAB_DLL_CLASS_EXTERN __declspec(dllexport)
343 | # else
344 | # define MECAB_DLL_EXTERN __declspec(dllimport)
345 | # endif
346 | #endif
347 |
348 | #ifndef MECAB_DLL_EXTERN
349 | # define MECAB_DLL_EXTERN extern
350 | #endif
351 |
352 | #ifndef MECAB_DLL_CLASS_EXTERN
353 | # define MECAB_DLL_CLASS_EXTERN
354 | #endif
355 |
356 | typedef struct mecab_t mecab_t;
357 | typedef struct mecab_model_t mecab_model_t;
358 | typedef struct mecab_lattice_t mecab_lattice_t;
359 | typedef struct mecab_dictionary_info_t mecab_dictionary_info_t;
360 | typedef struct mecab_node_t mecab_node_t;
361 | typedef struct mecab_path_t mecab_path_t;
362 |
363 | #ifndef SWIG
364 | /* C interface */
365 |
366 | /* old mecab interface */
367 | /**
368 | * C wrapper of MeCab::Tagger::create(argc, argv)
369 | */
370 | MECAB_DLL_EXTERN mecab_t* mecab_new(int argc, char **argv);
371 |
372 | /**
373 | * C wrapper of MeCab::Tagger::create(arg)
374 | */
375 | MECAB_DLL_EXTERN mecab_t* mecab_new2(const char *arg);
376 |
377 | /**
378 | * C wrapper of MeCab::Tagger::version()
379 | */
380 | MECAB_DLL_EXTERN const char* mecab_version();
381 |
382 | /**
383 | * C wrapper of MeCab::getLastError()
384 | */
385 | MECAB_DLL_EXTERN const char* mecab_strerror(mecab_t *mecab);
386 |
387 | /**
388 | * C wrapper of MeCab::deleteTagger(tagger)
389 | */
390 | MECAB_DLL_EXTERN void mecab_destroy(mecab_t *mecab);
391 |
392 | /**
393 | * C wrapper of MeCab::Tagger:set_partial()
394 | */
395 | MECAB_DLL_EXTERN int mecab_get_partial(mecab_t *mecab);
396 |
397 | /**
398 | * C wrapper of MeCab::Tagger::partial()
399 | */
400 | MECAB_DLL_EXTERN void mecab_set_partial(mecab_t *mecab, int partial);
401 |
402 | /**
403 | * C wrapper of MeCab::Tagger::theta()
404 | */
405 | MECAB_DLL_EXTERN float mecab_get_theta(mecab_t *mecab);
406 |
407 | /**
408 | * C wrapper of MeCab::Tagger::set_theta()
409 | */
410 | MECAB_DLL_EXTERN void mecab_set_theta(mecab_t *mecab, float theta);
411 |
412 | /**
413 | * C wrapper of MeCab::Tagger::lattice_level()
414 | */
415 | MECAB_DLL_EXTERN int mecab_get_lattice_level(mecab_t *mecab);
416 |
417 | /**
418 | * C wrapper of MeCab::Tagger::set_lattice_level()
419 | */
420 | MECAB_DLL_EXTERN void mecab_set_lattice_level(mecab_t *mecab, int level);
421 |
422 | /**
423 | * C wrapper of MeCab::Tagger::all_morphs()
424 | */
425 | MECAB_DLL_EXTERN int mecab_get_all_morphs(mecab_t *mecab);
426 |
427 | /**
428 | * C wrapper of MeCab::Tagger::set_all_moprhs()
429 | */
430 | MECAB_DLL_EXTERN void mecab_set_all_morphs(mecab_t *mecab, int all_morphs);
431 |
432 | /**
433 | * C wrapper of MeCab::Tagger::parse(MeCab::Lattice *lattice)
434 | */
435 | MECAB_DLL_EXTERN int mecab_parse_lattice(mecab_t *mecab, mecab_lattice_t *lattice);
436 |
437 | /**
438 | * C wrapper of MeCab::Tagger::parse(const char *str)
439 | */
440 | MECAB_DLL_EXTERN const char* mecab_sparse_tostr(mecab_t *mecab, const char *str);
441 |
442 | /**
443 | * C wrapper of MeCab::Tagger::parse(const char *str, size_t len)
444 | */
445 | MECAB_DLL_EXTERN const char* mecab_sparse_tostr2(mecab_t *mecab, const char *str, size_t len);
446 |
447 | /**
448 | * C wrapper of MeCab::Tagger::parse(const char *str, char *ostr, size_t olen)
449 | */
450 | MECAB_DLL_EXTERN char* mecab_sparse_tostr3(mecab_t *mecab, const char *str, size_t len,
451 | char *ostr, size_t olen);
452 |
453 | /**
454 | * C wrapper of MeCab::Tagger::parseToNode(const char *str)
455 | */
456 | MECAB_DLL_EXTERN const mecab_node_t* mecab_sparse_tonode(mecab_t *mecab, const char*);
457 |
458 | /**
459 | * C wrapper of MeCab::Tagger::parseToNode(const char *str, size_t len)
460 | */
461 | MECAB_DLL_EXTERN const mecab_node_t* mecab_sparse_tonode2(mecab_t *mecab, const char*, size_t);
462 |
463 | /**
464 | * C wrapper of MeCab::Tagger::parseNBest(size_t N, const char *str)
465 | */
466 | MECAB_DLL_EXTERN const char* mecab_nbest_sparse_tostr(mecab_t *mecab, size_t N, const char *str);
467 |
468 | /**
469 | * C wrapper of MeCab::Tagger::parseNBest(size_t N, const char *str, size_t len)
470 | */
471 | MECAB_DLL_EXTERN const char* mecab_nbest_sparse_tostr2(mecab_t *mecab, size_t N,
472 | const char *str, size_t len);
473 |
474 | /**
475 | * C wrapper of MeCab::Tagger::parseNBest(size_t N, const char *str, char *ostr, size_t olen)
476 | */
477 | MECAB_DLL_EXTERN char* mecab_nbest_sparse_tostr3(mecab_t *mecab, size_t N,
478 | const char *str, size_t len,
479 | char *ostr, size_t olen);
480 |
481 | /**
482 | * C wrapper of MeCab::Tagger::parseNBestInit(const char *str)
483 | */
484 | MECAB_DLL_EXTERN int mecab_nbest_init(mecab_t *mecab, const char *str);
485 |
486 | /**
487 | * C wrapper of MeCab::Tagger::parseNBestInit(const char *str, size_t len)
488 | */
489 | MECAB_DLL_EXTERN int mecab_nbest_init2(mecab_t *mecab, const char *str, size_t len);
490 |
491 | /**
492 | * C wrapper of MeCab::Tagger::next()
493 | */
494 | MECAB_DLL_EXTERN const char* mecab_nbest_next_tostr(mecab_t *mecab);
495 |
496 | /**
497 | * C wrapper of MeCab::Tagger::next(char *ostr, size_t olen)
498 | */
499 | MECAB_DLL_EXTERN char* mecab_nbest_next_tostr2(mecab_t *mecab, char *ostr, size_t olen);
500 |
501 | /**
502 | * C wrapper of MeCab::Tagger::nextNode()
503 | */
504 | MECAB_DLL_EXTERN const mecab_node_t* mecab_nbest_next_tonode(mecab_t *mecab);
505 |
506 | /**
507 | * C wrapper of MeCab::Tagger::formatNode(const Node *node)
508 | */
509 | MECAB_DLL_EXTERN const char* mecab_format_node(mecab_t *mecab, const mecab_node_t *node);
510 |
511 | /**
512 | * C wrapper of MeCab::Tagger::dictionary_info()
513 | */
514 | MECAB_DLL_EXTERN const mecab_dictionary_info_t* mecab_dictionary_info(mecab_t *mecab);
515 |
516 | /* lattice interface */
517 | /**
518 | * C wrapper of MeCab::createLattice()
519 | */
520 | MECAB_DLL_EXTERN mecab_lattice_t *mecab_lattice_new();
521 |
522 | /**
523 | * C wrapper of MeCab::deleteLattice(lattice)
524 | */
525 | MECAB_DLL_EXTERN void mecab_lattice_destroy(mecab_lattice_t *lattice);
526 |
527 | /**
528 | * C wrapper of MeCab::Lattice::clear()
529 | */
530 | MECAB_DLL_EXTERN void mecab_lattice_clear(mecab_lattice_t *lattice);
531 |
532 | /**
533 | * C wrapper of MeCab::Lattice::is_available()
534 | */
535 |
536 | MECAB_DLL_EXTERN int mecab_lattice_is_available(mecab_lattice_t *lattice);
537 |
538 | /**
539 | * C wrapper of MeCab::Lattice::bos_node()
540 | */
541 | MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_get_bos_node(mecab_lattice_t *lattice);
542 |
543 | /**
544 | * C wrapper of MeCab::Lattice::eos_node()
545 | */
546 | MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_get_eos_node(mecab_lattice_t *lattice);
547 |
548 | /**
549 | * C wrapper of MeCab::Lattice::begin_nodes()
550 | */
551 |
552 | MECAB_DLL_EXTERN mecab_node_t **mecab_lattice_get_all_begin_nodes(mecab_lattice_t *lattice);
553 | /**
554 | * C wrapper of MeCab::Lattice::end_nodes()
555 | */
556 | MECAB_DLL_EXTERN mecab_node_t **mecab_lattice_get_all_end_nodes(mecab_lattice_t *lattice);
557 |
558 | /**
559 | * C wrapper of MeCab::Lattice::begin_nodes(pos)
560 | */
561 | MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_get_begin_nodes(mecab_lattice_t *lattice, size_t pos);
562 |
563 | /**
564 | * C wrapper of MeCab::Lattice::end_nodes(pos)
565 | */
566 | MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_get_end_nodes(mecab_lattice_t *lattice, size_t pos);
567 |
568 | /**
569 | * C wrapper of MeCab::Lattice::sentence()
570 | */
571 | MECAB_DLL_EXTERN const char *mecab_lattice_get_sentence(mecab_lattice_t *lattice);
572 |
573 | /**
574 | * C wrapper of MeCab::Lattice::set_sentence(sentence)
575 | */
576 | MECAB_DLL_EXTERN void mecab_lattice_set_sentence(mecab_lattice_t *lattice, const char *sentence);
577 |
578 | /**
579 | * C wrapper of MeCab::Lattice::set_sentence(sentence, len)
580 | */
581 |
582 | MECAB_DLL_EXTERN void mecab_lattice_set_sentence2(mecab_lattice_t *lattice, const char *sentence, size_t len);
583 |
584 | /**
585 | * C wrapper of MeCab::Lattice::size()
586 | */
587 | MECAB_DLL_EXTERN size_t mecab_lattice_get_size(mecab_lattice_t *lattice);
588 |
589 | /**
590 | * C wrapper of MeCab::Lattice::Z()
591 | */
592 | MECAB_DLL_EXTERN double mecab_lattice_get_z(mecab_lattice_t *lattice);
593 |
594 | /**
595 | * C wrapper of MeCab::Lattice::set_Z()
596 | */
597 | MECAB_DLL_EXTERN void mecab_lattice_set_z(mecab_lattice_t *lattice, double Z);
598 |
599 | /**
600 | * C wrapper of MeCab::Lattice::theta()
601 | */
602 | MECAB_DLL_EXTERN double mecab_lattice_get_theta(mecab_lattice_t *lattice);
603 |
604 | /**
605 | * C wrapper of MeCab::Lattice::set_theta()
606 | */
607 |
608 | MECAB_DLL_EXTERN void mecab_lattice_set_theta(mecab_lattice_t *lattice, double theta);
609 |
610 | /**
611 | * C wrapper of MeCab::Lattice::next()
612 | */
613 | MECAB_DLL_EXTERN int mecab_lattice_next(mecab_lattice_t *lattice);
614 |
615 | /**
616 | * C wrapper of MeCab::Lattice::request_type()
617 | */
618 | MECAB_DLL_EXTERN int mecab_lattice_get_request_type(mecab_lattice_t *lattice);
619 |
620 | /**
621 | * C wrapper of MeCab::Lattice::has_request_type()
622 | */
623 | MECAB_DLL_EXTERN int mecab_lattice_has_request_type(mecab_lattice_t *lattice, int request_type);
624 |
625 | /**
626 | * C wrapper of MeCab::Lattice::set_request_type()
627 | */
628 | MECAB_DLL_EXTERN void mecab_lattice_set_request_type(mecab_lattice_t *lattice, int request_type);
629 |
630 | /**
631 | * C wrapper of MeCab::Lattice::add_request_type()
632 | */
633 |
634 | MECAB_DLL_EXTERN void mecab_lattice_add_request_type(mecab_lattice_t *lattice, int request_type);
635 |
636 | /**
637 | * C wrapper of MeCab::Lattice::remove_request_type()
638 | */
639 | MECAB_DLL_EXTERN void mecab_lattice_remove_request_type(mecab_lattice_t *lattice, int request_type);
640 |
641 | /**
642 | * C wrapper of MeCab::Lattice::newNode();
643 | */
644 | MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_new_node(mecab_lattice_t *lattice);
645 |
646 | /**
647 | * C wrapper of MeCab::Lattice::toString()
648 | */
649 | MECAB_DLL_EXTERN const char *mecab_lattice_tostr(mecab_lattice_t *lattice);
650 |
651 | /**
652 | * C wrapper of MeCab::Lattice::toString(buf, size)
653 | */
654 | MECAB_DLL_EXTERN const char *mecab_lattice_tostr2(mecab_lattice_t *lattice, char *buf, size_t size);
655 |
656 | /**
657 | * C wrapper of MeCab::Lattice::enumNBestAsString(N)
658 | */
659 | MECAB_DLL_EXTERN const char *mecab_lattice_nbest_tostr(mecab_lattice_t *lattice, size_t N);
660 |
661 | /**
662 | * C wrapper of MeCab::Lattice::enumNBestAsString(N, buf, size)
663 | */
664 |
665 | MECAB_DLL_EXTERN const char *mecab_lattice_nbest_tostr2(mecab_lattice_t *lattice, size_t N, char *buf, size_t size);
666 |
667 | /**
668 | * C wrapper of MeCab::Lattice::has_constraint()
669 | */
670 | MECAB_DLL_EXTERN int mecab_lattice_has_constraint(mecab_lattice_t *lattice);
671 |
672 | /**
673 | * C wrapper of MeCab::Lattice::boundary_constraint(pos)
674 | */
675 | MECAB_DLL_EXTERN int mecab_lattice_get_boundary_constraint(mecab_lattice_t *lattice, size_t pos);
676 |
677 |
678 | /**
679 | * C wrapper of MeCab::Lattice::feature_constraint(pos)
680 | */
681 | MECAB_DLL_EXTERN const char *mecab_lattice_get_feature_constraint(mecab_lattice_t *lattice, size_t pos);
682 |
683 | /**
684 | * C wrapper of MeCab::Lattice::boundary_constraint(pos, type)
685 | */
686 | MECAB_DLL_EXTERN void mecab_lattice_set_boundary_constraint(mecab_lattice_t *lattice, size_t pos, int boundary_type);
687 |
688 | /**
689 | * C wrapper of MeCab::Lattice::set_feature_constraint(begin_pos, end_pos, feature)
690 | */
691 | MECAB_DLL_EXTERN void mecab_lattice_set_feature_constraint(mecab_lattice_t *lattice, size_t begin_pos, size_t end_pos, const char *feature);
692 |
693 | /**
694 | * C wrapper of MeCab::Lattice::set_result(result);
695 | */
696 | MECAB_DLL_EXTERN void mecab_lattice_set_result(mecab_lattice_t *lattice, const char *result);
697 |
698 | /**
699 | * C wrapper of MeCab::Lattice::what()
700 | */
701 | MECAB_DLL_EXTERN const char *mecab_lattice_strerror(mecab_lattice_t *lattice);
702 |
703 |
704 | /* model interface */
705 | /**
706 | * C wapper of MeCab::Model::create(argc, argv)
707 | */
708 | MECAB_DLL_EXTERN mecab_model_t *mecab_model_new(int argc, char **argv);
709 |
710 | /**
711 | * C wapper of MeCab::Model::create(arg)
712 | */
713 | MECAB_DLL_EXTERN mecab_model_t *mecab_model_new2(const char *arg);
714 |
715 | /**
716 | * C wapper of MeCab::deleteModel(model)
717 | */
718 |
719 | MECAB_DLL_EXTERN void mecab_model_destroy(mecab_model_t *model);
720 |
721 | /**
722 | * C wapper of MeCab::Model::createTagger()
723 | */
724 | MECAB_DLL_EXTERN mecab_t *mecab_model_new_tagger(mecab_model_t *model);
725 |
726 | /**
727 | * C wapper of MeCab::Model::createLattice()
728 | */
729 | MECAB_DLL_EXTERN mecab_lattice_t *mecab_model_new_lattice(mecab_model_t *model);
730 |
731 | /**
732 | * C wrapper of MeCab::Model::swap()
733 | */
734 | MECAB_DLL_EXTERN int mecab_model_swap(mecab_model_t *model, mecab_model_t *new_model);
735 |
736 | /**
737 | * C wapper of MeCab::Model::dictionary_info()
738 | */
739 | MECAB_DLL_EXTERN const mecab_dictionary_info_t* mecab_model_dictionary_info(mecab_model_t *model);
740 |
741 | /**
742 | * C wrapper of MeCab::Model::transition_cost()
743 | */
744 | MECAB_DLL_EXTERN int mecab_model_transition_cost(mecab_model_t *model,
745 | unsigned short rcAttr,
746 | unsigned short lcAttr);
747 |
748 | /**
749 | * C wrapper of MeCab::Model::lookup()
750 | */
751 | MECAB_DLL_EXTERN mecab_node_t *mecab_model_lookup(mecab_model_t *model,
752 | const char *begin,
753 | const char *end,
754 | mecab_lattice_t *lattice);
755 |
756 | /* static functions */
757 | MECAB_DLL_EXTERN int mecab_do(int argc, char **argv);
758 | MECAB_DLL_EXTERN int mecab_dict_index(int argc, char **argv);
759 | MECAB_DLL_EXTERN int mecab_dict_gen(int argc, char **argv);
760 | MECAB_DLL_EXTERN int mecab_cost_train(int argc, char **argv);
761 | MECAB_DLL_EXTERN int mecab_system_eval(int argc, char **argv);
762 | MECAB_DLL_EXTERN int mecab_test_gen(int argc, char **argv);
763 | #endif
764 |
765 | #ifdef __cplusplus
766 | }
767 | #endif
768 |
769 | /* C++ interface */
770 | #ifdef __cplusplus
771 |
772 | namespace MeCab {
773 | typedef struct mecab_dictionary_info_t DictionaryInfo;
774 | typedef struct mecab_path_t Path;
775 | typedef struct mecab_node_t Node;
776 |
777 | template class Allocator;
778 | class Tagger;
779 |
780 | /**
781 | * Lattice class
782 | */
783 | class MECAB_DLL_CLASS_EXTERN Lattice {
784 | public:
785 | /**
786 | * Clear all internal lattice data.
787 | */
788 | virtual void clear() = 0;
789 |
790 | /**
791 | * Return true if result object is available.
792 | * @return boolean
793 | */
794 | virtual bool is_available() const = 0;
795 |
796 | /**
797 | * Return bos (begin of sentence) node.
798 | * You can obtain all nodes via "for (const Node *node = lattice->bos_node(); node; node = node->next) {}"
799 | * @return bos node object
800 | */
801 | virtual Node *bos_node() const = 0;
802 |
803 | /**
804 | * Return eos (end of sentence) node.
805 | * @return eos node object
806 | */
807 | virtual Node *eos_node() const = 0;
808 |
809 | #ifndef SWIG
810 | /**
811 | * This method is used internally.
812 | */
813 | virtual Node **begin_nodes() const = 0;
814 |
815 | /**
816 | * This method is used internally.
817 | */
818 | virtual Node **end_nodes() const = 0;
819 | #endif
820 |
821 | /**
822 | * Return node linked list ending at |pos|.
823 | * You can obtain all nodes via "for (const Node *node = lattice->end_nodes(pos); node; node = node->enext) {}"
824 | * @param pos position of nodes. 0 <= pos < size()
825 | * @return node linked list
826 | */
827 | virtual Node *end_nodes(size_t pos) const = 0;
828 |
829 | /**
830 | * Return node linked list starting at |pos|.
831 | * You can obtain all nodes via "for (const Node *node = lattice->begin_nodes(pos); node; node = node->bnext) {}"
832 | * @param pos position of nodes. 0 <= pos < size()
833 | * @return node linked list
834 | */
835 | virtual Node *begin_nodes(size_t pos) const = 0;
836 |
837 | /**
838 | * Return sentence.
839 | * If MECAB_NBEST or MECAB_PARTIAL mode is off, the returned poiner is the same as the one set by set_sentence().
840 | * @return sentence
841 | */
842 | virtual const char *sentence() const = 0;
843 |
844 | /**
845 | * Set sentence. This method does not take the ownership of the object.
846 | * @param sentence sentence
847 | */
848 | virtual void set_sentence(const char *sentence) = 0;
849 |
850 | #ifndef SWIG
851 | /**
852 | * Set sentence. This method does not take the ownership of the object.
853 | * @param sentence sentence
854 | * @param len length of the sentence
855 | */
856 | virtual void set_sentence(const char *sentence, size_t len) = 0;
857 | #endif
858 |
859 | /**
860 | * Return sentence size.
861 | * @return sentence size
862 | */
863 | virtual size_t size() const = 0;
864 |
865 | /**
866 | * Set normalization factor of CRF.
867 | * @param Z new normalization factor.
868 | */
869 | virtual void set_Z(double Z) = 0;
870 |
871 | /**
872 | * return normalization factor of CRF.
873 | * @return normalization factor.
874 | */
875 | virtual double Z() const = 0;
876 |
877 | /**
878 | * Set temparature parameter theta.
879 | * @param theta temparature parameter.
880 | */
881 | virtual void set_theta(float theta) = 0;
882 |
883 | /**
884 | * Return temparature parameter theta.
885 | * @return temparature parameter.
886 | */
887 | virtual float theta() const = 0;
888 |
889 | /**
890 | * Obtain next-best result. The internal linked list structure is updated.
891 | * You should set MECAB_NBEST reques_type in advance.
892 | * Return false if no more results are available or request_type is invalid.
893 | * @return boolean
894 | */
895 | virtual bool next() = 0;
896 |
897 | /**
898 | * Return the current request type.
899 | * @return request type
900 | */
901 | virtual int request_type() const = 0;
902 |
903 | /**
904 | * Return true if the object has a specified request type.
905 | * @return boolean
906 | */
907 | virtual bool has_request_type(int request_type) const = 0;
908 |
909 | /**
910 | * Set request type.
911 | * @param request_type new request type assigned
912 | */
913 | virtual void set_request_type(int request_type) = 0;
914 |
915 | /**
916 | * Add request type.
917 | * @param request_type new request type added
918 | */
919 | virtual void add_request_type(int request_type) = 0;
920 |
921 | /**
922 | * Remove request type.
923 | * @param request_type new request type removed
924 | */
925 | virtual void remove_request_type(int request_type) = 0;
926 |
927 | #ifndef SWIG
928 | /**
929 | * This method is used internally.
930 | */
931 | virtual Allocator *allocator() const = 0;
932 | #endif
933 |
934 | /**
935 | * Return new node. Lattice objects has the ownership of the node.
936 | * @return new node object
937 | */
938 | virtual Node *newNode() = 0;
939 |
940 | /**
941 | * Return string representation of the lattice.
942 | * Returned object is managed by this instance. When clear/set_sentence() method
943 | * is called, the returned buffer is initialized.
944 | * @return string representation of the lattice
945 | */
946 | virtual const char *toString() = 0;
947 |
948 | /**
949 | * Return string representation of the node.
950 | * Returned object is managed by this instance. When clear/set_sentence() method
951 | * is called, the returned buffer is initialized.
952 | * @return string representation of the node
953 | * @param node node object
954 | */
955 | virtual const char *toString(const Node *node) = 0;
956 |
957 | /**
958 | * Return string representation of the N-best results.
959 | * Returned object is managed by this instance. When clear/set_sentence() method
960 | * is called, the returned buffer is initialized.
961 | * @return string representation of the node
962 | * @param N how many results you want to obtain
963 | */
964 | virtual const char *enumNBestAsString(size_t N) = 0;
965 |
966 | #ifndef SWIG
967 | /**
968 | * Return string representation of the lattice.
969 | * Result is saved in the specified buffer.
970 | * @param buf output buffer
971 | * @param size output buffer size
972 | * @return string representation of the lattice
973 | */
974 | virtual const char *toString(char *buf, size_t size) = 0;
975 |
976 | /**
977 | * Return string representation of the node.
978 | * Result is saved in the specified buffer.
979 | * @param node node object
980 | * @param buf output buffer
981 | * @param size output buffer size
982 | * @return string representation of the lattice
983 | */
984 | virtual const char *toString(const Node *node,
985 | char *buf, size_t size) = 0;
986 |
987 | /**
988 | * Return string representation of the N-best result.
989 | * Result is saved in the specified.
990 | * @param N how many results you want to obtain
991 | * @param buf output buffer
992 | * @param size output buffer size
993 | * @return string representation of the lattice
994 | */
995 | virtual const char *enumNBestAsString(size_t N, char *buf, size_t size) = 0;
996 | #endif
997 |
998 | /**
999 | * Returns true if any parsing constraint is set
1000 | */
1001 | virtual bool has_constraint() const = 0;
1002 |
1003 | /**
1004 | * Returns the boundary constraint at the position.
1005 | * @param pos the position of constraint
1006 | * @return boundary constraint type
1007 | */
1008 | virtual int boundary_constraint(size_t pos) const = 0;
1009 |
1010 | /**
1011 | * Returns the token constraint at the position.
1012 | * @param pos the beginning position of constraint.
1013 | * @return constrained node starting at the position.
1014 | */
1015 | virtual const char *feature_constraint(size_t pos) const = 0;
1016 |
1017 | /**
1018 | * Set parsing constraint for partial parsing mode.
1019 | * @param pos the position of the boundary
1020 | * @param boundary_constraint_type the type of boundary
1021 | */
1022 | virtual void set_boundary_constraint(size_t pos,
1023 | int boundary_constraint_type) = 0;
1024 |
1025 | /**
1026 | * Set parsing constraint for partial parsing mode.
1027 | * @param begin_pos the starting position of the constrained token.
1028 | * @param end_pos the the ending position of the constrained token.
1029 | * @param feature the feature of the constrained token.
1030 | */
1031 | virtual void set_feature_constraint(
1032 | size_t begin_pos, size_t end_pos,
1033 | const char *feature) = 0;
1034 |
1035 | /**
1036 | * Set golden parsing results for unittesting.
1037 | * @param result the parsing result written in the standard mecab output.
1038 | */
1039 | virtual void set_result(const char *result) = 0;
1040 |
1041 | /**
1042 | * Return error string.
1043 | * @return error string
1044 | */
1045 | virtual const char *what() const = 0;
1046 |
1047 | /**
1048 | * Set error string. given string is copied to the internal buffer.
1049 | * @param str new error string
1050 | */
1051 | virtual void set_what(const char *str) = 0;
1052 |
1053 | #ifndef SWIG
1054 | /**
1055 | * Create new Lattice object
1056 | * @return new Lattice object
1057 | */
1058 | static Lattice *create();
1059 | #endif
1060 |
1061 | virtual ~Lattice() {}
1062 | };
1063 |
1064 | /**
1065 | * Model class
1066 | */
1067 | class MECAB_DLL_CLASS_EXTERN Model {
1068 | public:
1069 | /**
1070 | * Return DictionaryInfo linked list.
1071 | * @return DictionaryInfo linked list
1072 | */
1073 | virtual const DictionaryInfo *dictionary_info() const = 0;
1074 |
1075 | /**
1076 | * Return transtion cost from rcAttr to lcAttr.
1077 | * @return transtion cost
1078 | */
1079 | virtual int transition_cost(unsigned short rcAttr,
1080 | unsigned short lcAttr) const = 0;
1081 |
1082 | /**
1083 | * perform common prefix search from the range [begin, end).
1084 | * |lattice| takes the ownership of return value.
1085 | * @return node linked list.
1086 | */
1087 | virtual Node *lookup(const char *begin, const char *end,
1088 | Lattice *lattice) const = 0;
1089 |
1090 | /**
1091 | * Create a new Tagger object.
1092 | * All returned tagger object shares this model object as a parsing model.
1093 | * Never delete this model object before deleting tagger object.
1094 | * @return new Tagger object
1095 | */
1096 | virtual Tagger *createTagger() const = 0;
1097 |
1098 | /**
1099 | * Create a new Lattice object.
1100 | * @return new Lattice object
1101 | */
1102 | virtual Lattice *createLattice() const = 0;
1103 |
1104 | /**
1105 | * Swap the instance with |model|.
1106 | * The ownership of |model| always moves to this instance,
1107 | * meaning that passed |model| will no longer be accessible after calling this method.
1108 | * return true if new model is swapped successfully.
1109 | * This method is thread safe. All taggers created by
1110 | * Model::createTagger() method will also be updated asynchronously.
1111 | * No need to stop the parsing thread excplicitly before swapping model object.
1112 | * @return boolean
1113 | * @param model new model which is going to be swapped with the current model.
1114 | */
1115 | virtual bool swap(Model *model) = 0;
1116 |
1117 | /**
1118 | * Return a version string
1119 | * @return version string
1120 | */
1121 | static const char *version();
1122 |
1123 | virtual ~Model() {}
1124 |
1125 | #ifndef SWIG
1126 | /**
1127 | * Factory method to create a new Model with a specified main's argc/argv-style parameters.
1128 | * Return NULL if new model cannot be initialized. Use MeCab::getLastError() to obtain the
1129 | * cause of the errors.
1130 | * @return new Model object
1131 | * @param argc number of parameters
1132 | * @param argv parameter list
1133 | */
1134 | static Model* create(int argc, char **argv);
1135 |
1136 | /**
1137 | * Factory method to create a new Model with a string parameter representation, i.e.,
1138 | * "-d /user/local/mecab/dic/ipadic -Ochasen".
1139 | * Return NULL if new model cannot be initialized. Use MeCab::getLastError() to obtain the
1140 | * cause of the errors.
1141 | * @return new Model object
1142 | * @param arg single string representation of the argment.
1143 | */
1144 | static Model* create(const char *arg);
1145 | #endif
1146 | };
1147 |
1148 | /**
1149 | * Tagger class
1150 | */
1151 | class MECAB_DLL_CLASS_EXTERN Tagger {
1152 | public:
1153 | /**
1154 | * Handy static method.
1155 | * Return true if lattice is parsed successfully.
1156 | * This function is equivalent to
1157 | * {
1158 | * Tagger *tagger = model.createModel();
1159 | * cosnt bool result = tagger->parse(lattice);
1160 | * delete tagger;
1161 | * return result;
1162 | * }
1163 | * @return boolean
1164 | */
1165 | static bool parse(const Model &model, Lattice *lattice);
1166 |
1167 | /**
1168 | * Parse lattice object.
1169 | * Return true if lattice is parsed successfully.
1170 | * A sentence must be set to the lattice with Lattice:set_sentence object before calling this method.
1171 | * Parsed node object can be obtained with Lattice:bos_node.
1172 | * This method is thread safe.
1173 | * @return lattice lattice object
1174 | * @return boolean
1175 | */
1176 | virtual bool parse(Lattice *lattice) const = 0;
1177 |
1178 | /**
1179 | * Parse given sentence and return parsed result as string.
1180 | * You should not delete the returned string. The returned buffer
1181 | * is overwritten when parse method is called again.
1182 | * This method is NOT thread safe.
1183 | * @param str sentence
1184 | * @return parsed result
1185 | */
1186 | virtual const char* parse(const char *str) = 0;
1187 |
1188 | /**
1189 | * Parse given sentence and return Node object.
1190 | * You should not delete the returned node object. The returned buffer
1191 | * is overwritten when parse method is called again.
1192 | * You can traverse all nodes via Node::next member.
1193 | * This method is NOT thread safe.
1194 | * @param str sentence
1195 | * @return bos node object
1196 | */
1197 | virtual const Node* parseToNode(const char *str) = 0;
1198 |
1199 | /**
1200 | * Parse given sentence and obtain N-best results as a string format.
1201 | * Currently, N must be 1 <= N <= 512 due to the limitation of the buffer size.
1202 | * You should not delete the returned string. The returned buffer
1203 | * is overwritten when parse method is called again.
1204 | * This method is DEPRECATED. Use Lattice class.
1205 | * @param N how many results you want to obtain
1206 | * @param str sentence
1207 | * @return parsed result
1208 | */
1209 | virtual const char* parseNBest(size_t N, const char *str) = 0;
1210 |
1211 | /**
1212 | * Initialize N-best enumeration with a sentence.
1213 | * Return true if initialization finishes successfully.
1214 | * N-best result is obtained by calling next() or nextNode() in sequence.
1215 | * This method is NOT thread safe.
1216 | * This method is DEPRECATED. Use Lattice class.
1217 | * @param str sentence
1218 | * @return boolean
1219 | */
1220 | virtual bool parseNBestInit(const char *str) = 0;
1221 |
1222 | /**
1223 | * Return next-best parsed result. You must call parseNBestInit() in advance.
1224 | * Return NULL if no more reuslt is available.
1225 | * This method is NOT thread safe.
1226 | * This method is DEPRECATED. Use Lattice class.
1227 | * @return node object
1228 | */
1229 | virtual const Node* nextNode() = 0;
1230 |
1231 | /**
1232 | * Return next-best parsed result. You must call parseNBestInit() in advance.
1233 | * Return NULL if no more reuslt is available.
1234 | * This method is NOT thread safe.
1235 | * This method is DEPRECATED. Use Lattice class.
1236 | * @return parsed result
1237 | */
1238 | virtual const char* next() = 0;
1239 |
1240 | /**
1241 | * Return formatted node object. The format is specified with
1242 | * --unk-format, --bos-format, --eos-format, and --eon-format respectively.
1243 | * You should not delete the returned string. The returned buffer
1244 | * is overwritten when parse method is called again.
1245 | * This method is NOT thread safe.
1246 | * This method is DEPRECATED. Use Lattice class.
1247 | * @param node node object.
1248 | * @return parsed result
1249 | */
1250 | virtual const char* formatNode(const Node *node) = 0;
1251 |
1252 | #ifndef SWIG
1253 | /**
1254 | * The same as parse() method, but input length and output buffer are passed.
1255 | * Return parsed result as string. The result pointer is the same as |ostr|.
1256 | * Return NULL, if parsed result string cannot be stored within |olen| bytes.
1257 | * @param str sentence
1258 | * @param len sentence length
1259 | * @param ostr output buffer
1260 | * @param olen output buffer length
1261 | * @return parsed result
1262 | */
1263 | virtual const char* parse(const char *str, size_t len, char *ostr, size_t olen) = 0;
1264 |
1265 | /**
1266 | * The same as parse() method, but input length can be passed.
1267 | * @param str sentence
1268 | * @param len sentence length
1269 | * @return parsed result
1270 | */
1271 | virtual const char* parse(const char *str, size_t len) = 0;
1272 |
1273 | /**
1274 | * The same as parseToNode(), but input lenth can be passed.
1275 | * @param str sentence
1276 | * @param len sentence length
1277 | * @return node object
1278 | */
1279 | virtual const Node* parseToNode(const char *str, size_t len) = 0;
1280 |
1281 | /**
1282 | * The same as parseNBest(), but input length can be passed.
1283 | * @param N how many results you want to obtain
1284 | * @param str sentence
1285 | * @param len sentence length
1286 | * @return parsed result
1287 | */
1288 | virtual const char* parseNBest(size_t N, const char *str, size_t len) = 0;
1289 |
1290 | /**
1291 | * The same as parseNBestInit(), but input length can be passed.
1292 | * @param str sentence
1293 | * @param len sentence length
1294 | * @return boolean
1295 | * @return parsed result
1296 | */
1297 | virtual bool parseNBestInit(const char *str, size_t len) = 0;
1298 |
1299 | /**
1300 | * The same as next(), but output buffer can be passed.
1301 | * Return NULL if more than |olen| buffer is required to store output string.
1302 | * @param ostr output buffer
1303 | * @param olen output buffer length
1304 | * @return parsed result
1305 | */
1306 | virtual const char* next(char *ostr , size_t olen) = 0;
1307 |
1308 | /**
1309 | * The same as parseNBest(), but input length and output buffer can be passed.
1310 | * Return NULL if more than |olen| buffer is required to store output string.
1311 | * @param N how many results you want to obtain
1312 | * @param str input sentence
1313 | * @param len input sentence length
1314 | * @param ostr output buffer
1315 | * @param olen output buffer length
1316 | * @return parsed result
1317 | */
1318 | virtual const char* parseNBest(size_t N, const char *str,
1319 | size_t len, char *ostr, size_t olen) = 0;
1320 |
1321 | /**
1322 | * The same as formatNode(), but output buffer can be passed.
1323 | * Return NULL if more than |olen| buffer is required to store output string.
1324 | * @param node node object
1325 | * @param ostr output buffer
1326 | * @param olen output buffer length
1327 | * @return parsed result
1328 | */
1329 | virtual const char* formatNode(const Node *node, char *ostr, size_t olen) = 0;
1330 | #endif
1331 |
1332 | /**
1333 | * Set request type.
1334 | * This method is DEPRECATED. Use Lattice::set_request_type(MECAB_PARTIAL).
1335 | * @param request_type new request type assigned
1336 | */
1337 | virtual void set_request_type(int request_type) = 0;
1338 |
1339 | /**
1340 | * Return the current request type.
1341 | * This method is DEPRECATED. Use Lattice class.
1342 | * @return request type
1343 | */
1344 | virtual int request_type() const = 0;
1345 |
1346 | /**
1347 | * Return true if partial parsing mode is on.
1348 | * This method is DEPRECATED. Use Lattice::has_request_type(MECAB_PARTIAL).
1349 | * @return boolean
1350 | */
1351 | virtual bool partial() const = 0;
1352 |
1353 | /**
1354 | * set partial parsing mode.
1355 | * This method is DEPRECATED. Use Lattice::add_request_type(MECAB_PARTIAL) or Lattice::remove_request_type(MECAB_PARTIAL)
1356 | * @param partial partial mode
1357 | */
1358 | virtual void set_partial(bool partial) = 0;
1359 |
1360 | /**
1361 | * Return lattice level.
1362 | * This method is DEPRECATED. Use Lattice::*_request_type()
1363 | * @return int lattice level
1364 | */
1365 | virtual int lattice_level() const = 0;
1366 |
1367 | /**
1368 | * Set lattice level.
1369 | * This method is DEPRECATED. Use Lattice::*_request_type()
1370 | * @param level lattice level
1371 | */
1372 | virtual void set_lattice_level(int level) = 0;
1373 |
1374 | /**
1375 | * Return true if all morphs output mode is on.
1376 | * This method is DEPRECATED. Use Lattice::has_request_type(MECAB_ALL_MORPHS).
1377 | * @return boolean
1378 | */
1379 | virtual bool all_morphs() const = 0;
1380 |
1381 | /**
1382 | * set all-morphs output mode.
1383 | * This method is DEPRECATED. Use Lattice::add_request_type(MECAB_ALL_MORPHS) or Lattice::remove_request_type(MECAB_ALL_MORPHS)
1384 | * @param all_morphs
1385 | */
1386 | virtual void set_all_morphs(bool all_morphs) = 0;
1387 |
1388 | /**
1389 | * Set temparature parameter theta.
1390 | * @param theta temparature parameter.
1391 | */
1392 | virtual void set_theta(float theta) = 0;
1393 |
1394 | /**
1395 | * Return temparature parameter theta.
1396 | * @return temparature parameter.
1397 | */
1398 | virtual float theta() const = 0;
1399 |
1400 | /**
1401 | * Return DictionaryInfo linked list.
1402 | * @return DictionaryInfo linked list
1403 | */
1404 | virtual const DictionaryInfo* dictionary_info() const = 0;
1405 |
1406 | /**
1407 | * Return error string.
1408 | * @return error string
1409 | */
1410 | virtual const char* what() const = 0;
1411 |
1412 | virtual ~Tagger() {}
1413 |
1414 | #ifndef SWIG
1415 | /**
1416 | * Factory method to create a new Tagger with a specified main's argc/argv-style parameters.
1417 | * Return NULL if new model cannot be initialized. Use MeCab::getLastError() to obtain the
1418 | * cause of the errors.
1419 | * @return new Tagger object
1420 | * @param argc number of parameters
1421 | * @param argv parameter list
1422 | */
1423 | static Tagger *create(int argc, char **argv);
1424 |
1425 | /**
1426 | * Factory method to create a new Tagger with a string parameter representation, i.e.,
1427 | * "-d /user/local/mecab/dic/ipadic -Ochasen".
1428 | * Return NULL if new model cannot be initialized. Use MeCab::getLastError() to obtain the
1429 | * cause of the errors.
1430 | * @return new Model object
1431 | * @param arg single string representation of the argment.
1432 | */
1433 | static Tagger *create(const char *arg);
1434 | #endif
1435 |
1436 | /**
1437 | * Return a version string
1438 | * @return version string
1439 | */
1440 | static const char *version();
1441 | };
1442 |
1443 | #ifndef SWIG
1444 | /**
1445 | * Alias of Lattice::create()
1446 | */
1447 | MECAB_DLL_EXTERN Lattice *createLattice();
1448 |
1449 | /**
1450 | * Alias of Mode::create(argc, argv)
1451 | */
1452 | MECAB_DLL_EXTERN Model *createModel(int argc, char **argv);
1453 |
1454 | /**
1455 | * Alias of Mode::create(arg)
1456 | */
1457 | MECAB_DLL_EXTERN Model *createModel(const char *arg);
1458 |
1459 | /**
1460 | * Alias of Tagger::create(argc, argv)
1461 | */
1462 | MECAB_DLL_EXTERN Tagger *createTagger(int argc, char **argv);
1463 |
1464 | /**
1465 | * Alias of Tagger::create(arg)
1466 | */
1467 | MECAB_DLL_EXTERN Tagger *createTagger(const char *arg);
1468 |
1469 | /**
1470 | * delete Lattice object.
1471 | * This method calles "delete lattice".
1472 | * In some environment, e.g., MS-Windows, an object allocated inside a DLL must be deleted in the same DLL too.
1473 | * @param lattice lattice object
1474 | */
1475 | MECAB_DLL_EXTERN void deleteLattice(Lattice *lattice);
1476 |
1477 |
1478 | /**
1479 | * delete Model object.
1480 | * This method calles "delete model".
1481 | * In some environment, e.g., MS-Windows, an object allocated inside a DLL must be deleted in the same DLL too.
1482 | * @param model model object
1483 | */
1484 | MECAB_DLL_EXTERN void deleteModel(Model *model);
1485 |
1486 | /**
1487 | * delete Tagger object.
1488 | * This method calles "delete tagger".
1489 | * In some environment, e.g., MS-Windows, an object allocated inside a DLL must be deleted in the same DLL too.
1490 | * @param tagger tagger object
1491 | */
1492 | MECAB_DLL_EXTERN void deleteTagger(Tagger *tagger);
1493 |
1494 | /**
1495 | * Return last error string.
1496 | * @return error string
1497 | */
1498 | MECAB_DLL_EXTERN const char* getLastError();
1499 |
1500 | /**
1501 | * An alias of getLastError.
1502 | * It is kept for backward compatibility.
1503 | * @return error string
1504 | */
1505 | MECAB_DLL_EXTERN const char* getTaggerError();
1506 | #endif
1507 | }
1508 | #endif
1509 | #endif /* MECAB_MECAB_H_ */
1510 |
--------------------------------------------------------------------------------
/fugashi/mecab.pxd:
--------------------------------------------------------------------------------
1 | cdef extern from "mecab.h":
2 | cdef struct mecab_dictionary_info_t:
3 | char *filename
4 | char *charset
5 | unsigned int size
6 | unsigned short version
7 | mecab_dictionary_info_t* next
8 |
9 | cdef struct mecab_node_t:
10 | mecab_node_t *prev
11 | mecab_node_t *next
12 | const char *surface
13 | const char *feature
14 | unsigned int id
15 | unsigned short length
16 | unsigned short rlength
17 | unsigned short posid
18 | unsigned char char_type
19 | unsigned char stat
20 |
21 | cdef struct mecab_model_t:
22 | pass
23 |
24 | cdef struct mecab_t:
25 | pass
26 |
27 | cdef mecab_t* mecab_new(int argc, char **argv)
28 | cdef mecab_model_t* mecab_model_new(int argc, char **argv)
29 | cdef const char* mecab_sparse_tostr2(mecab_t *mecab, const char *str, size_t len)
30 | cdef const mecab_node_t* mecab_sparse_tonode(mecab_t *mecab, const char *str)
31 | cdef const mecab_dictionary_info_t* mecab_dictionary_info(mecab_t *mecab)
32 |
33 | cdef char* mecab_nbest_sparse_tostr(mecab_t *mecab, size_t N, const char *str)
34 | cdef int mecab_nbest_init(mecab_t *mecab, const char *str)
35 | cdef const char* mecab_strerror(mecab_t *mecab)
36 |
37 | cdef int mecab_dict_index(int argc, char**argv)
38 |
39 | cdef int mecab_nbest_init(mecab_t* mecab, const char* str)
40 | cdef const mecab_node_t* mecab_nbest_next_tonode(mecab_t* mecab)
41 |
--------------------------------------------------------------------------------
/fugashi/tests/test_basic.py:
--------------------------------------------------------------------------------
1 | ## NOTE: These tests are written again the 2.1.2 binary distribution of Unidic.
2 |
3 | from fugashi import Tagger, UnidicFeatures17
4 | import pytest
5 |
6 | WAKATI_TESTS = (
7 | ("すももももももももの内", 'すもも も もも も もも の 内'),
8 | ("日本語ですよ", '日本 語 です よ'),
9 | ("深海魚は、深海に生息する魚類の総称。", '深海 魚 は 、 深海 に 生息 する 魚類 の 総称 。'),
10 | )
11 |
12 | TOKENIZER_TESTS = (
13 | ('あなたは新米の魔女。', ['あなた', 'は', '新米', 'の', '魔女', '。']),
14 | ('パートナーである猫と共に、見知らぬ町へやってきたばかりです。', ['パートナー', 'で', 'ある', '猫', 'と', '共', 'に', '、', '見知ら', 'ぬ', '町', 'へ', 'やっ', 'て', 'き', 'た', 'ばかり', 'です', '。']),
15 | )
16 |
17 | NBEST_TESTS = (
18 | ('外国人参政権', '外国 人参 政権 \n外国 人 参政 権'),
19 | ("深海魚は、深海に生息する魚類の総称。", '深海 魚 は 、 深海 に 生息 する 魚類 の 総称 。 \n深 海魚 は 、 深海 に 生息 する 魚類 の 総称 。'),
20 | ("東京都の大人気ない主材料", '東京 都 の 大 人気 ない 主材 料 \n東京 都 の 大 人気 ない 主 材料')
21 | )
22 |
23 | POS_TESTS = (
24 | ('日本語', ['名詞,固有名詞,地名,国', '名詞,普通名詞,一般,*']),
25 | )
26 |
27 | ACCENT_TESTS = (
28 | ('稻村に行きました', ['0,2', '*', '0', '*', '*']),
29 | )
30 |
31 | # Last number is token index of white space
32 | WHITE_SPACE_TESTS = (
33 | ("これは 半角スペースです", " ", 2),
34 | ("これは\tタブ文字です", "\t", 2),
35 | ("これは\n改行文字です", "\n", 2),
36 | ("これは\n\t 複数種類の空白文字です", "\n\t ", 2),
37 | ("これは\n\t 複数種類の空白文字です", "\n\t ", 2),
38 | ("\tタブ文字で始まる文字列", "\t", 0),
39 | )
40 |
41 | @pytest.mark.parametrize('text,wakati', WAKATI_TESTS)
42 | def test_wakati(text, wakati):
43 | tagger = Tagger('-Owakati')
44 | assert tagger.parse(text) == wakati
45 |
46 | @pytest.mark.parametrize('text,saved', TOKENIZER_TESTS)
47 | def test_tokens(text, saved):
48 | # testing the token objects is tricky, so instead just check surfaces
49 | #TODO: maybe save serialized nodes to compare?
50 | tagger = Tagger()
51 | tokens = [str(tok) for tok in tagger(text)]
52 | assert tokens == saved
53 |
54 | @pytest.mark.parametrize('text,saved', NBEST_TESTS)
55 | def test_nbest(text, saved):
56 | tagger = Tagger('-Owakati')
57 | assert tagger.nbest(text, 2) == saved
58 |
59 | @pytest.mark.parametrize('text,saved', NBEST_TESTS)
60 | def test_nbest_nodes(text, saved):
61 | tagger = Tagger()
62 | # parse adds a space to the end of each line
63 | saved = [ss.strip() for ss in saved.split("\n")]
64 | res = tagger.nbestToNodeList(text, 2)
65 | out = [" ".join([nn.surface for nn in nodes]) for nodes in res]
66 | assert out == saved
67 |
68 | def test_invalid_args():
69 | # Invalid args will give a NULL pointer for the Tagger object
70 | # don't try to use the null object!
71 | with pytest.raises(RuntimeError):
72 | tagger = Tagger('-fail')
73 |
74 | @pytest.mark.parametrize('text,tags', POS_TESTS)
75 | def test_pos(text, tags):
76 | # There should be a pos property when using the default tagger
77 | tagger = Tagger()
78 | tags_ = [tok.pos for tok in tagger(text)]
79 | assert tags == tags_
80 |
81 | @pytest.mark.parametrize('text,accent', ACCENT_TESTS)
82 | def test_accent(text, accent):
83 | # This checks for correct handling of feature fields containing commas as reported in #13
84 | tagger = Tagger()
85 | tokens = tagger(text)
86 | # Skip if UnidicFeatures17 is used because it doesn't have 'atype' attribute
87 | if tokens and isinstance(tokens[0].feature, UnidicFeatures17):
88 | pytest.skip()
89 | accent_ = [tok.feature.aType for tok in tokens]
90 | assert accent_ == accent
91 |
92 | def test_clobber():
93 | # Check that memory isn't clobbered by repeated parse calls
94 | tagger = Tagger()
95 | nodes1 = tagger("a\tb c d")
96 | nodes2 = tagger("x y z !")
97 |
98 | assert "a b c d".split() == [nn.surface for nn in nodes1]
99 | assert ["", "\t", " ", " "] == [nn.white_space for nn in nodes1]
100 |
101 | @pytest.mark.parametrize("text,space,idx", WHITE_SPACE_TESTS)
102 | def test_white_space(text, space, idx):
103 | tagger = Tagger()
104 | nodes = tagger.parseToNodeList(text)
105 |
106 | assert nodes[idx].white_space == space
107 |
--------------------------------------------------------------------------------
/fugashi/tests/test_ipadic.py:
--------------------------------------------------------------------------------
1 | # This is a small test to make sure ipadic is usable
2 | from fugashi import GenericTagger
3 | import pytest
4 | import ipadic
5 |
6 | WAKATI_TESTS = (
7 | ("すももももももももの内", 'すもも も もも も もも の 内'),
8 | ("日本語ですよ", '日本語 です よ'),
9 | ("深海魚は、深海に生息する魚類の総称。", '深海魚 は 、 深海 に 生息 する 魚類 の 総称 。'),
10 | )
11 |
12 | @pytest.mark.parametrize('text,wakati', WAKATI_TESTS)
13 | def test_wakati(text, wakati):
14 | tagger = GenericTagger(ipadic.MECAB_ARGS + ' -Owakati')
15 | assert tagger.parse(text) == wakati
16 |
--------------------------------------------------------------------------------
/fugashi/tests/test_nbest.py:
--------------------------------------------------------------------------------
1 | from fugashi import Tagger
2 | import string
3 | import pytest
4 |
5 | # NOTE: The bulk test is written against unidic-3.1.0+2021-08-31, fed with
6 | # corpus cc-100, accessible at:
7 | # https://data.statmt.org/cc-100/ja.txt.xz
8 | path_to_jatxt = "ja.txt"
9 |
10 | @pytest.mark.skip(reason="This test requires too much data to run in CI.")
11 | def test_bulk():
12 | tagger = Tagger()
13 |
14 | insufficient_paths = []
15 | incomplete_hypothesis = []
16 |
17 | def print_result(i):
18 | if insufficient_paths:
19 | print(f"{i} - Not enough paths (counts = {len(insufficient_paths)}): {', '.join(insufficient_paths)}")
20 | else:
21 | print(f"{i} - All lines parsed with enough paths.")
22 |
23 | if incomplete_hypothesis:
24 | print(f"{i} - Original line not recovered (counts = {len(incomplete_hypothesis)}): {', '.join(f'{entry[0]}:{entry[1]}' for entry in incomplete_hypothesis)}")
25 | else:
26 | print(f"{i} - All lines recovered")
27 |
28 | replace_chars = string.whitespace + '\x00'
29 | log_interval = 65536
30 | with open(path_to_jatxt, 'r', encoding='utf8') as fin:
31 | for i, line in enumerate(fin):
32 |
33 | # Tagger ignores whitespace and stops parsing at '\x00'.
34 | # This preprocessing is done for the completeness criteria
35 | for c in replace_chars:
36 | line = line.replace(c, '')
37 |
38 | if not line:
39 | continue
40 |
41 | paths = tagger.nbestToNodeList(line, 10)
42 | if len(paths) != 10:
43 | insufficient_paths.append(str(i))
44 |
45 | for j, p in enumerate(paths):
46 | if ''.join(w.surface for w in p) != line:
47 | incomplete_hypothesis.append((i,j))
48 |
49 | if i >= log_interval:
50 | log_interval*=2
51 | print_result(i)
52 |
53 | print_result('Final')
54 |
55 | if __name__ == '__main__':
56 | test_bulk()
57 |
--------------------------------------------------------------------------------
/fugashi_util.py:
--------------------------------------------------------------------------------
1 | import os
2 | import platform
3 | import subprocess
4 |
5 |
6 | def mecab_config(com="mecab-config"):
7 | output = subprocess.check_output(
8 | [com, "--inc-dir", "--libs-only-L", "--libs-only-l"]
9 | )
10 | if not isinstance(output, str):
11 | output = output.decode("utf-8")
12 | return output.split("\n"), []
13 |
14 |
15 | def mecab_config_windows():
16 | ## Windows
17 | if not os.name == "nt":
18 | return
19 |
20 | win_mecab_dir = r"C:\mecab"
21 | win_bin_dir = win_mecab_dir # this is separate from the sdk dir on some installs
22 | mecab_details = (win_mecab_dir, win_mecab_dir, "libmecab")
23 | data_files = ["{}\\libmecab.dll".format(win_bin_dir)]
24 | return mecab_details, data_files
25 |
26 |
27 | def mecab_config_cygwin():
28 | ## Cygwin
29 | os.chdir("build/mecab")
30 | if platform.system().startswith("CYGWIN"):
31 | rep = "mecab-cygwin64" if platform.machine() == "x86_64" else "mecab-cygwin32"
32 | subprocess.run(
33 | ["git", "clone", "--depth=1", "https://github.com/KoichiYasuoka/" + rep]
34 | )
35 | mecab_details = (
36 | "build/mecab/" + rep + "/include",
37 | "build/mecab/" + rep + "/lib",
38 | "mecab stdc++",
39 | )
40 | return mecab_details, []
41 |
42 |
43 | def check_libmecab():
44 | """Get MeCab build parameters.
45 |
46 | Where available the mecab-config script is used, but if it's available it
47 | will be installed or the parameters will otherwise be figured out."""
48 |
49 | configs = [
50 | mecab_config_windows,
51 | mecab_config,
52 | mecab_config_cygwin,
53 | ]
54 |
55 | # A few scripts will use a build directory. Save where we start so we can
56 | # reset the directory after each build step.
57 | cwd = os.getcwd()
58 | os.makedirs("build/mecab", exist_ok=True)
59 | for config in configs:
60 | try:
61 | out = config()
62 | os.chdir(cwd)
63 | if out:
64 | return out
65 | except:
66 | # failure is normal, typically just a different platform
67 | os.chdir(cwd)
68 | raise RuntimeError("Could not configure working env. Have you installed MeCab?")
69 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["Cython~=3.0.11", "setuptools>=77", "setuptools-scm>=8"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "fugashi"
7 | description = "Cython MeCab wrapper for fast, pythonic Japanese tokenization."
8 | readme = "README.md"
9 | requires-python = ">=3.9"
10 | license = "MIT AND BSD-3-Clause"
11 | license-files = ["LICENSE", "LICENSE.mecab"]
12 | authors = [{ name = "Paul O'Leary McCann", email = "polm@dampfkraft.com" }]
13 | classifiers = [
14 | "Environment :: Console",
15 | "Intended Audience :: Developers",
16 | "Intended Audience :: Science/Research",
17 | "Natural Language :: Japanese",
18 | "Operating System :: POSIX :: Linux",
19 | "Operating System :: MacOS :: MacOS X",
20 | "Programming Language :: Cython",
21 | "Programming Language :: Python :: 3",
22 | "Topic :: Text Processing :: Linguistic",
23 | ]
24 | dynamic = ["version"]
25 |
26 | [project.optional-dependencies]
27 | unidic = ["unidic"]
28 | unidic-lite = ["unidic-lite"]
29 |
30 | [project.scripts]
31 | fugashi = "fugashi.cli:main"
32 | fugashi-info = "fugashi.cli:info"
33 | fugashi-build-dict = "fugashi.cli:build_dict"
34 |
35 | [project.urls]
36 | source = "https://github.com/polm/fugashi"
37 | funding = "https://github.com/sponsors/polm"
38 |
39 | [tool.setuptools]
40 | include-package-data = false
41 |
42 | [tool.setuptools.packages.find]
43 | exclude = ["fugashi.tests*"]
44 |
45 | [tool.setuptools_scm]
46 |
47 | [tool.pytest.ini_options]
48 | testpaths = ["fugashi/tests"]
49 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pathlib
3 | import sys
4 |
5 | from setuptools import Extension, setup
6 | from setuptools.command.build_ext import build_ext as _build_ext
7 |
8 | # This is a side effect of how build works, see:
9 | # https://github.com/pypa/setuptools/discussions/3134
10 | sys.path.append(str(pathlib.Path(__file__).parent))
11 | from fugashi_util import check_libmecab
12 |
13 | # get the build parameters
14 | output, dll_files = check_libmecab()
15 |
16 | # pad the list in case something's missing
17 | mecab_config = list(output) + ([""] * 5)
18 | include_dirs = mecab_config[0].split()
19 | library_dirs = mecab_config[1].split()
20 | libraries = mecab_config[2].split()
21 | extra_objects = mecab_config[3].split()
22 | extra_link_args = mecab_config[4].split()
23 |
24 |
25 | # Windows DLL related prep.
26 | # By default the DLL will be bundled on windows, but you can turn it off with
27 | # an env var.
28 | bundle_dll = False
29 | fugashi_package_files = []
30 | should_bundle = os.environ.get("FUGASHI_NO_BUNDLE_DLL", "") in ("", "0")
31 | if sys.platform == "win32" and should_bundle:
32 | bundle_dll = True
33 | fugashi_package_files = [pathlib.Path(i).name for i in dll_files]
34 |
35 |
36 | class build_ext(_build_ext):
37 | """Custom behavior for build_ext.
38 |
39 | This is only run when bundling DLLs on Windows, which requires copying
40 | files around."""
41 |
42 | def run(self):
43 | if bundle_dll:
44 | if self.editable_mode:
45 | fugashi_dir = pathlib.Path(__file__).parent / "fugashi"
46 | else:
47 | fugashi_dir = pathlib.Path(self.build_lib) / "fugashi"
48 | for i in dll_files:
49 | self.copy_file(i, fugashi_dir)
50 | return super().run()
51 |
52 |
53 | extensions = Extension(
54 | "fugashi.fugashi",
55 | ["fugashi/fugashi.pyx"],
56 | libraries=libraries,
57 | library_dirs=library_dirs,
58 | include_dirs=include_dirs,
59 | extra_objects=extra_objects,
60 | extra_link_args=extra_link_args,
61 | )
62 |
63 | setup(
64 | ext_modules=[extensions],
65 | cmdclass={"build_ext": build_ext},
66 | package_data={"fugashi": fugashi_package_files},
67 | )
68 |
--------------------------------------------------------------------------------