├── .github ├── FUNDING.yml ├── macos-build.sh └── workflows │ ├── actions │ ├── build-manylinux-aarch64 │ │ └── action.yml │ └── build-manylinux │ │ └── action.yml │ ├── entrypoint.sh │ ├── manylinux1.yml │ ├── osx.yml │ ├── test_manylinux.yml │ └── windows.yml ├── .gitignore ├── CITATION.cff ├── Dockerfile ├── LICENSE ├── LICENSE.mecab ├── README.md ├── fugashi.png ├── fugashi ├── __init__.py ├── cli.py ├── fugashi.pyx ├── include │ └── mecab │ │ └── mecab.h ├── mecab.pxd └── tests │ ├── test_basic.py │ ├── test_ipadic.py │ └── test_nbest.py ├── fugashi_util.py ├── pyproject.toml └── setup.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: polm 2 | -------------------------------------------------------------------------------- /.github/macos-build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FLAGS="--enable-utf8-only" 4 | X86_TRIPLET=x86_64-apple-macos10.9 5 | ARM_TRIPLET=arm64-apple-macos11 6 | 7 | 8 | git clone --depth=1 https://github.com/taku910/mecab.git 9 | cd mecab/mecab 10 | 11 | rm -rf src/.libs-arm64 src/.libs-x86_64 src/.libs.combined 12 | 13 | ./configure $FLAGS --host="arm-apple-darwin22.1.0 " CXX="clang++ -target $ARM_TRIPLET" CC="clang" 14 | 15 | make clean 16 | # nproc doesnt exist on the runner 17 | make -j$(sysctl -n hw.logicalcpu_max) 18 | 19 | mv src/.libs src/.libs-arm64 20 | 21 | ./configure $FLAGS --host="x86_64-apple-darwin22.1.0 " CXX="clang++ -target $X86_TRIPLET" CC="clang" 22 | 23 | make clean 24 | make -j$(sysctl -n hw.logicalcpu_max) 25 | 26 | mv src/.libs src/.libs-x86_64 27 | 28 | rm -rf src/.libs.combined 29 | mkdir src/.libs.combined 30 | 31 | lipo -create src/.libs-arm64/libmecab.2.dylib src/.libs-x86_64/libmecab.2.dylib -output src/.libs.combined/libmecab.2.dylib 32 | 33 | lipo -create src/.libs-arm64/libmecab.a src/.libs-x86_64/libmecab.a -output src/.libs.combined/libmecab.a 34 | 35 | cp src/.libs-arm64/libmecab.lai src/.libs.combined/libmecab.lai 36 | 37 | ls src/.libs-arm64/*.o src/.libs-arm64/mecab* | while read line; do 38 | echo $line 39 | lipo -create $line src/.libs-x86_64/$(basename $line) -output src/.libs.combined/$(basename $line) 40 | done 41 | 42 | cd src/.libs.combined 43 | ln -s ../libmecab.la libmecab.la 44 | ln -s libmecab.2.dylib libmecab.dylib 45 | cd ../.. 46 | mv src/.libs.combined src/.libs 47 | 48 | sudo make install 49 | cd ../.. 50 | 51 | python -m pip install --upgrade pip 52 | python -m pip install cibuildwheel==2.23.3 53 | 54 | python -m cibuildwheel --platform macos --archs x86_64,arm64,universal2 --output-dir dist 55 | -------------------------------------------------------------------------------- /.github/workflows/actions/build-manylinux-aarch64/action.yml: -------------------------------------------------------------------------------- 1 | name: build linux aarch64 wheels with manylinux docker image 2 | runs: 3 | using: 'docker' 4 | image: docker://quay.io/pypa/manylinux2014_aarch64 5 | args: 6 | - .github/workflows/entrypoint.sh 7 | -------------------------------------------------------------------------------- /.github/workflows/actions/build-manylinux/action.yml: -------------------------------------------------------------------------------- 1 | name: build wheels with manylinux docker image 2 | runs: 3 | using: 'docker' 4 | image: docker://quay.io/pypa/manylinux2014_x86_64 5 | args: 6 | - .github/workflows/entrypoint.sh 7 | -------------------------------------------------------------------------------- /.github/workflows/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Install mecab, then build wheels 3 | set -e 4 | 5 | # install MeCab 6 | # TODO specify the commit used here 7 | git clone --depth=1 https://github.com/taku910/mecab.git 8 | cd mecab/mecab 9 | if [ "$(uname -m)" == "aarch64" ] 10 | then 11 | ./configure --enable-utf8-only --build=aarch64-unknown-linux-gnu 12 | else 13 | ./configure --enable-utf8-only 14 | fi 15 | make 16 | make install 17 | 18 | # Hack 19 | # see here: 20 | # https://github.com/RalfG/python-wheels-manylinux-build/issues/26 21 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/ 22 | 23 | # Build the wheels 24 | Python="cp39-cp39 cp310-cp310 cp311-cp311 cp312-cp312 cp313-cp313" 25 | for PYVER in $Python; do 26 | # build the wheels 27 | /opt/python/$PYVER/bin/pip wheel /github/workspace -w /github/workspace/wheels || { echo "Failed while buiding $PYVER wheel"; exit 1; } 28 | done 29 | 30 | # fix the wheels (bundles libs) 31 | for wheel in /github/workspace/wheels/*.whl; do 32 | if [ "$(uname -m)" == "aarch64" ] 33 | then 34 | auditwheel repair "$wheel" --plat manylinux2014_aarch64 -w /github/workspace/manylinux-aarch64-wheels 35 | else 36 | auditwheel repair "$wheel" --plat manylinux2014_x86_64 -w /github/workspace/manylinux2014-wheels 37 | fi 38 | done 39 | 40 | echo "Built wheels:" 41 | if [ "$(uname -m)" == "aarch64" ] 42 | then 43 | ls /github/workspace/manylinux-aarch64-wheels 44 | else 45 | ls /github/workspace/manylinux2014-wheels 46 | fi 47 | -------------------------------------------------------------------------------- /.github/workflows/manylinux1.yml: -------------------------------------------------------------------------------- 1 | name: Build manylinux1 wheels 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build_sdist: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - name: Set up Python 11 | uses: actions/setup-python@v5 12 | with: 13 | python-version: '>=3.9 <3.14' 14 | - name: install MeCab 15 | run: | 16 | git clone --depth=1 https://github.com/taku910/mecab.git 17 | cd mecab/mecab 18 | ./configure --enable-utf8-only 19 | make 20 | sudo make install 21 | cd ../.. 22 | - name: build sdist 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install twine build 26 | python -m build 27 | - name: upload to pypi if tagged 28 | if: startsWith(github.ref, 'refs/tags') 29 | env: 30 | TWINE_USERNAME: __token__ 31 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 32 | run: | 33 | twine upload dist/fugashi*.tar.gz 34 | 35 | build_linux: 36 | runs-on: ubuntu-latest 37 | steps: 38 | - uses: actions/checkout@v4 39 | - name: Set up Python 40 | uses: actions/setup-python@v5 41 | with: 42 | python-version: '>=3.9 <3.14' 43 | - name: build array of wheels 44 | uses: ./.github/workflows/actions/build-manylinux/ 45 | - name: Upload Wheels 46 | uses: actions/upload-artifact@v4 47 | with: 48 | name: manylinux2014-wheels 49 | path: manylinux2014-wheels 50 | - name: Publish to PyPI if tagged 51 | if: startsWith(github.ref, 'refs/tags') 52 | env: 53 | TWINE_USERNAME: __token__ 54 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 55 | run: | 56 | python --version 57 | pip --version 58 | python -m pip install --upgrade pip 59 | pip install twine 60 | twine upload manylinux2014-wheels/fugashi*whl 61 | 62 | build_linux-aarch64: 63 | runs-on: ubuntu-latest 64 | steps: 65 | - uses: actions/checkout@v4 66 | - name: Set up Python 67 | uses: actions/setup-python@v5 68 | with: 69 | python-version: '>=3.9 <3.14' 70 | - name: Set up QEMU 71 | id: qemu 72 | uses: docker/setup-qemu-action@v1 73 | - uses: ./.github/workflows/actions/build-manylinux-aarch64/ 74 | - name: Upload Wheels 75 | uses: actions/upload-artifact@v4 76 | with: 77 | name: manylinux-aarch64-wheels 78 | path: manylinux-aarch64-wheels 79 | - name: Publish to PyPI if tagged 80 | if: startsWith(github.ref, 'refs/tags') 81 | env: 82 | TWINE_USERNAME: __token__ 83 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 84 | run: | 85 | python --version 86 | pip --version 87 | python -m pip install --upgrade pip 88 | pip install twine 89 | twine upload manylinux-aarch64-wheels/fugashi*whl 90 | -------------------------------------------------------------------------------- /.github/workflows/osx.yml: -------------------------------------------------------------------------------- 1 | name: Build OSX wheels 2 | 3 | env: 4 | CIBW_ARCHS_MACOS: "x86_64" 5 | 6 | on: [push] 7 | 8 | jobs: 9 | build_osx: 10 | runs-on: macos-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Set up python 14 | uses: actions/setup-python@v5 15 | with: 16 | python-version: '>=3.8 <3.14' 17 | - name: Download and build MeCab 18 | shell: bash 19 | run: | 20 | .github/macos-build.sh 21 | 22 | - name: Upload Wheels 23 | uses: actions/upload-artifact@v4 24 | with: 25 | name: osx-wheels 26 | path: dist 27 | - name: Publish to PyPI if tagged 28 | if: startsWith(github.ref, 'refs/tags') 29 | env: 30 | TWINE_USERNAME: __token__ 31 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 32 | run: | 33 | pip install twine 34 | twine upload dist/fugashi* 35 | 36 | -------------------------------------------------------------------------------- /.github/workflows/test_manylinux.yml: -------------------------------------------------------------------------------- 1 | name: test-manylinux 2 | 3 | on: 4 | push: 5 | 6 | jobs: 7 | test_linux: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] 12 | include: 13 | - python-version: '3.9' 14 | py-short: '39' 15 | py-short2: '39' 16 | - python-version: '3.10' 17 | py-short: '310' 18 | py-short2: '310' 19 | - python-version: '3.11' 20 | py-short: 311 21 | py-short2: 311 22 | - python-version: '3.12' 23 | py-short: 312 24 | py-short2: 312 25 | - python-version: '3.13' 26 | py-short: 313 27 | py-short2: 313 28 | env: 29 | PYTHON: /opt/python/cp${{ matrix.py-short }}-cp${{ matrix.py-short2 }}/bin/python 30 | steps: 31 | - uses: actions/checkout@v3 32 | - run: docker build -t fugashi . 33 | - name: setup and test 34 | run: docker run -v $(pwd):/workdir -w /workdir fugashi sh -c "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/ && $PYTHON -m pip install cython pytest wheel unidic-lite ipadic && $PYTHON -m pip install -e . && $PYTHON -m pytest" 35 | -------------------------------------------------------------------------------- /.github/workflows/windows.yml: -------------------------------------------------------------------------------- 1 | name: Build Python Windows wheels 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | create: 8 | 9 | 10 | jobs: 11 | build_windows: 12 | runs-on: windows-latest 13 | strategy: 14 | max-parallel: 5 15 | matrix: 16 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] 17 | include: 18 | - python-version: '3.9' 19 | py-short: '39' 20 | py-short2: '39' 21 | - python-version: '3.10' 22 | py-short: '310' 23 | py-short2: '310' 24 | - python-version: '3.11' 25 | py-short: 311 26 | py-short2: 311 27 | - python-version: '3.12' 28 | py-short: 312 29 | py-short2: 312 30 | - python-version: '3.13' 31 | py-short: 313 32 | py-short2: 313 33 | steps: 34 | - uses: actions/checkout@v4 35 | - name: Set up Python ${{ matrix.python-version }} 36 | uses: actions/setup-python@v5 37 | with: 38 | python-version: ${{ matrix.python-version }} 39 | - name: Cache mecab 40 | id: cache-mecab 41 | uses: actions/cache@v4 42 | with: 43 | path: C:/mecab 44 | key: mecab-win-build 45 | - name: Download MeCab Win and Unzip it 46 | if: steps.cache-mecab.outputs.cache-hit != 'true' 47 | shell: bash 48 | run: | 49 | curl -LO "https://github.com/chezou/mecab/releases/download/mecab-0.996-msvc-5/mecab-msvc-x64.zip" 50 | unzip -o "mecab-msvc-x64.zip" -d c:/mecab 51 | - name: Install dependencies 52 | run: | 53 | python -m pip install --upgrade pip setuptools 54 | pip install build delvewheel setuptools-scm 55 | - name: Build wheel 56 | run: | 57 | python -m build --wheel 58 | env: 59 | FUGASHI_NO_BUNDLE_DLL: 1 60 | - name: Repair wheel 61 | run: | 62 | python -m delvewheel repair --add-path=C:/mecab ./dist/fugashi-*.whl 63 | - name: Upload Wheel 64 | uses: actions/upload-artifact@v4 65 | with: 66 | name: win-wheels-${{ matrix.python-version }} 67 | path: wheelhouse 68 | - name: Check wheels 69 | shell: bash 70 | run: | 71 | ls -la 72 | VERSION=$(python -m setuptools_scm) 73 | pip install "wheelhouse/fugashi-${VERSION}-cp${{ matrix.py-short }}-cp${{ matrix.py-short2 }}-win_amd64.whl" 74 | - name: Publish to PyPI if tagged 75 | if: startsWith(github.ref, 'refs/tags') 76 | env: 77 | TWINE_USERNAME: __token__ 78 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 79 | shell: bash 80 | run: | 81 | pip install twine 82 | twine upload wheelhouse/fugashi* 83 | 84 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # Environments 101 | .env 102 | .venv 103 | env/ 104 | venv/ 105 | ENV/ 106 | env.bak/ 107 | venv.bak/ 108 | 109 | # Spyder project settings 110 | .spyderproject 111 | .spyproject 112 | 113 | # Rope project settings 114 | .ropeproject 115 | 116 | # mkdocs documentation 117 | /site 118 | 119 | # mypy 120 | .mypy_cache/ 121 | .dmypy.json 122 | dmypy.json 123 | 124 | # Pyre type checker 125 | .pyre/ 126 | 127 | /fugashi/fugashi.c 128 | /fugashi/libmecab.dll 129 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | preferred-citation: 3 | type: article 4 | message: "If you use fugashi in research, it would be appreciated if you site this paper." 5 | authors: 6 | - family-names: "McCann" 7 | given-names: "Paul" 8 | orcid: "https://orcid.org/0000-0003-3376-8772" 9 | title: "fugashi, a Tool for Tokenizing Japanese in Python" 10 | doi: "10.18653/v1/2020.nlposs-1.7" 11 | journal: "Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)" 12 | year: 2020 13 | month: 11 14 | start: 44 15 | end: 51 16 | 17 | 18 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/pypa/manylinux2014_x86_64 2 | 3 | RUN git clone --depth=1 https://github.com/taku910/mecab.git && \ 4 | cd mecab/mecab && \ 5 | ./configure --enable-utf8-only && \ 6 | make && \ 7 | make install 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Paul O'Leary McCann 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LICENSE.mecab: -------------------------------------------------------------------------------- 1 | Copyright (c) 2001-2008, Taku Kudo 2 | Copyright (c) 2004-2008, Nippon Telegraph and Telephone Corporation 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are 6 | permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above 9 | copyright notice, this list of conditions and the 10 | following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the 14 | following disclaimer in the documentation and/or other 15 | materials provided with the distribution. 16 | 17 | * Neither the name of the Nippon Telegraph and Telegraph Corporation 18 | nor the names of its contributors may be used to endorse or 19 | promote products derived from this software without specific 20 | prior written permission. 21 | 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 23 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 24 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 28 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 29 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Open in Streamlit](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://fugashi.streamlit.app) 2 | [![Current PyPI packages](https://badge.fury.io/py/fugashi.svg)](https://pypi.org/project/fugashi/) 3 | ![Test Status](https://github.com/polm/fugashi/workflows/test-manylinux/badge.svg) 4 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/fugashi)](https://pypi.org/project/fugashi/) 5 | ![Supported Platforms](https://img.shields.io/badge/platforms-linux%20macosx%20windows-blue) 6 | 7 | # fugashi 8 | 9 | fugashi by Irasutoya 10 | 11 | fugashi is a Cython wrapper for [MeCab](https://taku910.github.io/mecab/), a 12 | Japanese tokenizer and morphological analysis tool. Wheels are provided for 13 | Linux, OSX (Intel), and Win64, and UniDic is [easy to install](#installing-a-dictionary). 14 | 15 | **issueを英語で書く必要はありません。** 16 | 17 | Check out the [interactive demo][], see the [blog post](https://www.dampfkraft.com/nlp/fugashi.html) for background 18 | on why fugashi exists and some of the design decisions, or see [this 19 | guide][guide] for a basic introduction to Japanese tokenization. 20 | 21 | [guide]: https://www.dampfkraft.com/nlp/how-to-tokenize-japanese.html 22 | [interactive demo]: https://fugashi.streamlit.app 23 | 24 | If you are on a platform for which wheels are not provided, you'll need to 25 | install MeCab first. It's recommended you install [from 26 | source](https://github.com/taku910/mecab). If you need to build from source on 27 | Windows, [@chezou's fork](https://github.com/chezou/mecab) is recommended; see 28 | [issue #44](https://github.com/polm/fugashi/issues/44#issuecomment-954426115) 29 | for an explanation of the problems with the official repo. 30 | 31 | Known platforms without wheels: 32 | 33 | - musl-based distros like alpine [#77](https://github.com/polm/fugashi/issues/77) 34 | - PowerPC 35 | - Windows 32bit 36 | 37 | ## Usage 38 | 39 | ```python 40 | from fugashi import Tagger 41 | 42 | tagger = Tagger('-Owakati') 43 | text = "麩菓子は、麩を主材料とした日本の菓子。" 44 | tagger.parse(text) 45 | # => '麩 菓子 は 、 麩 を 主材 料 と し た 日本 の 菓子 。' 46 | for word in tagger(text): 47 | print(word, word.feature.lemma, word.pos, sep='\t') 48 | # "feature" is the Unidic feature data as a named tuple 49 | ``` 50 | 51 | ## Installing a Dictionary 52 | 53 | fugashi requires a dictionary. [UniDic](https://unidic.ninjal.ac.jp/) is 54 | recommended, and two easy-to-install versions are provided. 55 | 56 | - [unidic-lite](https://github.com/polm/unidic-lite), a slightly modified version 2.1.2 of Unidic (from 2013) that's relatively small 57 | - [unidic](https://github.com/polm/unidic-py), the latest UniDic 3.1.0, which is 770MB on disk and requires a separate download step 58 | 59 | If you just want to make sure things work you can start with `unidic-lite`, but 60 | for more serious processing `unidic` is recommended. For production use you'll 61 | generally want to generate your own dictionary too; for details see the [MeCab 62 | documentation](https://taku910.github.io/mecab/learn.html). 63 | 64 | To get either of these dictionaries, you can install them directly using `pip` 65 | or do the below: 66 | 67 | ```sh 68 | pip install 'fugashi[unidic-lite]' 69 | 70 | # The full version of UniDic requires a separate download step 71 | pip install 'fugashi[unidic]' 72 | python -m unidic download 73 | ``` 74 | 75 | For more information on the different MeCab dictionaries available, see [this article](https://www.dampfkraft.com/nlp/japanese-tokenizer-dictionaries.html). 76 | 77 | ## Dictionary Use 78 | 79 | fugashi is written with the assumption you'll use Unidic to process Japanese, 80 | but it supports arbitrary dictionaries. 81 | 82 | If you're using a dictionary besides Unidic you can use the GenericTagger like this: 83 | 84 | ```python 85 | from fugashi import GenericTagger 86 | tagger = GenericTagger() 87 | 88 | # parse can be used as normal 89 | tagger.parse('something') 90 | # features from the dictionary can be accessed by field numbers 91 | for word in tagger(text): 92 | print(word.surface, word.feature[0]) 93 | ``` 94 | 95 | You can also create a dictionary wrapper to get feature information as a named tuple. 96 | 97 | ```python 98 | from fugashi import GenericTagger, create_feature_wrapper 99 | CustomFeatures = create_feature_wrapper('CustomFeatures', 'alpha beta gamma') 100 | tagger = GenericTagger(wrapper=CustomFeatures) 101 | for word in tagger.parseToNodeList(text): 102 | print(word.surface, word.feature.alpha) 103 | ``` 104 | 105 | ## Citation 106 | 107 | If you use fugashi in research, it would be appreciated if you cite this paper. You can read it at [the ACL Anthology](https://www.aclweb.org/anthology/2020.nlposs-1.7/) or [on Arxiv](https://arxiv.org/abs/2010.06858). 108 | 109 | @inproceedings{mccann-2020-fugashi, 110 | title = "fugashi, a Tool for Tokenizing {J}apanese in Python", 111 | author = "McCann, Paul", 112 | booktitle = "Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)", 113 | month = nov, 114 | year = "2020", 115 | address = "Online", 116 | publisher = "Association for Computational Linguistics", 117 | url = "https://www.aclweb.org/anthology/2020.nlposs-1.7", 118 | pages = "44--51", 119 | abstract = "Recent years have seen an increase in the number of large-scale multilingual NLP projects. However, even in such projects, languages with special processing requirements are often excluded. One such language is Japanese. Japanese is written without spaces, tokenization is non-trivial, and while high quality open source tokenizers exist they can be hard to use and lack English documentation. This paper introduces fugashi, a MeCab wrapper for Python, and gives an introduction to tokenizing Japanese.", 120 | } 121 | 122 | ## Alternatives 123 | 124 | If you have a problem with fugashi feel free to open an issue. However, there 125 | are some cases where it might be better to use a different library. 126 | 127 | - If you don't want to deal with installing MeCab at all, try [SudachiPy](https://github.com/WorksApplications/sudachi.rs). 128 | - If you need to work with Korean, try [pymecab-ko](https://github.com/NoUnique/pymecab-ko) or [KoNLPy](https://konlpy.org/en/latest/). 129 | 130 | ## License and Copyright Notice 131 | 132 | fugashi is released under the terms of the [MIT license](./LICENSE). Please 133 | copy it far and wide. 134 | 135 | fugashi is a wrapper for MeCab, and fugashi wheels include MeCab binaries. 136 | MeCab is copyrighted free software by Taku Kudo `` and Nippon 137 | Telegraph and Telephone Corporation, and is redistributed under the [BSD 138 | License](./LICENSE.mecab). 139 | -------------------------------------------------------------------------------- /fugashi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polm/fugashi/60594f930d49bcfb1d4d9c72d310bfa3301444d9/fugashi.png -------------------------------------------------------------------------------- /fugashi/__init__.py: -------------------------------------------------------------------------------- 1 | from .fugashi import * 2 | -------------------------------------------------------------------------------- /fugashi/cli.py: -------------------------------------------------------------------------------- 1 | from fugashi import GenericTagger, Tagger, build_dictionary 2 | import sys 3 | import fileinput 4 | 5 | 6 | def main(): 7 | """ 8 | This is a simple wrapper for fugashi so you can test it from the command line. 9 | Like the mecab binary, it treats each line of stdin as one sentence. You can 10 | pass tagger arguments here too. 11 | """ 12 | args = " ".join(sys.argv[1:]) 13 | 14 | # This should work if you specify a different dictionary, 15 | # but it should also work with the pip unidic. 16 | # Try the GenericTagger and then try the Unidic tagger. 17 | try: 18 | tagger = GenericTagger(args, quiet=True) 19 | except RuntimeError: 20 | tagger = Tagger(args) 21 | 22 | for line in fileinput.input([]): 23 | print(tagger.parse(line.strip())) 24 | 25 | 26 | def info(): 27 | """Print configuration info.""" 28 | args = " ".join(sys.argv[1:]) 29 | try: 30 | tagger = GenericTagger(args, quiet=True) 31 | except RuntimeError: 32 | tagger = Tagger(args) 33 | # TODO get the fugashi version here too 34 | print("Fugashi dictionary info:") 35 | print("-----") 36 | for di in tagger.dictionary_info: 37 | for field in "version size charset filename".split(): 38 | print((field + ":").ljust(10), di[field]) 39 | print("-----") 40 | 41 | 42 | def build_dict(): 43 | """EXPERIMENTAL A wrapper for MeCab's user dictionary building command. 44 | 45 | This also defaults to utf8. 46 | """ 47 | # TODO simplify using pip-installed dictionaries as base 48 | args = sys.argv[0] + " -f utf8 -t utf8 " + " ".join(sys.argv[1:]) 49 | print(args) 50 | build_dictionary(args) 51 | -------------------------------------------------------------------------------- /fugashi/fugashi.pyx: -------------------------------------------------------------------------------- 1 | #cython: language_level=3 2 | from fugashi.mecab cimport (mecab_new, mecab_sparse_tostr2, mecab_t, mecab_node_t, 3 | mecab_sparse_tonode, mecab_nbest_sparse_tostr, 4 | mecab_dictionary_info_t, mecab_dictionary_info, 5 | mecab_model_new, mecab_strerror, mecab_dict_index, 6 | mecab_nbest_init, mecab_nbest_next_tonode) 7 | from collections import namedtuple 8 | import os 9 | import csv 10 | import shlex 11 | import sys 12 | from libc.stdlib cimport malloc, free 13 | 14 | # field names can be found in the dicrc file distributed with Unidic or here: 15 | # https://unidic.ninjal.ac.jp/faq 16 | 17 | # 2.1.2 src schema 18 | UnidicFeatures17 = namedtuple('UnidicFeatures17', 19 | ('pos1 pos2 pos3 pos4 cType cForm lForm lemma orth pron ' 20 | 'orthBase pronBase goshu iType iForm fType fForm').split(' ')) 21 | 22 | # 2.1.2 bin schema 23 | # The unidic-mecab-2.1.2_bin distribution adds kana accent fields. 24 | UnidicFeatures26 = namedtuple('UnidicFeatures26', 25 | ('pos1 pos2 pos3 pos4 cType cForm lForm lemma orth pron ' 26 | 'orthBase pronBase goshu iType iForm fType fForm ' 27 | 'kana kanaBase form formBase iConType fConType aType ' 28 | 'aConType aModeType').split(' ')) 29 | 30 | # schema used in 2.2.0, 2.3.0 31 | UnidicFeatures29 = namedtuple('UnidicFeatures29', 'pos1 pos2 pos3 pos4 cType ' 32 | 'cForm lForm lemma orth pron orthBase pronBase goshu iType iForm fType ' 33 | 'fForm iConType fConType type kana kanaBase form formBase aType aConType ' 34 | 'aModType lid lemma_id'.split(' ')) 35 | 36 | cdef class Node: 37 | """Generic Nodes are modeled after the data returned from MeCab. 38 | 39 | Some data is in a strict format using enums, but most useful data is in the 40 | feature string, which is an untokenized CSV string.""" 41 | cdef const mecab_node_t* c_node 42 | cdef str _surface 43 | cdef str _ws 44 | cdef object features 45 | cdef object wrapper 46 | 47 | def __init__(self): 48 | pass 49 | 50 | def __repr__(self): 51 | if self.stat == 0 or self.stat == 1: 52 | return self.surface 53 | elif self.stat == 2: 54 | return '' 55 | elif self.stat == 3: 56 | return '' 57 | else: 58 | return self.surface 59 | 60 | 61 | @property 62 | def surface(self): 63 | if self._surface is None: 64 | pass 65 | return self._surface 66 | 67 | @surface.setter 68 | def surface(self, ss): 69 | self._surface = ss 70 | 71 | @property 72 | def feature(self): 73 | if self.features is None: 74 | self.set_feature(self.c_node.feature) 75 | return self.features 76 | 77 | @property 78 | def feature_raw(self): 79 | return self.c_node.feature.decode('utf-8') 80 | 81 | @property 82 | def length(self): 83 | return self.c_node.length 84 | 85 | @property 86 | def rlength(self): 87 | return self.c_node.rlength 88 | 89 | @property 90 | def posid(self): 91 | return self.c_node.posid 92 | 93 | @property 94 | def char_type(self): 95 | return self.c_node.char_type 96 | 97 | @property 98 | def stat(self): 99 | return self.c_node.stat 100 | 101 | @property 102 | def is_unk(self): 103 | return self.stat == 1 104 | 105 | @property 106 | def white_space(self): 107 | if self._ws is None: 108 | return '' 109 | return self._ws 110 | 111 | @white_space.setter 112 | def white_space(self, ws): 113 | self._ws = ws 114 | 115 | cdef list pad_none(self, list fields): 116 | try: 117 | d = len(self.wrapper._fields) - len(fields) 118 | except AttributeError: 119 | d = 0 120 | return fields + [None] * d 121 | 122 | cdef void set_feature(self, bytes feature): 123 | raw = feature.decode('utf-8') 124 | if '"' in raw: 125 | # This happens when a field contains commas. In Unidic this only 126 | # happens for the "aType" field used for accent data, and then only 127 | # a minority of the time. 128 | fields = next(csv.reader([raw])) 129 | else: 130 | fields = raw.split(',') 131 | fields = self.pad_none(fields) 132 | self.features = self.wrapper(*fields) 133 | 134 | @staticmethod 135 | cdef Node wrap(const mecab_node_t* c_node, object wrapper): 136 | cdef Node node = Node.__new__(Node) 137 | node.c_node = c_node 138 | node.wrapper = wrapper 139 | 140 | return node 141 | 142 | cdef class UnidicNode(Node): 143 | """A Unidic specific node type. 144 | 145 | At present this just adds a convenience function to get the four-field POS 146 | value. 147 | """ 148 | 149 | @property 150 | def pos(self): 151 | return "{},{},{},{}".format(*self.feature[:4]) 152 | 153 | @staticmethod 154 | cdef UnidicNode wrap(const mecab_node_t* c_node, object wrapper): 155 | # This has to be copied from the base node to change the type 156 | cdef UnidicNode node = UnidicNode.__new__(UnidicNode) 157 | node.c_node = c_node 158 | node.wrapper = wrapper 159 | 160 | return node 161 | 162 | def make_tuple(*args): 163 | """Take variable number of args, return tuple. 164 | 165 | The tuple constructor actually has a different type signature than the 166 | namedtuple constructor. This is a wrapper to give it the same interface. 167 | """ 168 | return tuple(args) 169 | 170 | FAILMESSAGE = """ 171 | Failed initializing MeCab. Please see the README for possible solutions: 172 | 173 | https://github.com/polm/fugashi 174 | 175 | If you are still having trouble, please file an issue here, and include the 176 | ERROR DETAILS below: 177 | 178 | https://github.com/polm/fugashi/issues 179 | 180 | issueを英語で書く必要はありません。 181 | 182 | ------------------- ERROR DETAILS ------------------------""" 183 | 184 | cdef str get_error_details(int argc, char** argv): 185 | """Instantiate a Model to get output from MeCab. 186 | 187 | Due to an upstream bug, errors in Tagger intialization don't give useful 188 | error output.""" 189 | model = mecab_model_new(argc, argv) 190 | return mecab_strerror(NULL).decode('utf-8') 191 | 192 | cdef str get_detailed_error(list args, int argc, char** argv): 193 | """Generate guide to solving initialization errors.""" 194 | msg = FAILMESSAGE + "\n" 195 | msg += "arguments: " + str(args) + "\n" 196 | msg += get_error_details(argc, argv) + "\n" 197 | msg += '----------------------------------------------------------\n' 198 | return msg 199 | 200 | 201 | cdef class GenericTagger: 202 | """Generic Tagger, supports any dictionary. 203 | 204 | By default dictionary features are wrapped in a tuple. If you want you can 205 | provide a namedtuple or similar container for them as an argument to the 206 | constructor. 207 | """ 208 | 209 | cdef mecab_t* c_tagger 210 | cdef object wrapper 211 | cdef dict _cache 212 | 213 | def __init__(self, args='', wrapper=make_tuple, quiet=False): 214 | # The first argument is ignored because in the MeCab binary the argc 215 | # and argv for the process are used here. 216 | args = [b'fugashi', b'-C'] + [bytes(arg, 'utf-8') for arg in shlex.split(args)] 217 | cdef int argc = len(args) 218 | cdef char** argv = malloc(argc * sizeof(char*)) 219 | for ii, arg in enumerate(args): 220 | argv[ii] = arg 221 | 222 | self.c_tagger = mecab_new(argc, argv) 223 | if self.c_tagger == NULL: 224 | # In theory mecab_strerror should return an error string from MeCab 225 | # It doesn't seem to work and just returns b'' though, so this will 226 | # have to do. 227 | msg = "Failed initializing MeCab" 228 | if not quiet: 229 | msg = get_detailed_error(args, argc, argv) 230 | free(argv) 231 | raise RuntimeError(msg) 232 | free(argv) 233 | self.wrapper = wrapper 234 | self._cache = {} 235 | 236 | def __call__(self, text): 237 | """Wrapper for parseToNodeList.""" 238 | return self.parseToNodeList(text) 239 | 240 | def parse(self, str text): 241 | btext = bytes(text, 'utf-8') 242 | out = mecab_sparse_tostr2(self.c_tagger, btext, len(btext)).decode('utf-8') 243 | # MeCab always adds a newline, and in wakati mode it adds a space. 244 | # The reason for this is unclear but may be for terminal use. 245 | # It's never helpful, so remove it. 246 | return out.rstrip() 247 | 248 | cdef wrap(self, const mecab_node_t* node): 249 | # This function just exists so subclasses can override the node type. 250 | return Node.wrap(node, self.wrapper) 251 | 252 | def parseToNodeList(self, text): 253 | # cstr = bytes(text, 'utf-8') 254 | bstr = bytes(text, 'utf-8') 255 | cdef const mecab_node_t* node = mecab_sparse_tonode(self.c_tagger, bstr) 256 | 257 | # A nodelist always contains one each of BOS and EOS (beginning/end of 258 | # sentence) nodes. Since they have no information on them and MeCab 259 | # doesn't do any kind of sentence tokenization they're not useful in 260 | # the output and will be removed here. 261 | 262 | # Node that on the command line this behavior is different, and each 263 | # line is treated as a sentence. 264 | 265 | out = [] 266 | while node.next: 267 | node = node.next 268 | if node.stat == 3: # eos node 269 | return out 270 | nn = self.wrap(node) 271 | 272 | # TODO maybe add an option to this function that doesn't cache the 273 | # surface. Not caching here is faster but means node surfaces are 274 | # invalidated on the next call of this function. 275 | 276 | # In theory the input string should be re-usable, but it's hard to 277 | # track ownership of it in Python properly. 278 | 279 | # avoid new string allocations 280 | # TODO try lru cache instead of intern (reason: good to age stuff out) 281 | surf = node.surface[:node.length] 282 | shash = hash(surf) 283 | if shash not in self._cache: 284 | self._cache[shash] = sys.intern(surf.decode("utf-8")) 285 | nn.surface = self._cache[shash] 286 | 287 | # do the same for whitespace 288 | nodelen = node.rlength - node.length 289 | pnode = node.prev 290 | ws = pnode.surface[pnode.length : pnode.length + nodelen] 291 | wshash = hash(ws) 292 | if wshash not in self._cache: 293 | self._cache[wshash] = sys.intern(ws.decode("utf-8")) 294 | nn.white_space = self._cache[wshash] 295 | 296 | out.append(nn) 297 | 298 | def nbest(self, text, num=10): 299 | """Return the n-best possible tokenizations of the input, giving the 300 | output as a single string. 301 | """ 302 | 303 | cstr = bytes(text, 'utf-8') 304 | out = mecab_nbest_sparse_tostr(self.c_tagger, num, cstr).decode('utf-8') 305 | return out.rstrip() 306 | 307 | def nbestToNodeList(self, text, num=10): 308 | """Return the n-best possible tokenizations of the input, giving each 309 | as a list of nodes. 310 | """ 311 | 312 | cstr = bytes(text, 'utf-8') 313 | assert mecab_nbest_init(self.c_tagger, cstr), ( 314 | "Error at mecab_nbest_init" 315 | ) 316 | 317 | ret = [] 318 | for path in range(num): 319 | node = mecab_nbest_next_tonode(self.c_tagger) 320 | if not node: 321 | # this happens if there aren't enough paths 322 | break 323 | out = [] 324 | while node.next: 325 | node = node.next 326 | if node.stat == 3: 327 | break 328 | nn = self.wrap(node) 329 | surf = node.surface[:node.length] 330 | shash = hash(surf) 331 | 332 | if shash not in self._cache: 333 | self._cache[shash] = sys.intern(surf.decode("utf-8")) 334 | nn.surface = self._cache[shash] 335 | out.append(nn) 336 | 337 | ret.append(out) 338 | 339 | return ret 340 | 341 | @property 342 | def dictionary_info(self): 343 | """Get info on the dictionaries of the Tagger. 344 | 345 | This only exposes basic information. The C API has functions for more 346 | sophisticated access, though it's not clear how useful they are. 347 | 348 | The dictionary info structs will be returned as a list of dictionaries. 349 | If you have only the system dictionary that'll be the only dictionary, 350 | but if you specify user dictionaries they'll also be present. 351 | """ 352 | infos = [] 353 | cdef mecab_dictionary_info_t* dictinfo = mecab_dictionary_info(self.c_tagger) 354 | while dictinfo: 355 | info = {} 356 | info['filename'] = dictinfo.filename.decode('utf-8') 357 | info['charset'] = dictinfo.charset.decode('utf-8') 358 | info['size'] = dictinfo.size 359 | # Note this is generally not used reliably 360 | info['version'] = dictinfo.version 361 | dictinfo = dictinfo.next 362 | infos.append(info) 363 | return infos 364 | 365 | def try_import_unidic(): 366 | """Import unidic or unidic lite if available. Return dicdir.""" 367 | try: 368 | import unidic 369 | return unidic.DICDIR 370 | except ImportError: 371 | try: 372 | import unidic_lite 373 | return unidic_lite.DICDIR 374 | except ImportError: 375 | # This is OK, just give up. 376 | return 377 | 378 | cdef class Tagger(GenericTagger): 379 | """Default tagger. Detects the correct Unidic feature format. 380 | 381 | Unidic 2.1.2 (17 field) and 2.2, 2.3 format (29 field) are supported. 382 | """ 383 | 384 | def __init__(self, arg=''): 385 | # Use pip installed unidic if available 386 | unidicdir = try_import_unidic() 387 | if unidicdir: 388 | mecabrc = os.path.join(unidicdir, 'mecabrc') 389 | arg = '-r "{}" -d "{}" '.format(mecabrc, unidicdir) + arg 390 | 391 | super().__init__(arg) 392 | 393 | fields = self.parseToNodeList("日本")[0].feature_raw.split(',') 394 | 395 | if len(fields) == 17: 396 | self.wrapper = UnidicFeatures17 397 | elif len(fields) == 26: 398 | self.wrapper = UnidicFeatures26 399 | elif len(fields) == 29: 400 | self.wrapper = UnidicFeatures29 401 | else: 402 | raise RuntimeError("Unknown dictionary format, use a GenericTagger.") 403 | 404 | # This needs to be overridden to change the node type. 405 | cdef wrap(self, const mecab_node_t* node): 406 | return UnidicNode.wrap(node, self.wrapper) 407 | 408 | def create_feature_wrapper(name, fields, default=None): 409 | """Create a namedtuple based wrapper for dictionary features. 410 | 411 | This sets the default values for the namedtuple to None since in most cases 412 | unks will have fewer fields. 413 | 414 | The resulting type can be used as the wrapper argument to GenericTagger to 415 | support new dictionaries. 416 | """ 417 | return namedtuple(name, fields, defaults=(None,) * len(fields)) 418 | 419 | def build_dictionary(args): 420 | args = [bytes(arg, 'utf-8') for arg in shlex.split(args)] 421 | cdef int argc = len(args) 422 | cdef char** argv = malloc(argc * sizeof(char*)) 423 | for ii, arg in enumerate(args): 424 | argv[ii] = arg 425 | out = mecab_dict_index(argc, argv) 426 | free(argv) 427 | 428 | -------------------------------------------------------------------------------- /fugashi/include/mecab/mecab.h: -------------------------------------------------------------------------------- 1 | /* 2 | MeCab -- Yet Another Part-of-Speech and Morphological Analyzer 3 | 4 | Copyright(C) 2001-2011 Taku Kudo 5 | Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation 6 | */ 7 | #ifndef MECAB_MECAB_H_ 8 | #define MECAB_MECAB_H_ 9 | 10 | /* C/C++ common data structures */ 11 | 12 | /** 13 | * DictionaryInfo structure 14 | */ 15 | struct mecab_dictionary_info_t { 16 | /** 17 | * filename of dictionary 18 | * On Windows, filename is stored in UTF-8 encoding 19 | */ 20 | const char *filename; 21 | 22 | /** 23 | * character set of the dictionary. e.g., "SHIFT-JIS", "UTF-8" 24 | */ 25 | const char *charset; 26 | 27 | /** 28 | * How many words are registered in this dictionary. 29 | */ 30 | unsigned int size; 31 | 32 | /** 33 | * dictionary type 34 | * this value should be MECAB_USR_DIC, MECAB_SYS_DIC, or MECAB_UNK_DIC. 35 | */ 36 | int type; 37 | 38 | /** 39 | * left attributes size 40 | */ 41 | unsigned int lsize; 42 | 43 | /** 44 | * right attributes size 45 | */ 46 | unsigned int rsize; 47 | 48 | /** 49 | * version of this dictionary 50 | */ 51 | unsigned short version; 52 | 53 | /** 54 | * pointer to the next dictionary info. 55 | */ 56 | struct mecab_dictionary_info_t *next; 57 | }; 58 | 59 | /** 60 | * Path structure 61 | */ 62 | struct mecab_path_t { 63 | /** 64 | * pointer to the right node 65 | */ 66 | struct mecab_node_t* rnode; 67 | 68 | /** 69 | * pointer to the next right path 70 | */ 71 | struct mecab_path_t* rnext; 72 | 73 | /** 74 | * pointer to the left node 75 | */ 76 | struct mecab_node_t* lnode; 77 | 78 | /** 79 | * pointer to the next left path 80 | */ 81 | 82 | struct mecab_path_t* lnext; 83 | 84 | /** 85 | * local cost 86 | */ 87 | int cost; 88 | 89 | /** 90 | * marginal probability 91 | */ 92 | float prob; 93 | }; 94 | 95 | /** 96 | * Node structure 97 | */ 98 | struct mecab_node_t { 99 | /** 100 | * pointer to the previous node. 101 | */ 102 | struct mecab_node_t *prev; 103 | 104 | /** 105 | * pointer to the next node. 106 | */ 107 | struct mecab_node_t *next; 108 | 109 | /** 110 | * pointer to the node which ends at the same position. 111 | */ 112 | struct mecab_node_t *enext; 113 | 114 | /** 115 | * pointer to the node which starts at the same position. 116 | */ 117 | struct mecab_node_t *bnext; 118 | 119 | /** 120 | * pointer to the right path. 121 | * this value is NULL if MECAB_ONE_BEST mode. 122 | */ 123 | struct mecab_path_t *rpath; 124 | 125 | /** 126 | * pointer to the right path. 127 | * this value is NULL if MECAB_ONE_BEST mode. 128 | */ 129 | struct mecab_path_t *lpath; 130 | 131 | /** 132 | * surface string. 133 | * this value is not 0 terminated. 134 | * You can get the length with length/rlength members. 135 | */ 136 | const char *surface; 137 | 138 | /** 139 | * feature string 140 | */ 141 | const char *feature; 142 | 143 | /** 144 | * unique node id 145 | */ 146 | unsigned int id; 147 | 148 | /** 149 | * length of the surface form. 150 | */ 151 | unsigned short length; 152 | 153 | /** 154 | * length of the surface form including white space before the morph. 155 | */ 156 | unsigned short rlength; 157 | 158 | /** 159 | * right attribute id 160 | */ 161 | unsigned short rcAttr; 162 | 163 | /** 164 | * left attribute id 165 | */ 166 | unsigned short lcAttr; 167 | 168 | /** 169 | * unique part of speech id. This value is defined in "pos.def" file. 170 | */ 171 | unsigned short posid; 172 | 173 | /** 174 | * character type 175 | */ 176 | unsigned char char_type; 177 | 178 | /** 179 | * status of this model. 180 | * This value is MECAB_NOR_NODE, MECAB_UNK_NODE, MECAB_BOS_NODE, MECAB_EOS_NODE, or MECAB_EON_NODE. 181 | */ 182 | unsigned char stat; 183 | 184 | /** 185 | * set 1 if this node is best node. 186 | */ 187 | unsigned char isbest; 188 | 189 | /** 190 | * forward accumulative log summation. 191 | * This value is only available when MECAB_MARGINAL_PROB is passed. 192 | */ 193 | float alpha; 194 | 195 | /** 196 | * backward accumulative log summation. 197 | * This value is only available when MECAB_MARGINAL_PROB is passed. 198 | */ 199 | float beta; 200 | 201 | /** 202 | * marginal probability. 203 | * This value is only available when MECAB_MARGINAL_PROB is passed. 204 | */ 205 | float prob; 206 | 207 | /** 208 | * word cost. 209 | */ 210 | short wcost; 211 | 212 | /** 213 | * best accumulative cost from bos node to this node. 214 | */ 215 | long cost; 216 | }; 217 | 218 | /** 219 | * Parameters for MeCab::Node::stat 220 | */ 221 | enum { 222 | /** 223 | * Normal node defined in the dictionary. 224 | */ 225 | MECAB_NOR_NODE = 0, 226 | /** 227 | * Unknown node not defined in the dictionary. 228 | */ 229 | MECAB_UNK_NODE = 1, 230 | /** 231 | * Virtual node representing a beginning of the sentence. 232 | */ 233 | MECAB_BOS_NODE = 2, 234 | /** 235 | * Virtual node representing a end of the sentence. 236 | */ 237 | MECAB_EOS_NODE = 3, 238 | 239 | /** 240 | * Virtual node representing a end of the N-best enumeration. 241 | */ 242 | MECAB_EON_NODE = 4 243 | }; 244 | 245 | /** 246 | * Parameters for MeCab::DictionaryInfo::type 247 | */ 248 | enum { 249 | /** 250 | * This is a system dictionary. 251 | */ 252 | MECAB_SYS_DIC = 0, 253 | 254 | /** 255 | * This is a user dictionary. 256 | */ 257 | MECAB_USR_DIC = 1, 258 | 259 | /** 260 | * This is a unknown word dictionary. 261 | */ 262 | MECAB_UNK_DIC = 2 263 | }; 264 | 265 | /** 266 | * Parameters for MeCab::Lattice::request_type 267 | */ 268 | enum { 269 | /** 270 | * One best result is obtained (default mode) 271 | */ 272 | MECAB_ONE_BEST = 1, 273 | /** 274 | * Set this flag if you want to obtain N best results. 275 | */ 276 | MECAB_NBEST = 2, 277 | /** 278 | * Set this flag if you want to enable a partial parsing mode. 279 | * When this flag is set, the input |sentence| needs to be written 280 | * in partial parsing format. 281 | */ 282 | MECAB_PARTIAL = 4, 283 | /** 284 | * Set this flag if you want to obtain marginal probabilities. 285 | * Marginal probability is set in MeCab::Node::prob. 286 | * The parsing speed will get 3-5 times slower than the default mode. 287 | */ 288 | MECAB_MARGINAL_PROB = 8, 289 | /** 290 | * Set this flag if you want to obtain alternative results. 291 | * Not implemented. 292 | */ 293 | MECAB_ALTERNATIVE = 16, 294 | /** 295 | * When this flag is set, the result linked-list (Node::next/prev) 296 | * traverses all nodes in the lattice. 297 | */ 298 | MECAB_ALL_MORPHS = 32, 299 | 300 | /** 301 | * When this flag is set, tagger internally copies the body of passed 302 | * sentence into internal buffer. 303 | */ 304 | MECAB_ALLOCATE_SENTENCE = 64 305 | }; 306 | 307 | /** 308 | * Parameters for MeCab::Lattice::boundary_constraint_type 309 | */ 310 | enum { 311 | /** 312 | * The token boundary is not specified. 313 | */ 314 | MECAB_ANY_BOUNDARY = 0, 315 | 316 | /** 317 | * The position is a strong token boundary. 318 | */ 319 | MECAB_TOKEN_BOUNDARY = 1, 320 | 321 | /** 322 | * The position is not a token boundary. 323 | */ 324 | MECAB_INSIDE_TOKEN = 2 325 | }; 326 | 327 | /* C interface */ 328 | #ifdef __cplusplus 329 | #include 330 | #else 331 | #include 332 | #endif 333 | 334 | #ifdef __cplusplus 335 | extern "C" { 336 | #endif 337 | 338 | #ifdef _WIN32 339 | #include 340 | # ifdef DLL_EXPORT 341 | # define MECAB_DLL_EXTERN __declspec(dllexport) 342 | # define MECAB_DLL_CLASS_EXTERN __declspec(dllexport) 343 | # else 344 | # define MECAB_DLL_EXTERN __declspec(dllimport) 345 | # endif 346 | #endif 347 | 348 | #ifndef MECAB_DLL_EXTERN 349 | # define MECAB_DLL_EXTERN extern 350 | #endif 351 | 352 | #ifndef MECAB_DLL_CLASS_EXTERN 353 | # define MECAB_DLL_CLASS_EXTERN 354 | #endif 355 | 356 | typedef struct mecab_t mecab_t; 357 | typedef struct mecab_model_t mecab_model_t; 358 | typedef struct mecab_lattice_t mecab_lattice_t; 359 | typedef struct mecab_dictionary_info_t mecab_dictionary_info_t; 360 | typedef struct mecab_node_t mecab_node_t; 361 | typedef struct mecab_path_t mecab_path_t; 362 | 363 | #ifndef SWIG 364 | /* C interface */ 365 | 366 | /* old mecab interface */ 367 | /** 368 | * C wrapper of MeCab::Tagger::create(argc, argv) 369 | */ 370 | MECAB_DLL_EXTERN mecab_t* mecab_new(int argc, char **argv); 371 | 372 | /** 373 | * C wrapper of MeCab::Tagger::create(arg) 374 | */ 375 | MECAB_DLL_EXTERN mecab_t* mecab_new2(const char *arg); 376 | 377 | /** 378 | * C wrapper of MeCab::Tagger::version() 379 | */ 380 | MECAB_DLL_EXTERN const char* mecab_version(); 381 | 382 | /** 383 | * C wrapper of MeCab::getLastError() 384 | */ 385 | MECAB_DLL_EXTERN const char* mecab_strerror(mecab_t *mecab); 386 | 387 | /** 388 | * C wrapper of MeCab::deleteTagger(tagger) 389 | */ 390 | MECAB_DLL_EXTERN void mecab_destroy(mecab_t *mecab); 391 | 392 | /** 393 | * C wrapper of MeCab::Tagger:set_partial() 394 | */ 395 | MECAB_DLL_EXTERN int mecab_get_partial(mecab_t *mecab); 396 | 397 | /** 398 | * C wrapper of MeCab::Tagger::partial() 399 | */ 400 | MECAB_DLL_EXTERN void mecab_set_partial(mecab_t *mecab, int partial); 401 | 402 | /** 403 | * C wrapper of MeCab::Tagger::theta() 404 | */ 405 | MECAB_DLL_EXTERN float mecab_get_theta(mecab_t *mecab); 406 | 407 | /** 408 | * C wrapper of MeCab::Tagger::set_theta() 409 | */ 410 | MECAB_DLL_EXTERN void mecab_set_theta(mecab_t *mecab, float theta); 411 | 412 | /** 413 | * C wrapper of MeCab::Tagger::lattice_level() 414 | */ 415 | MECAB_DLL_EXTERN int mecab_get_lattice_level(mecab_t *mecab); 416 | 417 | /** 418 | * C wrapper of MeCab::Tagger::set_lattice_level() 419 | */ 420 | MECAB_DLL_EXTERN void mecab_set_lattice_level(mecab_t *mecab, int level); 421 | 422 | /** 423 | * C wrapper of MeCab::Tagger::all_morphs() 424 | */ 425 | MECAB_DLL_EXTERN int mecab_get_all_morphs(mecab_t *mecab); 426 | 427 | /** 428 | * C wrapper of MeCab::Tagger::set_all_moprhs() 429 | */ 430 | MECAB_DLL_EXTERN void mecab_set_all_morphs(mecab_t *mecab, int all_morphs); 431 | 432 | /** 433 | * C wrapper of MeCab::Tagger::parse(MeCab::Lattice *lattice) 434 | */ 435 | MECAB_DLL_EXTERN int mecab_parse_lattice(mecab_t *mecab, mecab_lattice_t *lattice); 436 | 437 | /** 438 | * C wrapper of MeCab::Tagger::parse(const char *str) 439 | */ 440 | MECAB_DLL_EXTERN const char* mecab_sparse_tostr(mecab_t *mecab, const char *str); 441 | 442 | /** 443 | * C wrapper of MeCab::Tagger::parse(const char *str, size_t len) 444 | */ 445 | MECAB_DLL_EXTERN const char* mecab_sparse_tostr2(mecab_t *mecab, const char *str, size_t len); 446 | 447 | /** 448 | * C wrapper of MeCab::Tagger::parse(const char *str, char *ostr, size_t olen) 449 | */ 450 | MECAB_DLL_EXTERN char* mecab_sparse_tostr3(mecab_t *mecab, const char *str, size_t len, 451 | char *ostr, size_t olen); 452 | 453 | /** 454 | * C wrapper of MeCab::Tagger::parseToNode(const char *str) 455 | */ 456 | MECAB_DLL_EXTERN const mecab_node_t* mecab_sparse_tonode(mecab_t *mecab, const char*); 457 | 458 | /** 459 | * C wrapper of MeCab::Tagger::parseToNode(const char *str, size_t len) 460 | */ 461 | MECAB_DLL_EXTERN const mecab_node_t* mecab_sparse_tonode2(mecab_t *mecab, const char*, size_t); 462 | 463 | /** 464 | * C wrapper of MeCab::Tagger::parseNBest(size_t N, const char *str) 465 | */ 466 | MECAB_DLL_EXTERN const char* mecab_nbest_sparse_tostr(mecab_t *mecab, size_t N, const char *str); 467 | 468 | /** 469 | * C wrapper of MeCab::Tagger::parseNBest(size_t N, const char *str, size_t len) 470 | */ 471 | MECAB_DLL_EXTERN const char* mecab_nbest_sparse_tostr2(mecab_t *mecab, size_t N, 472 | const char *str, size_t len); 473 | 474 | /** 475 | * C wrapper of MeCab::Tagger::parseNBest(size_t N, const char *str, char *ostr, size_t olen) 476 | */ 477 | MECAB_DLL_EXTERN char* mecab_nbest_sparse_tostr3(mecab_t *mecab, size_t N, 478 | const char *str, size_t len, 479 | char *ostr, size_t olen); 480 | 481 | /** 482 | * C wrapper of MeCab::Tagger::parseNBestInit(const char *str) 483 | */ 484 | MECAB_DLL_EXTERN int mecab_nbest_init(mecab_t *mecab, const char *str); 485 | 486 | /** 487 | * C wrapper of MeCab::Tagger::parseNBestInit(const char *str, size_t len) 488 | */ 489 | MECAB_DLL_EXTERN int mecab_nbest_init2(mecab_t *mecab, const char *str, size_t len); 490 | 491 | /** 492 | * C wrapper of MeCab::Tagger::next() 493 | */ 494 | MECAB_DLL_EXTERN const char* mecab_nbest_next_tostr(mecab_t *mecab); 495 | 496 | /** 497 | * C wrapper of MeCab::Tagger::next(char *ostr, size_t olen) 498 | */ 499 | MECAB_DLL_EXTERN char* mecab_nbest_next_tostr2(mecab_t *mecab, char *ostr, size_t olen); 500 | 501 | /** 502 | * C wrapper of MeCab::Tagger::nextNode() 503 | */ 504 | MECAB_DLL_EXTERN const mecab_node_t* mecab_nbest_next_tonode(mecab_t *mecab); 505 | 506 | /** 507 | * C wrapper of MeCab::Tagger::formatNode(const Node *node) 508 | */ 509 | MECAB_DLL_EXTERN const char* mecab_format_node(mecab_t *mecab, const mecab_node_t *node); 510 | 511 | /** 512 | * C wrapper of MeCab::Tagger::dictionary_info() 513 | */ 514 | MECAB_DLL_EXTERN const mecab_dictionary_info_t* mecab_dictionary_info(mecab_t *mecab); 515 | 516 | /* lattice interface */ 517 | /** 518 | * C wrapper of MeCab::createLattice() 519 | */ 520 | MECAB_DLL_EXTERN mecab_lattice_t *mecab_lattice_new(); 521 | 522 | /** 523 | * C wrapper of MeCab::deleteLattice(lattice) 524 | */ 525 | MECAB_DLL_EXTERN void mecab_lattice_destroy(mecab_lattice_t *lattice); 526 | 527 | /** 528 | * C wrapper of MeCab::Lattice::clear() 529 | */ 530 | MECAB_DLL_EXTERN void mecab_lattice_clear(mecab_lattice_t *lattice); 531 | 532 | /** 533 | * C wrapper of MeCab::Lattice::is_available() 534 | */ 535 | 536 | MECAB_DLL_EXTERN int mecab_lattice_is_available(mecab_lattice_t *lattice); 537 | 538 | /** 539 | * C wrapper of MeCab::Lattice::bos_node() 540 | */ 541 | MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_get_bos_node(mecab_lattice_t *lattice); 542 | 543 | /** 544 | * C wrapper of MeCab::Lattice::eos_node() 545 | */ 546 | MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_get_eos_node(mecab_lattice_t *lattice); 547 | 548 | /** 549 | * C wrapper of MeCab::Lattice::begin_nodes() 550 | */ 551 | 552 | MECAB_DLL_EXTERN mecab_node_t **mecab_lattice_get_all_begin_nodes(mecab_lattice_t *lattice); 553 | /** 554 | * C wrapper of MeCab::Lattice::end_nodes() 555 | */ 556 | MECAB_DLL_EXTERN mecab_node_t **mecab_lattice_get_all_end_nodes(mecab_lattice_t *lattice); 557 | 558 | /** 559 | * C wrapper of MeCab::Lattice::begin_nodes(pos) 560 | */ 561 | MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_get_begin_nodes(mecab_lattice_t *lattice, size_t pos); 562 | 563 | /** 564 | * C wrapper of MeCab::Lattice::end_nodes(pos) 565 | */ 566 | MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_get_end_nodes(mecab_lattice_t *lattice, size_t pos); 567 | 568 | /** 569 | * C wrapper of MeCab::Lattice::sentence() 570 | */ 571 | MECAB_DLL_EXTERN const char *mecab_lattice_get_sentence(mecab_lattice_t *lattice); 572 | 573 | /** 574 | * C wrapper of MeCab::Lattice::set_sentence(sentence) 575 | */ 576 | MECAB_DLL_EXTERN void mecab_lattice_set_sentence(mecab_lattice_t *lattice, const char *sentence); 577 | 578 | /** 579 | * C wrapper of MeCab::Lattice::set_sentence(sentence, len) 580 | */ 581 | 582 | MECAB_DLL_EXTERN void mecab_lattice_set_sentence2(mecab_lattice_t *lattice, const char *sentence, size_t len); 583 | 584 | /** 585 | * C wrapper of MeCab::Lattice::size() 586 | */ 587 | MECAB_DLL_EXTERN size_t mecab_lattice_get_size(mecab_lattice_t *lattice); 588 | 589 | /** 590 | * C wrapper of MeCab::Lattice::Z() 591 | */ 592 | MECAB_DLL_EXTERN double mecab_lattice_get_z(mecab_lattice_t *lattice); 593 | 594 | /** 595 | * C wrapper of MeCab::Lattice::set_Z() 596 | */ 597 | MECAB_DLL_EXTERN void mecab_lattice_set_z(mecab_lattice_t *lattice, double Z); 598 | 599 | /** 600 | * C wrapper of MeCab::Lattice::theta() 601 | */ 602 | MECAB_DLL_EXTERN double mecab_lattice_get_theta(mecab_lattice_t *lattice); 603 | 604 | /** 605 | * C wrapper of MeCab::Lattice::set_theta() 606 | */ 607 | 608 | MECAB_DLL_EXTERN void mecab_lattice_set_theta(mecab_lattice_t *lattice, double theta); 609 | 610 | /** 611 | * C wrapper of MeCab::Lattice::next() 612 | */ 613 | MECAB_DLL_EXTERN int mecab_lattice_next(mecab_lattice_t *lattice); 614 | 615 | /** 616 | * C wrapper of MeCab::Lattice::request_type() 617 | */ 618 | MECAB_DLL_EXTERN int mecab_lattice_get_request_type(mecab_lattice_t *lattice); 619 | 620 | /** 621 | * C wrapper of MeCab::Lattice::has_request_type() 622 | */ 623 | MECAB_DLL_EXTERN int mecab_lattice_has_request_type(mecab_lattice_t *lattice, int request_type); 624 | 625 | /** 626 | * C wrapper of MeCab::Lattice::set_request_type() 627 | */ 628 | MECAB_DLL_EXTERN void mecab_lattice_set_request_type(mecab_lattice_t *lattice, int request_type); 629 | 630 | /** 631 | * C wrapper of MeCab::Lattice::add_request_type() 632 | */ 633 | 634 | MECAB_DLL_EXTERN void mecab_lattice_add_request_type(mecab_lattice_t *lattice, int request_type); 635 | 636 | /** 637 | * C wrapper of MeCab::Lattice::remove_request_type() 638 | */ 639 | MECAB_DLL_EXTERN void mecab_lattice_remove_request_type(mecab_lattice_t *lattice, int request_type); 640 | 641 | /** 642 | * C wrapper of MeCab::Lattice::newNode(); 643 | */ 644 | MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_new_node(mecab_lattice_t *lattice); 645 | 646 | /** 647 | * C wrapper of MeCab::Lattice::toString() 648 | */ 649 | MECAB_DLL_EXTERN const char *mecab_lattice_tostr(mecab_lattice_t *lattice); 650 | 651 | /** 652 | * C wrapper of MeCab::Lattice::toString(buf, size) 653 | */ 654 | MECAB_DLL_EXTERN const char *mecab_lattice_tostr2(mecab_lattice_t *lattice, char *buf, size_t size); 655 | 656 | /** 657 | * C wrapper of MeCab::Lattice::enumNBestAsString(N) 658 | */ 659 | MECAB_DLL_EXTERN const char *mecab_lattice_nbest_tostr(mecab_lattice_t *lattice, size_t N); 660 | 661 | /** 662 | * C wrapper of MeCab::Lattice::enumNBestAsString(N, buf, size) 663 | */ 664 | 665 | MECAB_DLL_EXTERN const char *mecab_lattice_nbest_tostr2(mecab_lattice_t *lattice, size_t N, char *buf, size_t size); 666 | 667 | /** 668 | * C wrapper of MeCab::Lattice::has_constraint() 669 | */ 670 | MECAB_DLL_EXTERN int mecab_lattice_has_constraint(mecab_lattice_t *lattice); 671 | 672 | /** 673 | * C wrapper of MeCab::Lattice::boundary_constraint(pos) 674 | */ 675 | MECAB_DLL_EXTERN int mecab_lattice_get_boundary_constraint(mecab_lattice_t *lattice, size_t pos); 676 | 677 | 678 | /** 679 | * C wrapper of MeCab::Lattice::feature_constraint(pos) 680 | */ 681 | MECAB_DLL_EXTERN const char *mecab_lattice_get_feature_constraint(mecab_lattice_t *lattice, size_t pos); 682 | 683 | /** 684 | * C wrapper of MeCab::Lattice::boundary_constraint(pos, type) 685 | */ 686 | MECAB_DLL_EXTERN void mecab_lattice_set_boundary_constraint(mecab_lattice_t *lattice, size_t pos, int boundary_type); 687 | 688 | /** 689 | * C wrapper of MeCab::Lattice::set_feature_constraint(begin_pos, end_pos, feature) 690 | */ 691 | MECAB_DLL_EXTERN void mecab_lattice_set_feature_constraint(mecab_lattice_t *lattice, size_t begin_pos, size_t end_pos, const char *feature); 692 | 693 | /** 694 | * C wrapper of MeCab::Lattice::set_result(result); 695 | */ 696 | MECAB_DLL_EXTERN void mecab_lattice_set_result(mecab_lattice_t *lattice, const char *result); 697 | 698 | /** 699 | * C wrapper of MeCab::Lattice::what() 700 | */ 701 | MECAB_DLL_EXTERN const char *mecab_lattice_strerror(mecab_lattice_t *lattice); 702 | 703 | 704 | /* model interface */ 705 | /** 706 | * C wapper of MeCab::Model::create(argc, argv) 707 | */ 708 | MECAB_DLL_EXTERN mecab_model_t *mecab_model_new(int argc, char **argv); 709 | 710 | /** 711 | * C wapper of MeCab::Model::create(arg) 712 | */ 713 | MECAB_DLL_EXTERN mecab_model_t *mecab_model_new2(const char *arg); 714 | 715 | /** 716 | * C wapper of MeCab::deleteModel(model) 717 | */ 718 | 719 | MECAB_DLL_EXTERN void mecab_model_destroy(mecab_model_t *model); 720 | 721 | /** 722 | * C wapper of MeCab::Model::createTagger() 723 | */ 724 | MECAB_DLL_EXTERN mecab_t *mecab_model_new_tagger(mecab_model_t *model); 725 | 726 | /** 727 | * C wapper of MeCab::Model::createLattice() 728 | */ 729 | MECAB_DLL_EXTERN mecab_lattice_t *mecab_model_new_lattice(mecab_model_t *model); 730 | 731 | /** 732 | * C wrapper of MeCab::Model::swap() 733 | */ 734 | MECAB_DLL_EXTERN int mecab_model_swap(mecab_model_t *model, mecab_model_t *new_model); 735 | 736 | /** 737 | * C wapper of MeCab::Model::dictionary_info() 738 | */ 739 | MECAB_DLL_EXTERN const mecab_dictionary_info_t* mecab_model_dictionary_info(mecab_model_t *model); 740 | 741 | /** 742 | * C wrapper of MeCab::Model::transition_cost() 743 | */ 744 | MECAB_DLL_EXTERN int mecab_model_transition_cost(mecab_model_t *model, 745 | unsigned short rcAttr, 746 | unsigned short lcAttr); 747 | 748 | /** 749 | * C wrapper of MeCab::Model::lookup() 750 | */ 751 | MECAB_DLL_EXTERN mecab_node_t *mecab_model_lookup(mecab_model_t *model, 752 | const char *begin, 753 | const char *end, 754 | mecab_lattice_t *lattice); 755 | 756 | /* static functions */ 757 | MECAB_DLL_EXTERN int mecab_do(int argc, char **argv); 758 | MECAB_DLL_EXTERN int mecab_dict_index(int argc, char **argv); 759 | MECAB_DLL_EXTERN int mecab_dict_gen(int argc, char **argv); 760 | MECAB_DLL_EXTERN int mecab_cost_train(int argc, char **argv); 761 | MECAB_DLL_EXTERN int mecab_system_eval(int argc, char **argv); 762 | MECAB_DLL_EXTERN int mecab_test_gen(int argc, char **argv); 763 | #endif 764 | 765 | #ifdef __cplusplus 766 | } 767 | #endif 768 | 769 | /* C++ interface */ 770 | #ifdef __cplusplus 771 | 772 | namespace MeCab { 773 | typedef struct mecab_dictionary_info_t DictionaryInfo; 774 | typedef struct mecab_path_t Path; 775 | typedef struct mecab_node_t Node; 776 | 777 | template class Allocator; 778 | class Tagger; 779 | 780 | /** 781 | * Lattice class 782 | */ 783 | class MECAB_DLL_CLASS_EXTERN Lattice { 784 | public: 785 | /** 786 | * Clear all internal lattice data. 787 | */ 788 | virtual void clear() = 0; 789 | 790 | /** 791 | * Return true if result object is available. 792 | * @return boolean 793 | */ 794 | virtual bool is_available() const = 0; 795 | 796 | /** 797 | * Return bos (begin of sentence) node. 798 | * You can obtain all nodes via "for (const Node *node = lattice->bos_node(); node; node = node->next) {}" 799 | * @return bos node object 800 | */ 801 | virtual Node *bos_node() const = 0; 802 | 803 | /** 804 | * Return eos (end of sentence) node. 805 | * @return eos node object 806 | */ 807 | virtual Node *eos_node() const = 0; 808 | 809 | #ifndef SWIG 810 | /** 811 | * This method is used internally. 812 | */ 813 | virtual Node **begin_nodes() const = 0; 814 | 815 | /** 816 | * This method is used internally. 817 | */ 818 | virtual Node **end_nodes() const = 0; 819 | #endif 820 | 821 | /** 822 | * Return node linked list ending at |pos|. 823 | * You can obtain all nodes via "for (const Node *node = lattice->end_nodes(pos); node; node = node->enext) {}" 824 | * @param pos position of nodes. 0 <= pos < size() 825 | * @return node linked list 826 | */ 827 | virtual Node *end_nodes(size_t pos) const = 0; 828 | 829 | /** 830 | * Return node linked list starting at |pos|. 831 | * You can obtain all nodes via "for (const Node *node = lattice->begin_nodes(pos); node; node = node->bnext) {}" 832 | * @param pos position of nodes. 0 <= pos < size() 833 | * @return node linked list 834 | */ 835 | virtual Node *begin_nodes(size_t pos) const = 0; 836 | 837 | /** 838 | * Return sentence. 839 | * If MECAB_NBEST or MECAB_PARTIAL mode is off, the returned poiner is the same as the one set by set_sentence(). 840 | * @return sentence 841 | */ 842 | virtual const char *sentence() const = 0; 843 | 844 | /** 845 | * Set sentence. This method does not take the ownership of the object. 846 | * @param sentence sentence 847 | */ 848 | virtual void set_sentence(const char *sentence) = 0; 849 | 850 | #ifndef SWIG 851 | /** 852 | * Set sentence. This method does not take the ownership of the object. 853 | * @param sentence sentence 854 | * @param len length of the sentence 855 | */ 856 | virtual void set_sentence(const char *sentence, size_t len) = 0; 857 | #endif 858 | 859 | /** 860 | * Return sentence size. 861 | * @return sentence size 862 | */ 863 | virtual size_t size() const = 0; 864 | 865 | /** 866 | * Set normalization factor of CRF. 867 | * @param Z new normalization factor. 868 | */ 869 | virtual void set_Z(double Z) = 0; 870 | 871 | /** 872 | * return normalization factor of CRF. 873 | * @return normalization factor. 874 | */ 875 | virtual double Z() const = 0; 876 | 877 | /** 878 | * Set temparature parameter theta. 879 | * @param theta temparature parameter. 880 | */ 881 | virtual void set_theta(float theta) = 0; 882 | 883 | /** 884 | * Return temparature parameter theta. 885 | * @return temparature parameter. 886 | */ 887 | virtual float theta() const = 0; 888 | 889 | /** 890 | * Obtain next-best result. The internal linked list structure is updated. 891 | * You should set MECAB_NBEST reques_type in advance. 892 | * Return false if no more results are available or request_type is invalid. 893 | * @return boolean 894 | */ 895 | virtual bool next() = 0; 896 | 897 | /** 898 | * Return the current request type. 899 | * @return request type 900 | */ 901 | virtual int request_type() const = 0; 902 | 903 | /** 904 | * Return true if the object has a specified request type. 905 | * @return boolean 906 | */ 907 | virtual bool has_request_type(int request_type) const = 0; 908 | 909 | /** 910 | * Set request type. 911 | * @param request_type new request type assigned 912 | */ 913 | virtual void set_request_type(int request_type) = 0; 914 | 915 | /** 916 | * Add request type. 917 | * @param request_type new request type added 918 | */ 919 | virtual void add_request_type(int request_type) = 0; 920 | 921 | /** 922 | * Remove request type. 923 | * @param request_type new request type removed 924 | */ 925 | virtual void remove_request_type(int request_type) = 0; 926 | 927 | #ifndef SWIG 928 | /** 929 | * This method is used internally. 930 | */ 931 | virtual Allocator *allocator() const = 0; 932 | #endif 933 | 934 | /** 935 | * Return new node. Lattice objects has the ownership of the node. 936 | * @return new node object 937 | */ 938 | virtual Node *newNode() = 0; 939 | 940 | /** 941 | * Return string representation of the lattice. 942 | * Returned object is managed by this instance. When clear/set_sentence() method 943 | * is called, the returned buffer is initialized. 944 | * @return string representation of the lattice 945 | */ 946 | virtual const char *toString() = 0; 947 | 948 | /** 949 | * Return string representation of the node. 950 | * Returned object is managed by this instance. When clear/set_sentence() method 951 | * is called, the returned buffer is initialized. 952 | * @return string representation of the node 953 | * @param node node object 954 | */ 955 | virtual const char *toString(const Node *node) = 0; 956 | 957 | /** 958 | * Return string representation of the N-best results. 959 | * Returned object is managed by this instance. When clear/set_sentence() method 960 | * is called, the returned buffer is initialized. 961 | * @return string representation of the node 962 | * @param N how many results you want to obtain 963 | */ 964 | virtual const char *enumNBestAsString(size_t N) = 0; 965 | 966 | #ifndef SWIG 967 | /** 968 | * Return string representation of the lattice. 969 | * Result is saved in the specified buffer. 970 | * @param buf output buffer 971 | * @param size output buffer size 972 | * @return string representation of the lattice 973 | */ 974 | virtual const char *toString(char *buf, size_t size) = 0; 975 | 976 | /** 977 | * Return string representation of the node. 978 | * Result is saved in the specified buffer. 979 | * @param node node object 980 | * @param buf output buffer 981 | * @param size output buffer size 982 | * @return string representation of the lattice 983 | */ 984 | virtual const char *toString(const Node *node, 985 | char *buf, size_t size) = 0; 986 | 987 | /** 988 | * Return string representation of the N-best result. 989 | * Result is saved in the specified. 990 | * @param N how many results you want to obtain 991 | * @param buf output buffer 992 | * @param size output buffer size 993 | * @return string representation of the lattice 994 | */ 995 | virtual const char *enumNBestAsString(size_t N, char *buf, size_t size) = 0; 996 | #endif 997 | 998 | /** 999 | * Returns true if any parsing constraint is set 1000 | */ 1001 | virtual bool has_constraint() const = 0; 1002 | 1003 | /** 1004 | * Returns the boundary constraint at the position. 1005 | * @param pos the position of constraint 1006 | * @return boundary constraint type 1007 | */ 1008 | virtual int boundary_constraint(size_t pos) const = 0; 1009 | 1010 | /** 1011 | * Returns the token constraint at the position. 1012 | * @param pos the beginning position of constraint. 1013 | * @return constrained node starting at the position. 1014 | */ 1015 | virtual const char *feature_constraint(size_t pos) const = 0; 1016 | 1017 | /** 1018 | * Set parsing constraint for partial parsing mode. 1019 | * @param pos the position of the boundary 1020 | * @param boundary_constraint_type the type of boundary 1021 | */ 1022 | virtual void set_boundary_constraint(size_t pos, 1023 | int boundary_constraint_type) = 0; 1024 | 1025 | /** 1026 | * Set parsing constraint for partial parsing mode. 1027 | * @param begin_pos the starting position of the constrained token. 1028 | * @param end_pos the the ending position of the constrained token. 1029 | * @param feature the feature of the constrained token. 1030 | */ 1031 | virtual void set_feature_constraint( 1032 | size_t begin_pos, size_t end_pos, 1033 | const char *feature) = 0; 1034 | 1035 | /** 1036 | * Set golden parsing results for unittesting. 1037 | * @param result the parsing result written in the standard mecab output. 1038 | */ 1039 | virtual void set_result(const char *result) = 0; 1040 | 1041 | /** 1042 | * Return error string. 1043 | * @return error string 1044 | */ 1045 | virtual const char *what() const = 0; 1046 | 1047 | /** 1048 | * Set error string. given string is copied to the internal buffer. 1049 | * @param str new error string 1050 | */ 1051 | virtual void set_what(const char *str) = 0; 1052 | 1053 | #ifndef SWIG 1054 | /** 1055 | * Create new Lattice object 1056 | * @return new Lattice object 1057 | */ 1058 | static Lattice *create(); 1059 | #endif 1060 | 1061 | virtual ~Lattice() {} 1062 | }; 1063 | 1064 | /** 1065 | * Model class 1066 | */ 1067 | class MECAB_DLL_CLASS_EXTERN Model { 1068 | public: 1069 | /** 1070 | * Return DictionaryInfo linked list. 1071 | * @return DictionaryInfo linked list 1072 | */ 1073 | virtual const DictionaryInfo *dictionary_info() const = 0; 1074 | 1075 | /** 1076 | * Return transtion cost from rcAttr to lcAttr. 1077 | * @return transtion cost 1078 | */ 1079 | virtual int transition_cost(unsigned short rcAttr, 1080 | unsigned short lcAttr) const = 0; 1081 | 1082 | /** 1083 | * perform common prefix search from the range [begin, end). 1084 | * |lattice| takes the ownership of return value. 1085 | * @return node linked list. 1086 | */ 1087 | virtual Node *lookup(const char *begin, const char *end, 1088 | Lattice *lattice) const = 0; 1089 | 1090 | /** 1091 | * Create a new Tagger object. 1092 | * All returned tagger object shares this model object as a parsing model. 1093 | * Never delete this model object before deleting tagger object. 1094 | * @return new Tagger object 1095 | */ 1096 | virtual Tagger *createTagger() const = 0; 1097 | 1098 | /** 1099 | * Create a new Lattice object. 1100 | * @return new Lattice object 1101 | */ 1102 | virtual Lattice *createLattice() const = 0; 1103 | 1104 | /** 1105 | * Swap the instance with |model|. 1106 | * The ownership of |model| always moves to this instance, 1107 | * meaning that passed |model| will no longer be accessible after calling this method. 1108 | * return true if new model is swapped successfully. 1109 | * This method is thread safe. All taggers created by 1110 | * Model::createTagger() method will also be updated asynchronously. 1111 | * No need to stop the parsing thread excplicitly before swapping model object. 1112 | * @return boolean 1113 | * @param model new model which is going to be swapped with the current model. 1114 | */ 1115 | virtual bool swap(Model *model) = 0; 1116 | 1117 | /** 1118 | * Return a version string 1119 | * @return version string 1120 | */ 1121 | static const char *version(); 1122 | 1123 | virtual ~Model() {} 1124 | 1125 | #ifndef SWIG 1126 | /** 1127 | * Factory method to create a new Model with a specified main's argc/argv-style parameters. 1128 | * Return NULL if new model cannot be initialized. Use MeCab::getLastError() to obtain the 1129 | * cause of the errors. 1130 | * @return new Model object 1131 | * @param argc number of parameters 1132 | * @param argv parameter list 1133 | */ 1134 | static Model* create(int argc, char **argv); 1135 | 1136 | /** 1137 | * Factory method to create a new Model with a string parameter representation, i.e., 1138 | * "-d /user/local/mecab/dic/ipadic -Ochasen". 1139 | * Return NULL if new model cannot be initialized. Use MeCab::getLastError() to obtain the 1140 | * cause of the errors. 1141 | * @return new Model object 1142 | * @param arg single string representation of the argment. 1143 | */ 1144 | static Model* create(const char *arg); 1145 | #endif 1146 | }; 1147 | 1148 | /** 1149 | * Tagger class 1150 | */ 1151 | class MECAB_DLL_CLASS_EXTERN Tagger { 1152 | public: 1153 | /** 1154 | * Handy static method. 1155 | * Return true if lattice is parsed successfully. 1156 | * This function is equivalent to 1157 | * { 1158 | * Tagger *tagger = model.createModel(); 1159 | * cosnt bool result = tagger->parse(lattice); 1160 | * delete tagger; 1161 | * return result; 1162 | * } 1163 | * @return boolean 1164 | */ 1165 | static bool parse(const Model &model, Lattice *lattice); 1166 | 1167 | /** 1168 | * Parse lattice object. 1169 | * Return true if lattice is parsed successfully. 1170 | * A sentence must be set to the lattice with Lattice:set_sentence object before calling this method. 1171 | * Parsed node object can be obtained with Lattice:bos_node. 1172 | * This method is thread safe. 1173 | * @return lattice lattice object 1174 | * @return boolean 1175 | */ 1176 | virtual bool parse(Lattice *lattice) const = 0; 1177 | 1178 | /** 1179 | * Parse given sentence and return parsed result as string. 1180 | * You should not delete the returned string. The returned buffer 1181 | * is overwritten when parse method is called again. 1182 | * This method is NOT thread safe. 1183 | * @param str sentence 1184 | * @return parsed result 1185 | */ 1186 | virtual const char* parse(const char *str) = 0; 1187 | 1188 | /** 1189 | * Parse given sentence and return Node object. 1190 | * You should not delete the returned node object. The returned buffer 1191 | * is overwritten when parse method is called again. 1192 | * You can traverse all nodes via Node::next member. 1193 | * This method is NOT thread safe. 1194 | * @param str sentence 1195 | * @return bos node object 1196 | */ 1197 | virtual const Node* parseToNode(const char *str) = 0; 1198 | 1199 | /** 1200 | * Parse given sentence and obtain N-best results as a string format. 1201 | * Currently, N must be 1 <= N <= 512 due to the limitation of the buffer size. 1202 | * You should not delete the returned string. The returned buffer 1203 | * is overwritten when parse method is called again. 1204 | * This method is DEPRECATED. Use Lattice class. 1205 | * @param N how many results you want to obtain 1206 | * @param str sentence 1207 | * @return parsed result 1208 | */ 1209 | virtual const char* parseNBest(size_t N, const char *str) = 0; 1210 | 1211 | /** 1212 | * Initialize N-best enumeration with a sentence. 1213 | * Return true if initialization finishes successfully. 1214 | * N-best result is obtained by calling next() or nextNode() in sequence. 1215 | * This method is NOT thread safe. 1216 | * This method is DEPRECATED. Use Lattice class. 1217 | * @param str sentence 1218 | * @return boolean 1219 | */ 1220 | virtual bool parseNBestInit(const char *str) = 0; 1221 | 1222 | /** 1223 | * Return next-best parsed result. You must call parseNBestInit() in advance. 1224 | * Return NULL if no more reuslt is available. 1225 | * This method is NOT thread safe. 1226 | * This method is DEPRECATED. Use Lattice class. 1227 | * @return node object 1228 | */ 1229 | virtual const Node* nextNode() = 0; 1230 | 1231 | /** 1232 | * Return next-best parsed result. You must call parseNBestInit() in advance. 1233 | * Return NULL if no more reuslt is available. 1234 | * This method is NOT thread safe. 1235 | * This method is DEPRECATED. Use Lattice class. 1236 | * @return parsed result 1237 | */ 1238 | virtual const char* next() = 0; 1239 | 1240 | /** 1241 | * Return formatted node object. The format is specified with 1242 | * --unk-format, --bos-format, --eos-format, and --eon-format respectively. 1243 | * You should not delete the returned string. The returned buffer 1244 | * is overwritten when parse method is called again. 1245 | * This method is NOT thread safe. 1246 | * This method is DEPRECATED. Use Lattice class. 1247 | * @param node node object. 1248 | * @return parsed result 1249 | */ 1250 | virtual const char* formatNode(const Node *node) = 0; 1251 | 1252 | #ifndef SWIG 1253 | /** 1254 | * The same as parse() method, but input length and output buffer are passed. 1255 | * Return parsed result as string. The result pointer is the same as |ostr|. 1256 | * Return NULL, if parsed result string cannot be stored within |olen| bytes. 1257 | * @param str sentence 1258 | * @param len sentence length 1259 | * @param ostr output buffer 1260 | * @param olen output buffer length 1261 | * @return parsed result 1262 | */ 1263 | virtual const char* parse(const char *str, size_t len, char *ostr, size_t olen) = 0; 1264 | 1265 | /** 1266 | * The same as parse() method, but input length can be passed. 1267 | * @param str sentence 1268 | * @param len sentence length 1269 | * @return parsed result 1270 | */ 1271 | virtual const char* parse(const char *str, size_t len) = 0; 1272 | 1273 | /** 1274 | * The same as parseToNode(), but input lenth can be passed. 1275 | * @param str sentence 1276 | * @param len sentence length 1277 | * @return node object 1278 | */ 1279 | virtual const Node* parseToNode(const char *str, size_t len) = 0; 1280 | 1281 | /** 1282 | * The same as parseNBest(), but input length can be passed. 1283 | * @param N how many results you want to obtain 1284 | * @param str sentence 1285 | * @param len sentence length 1286 | * @return parsed result 1287 | */ 1288 | virtual const char* parseNBest(size_t N, const char *str, size_t len) = 0; 1289 | 1290 | /** 1291 | * The same as parseNBestInit(), but input length can be passed. 1292 | * @param str sentence 1293 | * @param len sentence length 1294 | * @return boolean 1295 | * @return parsed result 1296 | */ 1297 | virtual bool parseNBestInit(const char *str, size_t len) = 0; 1298 | 1299 | /** 1300 | * The same as next(), but output buffer can be passed. 1301 | * Return NULL if more than |olen| buffer is required to store output string. 1302 | * @param ostr output buffer 1303 | * @param olen output buffer length 1304 | * @return parsed result 1305 | */ 1306 | virtual const char* next(char *ostr , size_t olen) = 0; 1307 | 1308 | /** 1309 | * The same as parseNBest(), but input length and output buffer can be passed. 1310 | * Return NULL if more than |olen| buffer is required to store output string. 1311 | * @param N how many results you want to obtain 1312 | * @param str input sentence 1313 | * @param len input sentence length 1314 | * @param ostr output buffer 1315 | * @param olen output buffer length 1316 | * @return parsed result 1317 | */ 1318 | virtual const char* parseNBest(size_t N, const char *str, 1319 | size_t len, char *ostr, size_t olen) = 0; 1320 | 1321 | /** 1322 | * The same as formatNode(), but output buffer can be passed. 1323 | * Return NULL if more than |olen| buffer is required to store output string. 1324 | * @param node node object 1325 | * @param ostr output buffer 1326 | * @param olen output buffer length 1327 | * @return parsed result 1328 | */ 1329 | virtual const char* formatNode(const Node *node, char *ostr, size_t olen) = 0; 1330 | #endif 1331 | 1332 | /** 1333 | * Set request type. 1334 | * This method is DEPRECATED. Use Lattice::set_request_type(MECAB_PARTIAL). 1335 | * @param request_type new request type assigned 1336 | */ 1337 | virtual void set_request_type(int request_type) = 0; 1338 | 1339 | /** 1340 | * Return the current request type. 1341 | * This method is DEPRECATED. Use Lattice class. 1342 | * @return request type 1343 | */ 1344 | virtual int request_type() const = 0; 1345 | 1346 | /** 1347 | * Return true if partial parsing mode is on. 1348 | * This method is DEPRECATED. Use Lattice::has_request_type(MECAB_PARTIAL). 1349 | * @return boolean 1350 | */ 1351 | virtual bool partial() const = 0; 1352 | 1353 | /** 1354 | * set partial parsing mode. 1355 | * This method is DEPRECATED. Use Lattice::add_request_type(MECAB_PARTIAL) or Lattice::remove_request_type(MECAB_PARTIAL) 1356 | * @param partial partial mode 1357 | */ 1358 | virtual void set_partial(bool partial) = 0; 1359 | 1360 | /** 1361 | * Return lattice level. 1362 | * This method is DEPRECATED. Use Lattice::*_request_type() 1363 | * @return int lattice level 1364 | */ 1365 | virtual int lattice_level() const = 0; 1366 | 1367 | /** 1368 | * Set lattice level. 1369 | * This method is DEPRECATED. Use Lattice::*_request_type() 1370 | * @param level lattice level 1371 | */ 1372 | virtual void set_lattice_level(int level) = 0; 1373 | 1374 | /** 1375 | * Return true if all morphs output mode is on. 1376 | * This method is DEPRECATED. Use Lattice::has_request_type(MECAB_ALL_MORPHS). 1377 | * @return boolean 1378 | */ 1379 | virtual bool all_morphs() const = 0; 1380 | 1381 | /** 1382 | * set all-morphs output mode. 1383 | * This method is DEPRECATED. Use Lattice::add_request_type(MECAB_ALL_MORPHS) or Lattice::remove_request_type(MECAB_ALL_MORPHS) 1384 | * @param all_morphs 1385 | */ 1386 | virtual void set_all_morphs(bool all_morphs) = 0; 1387 | 1388 | /** 1389 | * Set temparature parameter theta. 1390 | * @param theta temparature parameter. 1391 | */ 1392 | virtual void set_theta(float theta) = 0; 1393 | 1394 | /** 1395 | * Return temparature parameter theta. 1396 | * @return temparature parameter. 1397 | */ 1398 | virtual float theta() const = 0; 1399 | 1400 | /** 1401 | * Return DictionaryInfo linked list. 1402 | * @return DictionaryInfo linked list 1403 | */ 1404 | virtual const DictionaryInfo* dictionary_info() const = 0; 1405 | 1406 | /** 1407 | * Return error string. 1408 | * @return error string 1409 | */ 1410 | virtual const char* what() const = 0; 1411 | 1412 | virtual ~Tagger() {} 1413 | 1414 | #ifndef SWIG 1415 | /** 1416 | * Factory method to create a new Tagger with a specified main's argc/argv-style parameters. 1417 | * Return NULL if new model cannot be initialized. Use MeCab::getLastError() to obtain the 1418 | * cause of the errors. 1419 | * @return new Tagger object 1420 | * @param argc number of parameters 1421 | * @param argv parameter list 1422 | */ 1423 | static Tagger *create(int argc, char **argv); 1424 | 1425 | /** 1426 | * Factory method to create a new Tagger with a string parameter representation, i.e., 1427 | * "-d /user/local/mecab/dic/ipadic -Ochasen". 1428 | * Return NULL if new model cannot be initialized. Use MeCab::getLastError() to obtain the 1429 | * cause of the errors. 1430 | * @return new Model object 1431 | * @param arg single string representation of the argment. 1432 | */ 1433 | static Tagger *create(const char *arg); 1434 | #endif 1435 | 1436 | /** 1437 | * Return a version string 1438 | * @return version string 1439 | */ 1440 | static const char *version(); 1441 | }; 1442 | 1443 | #ifndef SWIG 1444 | /** 1445 | * Alias of Lattice::create() 1446 | */ 1447 | MECAB_DLL_EXTERN Lattice *createLattice(); 1448 | 1449 | /** 1450 | * Alias of Mode::create(argc, argv) 1451 | */ 1452 | MECAB_DLL_EXTERN Model *createModel(int argc, char **argv); 1453 | 1454 | /** 1455 | * Alias of Mode::create(arg) 1456 | */ 1457 | MECAB_DLL_EXTERN Model *createModel(const char *arg); 1458 | 1459 | /** 1460 | * Alias of Tagger::create(argc, argv) 1461 | */ 1462 | MECAB_DLL_EXTERN Tagger *createTagger(int argc, char **argv); 1463 | 1464 | /** 1465 | * Alias of Tagger::create(arg) 1466 | */ 1467 | MECAB_DLL_EXTERN Tagger *createTagger(const char *arg); 1468 | 1469 | /** 1470 | * delete Lattice object. 1471 | * This method calles "delete lattice". 1472 | * In some environment, e.g., MS-Windows, an object allocated inside a DLL must be deleted in the same DLL too. 1473 | * @param lattice lattice object 1474 | */ 1475 | MECAB_DLL_EXTERN void deleteLattice(Lattice *lattice); 1476 | 1477 | 1478 | /** 1479 | * delete Model object. 1480 | * This method calles "delete model". 1481 | * In some environment, e.g., MS-Windows, an object allocated inside a DLL must be deleted in the same DLL too. 1482 | * @param model model object 1483 | */ 1484 | MECAB_DLL_EXTERN void deleteModel(Model *model); 1485 | 1486 | /** 1487 | * delete Tagger object. 1488 | * This method calles "delete tagger". 1489 | * In some environment, e.g., MS-Windows, an object allocated inside a DLL must be deleted in the same DLL too. 1490 | * @param tagger tagger object 1491 | */ 1492 | MECAB_DLL_EXTERN void deleteTagger(Tagger *tagger); 1493 | 1494 | /** 1495 | * Return last error string. 1496 | * @return error string 1497 | */ 1498 | MECAB_DLL_EXTERN const char* getLastError(); 1499 | 1500 | /** 1501 | * An alias of getLastError. 1502 | * It is kept for backward compatibility. 1503 | * @return error string 1504 | */ 1505 | MECAB_DLL_EXTERN const char* getTaggerError(); 1506 | #endif 1507 | } 1508 | #endif 1509 | #endif /* MECAB_MECAB_H_ */ 1510 | -------------------------------------------------------------------------------- /fugashi/mecab.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "mecab.h": 2 | cdef struct mecab_dictionary_info_t: 3 | char *filename 4 | char *charset 5 | unsigned int size 6 | unsigned short version 7 | mecab_dictionary_info_t* next 8 | 9 | cdef struct mecab_node_t: 10 | mecab_node_t *prev 11 | mecab_node_t *next 12 | const char *surface 13 | const char *feature 14 | unsigned int id 15 | unsigned short length 16 | unsigned short rlength 17 | unsigned short posid 18 | unsigned char char_type 19 | unsigned char stat 20 | 21 | cdef struct mecab_model_t: 22 | pass 23 | 24 | cdef struct mecab_t: 25 | pass 26 | 27 | cdef mecab_t* mecab_new(int argc, char **argv) 28 | cdef mecab_model_t* mecab_model_new(int argc, char **argv) 29 | cdef const char* mecab_sparse_tostr2(mecab_t *mecab, const char *str, size_t len) 30 | cdef const mecab_node_t* mecab_sparse_tonode(mecab_t *mecab, const char *str) 31 | cdef const mecab_dictionary_info_t* mecab_dictionary_info(mecab_t *mecab) 32 | 33 | cdef char* mecab_nbest_sparse_tostr(mecab_t *mecab, size_t N, const char *str) 34 | cdef int mecab_nbest_init(mecab_t *mecab, const char *str) 35 | cdef const char* mecab_strerror(mecab_t *mecab) 36 | 37 | cdef int mecab_dict_index(int argc, char**argv) 38 | 39 | cdef int mecab_nbest_init(mecab_t* mecab, const char* str) 40 | cdef const mecab_node_t* mecab_nbest_next_tonode(mecab_t* mecab) 41 | -------------------------------------------------------------------------------- /fugashi/tests/test_basic.py: -------------------------------------------------------------------------------- 1 | ## NOTE: These tests are written again the 2.1.2 binary distribution of Unidic. 2 | 3 | from fugashi import Tagger, UnidicFeatures17 4 | import pytest 5 | 6 | WAKATI_TESTS = ( 7 | ("すももももももももの内", 'すもも も もも も もも の 内'), 8 | ("日本語ですよ", '日本 語 です よ'), 9 | ("深海魚は、深海に生息する魚類の総称。", '深海 魚 は 、 深海 に 生息 する 魚類 の 総称 。'), 10 | ) 11 | 12 | TOKENIZER_TESTS = ( 13 | ('あなたは新米の魔女。', ['あなた', 'は', '新米', 'の', '魔女', '。']), 14 | ('パートナーである猫と共に、見知らぬ町へやってきたばかりです。', ['パートナー', 'で', 'ある', '猫', 'と', '共', 'に', '、', '見知ら', 'ぬ', '町', 'へ', 'やっ', 'て', 'き', 'た', 'ばかり', 'です', '。']), 15 | ) 16 | 17 | NBEST_TESTS = ( 18 | ('外国人参政権', '外国 人参 政権 \n外国 人 参政 権'), 19 | ("深海魚は、深海に生息する魚類の総称。", '深海 魚 は 、 深海 に 生息 する 魚類 の 総称 。 \n深 海魚 は 、 深海 に 生息 する 魚類 の 総称 。'), 20 | ("東京都の大人気ない主材料", '東京 都 の 大 人気 ない 主材 料 \n東京 都 の 大 人気 ない 主 材料') 21 | ) 22 | 23 | POS_TESTS = ( 24 | ('日本語', ['名詞,固有名詞,地名,国', '名詞,普通名詞,一般,*']), 25 | ) 26 | 27 | ACCENT_TESTS = ( 28 | ('稻村に行きました', ['0,2', '*', '0', '*', '*']), 29 | ) 30 | 31 | # Last number is token index of white space 32 | WHITE_SPACE_TESTS = ( 33 | ("これは 半角スペースです", " ", 2), 34 | ("これは\tタブ文字です", "\t", 2), 35 | ("これは\n改行文字です", "\n", 2), 36 | ("これは\n\t 複数種類の空白文字です", "\n\t ", 2), 37 | ("これは\n\t 複数種類の空白文字です", "\n\t ", 2), 38 | ("\tタブ文字で始まる文字列", "\t", 0), 39 | ) 40 | 41 | @pytest.mark.parametrize('text,wakati', WAKATI_TESTS) 42 | def test_wakati(text, wakati): 43 | tagger = Tagger('-Owakati') 44 | assert tagger.parse(text) == wakati 45 | 46 | @pytest.mark.parametrize('text,saved', TOKENIZER_TESTS) 47 | def test_tokens(text, saved): 48 | # testing the token objects is tricky, so instead just check surfaces 49 | #TODO: maybe save serialized nodes to compare? 50 | tagger = Tagger() 51 | tokens = [str(tok) for tok in tagger(text)] 52 | assert tokens == saved 53 | 54 | @pytest.mark.parametrize('text,saved', NBEST_TESTS) 55 | def test_nbest(text, saved): 56 | tagger = Tagger('-Owakati') 57 | assert tagger.nbest(text, 2) == saved 58 | 59 | @pytest.mark.parametrize('text,saved', NBEST_TESTS) 60 | def test_nbest_nodes(text, saved): 61 | tagger = Tagger() 62 | # parse adds a space to the end of each line 63 | saved = [ss.strip() for ss in saved.split("\n")] 64 | res = tagger.nbestToNodeList(text, 2) 65 | out = [" ".join([nn.surface for nn in nodes]) for nodes in res] 66 | assert out == saved 67 | 68 | def test_invalid_args(): 69 | # Invalid args will give a NULL pointer for the Tagger object 70 | # don't try to use the null object! 71 | with pytest.raises(RuntimeError): 72 | tagger = Tagger('-fail') 73 | 74 | @pytest.mark.parametrize('text,tags', POS_TESTS) 75 | def test_pos(text, tags): 76 | # There should be a pos property when using the default tagger 77 | tagger = Tagger() 78 | tags_ = [tok.pos for tok in tagger(text)] 79 | assert tags == tags_ 80 | 81 | @pytest.mark.parametrize('text,accent', ACCENT_TESTS) 82 | def test_accent(text, accent): 83 | # This checks for correct handling of feature fields containing commas as reported in #13 84 | tagger = Tagger() 85 | tokens = tagger(text) 86 | # Skip if UnidicFeatures17 is used because it doesn't have 'atype' attribute 87 | if tokens and isinstance(tokens[0].feature, UnidicFeatures17): 88 | pytest.skip() 89 | accent_ = [tok.feature.aType for tok in tokens] 90 | assert accent_ == accent 91 | 92 | def test_clobber(): 93 | # Check that memory isn't clobbered by repeated parse calls 94 | tagger = Tagger() 95 | nodes1 = tagger("a\tb c d") 96 | nodes2 = tagger("x y z !") 97 | 98 | assert "a b c d".split() == [nn.surface for nn in nodes1] 99 | assert ["", "\t", " ", " "] == [nn.white_space for nn in nodes1] 100 | 101 | @pytest.mark.parametrize("text,space,idx", WHITE_SPACE_TESTS) 102 | def test_white_space(text, space, idx): 103 | tagger = Tagger() 104 | nodes = tagger.parseToNodeList(text) 105 | 106 | assert nodes[idx].white_space == space 107 | -------------------------------------------------------------------------------- /fugashi/tests/test_ipadic.py: -------------------------------------------------------------------------------- 1 | # This is a small test to make sure ipadic is usable 2 | from fugashi import GenericTagger 3 | import pytest 4 | import ipadic 5 | 6 | WAKATI_TESTS = ( 7 | ("すももももももももの内", 'すもも も もも も もも の 内'), 8 | ("日本語ですよ", '日本語 です よ'), 9 | ("深海魚は、深海に生息する魚類の総称。", '深海魚 は 、 深海 に 生息 する 魚類 の 総称 。'), 10 | ) 11 | 12 | @pytest.mark.parametrize('text,wakati', WAKATI_TESTS) 13 | def test_wakati(text, wakati): 14 | tagger = GenericTagger(ipadic.MECAB_ARGS + ' -Owakati') 15 | assert tagger.parse(text) == wakati 16 | -------------------------------------------------------------------------------- /fugashi/tests/test_nbest.py: -------------------------------------------------------------------------------- 1 | from fugashi import Tagger 2 | import string 3 | import pytest 4 | 5 | # NOTE: The bulk test is written against unidic-3.1.0+2021-08-31, fed with 6 | # corpus cc-100, accessible at: 7 | # https://data.statmt.org/cc-100/ja.txt.xz 8 | path_to_jatxt = "ja.txt" 9 | 10 | @pytest.mark.skip(reason="This test requires too much data to run in CI.") 11 | def test_bulk(): 12 | tagger = Tagger() 13 | 14 | insufficient_paths = [] 15 | incomplete_hypothesis = [] 16 | 17 | def print_result(i): 18 | if insufficient_paths: 19 | print(f"{i} - Not enough paths (counts = {len(insufficient_paths)}): {', '.join(insufficient_paths)}") 20 | else: 21 | print(f"{i} - All lines parsed with enough paths.") 22 | 23 | if incomplete_hypothesis: 24 | print(f"{i} - Original line not recovered (counts = {len(incomplete_hypothesis)}): {', '.join(f'{entry[0]}:{entry[1]}' for entry in incomplete_hypothesis)}") 25 | else: 26 | print(f"{i} - All lines recovered") 27 | 28 | replace_chars = string.whitespace + '\x00' 29 | log_interval = 65536 30 | with open(path_to_jatxt, 'r', encoding='utf8') as fin: 31 | for i, line in enumerate(fin): 32 | 33 | # Tagger ignores whitespace and stops parsing at '\x00'. 34 | # This preprocessing is done for the completeness criteria 35 | for c in replace_chars: 36 | line = line.replace(c, '') 37 | 38 | if not line: 39 | continue 40 | 41 | paths = tagger.nbestToNodeList(line, 10) 42 | if len(paths) != 10: 43 | insufficient_paths.append(str(i)) 44 | 45 | for j, p in enumerate(paths): 46 | if ''.join(w.surface for w in p) != line: 47 | incomplete_hypothesis.append((i,j)) 48 | 49 | if i >= log_interval: 50 | log_interval*=2 51 | print_result(i) 52 | 53 | print_result('Final') 54 | 55 | if __name__ == '__main__': 56 | test_bulk() 57 | -------------------------------------------------------------------------------- /fugashi_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import subprocess 4 | 5 | 6 | def mecab_config(com="mecab-config"): 7 | output = subprocess.check_output( 8 | [com, "--inc-dir", "--libs-only-L", "--libs-only-l"] 9 | ) 10 | if not isinstance(output, str): 11 | output = output.decode("utf-8") 12 | return output.split("\n"), [] 13 | 14 | 15 | def mecab_config_windows(): 16 | ## Windows 17 | if not os.name == "nt": 18 | return 19 | 20 | win_mecab_dir = r"C:\mecab" 21 | win_bin_dir = win_mecab_dir # this is separate from the sdk dir on some installs 22 | mecab_details = (win_mecab_dir, win_mecab_dir, "libmecab") 23 | data_files = ["{}\\libmecab.dll".format(win_bin_dir)] 24 | return mecab_details, data_files 25 | 26 | 27 | def mecab_config_cygwin(): 28 | ## Cygwin 29 | os.chdir("build/mecab") 30 | if platform.system().startswith("CYGWIN"): 31 | rep = "mecab-cygwin64" if platform.machine() == "x86_64" else "mecab-cygwin32" 32 | subprocess.run( 33 | ["git", "clone", "--depth=1", "https://github.com/KoichiYasuoka/" + rep] 34 | ) 35 | mecab_details = ( 36 | "build/mecab/" + rep + "/include", 37 | "build/mecab/" + rep + "/lib", 38 | "mecab stdc++", 39 | ) 40 | return mecab_details, [] 41 | 42 | 43 | def check_libmecab(): 44 | """Get MeCab build parameters. 45 | 46 | Where available the mecab-config script is used, but if it's available it 47 | will be installed or the parameters will otherwise be figured out.""" 48 | 49 | configs = [ 50 | mecab_config_windows, 51 | mecab_config, 52 | mecab_config_cygwin, 53 | ] 54 | 55 | # A few scripts will use a build directory. Save where we start so we can 56 | # reset the directory after each build step. 57 | cwd = os.getcwd() 58 | os.makedirs("build/mecab", exist_ok=True) 59 | for config in configs: 60 | try: 61 | out = config() 62 | os.chdir(cwd) 63 | if out: 64 | return out 65 | except: 66 | # failure is normal, typically just a different platform 67 | os.chdir(cwd) 68 | raise RuntimeError("Could not configure working env. Have you installed MeCab?") 69 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["Cython~=3.0.11", "setuptools>=77", "setuptools-scm>=8"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "fugashi" 7 | description = "Cython MeCab wrapper for fast, pythonic Japanese tokenization." 8 | readme = "README.md" 9 | requires-python = ">=3.9" 10 | license = "MIT AND BSD-3-Clause" 11 | license-files = ["LICENSE", "LICENSE.mecab"] 12 | authors = [{ name = "Paul O'Leary McCann", email = "polm@dampfkraft.com" }] 13 | classifiers = [ 14 | "Environment :: Console", 15 | "Intended Audience :: Developers", 16 | "Intended Audience :: Science/Research", 17 | "Natural Language :: Japanese", 18 | "Operating System :: POSIX :: Linux", 19 | "Operating System :: MacOS :: MacOS X", 20 | "Programming Language :: Cython", 21 | "Programming Language :: Python :: 3", 22 | "Topic :: Text Processing :: Linguistic", 23 | ] 24 | dynamic = ["version"] 25 | 26 | [project.optional-dependencies] 27 | unidic = ["unidic"] 28 | unidic-lite = ["unidic-lite"] 29 | 30 | [project.scripts] 31 | fugashi = "fugashi.cli:main" 32 | fugashi-info = "fugashi.cli:info" 33 | fugashi-build-dict = "fugashi.cli:build_dict" 34 | 35 | [project.urls] 36 | source = "https://github.com/polm/fugashi" 37 | funding = "https://github.com/sponsors/polm" 38 | 39 | [tool.setuptools] 40 | include-package-data = false 41 | 42 | [tool.setuptools.packages.find] 43 | exclude = ["fugashi.tests*"] 44 | 45 | [tool.setuptools_scm] 46 | 47 | [tool.pytest.ini_options] 48 | testpaths = ["fugashi/tests"] 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import sys 4 | 5 | from setuptools import Extension, setup 6 | from setuptools.command.build_ext import build_ext as _build_ext 7 | 8 | # This is a side effect of how build works, see: 9 | # https://github.com/pypa/setuptools/discussions/3134 10 | sys.path.append(str(pathlib.Path(__file__).parent)) 11 | from fugashi_util import check_libmecab 12 | 13 | # get the build parameters 14 | output, dll_files = check_libmecab() 15 | 16 | # pad the list in case something's missing 17 | mecab_config = list(output) + ([""] * 5) 18 | include_dirs = mecab_config[0].split() 19 | library_dirs = mecab_config[1].split() 20 | libraries = mecab_config[2].split() 21 | extra_objects = mecab_config[3].split() 22 | extra_link_args = mecab_config[4].split() 23 | 24 | 25 | # Windows DLL related prep. 26 | # By default the DLL will be bundled on windows, but you can turn it off with 27 | # an env var. 28 | bundle_dll = False 29 | fugashi_package_files = [] 30 | should_bundle = os.environ.get("FUGASHI_NO_BUNDLE_DLL", "") in ("", "0") 31 | if sys.platform == "win32" and should_bundle: 32 | bundle_dll = True 33 | fugashi_package_files = [pathlib.Path(i).name for i in dll_files] 34 | 35 | 36 | class build_ext(_build_ext): 37 | """Custom behavior for build_ext. 38 | 39 | This is only run when bundling DLLs on Windows, which requires copying 40 | files around.""" 41 | 42 | def run(self): 43 | if bundle_dll: 44 | if self.editable_mode: 45 | fugashi_dir = pathlib.Path(__file__).parent / "fugashi" 46 | else: 47 | fugashi_dir = pathlib.Path(self.build_lib) / "fugashi" 48 | for i in dll_files: 49 | self.copy_file(i, fugashi_dir) 50 | return super().run() 51 | 52 | 53 | extensions = Extension( 54 | "fugashi.fugashi", 55 | ["fugashi/fugashi.pyx"], 56 | libraries=libraries, 57 | library_dirs=library_dirs, 58 | include_dirs=include_dirs, 59 | extra_objects=extra_objects, 60 | extra_link_args=extra_link_args, 61 | ) 62 | 63 | setup( 64 | ext_modules=[extensions], 65 | cmdclass={"build_ext": build_ext}, 66 | package_data={"fugashi": fugashi_package_files}, 67 | ) 68 | --------------------------------------------------------------------------------