├── .github ├── dependabot.yml └── workflows │ ├── build.yml │ ├── codeql-analysis.yml │ ├── dependabot-auto-merge.yml │ ├── docs-requirements.yml │ ├── lint.yml │ ├── publish.yml │ ├── pylock.yml │ ├── release.yml │ ├── test-example.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── AUTHORS.md ├── CITATION.cff ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── assets ├── KNP.sublime-syntax ├── logo-original.png ├── logo-wide.xcf └── logo.xcf ├── docs ├── Makefile ├── _static │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── favicon.ico │ ├── logo-wide.png │ └── logo.png ├── authors.md ├── cli │ └── index.md ├── conf.py ├── contributing │ └── index.md ├── format │ └── index.md ├── index.md ├── installation │ └── index.md ├── make.bat ├── reference │ ├── index.md │ ├── rhoknp.cli.cli.md │ ├── rhoknp.cli.md │ ├── rhoknp.cli.serve.md │ ├── rhoknp.cli.show.md │ ├── rhoknp.cli.stats.md │ ├── rhoknp.cohesion.argument.md │ ├── rhoknp.cohesion.coreference.md │ ├── rhoknp.cohesion.discourse.md │ ├── rhoknp.cohesion.exophora.md │ ├── rhoknp.cohesion.md │ ├── rhoknp.cohesion.pas.md │ ├── rhoknp.cohesion.predicate.md │ ├── rhoknp.cohesion.rel.md │ ├── rhoknp.md │ ├── rhoknp.processors.jumanpp.md │ ├── rhoknp.processors.knp.md │ ├── rhoknp.processors.kwja.md │ ├── rhoknp.processors.md │ ├── rhoknp.processors.processor.md │ ├── rhoknp.processors.senter.md │ ├── rhoknp.props.dependency.md │ ├── rhoknp.props.feature.md │ ├── rhoknp.props.md │ ├── rhoknp.props.memo.md │ ├── rhoknp.props.named_entity.md │ ├── rhoknp.props.semantics.md │ ├── rhoknp.units.base_phrase.md │ ├── rhoknp.units.clause.md │ ├── rhoknp.units.document.md │ ├── rhoknp.units.md │ ├── rhoknp.units.morpheme.md │ ├── rhoknp.units.phrase.md │ ├── rhoknp.units.sentence.md │ ├── rhoknp.units.unit.md │ ├── rhoknp.utils.md │ └── rhoknp.utils.reader.md └── requirements.txt ├── examples ├── README.md ├── apply_jumanpp.py ├── apply_knp.py ├── apply_kwja.py ├── load_jumanpp.py ├── load_knp.py ├── use_coreference_resolution.py ├── use_dependency_parsing.py ├── use_discourse_relation_analysis.py ├── use_morphological_analysis.py ├── use_named_entity_recognition.py └── use_predicate_argument_structure_analysis.py ├── pyproject.toml ├── src └── rhoknp │ ├── __init__.py │ ├── cli │ ├── __init__.py │ ├── cat.py │ ├── cli.py │ ├── serve.py │ ├── show.py │ ├── static │ │ ├── css │ │ │ └── style.css │ │ ├── images │ │ │ ├── apple-touch-icon.png │ │ │ └── favicon.ico │ │ └── js │ │ │ └── script.js │ ├── stats.py │ └── templates │ │ ├── base.jinja2 │ │ ├── components │ │ ├── dependency_parsing.jinja2 │ │ ├── discourse_parsing.jinja2 │ │ ├── error.jinja2 │ │ ├── form.jinja2 │ │ ├── hide_all_button.jinja2 │ │ ├── morphological_analysis.jinja2 │ │ ├── named_entity_recognition.jinja2 │ │ ├── navbar.jinja2 │ │ ├── raw_input.jinja2 │ │ ├── raw_output.jinja2 │ │ ├── show_all_button.jinja2 │ │ ├── typo_correction.jinja2 │ │ └── word_splitting.jinja2 │ │ ├── jumanpp.jinja2 │ │ ├── knp.jinja2 │ │ └── kwja.jinja2 │ ├── cohesion │ ├── __init__.py │ ├── argument.py │ ├── coreference.py │ ├── discourse.py │ ├── exophora.py │ ├── pas.py │ ├── predicate.py │ └── rel.py │ ├── processors │ ├── __init__.py │ ├── jumanpp.py │ ├── knp.py │ ├── kwja.py │ ├── processor.py │ └── senter.py │ ├── props │ ├── __init__.py │ ├── dependency.py │ ├── feature.py │ ├── memo.py │ ├── named_entity.py │ └── semantics.py │ ├── py.typed │ ├── units │ ├── __init__.py │ ├── base_phrase.py │ ├── clause.py │ ├── document.py │ ├── morpheme.py │ ├── phrase.py │ ├── sentence.py │ └── unit.py │ └── utils │ ├── __init__.py │ ├── comment.py │ └── reader.py ├── tests ├── bin │ ├── jumanpp-mock.sh │ ├── knp-mock.sh │ └── kwja-mock.sh ├── cli │ ├── test_cat.py │ ├── test_cli.py │ ├── test_serve.py │ ├── test_show.py │ └── test_stats.py ├── cohesion │ ├── test_argument.py │ ├── test_coreference.py │ ├── test_discourse.py │ ├── test_exophora.py │ ├── test_pas.py │ ├── test_predicate.py │ └── test_rel.py ├── data │ ├── w201106-0000060050.knp │ ├── w201106-0000060560.knp │ ├── w201106-0000060877.knp │ ├── w201106-0000074273.knp │ └── wiki00100176.knp ├── processors │ ├── test_jumanpp.py │ ├── test_knp.py │ ├── test_kwja.py │ └── test_regex_senter.py ├── props │ ├── test_features.py │ ├── test_memo.py │ ├── test_named_entity.py │ └── test_semantics.py ├── units │ ├── test_base_phrase.py │ ├── test_clause.py │ ├── test_document.py │ ├── test_morpheme.py │ ├── test_phrase.py │ └── test_sentence.py └── utils │ ├── test_comment.py │ └── test_reader.py └── uv.lock /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "uv" 9 | directory: "/" 10 | schedule: 11 | interval: "monthly" 12 | timezone: "Asia/Tokyo" 13 | groups: 14 | dependencies: 15 | patterns: 16 | - "*" 17 | target-branch: "develop" 18 | ignore: 19 | - dependency-name: "*" 20 | update-types: ["version-update:semver-major"] 21 | 22 | - package-ecosystem: "github-actions" 23 | # Workflow files stored in the 24 | # default location of `.github/workflows` 25 | directory: "/" 26 | schedule: 27 | interval: "monthly" 28 | timezone: "Asia/Tokyo" 29 | target-branch: "develop" 30 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | name: Build the project 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | max-parallel: 18 11 | fail-fast: false 12 | matrix: 13 | os: [ubuntu-latest, macos-latest, windows-latest] 14 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"] 15 | steps: 16 | - name: Checkout repository 17 | uses: actions/checkout@v4 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v5 20 | id: setup-python 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install uv 24 | uses: astral-sh/setup-uv@v6 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Build package 28 | run: uv build -o dist 29 | - name: Install virtualenv and create virtual environment 30 | run: | 31 | uv tool install virtualenv 32 | virtualenv -p ${{ matrix.python-version }} .venv 33 | - name: Install rhoknp from wheel (non-Windows) 34 | if: ${{ matrix.os != 'windows-latest' }} 35 | run: | 36 | source .venv/bin/activate 37 | wheelFile=$(ls dist/*.whl) 38 | pip install "${wheelFile}[cli]" 39 | - name: Install rhoknp from wheel (Windows) 40 | if: ${{ matrix.os == 'windows-latest' }} 41 | run: | 42 | .\.venv\Scripts\Activate 43 | $wheelFile = (Get-ChildItem -Path dist -Filter *.whl).FullName 44 | pip install "${wheelFile}[cli]" 45 | shell: pwsh 46 | - name: Run rhoknp (non-Windows) 47 | if: ${{ matrix.os != 'windows-latest' }} 48 | run: | 49 | source .venv/bin/activate 50 | rhoknp --version 51 | rhoknp --help 52 | - name: Run rhoknp (Windows) 53 | if: ${{ matrix.os == 'windows-latest' }} 54 | run: | 55 | .\.venv\Scripts\Activate 56 | rhoknp --version 57 | rhoknp --help 58 | shell: pwsh 59 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: ["main"] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: ["main"] 20 | schedule: 21 | - cron: "37 23 * * 0" 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: ["python"] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v4 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v3 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | 52 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 53 | # queries: security-extended,security-and-quality 54 | 55 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 56 | # If this step fails, then you should remove it and run the build manually (see below) 57 | - name: Autobuild 58 | uses: github/codeql-action/autobuild@v3 59 | 60 | # ℹ️ Command-line programs to run using the OS shell. 61 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 62 | 63 | # If the Autobuild fails above, remove it and uncomment the following three lines. 64 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 65 | 66 | # - run: | 67 | # echo "Run, Build Application using script" 68 | # ./location_of_script_within_repo/buildscript.sh 69 | 70 | - name: Perform CodeQL Analysis 71 | uses: github/codeql-action/analyze@v3 72 | with: 73 | category: "/language:${{matrix.language}}" 74 | -------------------------------------------------------------------------------- /.github/workflows/dependabot-auto-merge.yml: -------------------------------------------------------------------------------- 1 | # https://docs.github.com/en/code-security/dependabot/working-with-dependabot/automating-dependabot-with-github-actions#enable-auto-merge-on-a-pull-request 2 | name: Dependabot auto-merge 3 | on: pull_request_target 4 | 5 | permissions: 6 | pull-requests: write 7 | contents: write 8 | 9 | jobs: 10 | dependabot: 11 | runs-on: ubuntu-latest 12 | if: ${{ github.actor == 'dependabot[bot]' }} 13 | steps: 14 | - name: Dependabot metadata 15 | id: metadata 16 | uses: dependabot/fetch-metadata@v2 17 | with: 18 | github-token: "${{ secrets.GITHUB_TOKEN }}" 19 | - name: Wait for tests to pass 20 | uses: lewagon/wait-on-check-action@v1.3.4 21 | with: 22 | ref: ${{ github.event.pull_request.head.sha }} 23 | # running-workflow-name: "Test" # this condition does not work 24 | check-regexp: Run tests with pytest.* 25 | repo-token: ${{ secrets.GITHUB_TOKEN }} 26 | wait-interval: 60 # Check every 60 seconds 27 | - name: Enable auto-merge for Dependabot PRs 28 | if: ${{ steps.metadata.outputs.update-type != 'version-update:semver-major' }} 29 | run: gh pr merge --auto --merge "$PR_URL" 30 | env: 31 | PR_URL: ${{github.event.pull_request.html_url}} 32 | GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} 33 | -------------------------------------------------------------------------------- /.github/workflows/docs-requirements.yml: -------------------------------------------------------------------------------- 1 | name: Generate `docs/requirements.txt` 2 | 3 | on: 4 | push: 5 | paths: 6 | - "pyproject.toml" 7 | - "uv.lock" 8 | 9 | jobs: 10 | generate-requirements: 11 | name: Generate `docs/requirements.txt` from pyproject.toml 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | - name: Install uv 16 | uses: astral-sh/setup-uv@v6 17 | - name: Export requirements.txt 18 | run: uv export --only-group docs --no-annotate --no-hashes -o docs/requirements.txt 19 | - name: Commit and push changes 20 | uses: stefanzweifel/git-auto-commit-action@v5 21 | with: 22 | commit_message: update docs/requirements.txt 23 | # Optional glob pattern of files which should be added to the commit 24 | file_pattern: docs/requirements.txt 25 | # Optional. Prevents the shell from expanding filenames. 26 | # Details: https://www.gnu.org/software/bash/manual/html_node/Filename-Expansion.html 27 | disable_globbing: true 28 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Checkout repository 10 | uses: actions/checkout@v4 11 | - name: Set up Python 3.9 12 | uses: actions/setup-python@v5 13 | with: 14 | python-version: "3.9" 15 | - name: Run linters 16 | run: | 17 | pipx install pre-commit 18 | pre-commit run --all-files 19 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*.*.*" 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build-publish: 11 | runs-on: ubuntu-latest 12 | # https://github.com/pypa/gh-action-pypi-publish?tab=readme-ov-file#trusted-publishing 13 | environment: 14 | name: pypi 15 | url: https://pypi.org/p/rhoknp 16 | permissions: 17 | id-token: write 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Install uv 21 | uses: astral-sh/setup-uv@v6 22 | - name: Build package 23 | run: uv build -o dist 24 | - name: Publish package 25 | uses: pypa/gh-action-pypi-publish@release/v1 26 | with: 27 | verbose: true 28 | -------------------------------------------------------------------------------- /.github/workflows/pylock.yml: -------------------------------------------------------------------------------- 1 | name: Generate pylock.toml 2 | 3 | on: 4 | push: 5 | paths: 6 | - "pyproject.toml" 7 | - "uv.lock" 8 | 9 | jobs: 10 | generate-pylock: 11 | name: Generate pylock.toml from pyproject.toml 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | - name: Install uv 16 | uses: astral-sh/setup-uv@v6 17 | - name: Export pylock.toml 18 | run: uv export -o pylock.toml 19 | - name: Commit and push changes 20 | uses: stefanzweifel/git-auto-commit-action@v5 21 | with: 22 | commit_message: update pylock.yml 23 | # Optional glob pattern of files which should be added to the commit 24 | file_pattern: pylock.yml 25 | # Optional. Prevents the shell from expanding filenames. 26 | # Details: https://www.gnu.org/software/bash/manual/html_node/Filename-Expansion.html 27 | disable_globbing: true 28 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*.*.*" 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | contents: write 13 | steps: 14 | - name: Checkout code 15 | uses: actions/checkout@v4 16 | - name: Create release 17 | id: create_release 18 | uses: softprops/action-gh-release@v2 19 | with: 20 | tag_name: ${{ github.ref_name }} 21 | draft: false 22 | prerelease: false 23 | -------------------------------------------------------------------------------- /.github/workflows/test-example.yml: -------------------------------------------------------------------------------- 1 | name: TestExample 2 | 3 | on: 4 | schedule: 5 | - cron: "0 3 */16 * *" # Runs at 3:00 UTC on the 1 and 17th of every month. 6 | workflow_dispatch: 7 | 8 | jobs: 9 | test-example: 10 | name: Run tests for examples 11 | container: kunlp/jumanpp-knp:ubuntu22.04 12 | runs-on: ubuntu-22.04 13 | strategy: 14 | max-parallel: 5 15 | fail-fast: false 16 | matrix: 17 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"] 18 | steps: 19 | - name: Checkout repository 20 | uses: actions/checkout@v4 21 | - name: Install required apt packages 22 | run: | 23 | export DEBIAN_FRONTEND=noninteractive 24 | apt-get update -yq 25 | apt-get install -yq curl build-essential libsqlite3-dev libffi-dev 26 | - name: Set up Python ${{ matrix.python-version }} 27 | uses: actions/setup-python@v5 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | - name: Install uv 31 | uses: astral-sh/setup-uv@v6 32 | with: 33 | python-version: ${{ matrix.python-version }} 34 | - name: Install dependencies 35 | run: | 36 | uv sync --group test --extra cli --no-cache 37 | - name: Install KWJA 38 | # KWJA does not support Python 3.13 39 | if: ${{ matrix.python-version != 3.13 }} 40 | run: | 41 | uv tool install kwja 42 | - name: Run tests for all files under examples/apply_*.py 43 | shell: bash 44 | run: | 45 | for example in examples/apply_*.py; do 46 | if [[ -f "${example}" ]]; then 47 | echo "Running tests for ${example}" 48 | uv run python "${example}" "こんにちは" 49 | fi 50 | done 51 | - name: Run tests for examples/use_*.py 52 | shell: bash 53 | run: | 54 | for example in examples/use_*.py; do 55 | if [[ -f "${example}" ]]; then 56 | echo "Running tests for ${example}" 57 | uv run python "${example}" "こんにちは" 58 | fi 59 | done 60 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: [push, pull_request, workflow_dispatch] 4 | 5 | jobs: 6 | test: 7 | name: Run tests with pytest 8 | container: kunlp/jumanpp-knp:ubuntu22.04 9 | runs-on: ubuntu-22.04 10 | strategy: 11 | max-parallel: 6 12 | fail-fast: false 13 | matrix: 14 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"] 15 | steps: 16 | - name: Checkout repository 17 | uses: actions/checkout@v4 18 | - name: Install required apt packages 19 | run: | 20 | export DEBIAN_FRONTEND=noninteractive 21 | apt-get update -yq 22 | apt-get install -yq curl build-essential libsqlite3-dev libffi-dev libssl-dev 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install uv 28 | uses: astral-sh/setup-uv@v6 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | - name: Install dependencies 32 | run: uv sync --extra cli --no-cache 33 | - name: Install KWJA 34 | # KWJA does not support Python 3.13 35 | if: ${{ matrix.python-version != '3.13' && matrix.python-version != '3.13t' }} 36 | run: | 37 | uv tool install kwja 38 | kwja --model-size tiny --text "テスト" 39 | - name: Run tests with coverage 40 | if: ${{ matrix.python-version == '3.10' }} 41 | run: | 42 | uv run pytest --cov=./ --cov-report=xml -v ./tests 43 | - name: Run tests without coverage 44 | if: ${{ matrix.python-version != '3.10' }} 45 | run: | 46 | uv run pytest -v ./tests 47 | - name: Install git for codecov 48 | if: ${{ matrix.python-version == '3.10' }} 49 | run: | 50 | apt-get install -yq git 51 | - name: Upload coverage to Codecov 52 | if: ${{ matrix.python-version == '3.10' }} 53 | uses: codecov/codecov-action@v5 54 | with: 55 | files: ./coverage.xml 56 | name: codecov-umbrella 57 | token: ${{ secrets.CODECOV_TOKEN }} 58 | verbose: true 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # IPython 77 | profile_default/ 78 | ipython_config.py 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | .dmypy.json 111 | dmypy.json 112 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v5.0.0 4 | hooks: 5 | - id: end-of-file-fixer 6 | - id: trailing-whitespace 7 | - id: check-docstring-first 8 | - id: check-yaml 9 | - id: check-toml 10 | - id: check-added-large-files 11 | exclude: "assets/logo.*" 12 | - repo: https://github.com/astral-sh/ruff-pre-commit 13 | rev: v0.11.12 14 | hooks: 15 | - id: ruff 16 | args: [--fix, --exit-non-zero-on-fix] 17 | - id: ruff-format 18 | - repo: https://github.com/pre-commit/mirrors-mypy 19 | rev: v1.16.0 20 | hooks: 21 | - id: mypy 22 | additional_dependencies: 23 | - fastapi 24 | - jinja2 25 | - pygments 26 | - rich 27 | - typer-slim 28 | - types-click 29 | - types-PyYAML 30 | - typing-extensions 31 | - uvicorn 32 | - repo: https://github.com/pre-commit/mirrors-prettier 33 | rev: v4.0.0-alpha.8 34 | hooks: 35 | - id: prettier 36 | - repo: https://github.com/Riverside-Healthcare/djLint 37 | rev: v1.36.4 38 | hooks: 39 | - id: djlint-jinja 40 | - id: djlint-reformat-jinja 41 | - repo: https://github.com/rhysd/actionlint 42 | rev: v1.7.7 43 | hooks: 44 | - id: actionlint 45 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.9" 7 | 8 | sphinx: 9 | configuration: docs/conf.py 10 | 11 | python: 12 | install: 13 | - requirements: docs/requirements.txt 14 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | # Authors 2 | 3 | Maintained with: 4 | 5 | - [Hirokazu Kiyomaru](mailto:h.kiyomaru@gmail.com) 6 | - [Nobuhiro Ueda](mailto:ueda@nlp.ist.i.kyoto-u.ac.jp) 7 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | title: "rhoknp: Yet another Python binding for Juman++/KNP/KWJA" 4 | authors: 5 | - family-names: Kiyomaru 6 | given-names: Hirokazu 7 | - family-names: Ueda 8 | given-names: Nobuhiro 9 | version: 1.6.0 10 | repository-code: "https://github.com/ku-nlp/rhoknp" 11 | date-released: 2023-11-08 12 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to _rhoknp_ 2 | 3 | Thank you for your interest in improving _rhoknp_! 4 | Our [contributing documentation](https://rhoknp.readthedocs.io/en/latest/contributing/index.html) contains what you need to know about contributing to _rhoknp_. 5 | We look forward to your contributions! 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Kyoto University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | rhoknp logo 4 | 5 |

6 | 7 |

rhoknp: Yet another Python binding for Juman++/KNP/KWJA

8 | 9 |

10 | Test 11 | Codecov 12 | CodeFactor 13 | PyPI 14 | PyPI - Python Version 15 | Documentation 16 | Ruff 17 |

18 | 19 | --- 20 | 21 | **Documentation**: [https://rhoknp.readthedocs.io/en/latest/](https://rhoknp.readthedocs.io/en/latest/) 22 | 23 | **Source Code**: [https://github.com/ku-nlp/rhoknp](https://github.com/ku-nlp/rhoknp) 24 | 25 | --- 26 | 27 | _rhoknp_ is a Python binding for [Juman++](https://github.com/ku-nlp/jumanpp), [KNP](https://github.com/ku-nlp/knp), and [KWJA](https://github.com/ku-nlp/kwja).[^1] 28 | 29 | [^1]: The logo was generated by OpenAI DALL·E 2. 30 | 31 | ```python 32 | import rhoknp 33 | 34 | # Perform morphological analysis by Juman++ 35 | jumanpp = rhoknp.Jumanpp() 36 | sentence = jumanpp.apply_to_sentence( 37 | "電気抵抗率は電気の通しにくさを表す物性値である。" 38 | ) 39 | 40 | # Access to the result 41 | for morpheme in sentence.morphemes: # a.k.a. keitai-so 42 | ... 43 | 44 | # Save the result 45 | with open("result.jumanpp", "wt") as f: 46 | f.write(sentence.to_jumanpp()) 47 | 48 | # Load the result 49 | with open("result.jumanpp", "rt") as f: 50 | sentence = rhoknp.Sentence.from_jumanpp(f.read()) 51 | ``` 52 | 53 | ## Requirements 54 | 55 | - Python 3.9+ 56 | - (Optional) [Juman++](https://github.com/ku-nlp/jumanpp) v2.0.0-rc3+ 57 | - (Optional) [KNP](https://github.com/ku-nlp/knp) 5.0+ 58 | - (Optional) [KWJA](https://github.com/ku-nlp/kwja) 1.0.0+ 59 | 60 | ## Installation 61 | 62 | ```shell 63 | pip install rhoknp 64 | ``` 65 | 66 | ## Quick tour 67 | 68 | Let's begin by using Juman++ with rhoknp. 69 | Here, we present a simple example demonstrating how Juman++ can be used to analyze a sentence. 70 | 71 | ```python 72 | # Perform morphological analysis by Juman++ 73 | jumanpp = rhoknp.Jumanpp() 74 | sentence = jumanpp.apply_to_sentence("電気抵抗率は電気の通しにくさを表す物性値である。") 75 | ``` 76 | 77 | You can easily access the individual morphemes that make up the sentence. 78 | 79 | ```python 80 | for morpheme in sentence.morphemes: # a.k.a. keitai-so 81 | ... 82 | ``` 83 | 84 | Sentence objects can be saved in the JUMAN format. 85 | 86 | ```python 87 | # Save the sentence in the JUMAN format 88 | with open("sentence.jumanpp", "wt") as f: 89 | f.write(sentence.to_jumanpp()) 90 | 91 | # Load the sentence 92 | with open("sentence.jumanpp", "rt") as f: 93 | sentence = rhoknp.Sentence.from_jumanpp(f.read()) 94 | ``` 95 | 96 | Almost the same APIs are available for KNP. 97 | 98 | ```python 99 | # Perform language analysis by KNP 100 | knp = rhoknp.KNP() 101 | sentence = knp.apply_to_sentence("電気抵抗率は電気の通しにくさを表す物性値である。") 102 | ``` 103 | 104 | KNP performs language analysis at multiple levels. 105 | 106 | ```python 107 | for clause in sentence.clauses: # a.k.a., setsu 108 | ... 109 | for phrase in sentence.phrases: # a.k.a. bunsetsu 110 | ... 111 | for base_phrase in sentence.base_phrases: # a.k.a. kihon-ku 112 | ... 113 | for morpheme in sentence.morphemes: # a.k.a. keitai-so 114 | ... 115 | ``` 116 | 117 | Sentence objects can be saved in the KNP format. 118 | 119 | ```python 120 | # Save the sentence in the KNP format 121 | with open("sentence.knp", "wt") as f: 122 | f.write(sentence.to_knp()) 123 | 124 | # Load the sentence 125 | with open("sentence.knp", "rt") as f: 126 | sentence = rhoknp.Sentence.from_knp(f.read()) 127 | ``` 128 | 129 | Furthermore, rhoknp provides convenient APIs for document-level language analysis. 130 | 131 | ```python 132 | document = rhoknp.Document.from_raw_text( 133 | "電気抵抗率は電気の通しにくさを表す物性値である。単に抵抗率とも呼ばれる。" 134 | ) 135 | # If you know sentence boundaries, you can use `Document.from_sentences` instead. 136 | document = rhoknp.Document.from_sentences( 137 | [ 138 | "電気抵抗率は電気の通しにくさを表す物性値である。", 139 | "単に抵抗率とも呼ばれる。", 140 | ] 141 | ) 142 | ``` 143 | 144 | Document objects can be handled in a similar manner as Sentence objects. 145 | 146 | ```python 147 | # Perform morphological analysis by Juman++ 148 | document = jumanpp.apply_to_document(document) 149 | 150 | # Access language units in the document 151 | for sentence in document.sentences: 152 | ... 153 | for morpheme in document.morphemes: 154 | ... 155 | 156 | # Save language analysis by Juman++ 157 | with open("document.jumanpp", "wt") as f: 158 | f.write(document.to_jumanpp()) 159 | 160 | # Load language analysis by Juman++ 161 | with open("document.jumanpp", "rt") as f: 162 | document = rhoknp.Document.from_jumanpp(f.read()) 163 | ``` 164 | 165 | For more information, please refer to the [examples](./examples) and [documentation](https://rhoknp.readthedocs.io/en/latest/). 166 | 167 | ## Main differences from [pyknp](https://github.com/ku-nlp/pyknp/) 168 | 169 | [_pyknp_](https://pypi.org/project/pyknp/) serves as the official Python binding for Juman++ and KNP. 170 | In the development of rhoknp, we redesigned the API, considering the current use cases of pyknp. 171 | The key differences between the two are as follows: 172 | 173 | - **Support for document-level language analysis**: rhoknp allows you to load and instantiate the results of document-level language analysis, including cohesion analysis and discourse relation analysis. 174 | - **Strict type-awareness**: rhoknp has been thoroughly annotated with type annotations, ensuring strict type checking and improved code clarity. 175 | - **Comprehensive test suite**: rhoknp is extensively tested with a comprehensive test suite. You can view the code coverage report on [Codecov](https://app.codecov.io/gh/ku-nlp/rhoknp). 176 | 177 | ## License 178 | 179 | MIT 180 | 181 | ## Contributing 182 | 183 | We warmly welcome contributions to rhoknp. 184 | You can get started by reading the [contribution guide](https://rhoknp.readthedocs.io/en/latest/contributing/index.html). 185 | 186 | ## Reference 187 | 188 | - [KNP FORMAT](http://cr.fvcrc.i.nagoya-u.ac.jp/~sasano/knp/format.html) 189 | - [KNP - KUROHASHI-CHU-MURAWAKI LAB](https://nlp.ist.i.kyoto-u.ac.jp/?KNP) 190 | -------------------------------------------------------------------------------- /assets/KNP.sublime-syntax: -------------------------------------------------------------------------------- 1 | %YAML 1.2 2 | --- 3 | name: KNP 4 | file_extensions: [knp] 5 | scope: source.knp 6 | 7 | contexts: 8 | main: 9 | - match: '^[^+*\#\"<> ]+' 10 | scope: variablel 11 | - match: '(?<=\s)[^+\#\"<> ]+' 12 | scope: variable 13 | - match: "<" 14 | scope: keyword 15 | push: feature 16 | - match: ^\+ 17 | scope: keyword 18 | push: tag_bnst 19 | - match: ^\* 20 | scope: keyword 21 | push: tag_bnst 22 | - match: ^EOS$ 23 | scope: constant 24 | - match: \" 25 | scope: string 26 | push: string 27 | - match: ^\# 28 | scope: comment 29 | push: comment 30 | string: 31 | - match: '[^\"]+' 32 | scope: string 33 | - match: \" 34 | scope: string 35 | pop: true 36 | tag_bnst: 37 | - match: (-1|\d+)[DPAI] 38 | scope: constant.language 39 | - match: "<" 40 | scope: keyword 41 | push: feature 42 | - match: $ 43 | pop: true 44 | feature: 45 | - match: \" 46 | scope: string 47 | push: string 48 | - match: ">" 49 | scope: keyword 50 | pop: true 51 | - match: ":" 52 | scope: keyword 53 | push: feature_value 54 | - match: "=" 55 | scope: keyword 56 | - match: '[^ :><\"=]+' 57 | scope: storage.type 58 | feature_value: 59 | - match: "[^><]+(?=>)" 60 | scope: variable.parameter 61 | pop: true 62 | comment: 63 | - match: "S-ID:" 64 | scope: comment 65 | push: sid 66 | - match: \S+ 67 | scope: comment 68 | - match: $ 69 | pop: true 70 | sid: 71 | - match: \S+ 72 | scope: constant.numeric 73 | pop: true 74 | -------------------------------------------------------------------------------- /assets/logo-original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/assets/logo-original.png -------------------------------------------------------------------------------- /assets/logo-wide.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/assets/logo-wide.xcf -------------------------------------------------------------------------------- /assets/logo.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/assets/logo.xcf -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/docs/_static/favicon-16x16.png -------------------------------------------------------------------------------- /docs/_static/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/docs/_static/favicon-32x32.png -------------------------------------------------------------------------------- /docs/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/docs/_static/favicon.ico -------------------------------------------------------------------------------- /docs/_static/logo-wide.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/docs/_static/logo-wide.png -------------------------------------------------------------------------------- /docs/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/docs/_static/logo.png -------------------------------------------------------------------------------- /docs/authors.md: -------------------------------------------------------------------------------- 1 | ```{include} ../AUTHORS.md 2 | 3 | ``` 4 | -------------------------------------------------------------------------------- /docs/cli/index.md: -------------------------------------------------------------------------------- 1 | # CLI Tools 2 | 3 | _rhoknp_ provides a command-line interface (CLI). 4 | 5 | Before using the CLI, you need to install _rhoknp_ with the following command: 6 | 7 | ```{eval-rst} 8 | .. prompt:: 9 | :prompts: $ 10 | 11 | pip install rhoknp[cli] 12 | ``` 13 | 14 | ## cat 15 | 16 | The `cat` command prints KNP files with syntax highlighting. 17 | 18 | ```{eval-rst} 19 | .. prompt:: 20 | :prompts: $ 21 | 22 | rhoknp cat [--dark] 23 | ``` 24 | 25 | ## serve 26 | 27 | The `serve` command starts a web server to provide a playground for the given language analyzer. 28 | 29 | ```{eval-rst} 30 | .. prompt:: 31 | :prompts: $ 32 | 33 | rhoknp serve {jumanpp|knp|kwja} [--host HOST] [--port PORT] 34 | ``` 35 | 36 | ## show 37 | 38 | The `show` command shows the given KNP file in a tree format. 39 | 40 | ```{eval-rst} 41 | .. prompt:: 42 | :prompts: $ 43 | 44 | rhoknp show [--pos] [--rel] 45 | ``` 46 | 47 | ## stats 48 | 49 | The `stats` command shows the statistics of the given KNP file. 50 | 51 | ```{eval-rst} 52 | .. prompt:: 53 | :prompts: $ 54 | 55 | rhoknp stats [--json] 56 | ``` 57 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import sys 14 | from pathlib import Path 15 | 16 | sys.path.insert(0, str(Path("../src").resolve())) 17 | 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = "rhoknp" 22 | copyright = "2021, Hirokazu Kiyomaru and Nobuhiro Ueda" 23 | author = "Hirokazu Kiyomaru and Nobuhiro Ueda" 24 | 25 | 26 | # -- General configuration --------------------------------------------------- 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | "sphinx.ext.autodoc", 33 | "sphinx.ext.viewcode", 34 | "sphinx.ext.napoleon", 35 | "sphinx_copybutton", 36 | "sphinx-prompt", 37 | "myst_parser", 38 | ] 39 | 40 | # sphinx.ext.autodoc 41 | autodoc_default_options = { 42 | "members": True, 43 | "show-inheritance": True, 44 | "undoc-members": True, 45 | "exclude-members": ",".join(["__weakref__", "count", "parent_unit", "child_units"]), 46 | "member-order": "bysource", 47 | } 48 | 49 | # sphinx_copybutton 50 | copybutton_prompt_text = r">>> |\.\.\. |\$ " 51 | copybutton_prompt_is_regexp = True 52 | 53 | # Add any paths that contain templates here, relative to this directory. 54 | templates_path = ["_templates"] 55 | 56 | # List of patterns, relative to source directory, that match files and 57 | # directories to ignore when looking for source files. 58 | # This pattern also affects html_static_path and html_extra_path. 59 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 60 | 61 | 62 | # -- Options for HTML output ------------------------------------------------- 63 | 64 | # The theme to use for HTML and HTML Help pages. See the documentation for 65 | # a list of builtin themes. 66 | # 67 | html_theme = "furo" 68 | html_logo = "_static/logo-wide.png" 69 | 70 | # Add any paths that contain custom static files (such as style sheets) here, 71 | # relative to this directory. They are copied after the builtin static files, 72 | # so a file named "default.css" will overwrite the builtin "default.css". 73 | html_static_path = ["_static"] 74 | -------------------------------------------------------------------------------- /docs/contributing/index.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Thank you for your interest in improving _rhoknp_! 4 | We give an overview on contributing to the _rhoknp_ project. 5 | 6 | ## Development Environment 7 | 8 | Development should be done using the latest version of Python. 9 | As of this writing, it is Python 3.10. 10 | 11 | Install the development dependencies using [uv](https://docs.astral.sh/uv/). 12 | 13 | ```{eval-rst} 14 | .. prompt:: 15 | :prompts: $ 16 | 17 | uv sync 18 | pre-commit install 19 | ``` 20 | 21 | ## Submitting a Pull Request 22 | 23 | Before submitting a pull request, run lints and test. 24 | 25 | ```{eval-rst} 26 | .. prompt:: 27 | :prompts: $ 28 | 29 | uv run pre-commit run --all-files 30 | uv run pytest 31 | ``` 32 | 33 | ## Testing 34 | 35 | If you are adding a new feature, please add a test for it. 36 | When the feature is large, first open an issue to discuss the idea. 37 | 38 | If you are fixing a bug, please add a test that exposes the bug and fails before applying your fix. 39 | -------------------------------------------------------------------------------- /docs/format/index.md: -------------------------------------------------------------------------------- 1 | # Juman++/KNP Format 2 | 3 | This page describes the format of the result of Juman++ and KNP. 4 | 5 | ## Juman++ 6 | 7 | Juman++ is a morphological analyzer for Japanese. 8 | We show an example of the result of Juman++: 9 | 10 | ``` 11 | # Language analysis of "麻生太郎はコーヒーを買って飲んだ。" 12 | 麻生 あそう 麻生 名詞 6 人名 5 * 0 * 0 "人名:日本:姓:135:0.00166" 13 | 太郎 たろう 太郎 名詞 6 人名 5 * 0 * 0 "人名:日本:名:45:0.00106" 14 | は は は 助詞 9 副助詞 2 * 0 * 0 NIL 15 | コーヒー こーひー コーヒー 名詞 6 普通名詞 1 * 0 * 0 "代表表記:珈琲/こーひー ドメイン:料理・食事 カテゴリ:人工物-食べ物" 16 | を を を 助詞 9 格助詞 1 * 0 * 0 NIL 17 | 買って かって 買う 動詞 2 * 0 子音動詞ワ行 12 タ系連用テ形 14 "代表表記:買う/かう ドメイン:家庭・暮らし;ビジネス 反義:動詞:売る/うる" 18 | 飲んだ のんだ 飲む 動詞 2 * 0 子音動詞マ行 9 タ形 10 "代表表記:飲む/のむ ドメイン:料理・食事" 19 | 。 。 。 特殊 1 句点 1 * 0 * 0 NIL 20 | EOS 21 | ``` 22 | 23 | Each line represents a morpheme (a.k.a. _keitai-so_) and formatted as `[surface form] [reading] [lemma] [pos] [pos ID] [pos subcategory] [pos subcategory ID] [conjugation type] [conjugation type ID] [conjugation form] [conjugation form ID] [semantic information]`. 24 | For example, `飲んだ のんだ 飲む 動詞 2 * 0 子音動詞マ行 9 タ形 10 "代表表記:飲む/のむ ドメイン:料理・食事"` indicates that the surface form is `飲んだ`, the reading is `のんだ`, the lemma is `飲む`, and the pos (part-of-speech) is `動詞`, and so forth. 25 | 26 | ## KNP 27 | 28 | KNP is a Japanese dependency parser. 29 | We show an example of the result of KNP: 30 | 31 | ``` 32 | # Language analysis of "麻生太郎はコーヒーを買って飲んだ。" 33 | * 3D <文頭><人名><ハ><助詞><体言><係:未格><提題><区切:3-5><主題表現><格要素><連用要素><正規化代表表記:麻生/あそう+太郎/たろう><主辞代表表記:太郎/たろう> 34 | + 1D <文節内><係:文節内><文頭><人名><体言><名詞項候補><先行詞候補><正規化代表表記:麻生/あそう> 35 | 麻生 あそう 麻生 名詞 6 人名 5 * 0 * 0 "人名:日本:姓:135:0.00166 疑似代表表記 代表表記:麻生/あそう" <人名:日本:姓:135:0.00166><疑似代表表記><代表表記:麻生/あそう><正規化代表表記:麻生/あそう><漢字><かな漢字><名詞相当語><文頭><自立><内容語><タグ単位始><文節始><固有キー><用言表記先頭><用言表記末尾><用言意味表記末尾> 36 | + 4D <人名><ハ><助詞><体言><係:未格><提題><区切:3-5><主題表現><格要素><連用要素><名詞項候補><先行詞候補><正規化代表表記:太郎/たろう><主辞代表表記:太郎/たろう><解析格:ガ> 37 | 太郎 たろう 太郎 名詞 6 人名 5 * 0 * 0 "人名:日本:名:45:0.00106 疑似代表表記 代表表記:太郎/たろう" <人名:日本:名:45:0.00106><疑似代表表記><代表表記:太郎/たろう><正規化代表表記:太郎/たろう><漢字><かな漢字><名詞相当語><自立><複合←><内容語><タグ単位始><固有キー><文節主辞><用言表記先頭><用言表記末尾><用言意味表記末尾> 38 | は は は 助詞 9 副助詞 2 * 0 * 0 NIL <かな漢字><ひらがな><付属> 39 | * 2D <ヲ><助詞><体言><係:ヲ格><区切:0-0><格要素><連用要素><正規化代表表記:珈琲/こーひー><主辞代表表記:珈琲/こーひー> 40 | + 3D <ヲ><助詞><体言><係:ヲ格><区切:0-0><格要素><連用要素><名詞項候補><先行詞候補><正規化代表表記:珈琲/こーひー><主辞代表表記:珈琲/こーひー><解析格:ヲ> 41 | コーヒー こーひー コーヒー 名詞 6 普通名詞 1 * 0 * 0 "代表表記:珈琲/こーひー ドメイン:料理・食事 カテゴリ:人工物-食べ物" <代表表記:珈琲/こーひー><ドメイン:料理・食事><カテゴリ:人工物-食べ物><正規化代表表記:珈琲/こーひー><記英数カ><カタカナ><名詞相当語><自立><内容語><タグ単位始><文節始><固有キー><文節主辞> 42 | を を を 助詞 9 格助詞 1 * 0 * 0 NIL <かな漢字><ひらがな><付属> 43 | * 3D <用言:動><係:連用><レベル:A><区切:3-5><連用要素><連用節><動態述語><正規化代表表記:買う/かう><主辞代表表記:買う/かう> 44 | + 4D <用言:動><係:連用><レベル:A><区切:3-5><連用要素><連用節><動態述語><正規化代表表記:買う/かう><主辞代表表記:買う/かう><用言代表表記:買う/かう><節-区切><節-主辞><格関係2:ヲ:コーヒー><格解析結果:買う/かう:動1:ガ/U/-/-/-/-;ヲ/C/コーヒー/2/0/1;ニ/U/-/-/-/-;ト/U/-/-/-/-;デ/U/-/-/-/-;時間/U/-/-/-/-><標準用言代表表記:買う/かう> 45 | 買って かって 買う 動詞 2 * 0 子音動詞ワ行 12 タ系連用テ形 14 "代表表記:買う/かう ドメイン:家庭・暮らし;ビジネス 反義:動詞:売る/うる" <代表表記:買う/かう><ドメイン:家庭・暮らし;ビジネス><反義:動詞:売る/うる><正規化代表表記:買う/かう><かな漢字><活用語><自立><内容語><タグ単位始><文節始><文節主辞><用言表記先頭><用言表記末尾><用言意味表記末尾> 46 | * -1D <文末><時制:過去><句点><用言:動><レベル:C><区切:5-5><係:文末><提題受:30><主節><格要素><連用要素><動態述語><正規化代表表記:飲む/のむ><主辞代表表記:飲む/のむ> 47 | + -1D <文末><時制:過去><句点><用言:動><レベル:C><区切:5-5><係:文末><提題受:30><主節><格要素><連用要素><動態述語><正規化代表表記:飲む/のむ><主辞代表表記:飲む/のむ><用言代表表記:飲む/のむ><節-区切><節-主辞><主題格:一人称優位><格関係1:ガ:太郎><格解析結果:飲む/のむ:動8:ガ/N/太郎/1/0/1;ヲ/U/-/-/-/-;ニ/U/-/-/-/-;デ/U/-/-/-/-;時間/U/-/-/-/-><標準用言代表表記:飲む/のむ> 48 | 飲んだ のんだ 飲む 動詞 2 * 0 子音動詞マ行 9 タ形 10 "代表表記:飲む/のむ ドメイン:料理・食事" <代表表記:飲む/のむ><ドメイン:料理・食事><正規化代表表記:飲む/のむ><かな漢字><活用語><表現文末><自立><内容語><タグ単位始><文節始><文節主辞><用言表記先頭><用言表記末尾><用言意味表記末尾> 49 | 。 。 。 特殊 1 句点 1 * 0 * 0 NIL <英記号><記号><文末><付属> 50 | EOS 51 | ``` 52 | 53 | The line starting with `*` represents the beginning of a phrase (a.k.a. _bunsetsu_) and formatted as `* [parent phrase index][dependency type] [semantic information]`. 54 | For example, the line `* 3D <文頭><人名>` indicates that the phrase modifies the `3`rd phrase with the dependency type of `D` includes the semantic information of `<文頭>` and `<人名>`. 55 | 56 | The line starting with `+` represents the beginning of a base-phrase (a.k.a. _kihon-ku_) and formatted as `+ [parent base-phrase index][dependency type] [semantic information]`. 57 | For example, the line `+ 1D <文節内><係:文節内>` indicates that the base-phrase modifies the `1`st base-phrase with the dependency type of `D` includes the semantic information of `<文節内>` and `<係:文節内>`. 58 | 59 | Lines with neither `*` nor `+` represent morphemes. 60 | The format is almost the same as Juman++'s one, except that the column of representing the semantic information is added at the end. 61 | 62 | ## Misc 63 | 64 | - Lines starting with `#` are comments. 65 | - `EOS` represents the end of the sentence. 66 | 67 | --- 68 | 69 | ## Reference 70 | 71 | - [KNP の基本的な出力の読み方 (in Japanese)](http://cr.fvcrc.i.nagoya-u.ac.jp/~sasano/knp/format.html) 72 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # rhoknp: Yet another Python binding for Juman++/KNP/KWJA 2 | 3 | [![Test](https://img.shields.io/github/actions/workflow/status/ku-nlp/rhoknp/test.yml?branch=main&logo=github&label=test&style=flat-square)](https://github.com/ku-nlp/rhoknp/actions/workflows/test.yml) 4 | [![Codecov](https://img.shields.io/codecov/c/github/ku-nlp/rhoknp?logo=codecov&style=flat-square)](https://codecov.io/gh/ku-nlp/rhoknp) 5 | [![CodeFactor](https://img.shields.io/codefactor/grade/github/ku-nlp/rhoknp?style=flat-square)](https://www.codefactor.io/repository/github/ku-nlp/rhoknp) 6 | [![PyPI](https://img.shields.io/pypi/v/rhoknp?style=flat-square)](https://pypi.org/project/rhoknp/) 7 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/rhoknp?style=flat-square)](https://pypi.org/project/rhoknp/) 8 | [![Documentation](https://img.shields.io/readthedocs/rhoknp?style=flat-square)](https://rhoknp.readthedocs.io/en/latest/?badge=latest) 9 | 10 | **rhoknp** is a Python binding for [Juman++](https://github.com/ku-nlp/jumanpp), [KNP](https://github.com/ku-nlp/knp), and [KWJA](https://github.com/ku-nlp/kwja). 11 | 12 | ```python3 13 | import rhoknp 14 | 15 | # Perform language analysis by Juman++ 16 | jumanpp = rhoknp.Jumanpp() 17 | sentence = jumanpp.apply_to_sentence("電気抵抗率は電気の通しにくさを表す物性値である。") 18 | 19 | # Dump language analysis by Juman++ 20 | with open("result.jumanpp", "wt") as f: 21 | f.write(sentence.to_jumanpp()) 22 | 23 | # Load language analysis by Juman++ 24 | with open("result.jumanpp", "rt") as f: 25 | sentence = rhoknp.Sentence.from_jumanpp(f.read()) 26 | ``` 27 | 28 | ```{admonition} Why not *pyknp*? 29 | :class: note 30 | [*pyknp*](https://pypi.org/project/pyknp/) has been developed as the official Python binding for Juman++ and KNP. 31 | In *rhoknp*, we redesigned the API from the top-down, taking into account the current use cases of *pyknp*. 32 | The main differences from *pyknp* are as follows: 33 | 34 | - **Support document-level language analysis**: *rhoknp* can load and instantiate the result of document-level language analysis: i.e., cohesion analysis and discourse relation analysis. 35 | - **Strictly type-aware**: *rhoknp* is thoroughly annotated with type annotations. Efficient development is possible with the help of an IDE. 36 | - **Extensive test suite**: *rhoknp* is tested with an extensive test suite. See the code coverage at Codecov. 37 | ``` 38 | 39 | ```{toctree} 40 | --- 41 | hidden: 42 | caption: User Guide 43 | maxdepth: 1 44 | --- 45 | 46 | installation/index 47 | reference/index 48 | cli/index 49 | format/index 50 | ``` 51 | 52 | ```{toctree} 53 | --- 54 | hidden: 55 | caption: Development 56 | maxdepth: 1 57 | --- 58 | 59 | contributing/index 60 | authors 61 | ``` 62 | 63 | ```{toctree} 64 | --- 65 | hidden: 66 | caption: Project Links 67 | --- 68 | 69 | GitHub 70 | PyPI 71 | ``` 72 | 73 | ## Indices and tables 74 | 75 | - {ref}`genindex` 76 | - {ref}`search` 77 | -------------------------------------------------------------------------------- /docs/installation/index.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Requirements 4 | 5 | - [Python](https://python.org/) 6 | - Supported versions: 3.9+ 7 | - [Juman++](https://github.com/ku-nlp/jumanpp) (Optional) 8 | - Supported versions: v2.0.0-rc3+ 9 | - [KNP](https://github.com/ku-nlp/knp) (Optional) 10 | - Supported versions: 5.0+ 11 | 12 | ```{note} 13 | If you just would like to load the result of language analysis by Juman++ and KNP, you do not need to install them. 14 | ``` 15 | 16 | ## Installation 17 | 18 | We recommend installing _rhoknp_ with pip: 19 | 20 | ```{eval-rst} 21 | .. prompt:: 22 | :prompts: $ 23 | 24 | pip install rhoknp 25 | ``` 26 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/reference/index.md: -------------------------------------------------------------------------------- 1 | # API Reference 2 | 3 | ```{toctree} 4 | :maxdepth: 4 5 | 6 | rhoknp 7 | ``` 8 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.cli.cli.md: -------------------------------------------------------------------------------- 1 | # rhoknp.cli.cli module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.cli.cli 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.cli.md: -------------------------------------------------------------------------------- 1 | # rhoknp.cli package 2 | 3 | ```{toctree} 4 | :maxdepth: 4 5 | 6 | rhoknp.cli.cli 7 | rhoknp.cli.serve 8 | rhoknp.cli.show 9 | rhoknp.cli.stats 10 | ``` 11 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.cli.serve.md: -------------------------------------------------------------------------------- 1 | # rhoknp.cli.serve module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.cli.serve 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.cli.show.md: -------------------------------------------------------------------------------- 1 | # rhoknp.cli.show module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.cli.show 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.cli.stats.md: -------------------------------------------------------------------------------- 1 | # rhoknp.cli.stats module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.cli.stats 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.cohesion.argument.md: -------------------------------------------------------------------------------- 1 | # rhoknp.cohesion.argument module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.cohesion.argument 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.cohesion.coreference.md: -------------------------------------------------------------------------------- 1 | # rhoknp.cohesion.coreference module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.cohesion.coreference 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.cohesion.discourse.md: -------------------------------------------------------------------------------- 1 | # rhoknp.cohesion.discourse module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.cohesion.discourse 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.cohesion.exophora.md: -------------------------------------------------------------------------------- 1 | # rhoknp.cohesion.exophora module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.cohesion.exophora 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.cohesion.md: -------------------------------------------------------------------------------- 1 | # rhoknp.cohesion package 2 | 3 | ```{toctree} 4 | :maxdepth: 4 5 | 6 | rhoknp.cohesion.rel 7 | rhoknp.cohesion.pas 8 | rhoknp.cohesion.predicate 9 | rhoknp.cohesion.argument 10 | rhoknp.cohesion.exophora 11 | rhoknp.cohesion.coreference 12 | rhoknp.cohesion.discourse 13 | ``` 14 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.cohesion.pas.md: -------------------------------------------------------------------------------- 1 | # rhoknp.cohesion.pas module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.cohesion.pas 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.cohesion.predicate.md: -------------------------------------------------------------------------------- 1 | # rhoknp.cohesion.predicate module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.cohesion.predicate 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.cohesion.rel.md: -------------------------------------------------------------------------------- 1 | # rhoknp.cohesion.rel module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.cohesion.rel 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.md: -------------------------------------------------------------------------------- 1 | # rhoknp package 2 | 3 | ```{toctree} 4 | :maxdepth: 4 5 | 6 | rhoknp.processors 7 | rhoknp.units 8 | rhoknp.props 9 | rhoknp.cohesion 10 | rhoknp.utils 11 | rhoknp.cli 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.processors.jumanpp.md: -------------------------------------------------------------------------------- 1 | # rhoknp.processors.jumanpp module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.processors.jumanpp 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.processors.knp.md: -------------------------------------------------------------------------------- 1 | # rhoknp.processors.knp module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.processors.knp 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.processors.kwja.md: -------------------------------------------------------------------------------- 1 | # rhoknp.processors.kwja module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.processors.kwja 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.processors.md: -------------------------------------------------------------------------------- 1 | # rhoknp.processors package 2 | 3 | ```{toctree} 4 | :maxdepth: 4 5 | 6 | rhoknp.processors.senter 7 | rhoknp.processors.jumanpp 8 | rhoknp.processors.knp 9 | rhoknp.processors.kwja 10 | rhoknp.processors.processor 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.processors.processor.md: -------------------------------------------------------------------------------- 1 | # rhoknp.processors.processor module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.processors.processor 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.processors.senter.md: -------------------------------------------------------------------------------- 1 | # rhoknp.processors.senter module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.processors.senter 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.props.dependency.md: -------------------------------------------------------------------------------- 1 | # rhoknp.props.dependency module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.props.dependency 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.props.feature.md: -------------------------------------------------------------------------------- 1 | # rhoknp.props.feature module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.props.feature 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.props.md: -------------------------------------------------------------------------------- 1 | # rhoknp.props package 2 | 3 | ```{toctree} 4 | :maxdepth: 4 5 | 6 | rhoknp.props.dependency 7 | rhoknp.props.feature 8 | rhoknp.props.semantics 9 | rhoknp.props.named_entity 10 | rhoknp.props.memo 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.props.memo.md: -------------------------------------------------------------------------------- 1 | # rhoknp.props.memo module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.props.memo 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.props.named_entity.md: -------------------------------------------------------------------------------- 1 | # rhoknp.props.named_entity module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.props.named_entity 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.props.semantics.md: -------------------------------------------------------------------------------- 1 | # rhoknp.props.semantics module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.props.semantics 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.units.base_phrase.md: -------------------------------------------------------------------------------- 1 | # rhoknp.units.base_phrase module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.units.base_phrase 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.units.clause.md: -------------------------------------------------------------------------------- 1 | # rhoknp.units.clause module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.units.clause 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.units.document.md: -------------------------------------------------------------------------------- 1 | # rhoknp.units.document module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.units.document 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.units.md: -------------------------------------------------------------------------------- 1 | # rhoknp.units package 2 | 3 | ```{toctree} 4 | :maxdepth: 4 5 | 6 | rhoknp.units.document 7 | rhoknp.units.sentence 8 | rhoknp.units.clause 9 | rhoknp.units.phrase 10 | rhoknp.units.base_phrase 11 | rhoknp.units.morpheme 12 | rhoknp.units.unit 13 | ``` 14 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.units.morpheme.md: -------------------------------------------------------------------------------- 1 | # rhoknp.units.morpheme module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.units.morpheme 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.units.phrase.md: -------------------------------------------------------------------------------- 1 | # rhoknp.units.phrase module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.units.phrase 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.units.sentence.md: -------------------------------------------------------------------------------- 1 | # rhoknp.units.sentence module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.units.sentence 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.units.unit.md: -------------------------------------------------------------------------------- 1 | # rhoknp.units.unit module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.units.unit 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.utils.md: -------------------------------------------------------------------------------- 1 | # rhoknp.utils package 2 | 3 | ```{toctree} 4 | :maxdepth: 4 5 | 6 | rhoknp.utils.reader 7 | ``` 8 | -------------------------------------------------------------------------------- /docs/reference/rhoknp.utils.reader.md: -------------------------------------------------------------------------------- 1 | # rhoknp.utils.reader module 2 | 3 | ```{eval-rst} 4 | .. automodule:: rhoknp.utils.reader 5 | ``` 6 | 7 | ```{toctree} 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv export --only-group docs --no-annotate --no-hashes -o docs/requirements.txt 3 | alabaster==0.7.16 ; python_full_version < '3.10' 4 | alabaster==1.0.0 ; python_full_version >= '3.10' 5 | babel==2.17.0 6 | beautifulsoup4==4.13.4 7 | certifi==2025.4.26 8 | charset-normalizer==3.4.2 9 | colorama==0.4.6 ; sys_platform == 'win32' 10 | docutils==0.21.2 11 | furo==2024.8.6 12 | idna==3.10 13 | imagesize==1.4.1 14 | importlib-metadata==8.7.0 ; python_full_version < '3.10' 15 | jinja2==3.1.6 16 | markdown-it-py==3.0.0 17 | markupsafe==3.0.2 18 | mdit-py-plugins==0.4.2 19 | mdurl==0.1.2 20 | myst-parser==3.0.1 ; python_full_version < '3.10' 21 | myst-parser==4.0.1 ; python_full_version >= '3.10' 22 | packaging==25.0 23 | pygments==2.19.1 24 | pyyaml==6.0.2 25 | requests==2.32.3 26 | roman-numerals-py==3.1.0 ; python_full_version >= '3.11' 27 | snowballstemmer==2.2.0 28 | soupsieve==2.7 29 | sphinx==7.4.7 ; python_full_version < '3.10' 30 | sphinx==8.1.3 ; python_full_version == '3.10.*' 31 | sphinx==8.2.3 ; python_full_version >= '3.11' 32 | sphinx-basic-ng==1.0.0b2 33 | sphinx-copybutton==0.5.2 34 | sphinx-prompt==1.8.0 ; python_full_version < '3.10' 35 | sphinx-prompt==1.9.0 ; python_full_version >= '3.10' 36 | sphinxcontrib-applehelp==2.0.0 37 | sphinxcontrib-devhelp==2.0.0 38 | sphinxcontrib-htmlhelp==2.1.0 39 | sphinxcontrib-jsmath==1.0.1 40 | sphinxcontrib-qthelp==2.0.0 41 | sphinxcontrib-serializinghtml==2.0.0 42 | tomli==2.2.1 ; python_full_version < '3.11' 43 | typing-extensions==4.13.2 44 | urllib3==2.4.0 45 | zipp==3.21.0 ; python_full_version < '3.10' 46 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | We host a number of example scripts. 4 | 5 | ## Perform language analysis 6 | 7 | - [Juman++](./apply_jumanpp.py) 8 | - [KNP](./apply_knp.py) 9 | - [KWJA](./apply_kwja.py) 10 | 11 | ## Load language analysis results 12 | 13 | - [Juman++](./load_jumanpp.py) 14 | - [KNP](./load_knp.py) 15 | - [KWJA](./load_knp.py) 16 | 17 | ## Use language analysis results 18 | 19 | - [Morphological analysis](./use_morphological_analysis.py) 20 | - [Dependency parsing](./use_dependency_parsing.py) 21 | - [Named entity recognition](./use_named_entity_recognition.py) 22 | - [Discourse relation analysis](./use_discourse_relation_analysis.py) 23 | - [Predicate-argument structure analysis](./use_predicate_argument_structure_analysis.py) 24 | - [Coreference resolution](./use_coreference_resolution.py) 25 | -------------------------------------------------------------------------------- /examples/apply_jumanpp.py: -------------------------------------------------------------------------------- 1 | """Example code for applying Juman++ to the given sentence. 2 | 3 | Usage: 4 | $ python examples/apply_jumanpp.py "今日はいい天気ですね。" 5 | """ 6 | 7 | import sys 8 | 9 | from rhoknp import Jumanpp 10 | 11 | # Create a Jumanpp instance. 12 | jumanpp = Jumanpp() 13 | 14 | # Apply Jumanpp to a sentence. 15 | sent = jumanpp.apply_to_sentence(sys.argv[1]) 16 | 17 | # Get information. 18 | for mrph in sent.morphemes: 19 | print(f"Text: {mrph.text}") 20 | print(f"Reading: {mrph.reading}") 21 | print(f"Lemma: {mrph.lemma}") 22 | print(f"POS: {mrph.pos}") 23 | print(f"Sub POS: {mrph.subpos}") 24 | print(f"Conjugation (type): {mrph.conjtype}") 25 | print(f"Conjugation (form): {mrph.conjform}") 26 | print("---") 27 | -------------------------------------------------------------------------------- /examples/apply_knp.py: -------------------------------------------------------------------------------- 1 | """Example code for applying KNP to the given sentence. 2 | 3 | Usage: 4 | $ python examples/apply_knp.py "今日はいい天気ですね。" 5 | """ 6 | 7 | import sys 8 | 9 | from rhoknp import KNP 10 | 11 | # Create a KNP instance. 12 | knp = KNP() 13 | 14 | # Apply KNP to a sentence. 15 | sent = knp.apply_to_sentence(sys.argv[1]) 16 | 17 | # Get information. 18 | for mrph in sent.morphemes: 19 | print(f"Text: {mrph.text}") 20 | print(f"Reading: {mrph.reading}") 21 | print(f"Lemma: {mrph.lemma}") 22 | print(f"POS: {mrph.pos}") 23 | print(f"Sub POS: {mrph.subpos}") 24 | print(f"Conjugation (type): {mrph.conjtype}") 25 | print(f"Conjugation (form): {mrph.conjform}") 26 | print("---") 27 | -------------------------------------------------------------------------------- /examples/apply_kwja.py: -------------------------------------------------------------------------------- 1 | """Example code for applying KWJA to the given sentence. 2 | 3 | Usage: 4 | $ python examples/apply_kwja.py "今日はいい天気ですね。" 5 | """ 6 | 7 | import sys 8 | 9 | from rhoknp import KWJA 10 | 11 | # Create a KWJA instance. 12 | kwja = KWJA(options=["--model-size", "tiny"]) 13 | 14 | # Apply KWJA to a document. 15 | doc = kwja.apply_to_document(sys.argv[1], timeout=120) 16 | 17 | # Get information. 18 | for mrph in doc.morphemes: 19 | print(f"Text: {mrph.text}") 20 | print(f"Reading: {mrph.reading}") 21 | print(f"Lemma: {mrph.lemma}") 22 | print(f"POS: {mrph.pos}") 23 | print(f"Sub POS: {mrph.subpos}") 24 | print(f"Conjugation (type): {mrph.conjtype}") 25 | print(f"Conjugation (form): {mrph.conjform}") 26 | print("---") 27 | -------------------------------------------------------------------------------- /examples/load_jumanpp.py: -------------------------------------------------------------------------------- 1 | """Example code for loading the result of Juman++ from a file. 2 | 3 | Usage: 4 | $ python examples/load_jumanpp.py example.jumanpp 5 | """ 6 | 7 | import sys 8 | 9 | from rhoknp import Sentence 10 | from rhoknp.utils.reader import chunk_by_sentence 11 | 12 | with open(sys.argv[1]) as f: 13 | for jumanpp in chunk_by_sentence(f): 14 | sent = Sentence.from_jumanpp(jumanpp) 15 | print(f"Successfully loaded a sentence: {sent.text}") 16 | -------------------------------------------------------------------------------- /examples/load_knp.py: -------------------------------------------------------------------------------- 1 | """Example code for loading the result of KNP/KWJA from a file. 2 | 3 | Usage: 4 | $ python examples/load_knp.py example.jumanpp 5 | """ 6 | 7 | import sys 8 | 9 | from rhoknp import Sentence 10 | from rhoknp.utils.reader import chunk_by_sentence 11 | 12 | with open(sys.argv[1]) as f: 13 | for knp in chunk_by_sentence(f): 14 | sent = Sentence.from_knp(knp) 15 | print(f"Successfully loaded a sentence: {sent.text}") 16 | -------------------------------------------------------------------------------- /examples/use_coreference_resolution.py: -------------------------------------------------------------------------------- 1 | """Example code for using the result of coreference resolution. 2 | 3 | Usage: 4 | $ python examples/use_coreference_resolution.py "ソビエト連邦はソ連ともよばれる。同国の首都はモスクワである。" 5 | """ 6 | 7 | import sys 8 | 9 | from rhoknp import KWJA, BasePhrase 10 | 11 | # Create a KWJA instance. 12 | kwja = KWJA() 13 | 14 | # Apply KWJA to a document. 15 | doc = kwja.apply_to_document(sys.argv[1]) 16 | 17 | # Get information. 18 | for base_phrase in doc.base_phrases: 19 | coreferents: list[BasePhrase] = base_phrase.get_coreferents() 20 | if len(coreferents) > 0: 21 | print(f"Mention {base_phrase}") 22 | for coreferring_mention in coreferents: 23 | print(f" = {coreferring_mention}") 24 | print("---") 25 | -------------------------------------------------------------------------------- /examples/use_dependency_parsing.py: -------------------------------------------------------------------------------- 1 | """Example code for using the result of dependency parsing. 2 | 3 | Usage: 4 | $ python examples/use_dependency_parsing.py "今日はいい天気ですね。" 5 | """ 6 | 7 | import sys 8 | 9 | from rhoknp import KNP 10 | 11 | # Create a KNP instance. 12 | knp = KNP() 13 | 14 | # Apply KNP to a sentence. 15 | sent = knp.apply_to_sentence(sys.argv[1]) 16 | 17 | # Get information. 18 | for phrase in sent.phrases: 19 | parent = phrase.parent 20 | if parent: 21 | print(f"{phrase.text} -> {parent.text}") 22 | else: 23 | print(f"{phrase.text} -> ROOT") 24 | -------------------------------------------------------------------------------- /examples/use_discourse_relation_analysis.py: -------------------------------------------------------------------------------- 1 | """Example code for using the result of discourse relation analysis. 2 | 3 | Usage: 4 | $ python examples/use_discourse_relation_analysis.py "風が吹いたら桶屋が儲かる。" 5 | """ 6 | 7 | import sys 8 | 9 | from rhoknp import KNP 10 | 11 | # Create a KNP instance. 12 | knp = KNP() 13 | 14 | # Apply KNP to a sentence. 15 | sent = knp.apply_to_sentence(sys.argv[1]) 16 | 17 | # Get information. 18 | if sent.is_clause_tag_required() is True: 19 | print("KNP might be too old; please update it.") 20 | sys.exit(1) 21 | 22 | discourse_relations = [] 23 | for clause in sent.clauses: 24 | discourse_relations.extend(clause.discourse_relations) 25 | 26 | if discourse_relations: 27 | print(f"Found {len(discourse_relations)} discourse relations:") 28 | for i, discourse_relation in enumerate(discourse_relations, start=1): 29 | modifier = discourse_relation.modifier 30 | head = discourse_relation.head 31 | label = discourse_relation.label 32 | print(f' {i}. "{modifier}" -({label.value})-> "{head}"') 33 | else: 34 | print("No discourse relation found.") 35 | -------------------------------------------------------------------------------- /examples/use_morphological_analysis.py: -------------------------------------------------------------------------------- 1 | """Example code for using the result of morphological analysis. 2 | 3 | Usage: 4 | $ python examples/use_morphological_analysis.py "今日はいい天気ですね。" 5 | """ 6 | 7 | import sys 8 | 9 | from rhoknp import Jumanpp 10 | 11 | # Create a Jumanpp instance. 12 | jumanpp = Jumanpp() 13 | 14 | # Apply Jumanpp to a sentence. 15 | sent = jumanpp.apply_to_sentence(sys.argv[1]) 16 | 17 | # Get information. 18 | for mrph in sent.morphemes: 19 | print(f"Text: {mrph.text}") 20 | print(f"Reading: {mrph.reading}") 21 | print(f"Lemma: {mrph.lemma}") 22 | print(f"POS: {mrph.pos}") 23 | print(f"Sub POS: {mrph.subpos}") 24 | print(f"Conjugation (type): {mrph.conjtype}") 25 | print(f"Conjugation (form): {mrph.conjform}") 26 | print("---") 27 | -------------------------------------------------------------------------------- /examples/use_named_entity_recognition.py: -------------------------------------------------------------------------------- 1 | """Example code for using the result of named entity recognition. 2 | 3 | Usage: 4 | $ python examples/use_named_entity_recognition.py "太郎は花子が読んでいる本を次郎に渡した。" 5 | """ 6 | 7 | import sys 8 | 9 | from rhoknp import KNP 10 | 11 | # Create a KNP instance. 12 | knp = KNP() 13 | 14 | # Apply KNP to a sentence. 15 | sent = knp.apply_to_sentence(sys.argv[1]) 16 | 17 | # Get information. 18 | if sent.named_entities: 19 | print(f"Found {len(sent.named_entities)} named entities:") 20 | for i, named_entity in enumerate(sent.named_entities, start=1): 21 | print(f' {i}. "{named_entity.text}" ({named_entity.category.value})') 22 | else: 23 | print("No named entity found.") 24 | -------------------------------------------------------------------------------- /examples/use_predicate_argument_structure_analysis.py: -------------------------------------------------------------------------------- 1 | """Example code for using the result of predicate-argument structure analysis. 2 | 3 | Usage: 4 | $ python examples/use_predicate_argument_structure_analysis.py "太郎は花子が読んでいる本を次郎に渡した。" 5 | """ 6 | 7 | import sys 8 | 9 | from rhoknp import KWJA 10 | from rhoknp.cohesion import Argument 11 | 12 | # Create a KWJA instance. 13 | kwja = KWJA() 14 | 15 | # Apply KWJA to a document. 16 | doc = kwja.apply_to_document(sys.argv[1]) 17 | 18 | # Get information. 19 | for base_phrase in doc.base_phrases: 20 | pas = base_phrase.pas 21 | if pas.is_empty() is True: 22 | continue 23 | all_arguments: dict[str, list[Argument]] = pas.get_all_arguments() 24 | print(f"Predicate: {pas.predicate}") 25 | for case, arguments in all_arguments.items(): 26 | print(f" {case}格: ", end="") 27 | print(", ".join(str(argument) for argument in arguments)) 28 | print("---") 29 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "rhoknp" 3 | version = "1.7.1" 4 | description = "Yet another Python binding for Juman++/KNP/KWJA" 5 | license = "MIT" 6 | authors = [ 7 | { name = "Hirokazu Kiyomaru", email = "h.kiyomaru@gmail.com"}, 8 | { name = "Nobuhiro Ueda", email = "ueda@nlp.i.kyoto-u.ac.jp"}, 9 | ] 10 | maintainers = [ 11 | { name = "Hirokazu Kiyomaru", email = "h.kiyomaru@gmail.com"}, 12 | { name = "Nobuhiro Ueda", email = "ueda@nlp.i.kyoto-u.ac.jp"}, 13 | ] 14 | readme = "README.md" 15 | keywords = ["NLP", "Japanese", "Juman++", "KNP", "KWJA"] 16 | classifiers = [ 17 | "License :: OSI Approved :: MIT License", 18 | "Natural Language :: Japanese", 19 | "Operating System :: MacOS", 20 | "Operating System :: MacOS :: MacOS X", 21 | "Operating System :: Microsoft :: Windows", 22 | "Operating System :: POSIX :: Linux", 23 | "Programming Language :: Python :: 3", 24 | "Programming Language :: Python :: 3.9", 25 | "Programming Language :: Python :: 3.10", 26 | "Programming Language :: Python :: 3.11", 27 | "Programming Language :: Python :: 3.12", 28 | "Programming Language :: Python :: 3.13", 29 | "Topic :: Scientific/Engineering", 30 | "Topic :: Software Development :: Libraries", 31 | "Topic :: Software Development :: Libraries :: Python Modules", 32 | "Topic :: Text Processing", 33 | "Topic :: Text Processing :: Linguistic", 34 | ] 35 | requires-python = ">=3.9" 36 | 37 | dependencies = [ 38 | "typing-extensions>=4.4; python_version < '3.12'" 39 | ] 40 | 41 | [project.optional-dependencies] 42 | cli = [ 43 | "typer-slim>=0.15.2", 44 | "PyYAML>=6.0", 45 | "rich>=12.6", 46 | "uvicorn>=0.30.0", 47 | "fastapi>=0.111.0", 48 | "jinja2>=3.1.4", 49 | "pygments>=2.18.0", 50 | ] 51 | 52 | [project.urls] 53 | Homepage = "https://github.com/ku-nlp/rhoknp" 54 | Documentation = "https://rhoknp.readthedocs.io/en/latest" 55 | Repository = "https://github.com/ku-nlp/rhoknp" 56 | Issues = "https://github.com/ku-nlp/rhoknp/issues" 57 | 58 | [project.scripts] 59 | rhoknp = "rhoknp.cli.cli:app" 60 | 61 | [dependency-groups] 62 | dev = [ 63 | "ipdb>=0.13.13", 64 | ] 65 | test = [ 66 | "pytest>=8.0", 67 | "coverage[toml]>=7.3", 68 | "pytest-cov>=6.0", 69 | "httpx>=0.25", 70 | ] 71 | docs = [ 72 | "Sphinx>=7.0; python_version < '3.10'", 73 | "Sphinx>=8.0; python_version >= '3.10'", 74 | "sphinx-prompt>=1.8; python_version < '3.10'", 75 | "sphinx-prompt>=1.9; python_version >= '3.10'", 76 | "sphinx-copybutton>=0.5.0", 77 | "myst-parser>=3.0; python_version < '3.10'", 78 | "myst-parser>=4.0; python_version >= '3.10'", 79 | "markdown-it-py>=3.0", 80 | "furo>=2024.4", 81 | "typing-extensions>=4.4", 82 | ] 83 | 84 | [build-system] 85 | requires = ["hatchling"] 86 | build-backend = "hatchling.build" 87 | 88 | [tool.hatch.build.targets.sdist] 89 | only-include = ["/src/rhoknp"] 90 | 91 | [tool.uv] 92 | package = true 93 | default-groups = ["dev", "test"] 94 | 95 | [tool.ruff] 96 | line-length = 120 97 | indent-width = 4 98 | src = ["src"] 99 | target-version = "py39" # The minimum Python version to target 100 | 101 | [tool.ruff.lint] 102 | select = ["F", "E", "W", "I", "B", "PL", "PD", "NPY", "RUF", "UP", "TID", "COM", "PT", "D", "ARG", "PYI", "ANN", "G", "FBT", "EM", "TRY", "PTH", "T", "INP"] 103 | #select = ["ALL"] 104 | ignore = [ 105 | "PLR0911", # Too many return statements 106 | "PLR0912", # Too many branches 107 | "PLR0913", # Too many arguments in function definition 108 | "PLR0915", # Too many statements 109 | "E501", # Line too long 110 | "RUF001", # String contains ambiguous `ノ` (KATAKANA LETTER NO). Did you mean `/` (SOLIDUS)? 111 | "RUF002", # Docstring contains ambiguous `,` (FULLWIDTH COMMA). Did you mean `,` (COMMA)? 112 | "RUF003", # Comment contains ambiguous `(` (FULLWIDTH LEFT PARENTHESIS). Did you mean `(` (LEFT PARENTHESIS)? 113 | "UP037", # Remove quotes from type annotation 114 | "COM812", # Trailing comma missing 115 | "PLR2004", # Magic value used in comparison 116 | "D100", # Missing docstring in public module 117 | "D105", # Missing docstring in magic method 118 | "D107", # Missing docstring in `__init__` 119 | "D301", # Use `r"""` if any backslashes in a docstring 120 | "D403", # First word of the first line should be properly capitalized 121 | "D415", # First line should end with a period, question mark, or exclamation point 122 | "ANN002", # Missing type annotation for `*args` 123 | "ANN003", # Missing type annotation for `**kwargs` 124 | "FA100", # Missing `from __future__ import annotations`, but uses `...` 125 | "S101", # Use of `assert` detected 126 | "G004", # Logging statement uses f-string 127 | "FBT001", # Boolean-typed positional argument in function definition 128 | "FBT002", # Boolean default positional argument in function definition 129 | "FBT003", # Boolean positional value in function call 130 | "EM101", # Exception must not use a string literal, assign to variable first 131 | "EM102", # Exception must not use an f-string literal, assign to variable first 132 | "TRY003", # Avoid specifying long messages outside the exception class 133 | ] 134 | 135 | [tool.ruff.lint.per-file-ignores] 136 | "__init__.py" = [ 137 | "D104", # Missing docstring in public package 138 | ] 139 | "tests/*" = [ 140 | "D", # pydocstyle 141 | "S101", # Use of `assert` detected 142 | "INP001", # File `...` is part of an implicit namespace package. Add an `__init__.py` 143 | ] 144 | "src/rhoknp/cli/*" = [ 145 | "T201", # `print` found 146 | ] 147 | "examples/*" = [ 148 | "T201", # `print` found 149 | "INP001", # File `...` is part of an implicit namespace package. Add an `__init__.py` 150 | "PTH123", # `open()` should be replaced by `Path.open()` 151 | ] 152 | "docs/conf.py" = [ 153 | "INP001", # File `...` is part of an implicit namespace package. Add an `__init__.py` 154 | ] 155 | 156 | [tool.ruff.lint.flake8-bugbear] 157 | extend-immutable-calls = ["typer.Argument", "typer.Option"] 158 | 159 | [tool.ruff.lint.flake8-tidy-imports] 160 | ban-relative-imports = "all" 161 | 162 | [tool.ruff.lint.pydocstyle] 163 | convention = "google" 164 | 165 | [tool.ruff.lint.mccabe] 166 | max-complexity = 20 # default: 10 167 | 168 | [tool.mypy] 169 | python_version = "3.9" 170 | 171 | [tool.coverage.run] 172 | omit = ["tests/*"] 173 | 174 | [tool.coverage.report] 175 | exclude_lines = [ 176 | "pragma: no cover", 177 | "def __repr__", # Do not complain about missing debug-only code 178 | "except ImportError", # Do not complain about packages we have installed 179 | # Do not complain if tests do not hit defensive assertion code 180 | "raise AssertionError", 181 | "raise NotImplementedError", 182 | "raise ImportError", 183 | # Do not complain if non-runnable code is not run 184 | "if TYPE_CHECKING:", 185 | "if __name__ == .__main__.:", 186 | "@(abc\\.)?abstractmethod", # Do not complain about abstract methods 187 | "@overload", # Do not complain about overloads 188 | ] 189 | -------------------------------------------------------------------------------- /src/rhoknp/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import version 2 | 3 | from rhoknp.processors import KNP, KWJA, Jumanpp, RegexSenter 4 | from rhoknp.units import BasePhrase, Clause, Document, Morpheme, Phrase, Sentence 5 | 6 | __version__ = version("rhoknp") 7 | 8 | __all__ = [ 9 | "KNP", 10 | "KWJA", 11 | "BasePhrase", 12 | "Clause", 13 | "Document", 14 | "Jumanpp", 15 | "Morpheme", 16 | "Phrase", 17 | "RegexSenter", 18 | "Sentence", 19 | "__version__", 20 | ] 21 | -------------------------------------------------------------------------------- /src/rhoknp/cli/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import rhoknp.cli.cli 3 | import rhoknp.cli.serve 4 | import rhoknp.cli.show 5 | import rhoknp.cli.stats # noqa: F401 6 | except ImportError as e: 7 | raise ImportError( 8 | f"{e.msg}\nExtra dependencies are required to use the CLI. Install them with `pip install rhoknp[cli]`." 9 | ) from e 10 | -------------------------------------------------------------------------------- /src/rhoknp/cli/cat.py: -------------------------------------------------------------------------------- 1 | from typing import ClassVar 2 | 3 | from pygments import highlight 4 | from pygments.formatters import TerminalFormatter 5 | from pygments.lexer import RegexLexer, bygroups, default 6 | from pygments.token import Comment, Generic, Literal, Name, Number, String, Text, Whitespace 7 | 8 | from rhoknp import BasePhrase, Document, Morpheme, Phrase 9 | 10 | 11 | class KNPLexer(RegexLexer): 12 | """KNP の出力を色付けするための Lexer.""" 13 | 14 | name: ClassVar[str] = "KNP" 15 | url: ClassVar[str] = "https://github.com/ku-nlp/knp" 16 | filenames: ClassVar[list[str]] = ["*.knp", "*.kwja"] 17 | mimetypes: ClassVar[list[str]] = ["text/plain"] 18 | 19 | tokens = { # noqa: RUF012 20 | "root": [ 21 | (r"\s+", Whitespace), 22 | (rf"(?={Phrase.PAT.pattern})", Text, "phrase"), 23 | (rf"(?={BasePhrase.PAT.pattern})", Text, "base_phrase"), 24 | (rf"(?={Morpheme.PAT.pattern})", Text, "morpheme"), 25 | (r"^#.*$", Comment.Single), 26 | (r"^EOS$", Generic.Subheading), 27 | ], 28 | "phrase": [ 29 | (r"\s+", Whitespace), 30 | (r"^\*", Generic.Heading), 31 | (r"(-?\d+)([DPAI])", bygroups(Number, Literal.String)), 32 | (r"<", Name.Tag, "tag"), 33 | default("#pop"), 34 | ], 35 | "base_phrase": [ 36 | (r"\s+", Whitespace), 37 | (r"^\+", Generic.Heading), 38 | (r"(-?\d+)([DPAI])", bygroups(Number, Literal.String)), 39 | (r":]+)(:)?([^>]+)?", bygroups(Name.Tag, Name.Tag, Name.Attribute)), 85 | (r">", Name.Tag, "#pop"), 86 | ], 87 | "rel_tag": [ 88 | (r"\s+", Whitespace), 89 | (r'(\S+=)("\S+?")', bygroups(Name.Attribute, String)), 90 | (r"/>", Name.Tag, "#pop"), 91 | ], 92 | } 93 | 94 | 95 | def print_document(document: Document, is_dark: bool = False) -> None: 96 | """KNP ファイルを色付きで表示. 97 | 98 | Args: 99 | document (Document): 文書. 100 | is_dark (bool, optional): ターミナルの背景色が dark なら True.デフォルトは False. 101 | """ 102 | formatter = TerminalFormatter(bg="dark" if is_dark else "light") 103 | print(highlight(document.to_knp(), KNPLexer(), formatter), end="") 104 | -------------------------------------------------------------------------------- /src/rhoknp/cli/cli.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from pathlib import Path 4 | from typing import Optional 5 | 6 | import typer 7 | import yaml 8 | 9 | from rhoknp import Document, __version__ 10 | from rhoknp.cli.cat import print_document 11 | from rhoknp.cli.serve import AnalyzerType, serve_analyzer 12 | from rhoknp.cli.show import draw_tree 13 | from rhoknp.cli.stats import get_document_statistics 14 | 15 | app = typer.Typer(help="rhoknp CLI utilities.") 16 | 17 | 18 | def version_callback(value: bool) -> None: 19 | """バージョンを表示. 20 | 21 | Args: 22 | value: True ならバージョンを表示してプログラムを終了. 23 | """ 24 | if value: 25 | print(f"rhoknp version: {__version__}") 26 | raise typer.Exit 27 | 28 | 29 | @app.callback() 30 | def main( 31 | _: bool = typer.Option(False, "--version", "-v", callback=version_callback, help="Show version and exit."), 32 | ) -> None: 33 | """CLI のメイン関数.""" 34 | 35 | 36 | @app.command(help="Print KNP files with syntax highlighting.") 37 | def cat( 38 | knp_path: Optional[Path] = typer.Argument(None, exists=True, dir_okay=False, help="Path to knp file to show."), 39 | dark: bool = typer.Option(False, "--dark", "-d", help="Use dark background."), 40 | ) -> None: 41 | """KNP ファイルを色付きで表示. 42 | 43 | Args: 44 | knp_path: KNP ファイルのパス. 45 | dark: True なら背景を黒にする. 46 | """ 47 | knp_text = sys.stdin.read() if knp_path is None else knp_path.read_text() 48 | doc = Document.from_knp(knp_text) 49 | print_document(doc, is_dark=dark) 50 | 51 | 52 | @app.command(help="Convert a KNP file into raw text, Juman++ format, or KNP format.") 53 | def convert( 54 | knp_path: Optional[Path] = typer.Argument( 55 | None, exists=True, dir_okay=False, help="Path to knp file to convert. If not given, read from stdin" 56 | ), 57 | format_: str = typer.Option("text", "--format", "-f", help="Format to convert to."), 58 | ) -> None: 59 | """KNP ファイルを種々のフォーマットに変換. 60 | 61 | Args: 62 | knp_path: KNP ファイルのパス. 63 | format_: 変換先のフォーマット."text", "jumanpp", "knp" のいずれか. 64 | """ 65 | knp_text = sys.stdin.read() if knp_path is None else knp_path.read_text() 66 | doc = Document.from_knp(knp_text) 67 | if format_ == "text": 68 | print(doc.text) 69 | elif format_ == "jumanpp": 70 | print(doc.to_jumanpp(), end="") 71 | elif format_ == "knp": 72 | print(doc.to_knp(), end="") 73 | else: 74 | raise ValueError(f"Unknown format: {format_}") 75 | 76 | 77 | @app.command(help="Print given file content in tree format.") 78 | def show( 79 | knp_path: Path = typer.Argument(..., exists=True, dir_okay=False, help="Path to knp file to show"), 80 | pos: bool = typer.Option(False, "--pos", "-p", help="Show POS characters."), 81 | rel: bool = typer.Option(False, "--rel", "-r", help="Show contents of tags."), 82 | pas: bool = typer.Option(False, "--pas", help="Show predicate-argument structures."), 83 | ) -> None: 84 | """KNP ファイルを読み込み係り受けを可視化. 85 | 86 | Args: 87 | knp_path: KNP ファイルのパス. 88 | pos: True なら同時に品詞を表示. 89 | rel: True なら同時に タグの内容を表示. 90 | pas: True なら同時に述語項構造を表示. 91 | """ 92 | doc = Document.from_knp(knp_path.read_text()) 93 | for sent in doc.sentences: 94 | print(sent.comment) 95 | draw_tree(sent.base_phrases, show_pos=pos, show_rel=rel, show_pas=pas) 96 | 97 | 98 | @app.command(help="Show statistics of given KNP file.") 99 | def stats( 100 | knp_path: Path = typer.Argument( 101 | ..., exists=True, dir_okay=False, help="Path to knp file to calculate statistics on." 102 | ), 103 | use_json: bool = typer.Option(False, "--json", "-j", help="Output statistics in JSON format."), 104 | ) -> None: 105 | """KNP ファイルを読み込みその統計情報を出力. 106 | 107 | Args: 108 | knp_path: KNP ファイルのパス. 109 | use_json: JSON 形式で出力. 110 | """ 111 | doc = Document.from_knp(knp_path.read_text()) 112 | doc_stats = get_document_statistics(doc) 113 | if use_json: 114 | print(json.dumps(doc_stats, ensure_ascii=False, indent=4)) 115 | else: 116 | print(yaml.dump(doc_stats, allow_unicode=True, sort_keys=False), end="") 117 | 118 | 119 | @app.command(help="Serve an analyzer as HTTP server.") 120 | def serve( 121 | analyzer: AnalyzerType = typer.Argument(..., help="Analyzer to use. Choose from jumanpp, knp, kwja."), 122 | host: str = typer.Option("localhost", "--host", "-h", help="Host to listen on."), 123 | port: int = typer.Option(8000, "--port", "-p", help="Port to listen on."), 124 | base_url: str = typer.Option("/", "--base-url", help="Root path of the server."), 125 | analyzer_args: Optional[list[str]] = typer.Argument(None, help="Additional arguments for the analyzer."), 126 | ) -> None: 127 | """解析器を起動し,HTTP サーバとして提供. 128 | 129 | Args: 130 | analyzer: 解析器の種類. 131 | host: ホスト. 132 | port: ポート. 133 | base_url: ベース URL. 134 | analyzer_args: 解析器のオプション. 135 | """ 136 | serve_analyzer(analyzer, host, port, base_url, analyzer_args) # pragma: no cover 137 | 138 | 139 | if __name__ == "__main__": 140 | app() 141 | -------------------------------------------------------------------------------- /src/rhoknp/cli/show.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections.abc import Sequence 3 | from typing import TextIO, Union 4 | 5 | from rich.console import Console 6 | from rich.table import Table 7 | from rich.text import Text 8 | 9 | from rhoknp.cohesion import EndophoraArgument 10 | from rhoknp.props.dependency import DepType 11 | from rhoknp.units.base_phrase import BasePhrase 12 | from rhoknp.units.phrase import Phrase 13 | 14 | POS_MARK = { 15 | "特殊": "*", 16 | "動詞": "v", 17 | "形容詞": "j", 18 | "判定詞": "c", 19 | "助動詞": "x", 20 | "名詞": "n", 21 | "固有名詞": "N", 22 | "人名": "J", 23 | "地名": "C", 24 | "組織名": "A", 25 | "指示詞": "d", 26 | "副詞": "a", 27 | "助詞": "p", 28 | "接続詞": "c", 29 | "連体詞": "m", 30 | "感動詞": "!", 31 | "接頭辞": "p", 32 | "接尾辞": "s", 33 | "未定義語": "?", 34 | } 35 | 36 | 37 | def draw_tree( 38 | leaves: Union[Sequence[Phrase], Sequence[BasePhrase]], 39 | fh: TextIO = sys.stdout, 40 | show_pos: bool = False, 41 | show_rel: bool = False, 42 | show_pas: bool = False, 43 | ) -> None: 44 | """構文木を指定された fh に出力. 45 | 46 | Args: 47 | leaves: 構文木の葉となる文節列または基本句列. 48 | fh: 出力先. 49 | show_pos: True なら同時に品詞を表示する. 50 | show_rel: True なら同時に タグの内容を表示する. 51 | show_pas: True なら同時に述語項構造を表示する. 52 | """ 53 | console = Console(file=fh) 54 | table = Table.grid(padding=(0, 2)) 55 | limit = len(leaves) 56 | item = [[""] * limit for _ in leaves] 57 | active_column = [0] * limit 58 | limit -= 1 59 | 60 | for i in range(limit): 61 | parent_index = leaves[i].parent_index 62 | dep_type = leaves[i].dep_type 63 | assert parent_index is not None, "parent_index has not been set" 64 | para_row = leaves[i].dep_type == DepType.PARALLEL 65 | for j in range(i + 1, limit + 1): 66 | if j < parent_index: 67 | if active_column[j] == 2: 68 | item[i][j] = "╋" if para_row else "╂" 69 | elif active_column[j] == 1: 70 | item[i][j] = "┿" if para_row else "┼" 71 | else: 72 | item[i][j] = "━" if para_row else "─" 73 | elif j == parent_index: 74 | if dep_type in (DepType.PARALLEL, DepType.IMPERFECT_PARALLEL, DepType.APPOSITION): 75 | item[i][j] = str(dep_type.value) 76 | elif active_column[j] == 2: 77 | item[i][j] = "┨" 78 | elif active_column[j] == 1: 79 | item[i][j] = "┤" 80 | else: 81 | item[i][j] = "┐" 82 | if active_column[j] == 2: 83 | pass 84 | elif para_row: 85 | active_column[j] = 2 86 | else: 87 | active_column[j] = 1 88 | else: # noqa: PLR5501 89 | if active_column[j] == 2: 90 | item[i][j] = "┃" 91 | elif active_column[j] == 1: 92 | item[i][j] = "│" 93 | else: 94 | item[i][j] = " " 95 | 96 | lines: list[str] = [] 97 | for i in range(len(leaves)): 98 | line = _leaf_string(leaves[i], show_pos) 99 | for j in range(i + 1, len(leaves)): 100 | line += _extend_horizontal(item[i][j]) + item[i][j] 101 | lines.append(line) 102 | 103 | max_length = max(_str_real_length(line) for line in lines) 104 | for line, leaf in zip(lines, leaves): 105 | diff = max_length - _str_real_length(line) 106 | tree_string = " " * diff + line 107 | feat_string = _feat_string(leaf, show_rel, show_pas) if isinstance(leaf, BasePhrase) else "" 108 | table.add_row(Text(tree_string), Text(feat_string)) 109 | console.print(table) 110 | 111 | 112 | def _extend_horizontal(token: str) -> str: 113 | if token in ("╂", "┼", "┤", "┨", "┐", "─", "I", "A"): 114 | return "─" 115 | elif token in ("╋", "┿", "━", "P"): 116 | return "━" 117 | else: 118 | return " " 119 | 120 | 121 | def _leaf_string(leaf: Union[Phrase, BasePhrase], show_pos: bool) -> str: 122 | ret = "" 123 | for morpheme in leaf.morphemes: 124 | ret += morpheme.text 125 | if show_pos is True: 126 | if morpheme.subpos in ("固有名詞", "人名", "地名"): 127 | ret += POS_MARK[morpheme.subpos] 128 | else: 129 | ret += POS_MARK[morpheme.pos] 130 | return ret 131 | 132 | 133 | def _str_real_length(string: str) -> int: 134 | return Text(string).cell_len 135 | 136 | 137 | def _feat_string(base_phrase: BasePhrase, show_rel: bool, show_pas: bool) -> str: 138 | tag_strings: list[str] = [] 139 | if show_rel is True: 140 | for tag in base_phrase.rel_tags: 141 | tag_strings.append(f"{tag.type}:{tag.target}") 142 | if show_pas is True: 143 | for case, arguments in base_phrase.pas.get_all_arguments(relax=False).items(): 144 | for arg in arguments: 145 | core_text = _get_core_text(arg.base_phrase) if isinstance(arg, EndophoraArgument) else str(arg) 146 | tag_string = f"{case}:{core_text}" 147 | if tag_string not in tag_strings: 148 | tag_strings.append(tag_string) 149 | return " ".join(tag_strings) 150 | 151 | 152 | def _get_core_text(base_phrase: BasePhrase) -> str: 153 | """Get the core text without ancillary words.""" 154 | morphemes = base_phrase.morphemes 155 | start_index = 0 156 | for morpheme in morphemes: 157 | if morpheme.pos in ("助詞", "特殊", "判定詞"): 158 | start_index += 1 159 | else: 160 | break 161 | end_index = len(morphemes) 162 | for morpheme in reversed(morphemes): 163 | if morpheme.pos in ("助詞", "特殊", "判定詞"): 164 | end_index -= 1 165 | else: 166 | break 167 | ret = "".join(m.text for m in morphemes[start_index:end_index]) 168 | if not ret: 169 | start_index = 0 170 | end_index = len(morphemes) 171 | return "".join(m.text for m in morphemes[start_index:end_index]) 172 | -------------------------------------------------------------------------------- /src/rhoknp/cli/static/css/style.css: -------------------------------------------------------------------------------- 1 | /* templates/components/raw_input.jinja2 */ 2 | .input-text { 3 | white-space: pre-wrap; 4 | margin: 0 0.5em 1em; 5 | padding: 0.5em; 6 | } 7 | 8 | .result { 9 | margin: 0 0.5em 1em; 10 | padding: 0.5em; 11 | } 12 | 13 | /* templates/components/named_entity_recognition.jinja2 */ 14 | .entity { 15 | margin: 0 0.25em; 16 | line-height: 1; 17 | border-radius: 0.35em; 18 | } 19 | 20 | .entity-organization { 21 | background: #7aecec; 22 | } 23 | 24 | .entity-person { 25 | background: #aa9cfc; 26 | } 27 | 28 | .entity-location { 29 | background: #ff9561; 30 | } 31 | 32 | .entity-artifact { 33 | background: #bfeeb7; 34 | } 35 | 36 | .entity-date { 37 | background: #bfe1d9; 38 | } 39 | 40 | .entity-time { 41 | background: #bfe1d9; 42 | } 43 | 44 | .entity-money { 45 | background: #e4e7d2; 46 | } 47 | 48 | .entity-percent { 49 | background: #e4e7d2; 50 | } 51 | 52 | .entity-label { 53 | font-size: 0.8em; 54 | font-weight: bold; 55 | line-height: 1; 56 | border-radius: 0.35em; 57 | vertical-align: middle; 58 | } 59 | -------------------------------------------------------------------------------- /src/rhoknp/cli/static/images/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/src/rhoknp/cli/static/images/apple-touch-icon.png -------------------------------------------------------------------------------- /src/rhoknp/cli/static/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/src/rhoknp/cli/static/images/favicon.ico -------------------------------------------------------------------------------- /src/rhoknp/cli/static/js/script.js: -------------------------------------------------------------------------------- 1 | /* Keep the status of the accordion to show analysis results */ 2 | const defaultOpenAccordionItems = document.querySelectorAll( 3 | ".accordion-item-default-open", 4 | ); 5 | defaultOpenAccordionItems.forEach((item) => { 6 | const itemId = `accordion-${item.id}`; 7 | if (localStorage.getItem(itemId) === null) { 8 | localStorage.setItem(itemId, "true"); 9 | } 10 | }); 11 | 12 | const accordionItems = document.querySelectorAll(".accordion-item"); 13 | accordionItems.forEach((item) => { 14 | const itemId = `accordion-${item.id}`; 15 | item.addEventListener("shown.bs.collapse", () => { 16 | localStorage.setItem(itemId, "true"); 17 | }); 18 | item.addEventListener("hidden.bs.collapse", () => { 19 | localStorage.setItem(itemId, "false"); 20 | }); 21 | }); 22 | 23 | accordionItems.forEach((item) => { 24 | const itemId = `accordion-${item.id}`; 25 | const state = localStorage.getItem(itemId); 26 | console.log(state); 27 | if (state === "true") { 28 | item.querySelector(".accordion-button").classList.remove("collapsed"); 29 | item.querySelector(".accordion-collapse").classList.add("show"); 30 | } else { 31 | item.querySelector(".accordion-button").classList.add("collapsed"); 32 | item.querySelector(".accordion-collapse").classList.remove("show"); 33 | } 34 | }); 35 | 36 | const showAllButton = document.querySelector("#show-all-button"); 37 | showAllButton.addEventListener("click", () => { 38 | accordionItems.forEach((item) => { 39 | const itemId = `accordion-${item.id}`; 40 | localStorage.setItem(itemId, "true"); 41 | item.querySelector(".accordion-button").classList.remove("collapsed"); 42 | item.querySelector(".accordion-collapse").classList.add("show"); 43 | }); 44 | }); 45 | 46 | const hideAllButton = document.querySelector("#hide-all-button"); 47 | hideAllButton.addEventListener("click", () => { 48 | accordionItems.forEach((item) => { 49 | const itemId = `accordion-${item.id}`; 50 | localStorage.setItem(itemId, "false"); 51 | item.querySelector(".accordion-button").classList.add("collapsed"); 52 | item.querySelector(".accordion-collapse").classList.remove("show"); 53 | }); 54 | }); 55 | -------------------------------------------------------------------------------- /src/rhoknp/cli/stats.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from rhoknp import Document 4 | from rhoknp.cohesion.rel import CASE_TYPES, COREF_TYPES 5 | 6 | 7 | def get_document_statistics(document: Document) -> dict[str, dict[str, int]]: 8 | """文書の統計情報を取得. 9 | 10 | Args: 11 | document (Document): 文書. 12 | 13 | Returns: 14 | Dict[str, Dict[str, int]]: 統計情報. 15 | """ 16 | stats: dict[str, Any] = {"unit": {}, "cohesion": {}, "other": {}} 17 | # Unit 18 | if not document.is_senter_required(): 19 | stats["unit"]["sentence"] = len(document.sentences) 20 | if not document.is_clause_tag_required(): 21 | stats["unit"]["clause"] = len(document.clauses) 22 | if not document.is_knp_required(): 23 | stats["unit"]["phrase"] = len(document.phrases) 24 | stats["unit"]["base_phrase"] = len(document.base_phrases) 25 | if not document.is_jumanpp_required(): 26 | stats["unit"]["morpheme"] = len(document.morphemes) 27 | # Cohesion 28 | if not document.is_knp_required(): 29 | stats["cohesion"]["predicate"] = sum( 30 | len([rel_tag for rel_tag in bp.rel_tags if rel_tag.type in CASE_TYPES]) > 0 for bp in document.base_phrases 31 | ) 32 | stats["cohesion"]["argument"] = sum( 33 | len([rel_tag for rel_tag in bp.rel_tags if rel_tag.type in CASE_TYPES]) for bp in document.base_phrases 34 | ) 35 | stats["cohesion"]["coreference"] = sum( 36 | len([rel_tag for rel_tag in bp.rel_tags if rel_tag.type in COREF_TYPES]) for bp in document.base_phrases 37 | ) 38 | if not document.is_clause_tag_required(): 39 | stats["cohesion"]["discourse"] = sum(len(clause.discourse_relations) for clause in document.clauses) 40 | if not document.is_senter_required(): 41 | stats["other"]["named_entity"] = sum(len(sentence.named_entities) for sentence in document.sentences) 42 | return stats 43 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/base.jinja2: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {{ title }} 8 | 9 | 11 | 15 | 17 | 21 | 24 | 28 | 29 | 30 | 31 | {% include "components/navbar.jinja2" %} 32 |
33 | {% include "components/form.jinja2" %} 34 | {% if analyzed_document %} 35 |
36 | {% include "components/raw_input.jinja2" %} 37 |
解析結果
38 |
39 | {% include "components/show_all_button.jinja2" %} 40 | {% include "components/hide_all_button.jinja2" %} 41 | {% block result %} 42 | {% endblock result %} 43 |
44 | {% endif %} 45 | {% if error %} 46 | {% include "components/error.jinja2" %} 47 | {% endif %} 48 |
49 | 50 | 51 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/components/dependency_parsing.jinja2: -------------------------------------------------------------------------------- 1 |
3 |
4 | 10 |
11 |
14 |
15 |
{{ tree }}
16 |
17 |
18 |
19 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/components/discourse_parsing.jinja2: -------------------------------------------------------------------------------- 1 |
2 |
3 | 9 |
10 |
13 |
14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | {% for clause in analyzed_document.clauses %} 25 | {% for discourse_relation in clause.discourse_relations %} 26 | 27 | 28 | 35 | 36 | 37 | 38 | {% endfor %} 39 | {% endfor %} 40 | 41 |
談話関係タイプModifierHead
{{ discourse_relation.label.value }} 29 | {% if discourse_relation.is_explicit %} 30 | 明示的 31 | {% else %} 32 | 非明示的 33 | {% endif %} 34 | {{ discourse_relation.modifier.text }}{{ discourse_relation.head.text }}
42 |
43 |
44 |
45 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/components/error.jinja2: -------------------------------------------------------------------------------- 1 | 8 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/components/form.jinja2: -------------------------------------------------------------------------------- 1 |
2 |
3 | 4 | 5 |
6 |

Model: {{ version }}

7 | 8 |
9 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/components/hide_all_button.jinja2: -------------------------------------------------------------------------------- 1 | 4 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/components/morphological_analysis.jinja2: -------------------------------------------------------------------------------- 1 |
3 |
4 | 10 |
11 |
14 |
15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | {% for morpheme in analyzed_document.morphemes %} 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | {% endfor %} 41 | 42 |
表層文字列読み原形品詞品詞細分類活用型活用形意味情報
{{ morpheme.text }}{{ morpheme.reading }}{{ morpheme.lemma }}{{ morpheme.pos }}{{ morpheme.subpos }}{{ morpheme.conjtype }}{{ morpheme.conjform }}{{ morpheme.semantics.to_sstring().strip('"') }}
43 |
44 |
45 |
46 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/components/named_entity_recognition.jinja2: -------------------------------------------------------------------------------- 1 |
2 |
3 | 9 |
10 |
13 |
14 | {% for span in get_entity_spans(analyzed_document) %} 15 | {% if span.label %} 16 | 17 | {{ span.text }} 18 | {{ span.label }} 19 | 20 | {% else %} 21 | {{ span.text }} 22 | {% endif %} 23 | {% endfor %} 24 |
25 |
26 |
27 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/components/navbar.jinja2: -------------------------------------------------------------------------------- 1 | 6 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/components/raw_input.jinja2: -------------------------------------------------------------------------------- 1 |
テキスト
2 |
{{ text }}
3 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/components/raw_output.jinja2: -------------------------------------------------------------------------------- 1 |
2 |
3 | 9 |
10 |
13 |
14 |
{{ raw_output }}
15 |
16 |
17 |
18 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/components/show_all_button.jinja2: -------------------------------------------------------------------------------- 1 | 4 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/components/typo_correction.jinja2: -------------------------------------------------------------------------------- 1 |
2 |
3 | 9 |
10 |
13 |
14 | {% for diff in get_string_diff(text, analyzed_document.text) %} 15 | {% if diff.label == '+' %} 16 | {{ diff.text }} 17 | {% elif diff.label == '-' %} 18 | {{ diff.text }} 19 | {% else %} 20 | {{ diff.text }} 21 | {% endif %} 22 | {% endfor %} 23 |
24 |
25 |
26 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/components/word_splitting.jinja2: -------------------------------------------------------------------------------- 1 |
3 |
4 | 10 |
11 |
14 |
15 | {% for morpheme in analyzed_document.morphemes %}{{ morpheme.text + " " }}{% endfor %} 16 |
17 |
18 |
19 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/jumanpp.jinja2: -------------------------------------------------------------------------------- 1 | {% extends "base.jinja2" %} 2 | {% block result %} 3 |
4 | {% include "components/word_splitting.jinja2" %} 5 | {% include "components/morphological_analysis.jinja2" %} 6 | {% with raw_output = analyzed_document.to_jumanpp() %} 7 | {% include "components/raw_output.jinja2" %} 8 | {% endwith %} 9 |
10 | {% endblock result %} 11 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/knp.jinja2: -------------------------------------------------------------------------------- 1 | {% extends "base.jinja2" %} 2 | {% block result %} 3 |
4 | {% include "components/word_splitting.jinja2" %} 5 | {% include "components/morphological_analysis.jinja2" %} 6 | {% with tree = draw_tree(analyzed_document, show_pas=True) %} 7 | {% include "components/dependency_parsing.jinja2" %} 8 | {% endwith %} 9 | {% include "components/discourse_parsing.jinja2" %} 10 | {% with raw_output = analyzed_document.to_knp() %} 11 | {% include "components/raw_output.jinja2" %} 12 | {% endwith %} 13 |
14 | {% endblock result %} 15 | -------------------------------------------------------------------------------- /src/rhoknp/cli/templates/kwja.jinja2: -------------------------------------------------------------------------------- 1 | {% extends "base.jinja2" %} 2 | {% block result %} 3 |
4 | {% include "components/typo_correction.jinja2" %} 5 | {% include "components/word_splitting.jinja2" %} 6 | {% include "components/morphological_analysis.jinja2" %} 7 | {% include "components/named_entity_recognition.jinja2" %} 8 | {% with tree = draw_tree(analyzed_document, show_rel=True) %} 9 | {% include "components/dependency_parsing.jinja2" %} 10 | {% endwith %} 11 | {% include "components/discourse_parsing.jinja2" %} 12 | {% with raw_output = analyzed_document.to_knp() %} 13 | {% include "components/raw_output.jinja2" %} 14 | {% endwith %} 15 |
16 | {% endblock result %} 17 | -------------------------------------------------------------------------------- /src/rhoknp/cohesion/__init__.py: -------------------------------------------------------------------------------- 1 | from rhoknp.cohesion.argument import Argument, ArgumentType, EndophoraArgument, ExophoraArgument 2 | from rhoknp.cohesion.coreference import Entity, EntityManager 3 | from rhoknp.cohesion.discourse import DiscourseRelation, DiscourseRelationLabel, DiscourseRelationTag 4 | from rhoknp.cohesion.exophora import ExophoraReferent, ExophoraReferentType 5 | from rhoknp.cohesion.pas import Pas 6 | from rhoknp.cohesion.predicate import Predicate 7 | from rhoknp.cohesion.rel import RelMode, RelTag, RelTagList 8 | 9 | __all__ = [ 10 | "Argument", 11 | "ArgumentType", 12 | "DiscourseRelation", 13 | "DiscourseRelationLabel", 14 | "DiscourseRelationTag", 15 | "EndophoraArgument", 16 | "Entity", 17 | "EntityManager", 18 | "ExophoraArgument", 19 | "ExophoraReferent", 20 | "ExophoraReferentType", 21 | "Pas", 22 | "Predicate", 23 | "RelMode", 24 | "RelTag", 25 | "RelTagList", 26 | ] 27 | -------------------------------------------------------------------------------- /src/rhoknp/cohesion/argument.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from enum import Enum 3 | from typing import TYPE_CHECKING, Optional, Union 4 | 5 | from rhoknp.cohesion.exophora import ExophoraReferent 6 | from rhoknp.cohesion.predicate import Predicate 7 | 8 | if TYPE_CHECKING: 9 | from rhoknp.cohesion.pas import Pas 10 | from rhoknp.units.base_phrase import BasePhrase 11 | from rhoknp.units.clause import Clause 12 | from rhoknp.units.document import Document 13 | from rhoknp.units.phrase import Phrase 14 | from rhoknp.units.sentence import Sentence 15 | 16 | _HIRAGANA = "ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろわをんーゎゐゑゕゖゔゝゞ" 17 | _KATAKANA = "ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロワヲンーヮヰヱヵヶヴヽヾ" 18 | HIRA2KATA = str.maketrans(_HIRAGANA, _KATAKANA) 19 | 20 | 21 | class ArgumentType(Enum): 22 | """項のタイプ.""" 23 | 24 | CASE_EXPLICIT = "C" #: 直接係り受けをもつ格要素(格は明示されている). 25 | CASE_HIDDEN = "N" #: 直接係り受けをもつ格要素(格は明示されていない). 26 | OMISSION = "O" #: 省略の指示対象. 27 | DEMONSTRATIVE = "D" #: 指示詞の指示対象. 28 | EXOPHORA = "E" #: 特殊(不特定:人など). 29 | UNASSIGNED = "U" #: 格要素の割り当てなし. 30 | 31 | 32 | class BaseArgument(ABC): 33 | """項の基底クラス. 34 | 35 | Args: 36 | case: 述語に対する格. 37 | arg_type: 項のタイプ. 38 | """ 39 | 40 | def __init__(self, case: str, arg_type: ArgumentType) -> None: 41 | self.case: str = case #: 述語に対する格. 42 | self.type: ArgumentType = arg_type #: 項のタイプ. 43 | self.optional: bool = False #: 修飾的な項かどうか. 44 | self._pas: Optional["Pas"] = None 45 | 46 | @abstractmethod 47 | def __str__(self) -> str: 48 | raise NotImplementedError 49 | 50 | @abstractmethod 51 | def __repr__(self) -> str: 52 | raise NotImplementedError 53 | 54 | @abstractmethod 55 | def __eq__(self, other: object) -> bool: 56 | raise NotImplementedError 57 | 58 | @property 59 | def pas(self) -> "Pas": 60 | """述語項構造.""" 61 | assert self._pas is not None 62 | return self._pas 63 | 64 | @pas.setter 65 | def pas(self, pas: "Pas") -> None: 66 | """述語項構造.""" 67 | self._pas = pas 68 | 69 | def is_special(self) -> bool: 70 | """外界照応なら True.""" 71 | return self.type == ArgumentType.EXOPHORA 72 | 73 | 74 | class EndophoraArgument(BaseArgument): 75 | """文脈中の基本句に対応する項を表すクラス. 76 | 77 | Args: 78 | case: 述語に対する格. 79 | base_phrase: 項の核となる基本句. 80 | arg_type: 項のタイプ. 81 | """ 82 | 83 | def __init__( 84 | self, 85 | case: str, 86 | base_phrase: "BasePhrase", 87 | predicate: Predicate, 88 | arg_type: Optional[ArgumentType] = None, 89 | ) -> None: 90 | super().__init__(case, arg_type or self._get_arg_type(predicate, base_phrase, case)) 91 | self.base_phrase = base_phrase #: 項の核となる基本句. 92 | 93 | def __repr__(self) -> str: 94 | return f"<{self.__module__}.{self.__class__.__name__}: {self.case!r}, {self.base_phrase.text!r}>" 95 | 96 | def __str__(self) -> str: 97 | return self.base_phrase.text 98 | 99 | def __eq__(self, other: object) -> bool: 100 | if not isinstance(other, type(self)): 101 | return False 102 | if self._pas is not None and other._pas is not None: 103 | if self.pas.predicate != other.pas.predicate: 104 | return False 105 | return self.case == other.case and self.base_phrase == other.base_phrase 106 | 107 | @property 108 | def document(self) -> "Document": 109 | """項の核となる基本句が属する文書. 110 | 111 | Raises: 112 | AttributeError: 解析結果にアクセスできない場合. 113 | """ 114 | return self.base_phrase.document 115 | 116 | @property 117 | def sentence(self) -> "Sentence": 118 | """項の核となる基本句が属する文.""" 119 | return self.base_phrase.sentence 120 | 121 | @property 122 | def clause(self) -> "Clause": 123 | """項の核となる基本句が属する節. 124 | 125 | Raises: 126 | AttributeError: 解析結果にアクセスできない場合. 127 | """ 128 | return self.base_phrase.clause 129 | 130 | @property 131 | def phrase(self) -> "Phrase": 132 | """項の核となる基本句が属する文節.""" 133 | return self.base_phrase.phrase 134 | 135 | @staticmethod 136 | def _get_arg_type(predicate: Predicate, arg_base_phrase: "BasePhrase", case: str) -> ArgumentType: 137 | if predicate.base_phrase.parent_index is None: 138 | return ArgumentType.UNASSIGNED 139 | if arg_base_phrase in predicate.base_phrase.children: 140 | tail_morpheme = arg_base_phrase.morphemes[-1] 141 | if tail_morpheme.subpos == "格助詞" and tail_morpheme.text.translate(HIRA2KATA) == case: 142 | return ArgumentType.CASE_EXPLICIT 143 | else: 144 | return ArgumentType.CASE_HIDDEN 145 | elif predicate.base_phrase.parent and predicate.base_phrase.parent == arg_base_phrase: 146 | return ArgumentType.CASE_HIDDEN 147 | else: 148 | return ArgumentType.OMISSION 149 | 150 | 151 | class ExophoraArgument(BaseArgument): 152 | """外界照応の照応先に対応する項を表すクラス. 153 | 154 | Args: 155 | case: 述語に対する格. 156 | exophora_referent: 外界照応における照応先(不特定:人など). 157 | eid: エンティティID. 158 | """ 159 | 160 | def __init__(self, case: str, exophora_referent: ExophoraReferent, eid: int) -> None: 161 | super().__init__(case, ArgumentType.EXOPHORA) 162 | self.exophora_referent = exophora_referent #: 外界照応における照応先. 163 | self.eid = eid #: エンティティID. 164 | 165 | def __repr__(self) -> str: 166 | return ( 167 | f"{self.__class__.__name__}(case={self.case!r}, exophora_referent={self.exophora_referent!r}, " 168 | f"eid={self.eid!r})" 169 | ) 170 | 171 | def __str__(self) -> str: 172 | return str(self.exophora_referent) 173 | 174 | def __eq__(self, other: object) -> bool: 175 | if not isinstance(other, type(self)): 176 | return False 177 | if self._pas is not None and other._pas is not None: 178 | if self.pas.predicate != other.pas.predicate: 179 | return False 180 | return self.case == other.case and self.exophora_referent == other.exophora_referent 181 | 182 | 183 | Argument = Union[EndophoraArgument, ExophoraArgument] 184 | -------------------------------------------------------------------------------- /src/rhoknp/cohesion/exophora.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from enum import Enum 4 | from typing import ClassVar, Optional 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class ExophoraReferentType(Enum): 10 | """外海照応における照応先を表す列挙体.""" 11 | 12 | WRITER = "著者" 13 | READER = "読者" 14 | UNSPECIFIED_PERSON = "不特定:人" 15 | UNSPECIFIED_MATTER = "不特定:物" 16 | UNSPECIFIED_SITUATION = "不特定:状況" 17 | PREVIOUS_SENTENCE = "前文" 18 | NEXT_SENTENCE = "後文" 19 | OTHER = "OTHER" 20 | 21 | 22 | class ExophoraReferent: 23 | """外界照応における照応先を表すクラス.""" 24 | 25 | PAT: ClassVar[re.Pattern] = re.compile( 26 | rf"^(?P{'|'.join(t.value for t in ExophoraReferentType if t != ExophoraReferentType.OTHER)})" 27 | rf"(?P[0-9\d]*)$" 28 | ) 29 | 30 | def __init__(self, text: str) -> None: 31 | self.index: Optional[int] = None 32 | self._other_text: Optional[str] = None 33 | match: Optional[re.Match[str]] = self.PAT.match(text) 34 | if match is None: 35 | logger.warning(f"unknown exophora referent found: {text}") 36 | self.type = ExophoraReferentType.OTHER 37 | self._other_text = text 38 | else: 39 | index = match["index"] 40 | if index: 41 | self.index = int(index) 42 | self.type = ExophoraReferentType(match["type"]) 43 | 44 | @property 45 | def text(self) -> str: 46 | """外界照応の照応先を表すテキスト表現.""" 47 | if self.type != ExophoraReferentType.OTHER: 48 | return str(self.type.value) + str(self.index or "") 49 | else: 50 | assert self._other_text is not None 51 | return self._other_text 52 | 53 | def is_singleton(self) -> bool: 54 | """文書中に1つしか存在しないエンティティであれば True.""" 55 | if self.type in (ExophoraReferentType.WRITER, ExophoraReferentType.READER): 56 | return True 57 | if self.index is not None: 58 | return True 59 | return False 60 | 61 | def __str__(self) -> str: 62 | return self.text 63 | 64 | def __repr__(self) -> str: 65 | return f"{self.__class__.__name__}(text={self.text!r})" 66 | 67 | def __eq__(self, other: object) -> bool: 68 | if not isinstance(other, type(self)) or self.type != other.type: 69 | return False 70 | if self.type == ExophoraReferentType.OTHER: 71 | return self._other_text == other._other_text 72 | return self.index == other.index 73 | -------------------------------------------------------------------------------- /src/rhoknp/cohesion/predicate.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Optional 2 | 3 | if TYPE_CHECKING: 4 | from rhoknp.cohesion.pas import Pas 5 | from rhoknp.units.base_phrase import BasePhrase 6 | from rhoknp.units.clause import Clause 7 | from rhoknp.units.document import Document 8 | from rhoknp.units.phrase import Phrase 9 | from rhoknp.units.sentence import Sentence 10 | 11 | 12 | class Predicate: 13 | """述語を表すクラス. 14 | 15 | Args: 16 | base_phrase: 述語の核となる基本句. 17 | cfid: 格フーレムID. 18 | """ 19 | 20 | def __init__(self, base_phrase: "BasePhrase", cfid: Optional[str] = None) -> None: 21 | self.base_phrase: "BasePhrase" = base_phrase #: 述語の核となる基本句. 22 | self.cfid: Optional[str] = cfid #: 格フーレムID. 23 | self._pas: Optional["Pas"] = None 24 | 25 | @property 26 | def text(self) -> str: 27 | """表層文字列.""" 28 | return self.base_phrase.text 29 | 30 | @property 31 | def sid(self) -> str: 32 | """文 ID.""" 33 | return self.base_phrase.sentence.sid 34 | 35 | @property 36 | def pas(self) -> "Pas": 37 | """述語項構造.""" 38 | assert self._pas is not None 39 | return self._pas 40 | 41 | @pas.setter 42 | def pas(self, pas: "Pas") -> None: 43 | """述語項構造. 44 | 45 | Args: 46 | pas: 述語項構造. 47 | """ 48 | self._pas = pas 49 | 50 | @property 51 | def document(self) -> "Document": 52 | """述語の核となる基本句が属する文書. 53 | 54 | Raises: 55 | AttributeError: 解析結果にアクセスできない場合. 56 | """ 57 | return self.base_phrase.document 58 | 59 | @property 60 | def sentence(self) -> "Sentence": 61 | """述語の核となる基本句が属する文.""" 62 | return self.base_phrase.sentence 63 | 64 | @property 65 | def clause(self) -> "Clause": 66 | """述語の核となる基本句が属する節. 67 | 68 | Raises: 69 | AttributeError: 解析結果にアクセスできない場合. 70 | """ 71 | return self.base_phrase.clause 72 | 73 | @property 74 | def phrase(self) -> "Phrase": 75 | """述語の核となる基本句が属する文節.""" 76 | return self.base_phrase.phrase 77 | 78 | def __str__(self) -> str: 79 | return self.text 80 | 81 | def __repr__(self) -> str: 82 | return f"<{self.__module__}.{self.__class__.__name__}: {self.text!r}>" 83 | 84 | def __eq__(self, other: object) -> bool: 85 | if not isinstance(other, type(self)) or self.base_phrase != other.base_phrase: 86 | return False 87 | if self.cfid is None or other.cfid is None: 88 | return True 89 | return self.cfid == other.cfid 90 | -------------------------------------------------------------------------------- /src/rhoknp/cohesion/rel.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from dataclasses import dataclass 4 | from enum import Enum 5 | from typing import ClassVar, Optional 6 | 7 | CASE_TYPES = [ 8 | "ガ", 9 | "デ", 10 | "ト", 11 | "ニ", 12 | "ノ", 13 | "ヘ", 14 | "ヲ", 15 | "カラ", 16 | "ガ2", 17 | "ノ?", 18 | "マデ", 19 | "ヨリ", 20 | "トイウ", 21 | "トシテ", 22 | "トスル", 23 | "ニオク", 24 | "ニシテ", 25 | "ニツク", 26 | "ニトル", 27 | "ニヨル", 28 | "マデニ", 29 | "ニオイテ", 30 | "ニカワル", 31 | "ニソッテ", 32 | "ニツイテ", 33 | "ニトッテ", 34 | "ニムケテ", 35 | "ニムケル", 36 | "ニヨッテ", 37 | "ニヨラズ", 38 | "ニアワセテ", 39 | "ニカギッテ", 40 | "ニカギラズ", 41 | "ニカランデ", 42 | "ニカワッテ", 43 | "ニカンシテ", 44 | "ニカンスル", 45 | "ニクラベテ", 46 | "ニクワエテ", 47 | "ニタイシテ", 48 | "ニタイスル", 49 | "ニツヅイテ", 50 | "ニナランデ", 51 | "ヲツウジテ", 52 | "ヲツウジル", 53 | "ヲノゾイテ", 54 | "ヲフクメテ", 55 | "ヲメグッテ", 56 | "ニトモナッテ", 57 | "ニモトヅイテ", 58 | "無", 59 | "修飾", 60 | "判ガ", 61 | "時間", 62 | "外の関係", 63 | ] 64 | CASE_TYPES += [case + "≒" for case in CASE_TYPES] 65 | 66 | COREF_TYPES = ["=", "=構", "=役"] 67 | COREF_TYPES += [coref + "≒" for coref in COREF_TYPES] 68 | 69 | logger = logging.getLogger(__name__) 70 | 71 | 72 | class RelMode(Enum): 73 | """同一の基本句に同一タイプの関係タグが複数付いている場合にそれらの関係を表す列挙体. 74 | 75 | .. note:: 76 | 各関係タグの具体例は以下の通りである: 77 | 78 | * AND 79 | (例)太郎と花子が学校から<帰った>(ガ格:太郎, ガ格:花子 [and]) 80 | * OR 81 | (例)私は田園調布か国立に<住みたい>(ガ格:私, ニ格:田園調布, ニ格:国立 [or]) 82 | * AMBIGUOUS 83 | (例)高知県の橋本知事は…国籍条項を<撤廃する>方針を明らかにした(ガ格:高知県, ガ格:橋本知事 [?], ガ格:不特定:人 [?], ヲ格:条項, 外の関係:方針) 84 | 85 | .. note:: 86 | target が「なし」の場合,同じタイプの関係タグが任意的要素であることを示す. 87 | (例)太郎は一人で<立っていた>(ガ格:太郎, デ格:一人, デ格:なし [?]) 88 | """ 89 | 90 | AND = "AND" #: 関係の対象が並列である. 91 | OR = "OR" #: 「AかB」のように意味的に or である. 92 | AMBIGUOUS = "?" #: いずれの解釈も妥当であり,文脈から判断ができない. 93 | 94 | 95 | @dataclass(frozen=True) 96 | class RelTag: 97 | """関係タグ付きコーパスにおける タグを表すクラス.""" 98 | 99 | PAT: ClassVar[re.Pattern] = re.compile( 100 | r'' 102 | ) 103 | type: str 104 | target: str 105 | sid: Optional[str] 106 | base_phrase_index: Optional[int] 107 | mode: Optional[RelMode] 108 | 109 | def __post_init__(self) -> None: 110 | if self.is_coreference(): 111 | if self.type not in COREF_TYPES: 112 | logger.warning(f"Unknown coreference type: {self.type} ({self})") 113 | else: # noqa: PLR5501 114 | if self.type not in CASE_TYPES: 115 | logger.warning(f"Unknown case type: {self.type} ({self})") 116 | 117 | def to_fstring(self) -> str: 118 | """素性文字列に変換.""" 119 | ret = f' bool: 130 | """共参照・照応関係を表すタグなら True.""" 131 | return self.type.startswith("=") 132 | 133 | 134 | class RelTagList(list[RelTag]): 135 | """関係タグ付きコーパスにおける タグの列を表すクラス.""" 136 | 137 | @classmethod 138 | def from_fstring(cls, fstring: str) -> "RelTagList": 139 | """KNP における素性文字列からオブジェクトを作成.""" 140 | rel_tags = [] 141 | for match in RelTag.PAT.finditer(fstring): 142 | rel_tags.append( 143 | RelTag( 144 | type=match["type"], 145 | target=match["target"], 146 | sid=match["sid"], 147 | base_phrase_index=int(match["id"]) if match["id"] else None, 148 | mode=RelMode(match["mode"]) if match["mode"] else None, 149 | ) 150 | ) 151 | return cls(rel_tags) 152 | 153 | def to_fstring(self) -> str: 154 | """素性文字列に変換.""" 155 | return "".join(rel_tag.to_fstring() for rel_tag in self) 156 | 157 | def __str__(self) -> str: 158 | return self.to_fstring() 159 | -------------------------------------------------------------------------------- /src/rhoknp/processors/__init__.py: -------------------------------------------------------------------------------- 1 | from rhoknp.processors.jumanpp import Jumanpp 2 | from rhoknp.processors.knp import KNP 3 | from rhoknp.processors.kwja import KWJA 4 | from rhoknp.processors.senter import RegexSenter 5 | 6 | __all__ = ["KNP", "KWJA", "Jumanpp", "RegexSenter"] 7 | -------------------------------------------------------------------------------- /src/rhoknp/processors/processor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Union, overload 3 | 4 | from rhoknp.units import Document, Sentence 5 | 6 | 7 | class Processor(ABC): 8 | """解析器の基底クラス.""" 9 | 10 | @overload 11 | def __call__(self, text: str, timeout: int = 10) -> Document: ... 12 | 13 | @overload 14 | def __call__(self, text: Sentence, timeout: int = 10) -> Sentence: ... 15 | 16 | @overload 17 | def __call__(self, text: Document, timeout: int = 10) -> Document: ... 18 | 19 | def __call__(self, text: Union[str, Sentence, Document], timeout: int = 10) -> Union[Document, Sentence]: 20 | """テキストに解析器を適用する. 21 | 22 | Args: 23 | text: 解析するテキスト. 24 | timeout: 最大処理時間. 25 | 26 | Raises: 27 | TypeError: textの型がstr, Sentence, Document以外の場合. 28 | 29 | .. note:: 30 | このメソッドは引数の型に応じて ``apply_to_document`` または ``apply_to_sentence`` を呼び出す. 31 | 引数の型が ``str`` の場合は ``apply_to_document`` を呼び出す. 32 | 引数の型が ``Sentence`` の場合は ``apply_to_sentence`` を呼び出す. 33 | 引数の型が ``Document`` の場合は ``apply_to_document`` を呼び出す. 34 | """ 35 | return self.apply(text, timeout=timeout) 36 | 37 | @overload 38 | def apply(self, text: str, timeout: int = 10) -> Document: ... 39 | 40 | @overload 41 | def apply(self, text: Sentence, timeout: int = 10) -> Sentence: ... 42 | 43 | @overload 44 | def apply(self, text: Document, timeout: int = 10) -> Document: ... 45 | 46 | def apply(self, text: Union[str, Sentence, Document], timeout: int = 10) -> Union[Document, Sentence]: 47 | """テキストに解析器を適用する. 48 | 49 | Args: 50 | text: 解析するテキスト. 51 | timeout: 最大処理時間. 52 | 53 | Raises: 54 | TypeError: textの型がstr, Sentence, Document以外の場合. 55 | 56 | .. note:: 57 | このメソッドは引数の型に応じて ``apply_to_document`` または ``apply_to_sentence`` を呼び出す. 58 | 引数の型が ``str`` の場合は ``apply_to_document`` を呼び出す. 59 | 引数の型が ``Sentence`` の場合は ``apply_to_sentence`` を呼び出す. 60 | 引数の型が ``Document`` の場合は ``apply_to_document`` を呼び出す. 61 | """ 62 | if isinstance(text, (Document, str)): 63 | return self.apply_to_document(text, timeout=timeout) 64 | elif isinstance(text, Sentence): 65 | return self.apply_to_sentence(text, timeout=timeout) 66 | else: 67 | raise TypeError("Invalid type: text must be str, Sentence, or Document") 68 | 69 | @abstractmethod 70 | def apply_to_document(self, document: Union[Document, str], timeout: int = 10) -> Document: 71 | """文書に解析器を適用する. 72 | 73 | Args: 74 | document: 文書. 75 | timeout: 最大処理時間. 76 | """ 77 | raise NotImplementedError 78 | 79 | @abstractmethod 80 | def apply_to_sentence(self, sentence: Union[Sentence, str], timeout: int = 10) -> Sentence: 81 | """文に解析器を適用する. 82 | 83 | Args: 84 | sentence: 文. 85 | timeout: 最大処理時間. 86 | """ 87 | raise NotImplementedError 88 | -------------------------------------------------------------------------------- /src/rhoknp/processors/senter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import threading 4 | from typing import ClassVar, Union 5 | 6 | try: 7 | from typing import override # type: ignore[attr-defined] 8 | except ImportError: 9 | from typing_extensions import override 10 | 11 | from rhoknp.processors.processor import Processor 12 | from rhoknp.units import Document, Sentence 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class RegexSenter(Processor): 18 | """正規表現にもとづく文分割クラス. 19 | 20 | Example: 21 | >>> from rhoknp import RegexSenter 22 | >>> senter = RegexSenter() 23 | >>> document = senter.apply("天気が良かったので散歩した。途中で先生に会った。") 24 | """ 25 | 26 | _PERIOD_PAT: ClassVar[re.Pattern] = re.compile(r"[。.?!♪☆★…?!]+") #: ピリオドとみなすパターン. 27 | 28 | def __repr__(self) -> str: 29 | return f"{self.__class__.__name__}()" 30 | 31 | @override 32 | def apply_to_document(self, document: Union[Document, str], timeout: int = 10) -> Document: 33 | """文書に RegexSenter を適用する. 34 | 35 | Args: 36 | document: 文書. 37 | timeout: 最大処理時間.. 38 | """ 39 | if isinstance(document, str): 40 | document = Document(document) 41 | doc_id = document.doc_id 42 | 43 | sentences: list[str] = [] 44 | 45 | def worker() -> None: 46 | nonlocal sentences 47 | sentences = self._split_document(document.text) 48 | 49 | thread = threading.Thread(target=worker, daemon=True) 50 | thread.start() 51 | thread.join(timeout) 52 | 53 | if thread.is_alive(): 54 | raise TimeoutError(f"Operation timed out after {timeout} seconds.") 55 | 56 | ret = Document.from_sentences(sentences) 57 | if doc_id != "": 58 | ret.doc_id = doc_id 59 | for sentence in ret.sentences: 60 | sentence.doc_id = doc_id 61 | return ret 62 | 63 | @override 64 | def apply_to_sentence(self, sentence: Union[Sentence, str], timeout: int = 10) -> Sentence: 65 | """文に RegexSenter を適用する. 66 | 67 | Args: 68 | sentence: 文. 69 | timeout: 最大処理時間. 70 | """ 71 | if isinstance(sentence, str): 72 | sentence = Sentence(sentence) 73 | return sentence 74 | 75 | def _split_document(self, text: str) -> list[str]: 76 | if text == "": 77 | return [] 78 | 79 | def split_text_by_period(text: str) -> list[str]: 80 | segments: list[str] = [] 81 | start: int = 0 82 | for match in self._PERIOD_PAT.finditer(text): 83 | end: int = match.end() 84 | segments.append(text[start:end]) 85 | start = end 86 | if start < len(text): 87 | segments.append(text[start:]) 88 | return [segment.strip() for segment in segments] 89 | 90 | sentences: list[str] = [] 91 | for line in text.split("\n"): 92 | # Split by periods 93 | sentence_candidates: list[str] = split_text_by_period(line) 94 | 95 | # Merge sentence candidates so that strings in parentheses or brackets are not split 96 | parenthesis_level: int = 0 97 | hook_bracket_level: int = 0 98 | double_hook_bracket_level: int = 0 99 | sentence: str = "" 100 | while sentence_candidates: 101 | sentence_candidate: str = sentence_candidates.pop(0) 102 | 103 | sentence += sentence_candidate 104 | 105 | parenthesis_level += sentence_candidate.count("(") - sentence_candidate.count(")") 106 | parenthesis_level += sentence_candidate.count("(") - sentence_candidate.count(")") 107 | hook_bracket_level += sentence_candidate.count("「") - sentence_candidate.count("」") 108 | double_hook_bracket_level += sentence_candidate.count("『") - sentence_candidate.count("』") 109 | if parenthesis_level == hook_bracket_level == double_hook_bracket_level == 0: 110 | if sentence.strip(): 111 | sentences.append(sentence.strip()) 112 | sentence = "" 113 | if sentence.strip(): 114 | sentences.extend(split_text_by_period(sentence.strip())) 115 | 116 | return sentences 117 | -------------------------------------------------------------------------------- /src/rhoknp/props/__init__.py: -------------------------------------------------------------------------------- 1 | from rhoknp.props.dependency import DepType 2 | from rhoknp.props.feature import FeatureDict 3 | from rhoknp.props.memo import MemoTag 4 | from rhoknp.props.named_entity import NamedEntity, NamedEntityCategory 5 | from rhoknp.props.semantics import SemanticsDict 6 | 7 | __all__ = [ 8 | "DepType", 9 | "FeatureDict", 10 | "MemoTag", 11 | "NamedEntity", 12 | "NamedEntityCategory", 13 | "SemanticsDict", 14 | ] 15 | -------------------------------------------------------------------------------- /src/rhoknp/props/dependency.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class DepType(Enum): 5 | """文節,基本句の係り受けタイプを表す列挙体.""" 6 | 7 | DEPENDENCY = "D" 8 | PARALLEL = "P" 9 | APPOSITION = "A" 10 | IMPERFECT_PARALLEL = "I" 11 | -------------------------------------------------------------------------------- /src/rhoknp/props/feature.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from typing import ClassVar, Union 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class FeatureDict(dict[str, Union[str, bool]]): 9 | """文節,基本句,形態素の素性情報を表すクラス.""" 10 | 11 | IGNORE_TAG_PREFIXES: ClassVar[set[str]] = {"rel ", "memo "} 12 | _FEATURE_KEY_PAT: ClassVar[re.Pattern] = re.compile(r"(?P([^:\"]|\"[^\"]*?\")+?)") 13 | _FEATURE_VALUE_PAT: ClassVar[re.Pattern] = re.compile(r"(?P([^>\\]|\\>?)+)") 14 | PAT: ClassVar[re.Pattern] = re.compile( 15 | rf"(?P(<{_FEATURE_KEY_PAT.pattern}(:{_FEATURE_VALUE_PAT.pattern})?>)*)" 16 | ) 17 | FEATURE_PAT: ClassVar[re.Pattern] = re.compile( 18 | rf"<(?!({'|'.join(IGNORE_TAG_PREFIXES)})){_FEATURE_KEY_PAT.pattern}(:{_FEATURE_VALUE_PAT.pattern})?>" 19 | ) 20 | 21 | def __setitem__(self, key: str, value: Union[str, bool]) -> None: 22 | if key == "rel": 23 | logger.warning( 24 | f"Adding 'rel' to {self.__class__.__name__} is not supported and was ignored. Instead, add a RelTag " 25 | f"object to BasePhrase.rel_tags and call Document.reparse()." 26 | ) 27 | return 28 | if key == "memo": 29 | logger.warning( 30 | f"Adding 'memo' to {self.__class__.__name__} is not supported and was ignored. Instead, set a MemoTag " 31 | f"object to BasePhrase.memo_tag." 32 | ) 33 | return 34 | super().__setitem__(key, value) 35 | 36 | @classmethod 37 | def from_fstring(cls, fstring: str) -> "FeatureDict": 38 | """素性文字列をパースして辞書型に変換する. 39 | 40 | 例:"<正規化代表表記:遅れる/おくれる>" -> {"正規化代表表記": "遅れる/おくれる"} 41 | 42 | Args: 43 | fstring: KNP 形式における素性文字列. 44 | """ 45 | features = cls() 46 | for match in cls.FEATURE_PAT.finditer(fstring): 47 | features[match["key"]] = match["value"].replace(r"\>", ">") if match["value"] is not None else True 48 | return features 49 | 50 | def to_fstring(self) -> str: 51 | """素性文字列に変換.""" 52 | return "".join(self._item_to_fstring(k, v) for k, v in self.items()) 53 | 54 | @staticmethod 55 | def _item_to_fstring(key: str, value: Union[str, bool]) -> str: 56 | if value is False: 57 | return "" 58 | if value is True: 59 | return f"<{key}>" 60 | escaped_value = value.replace(">", r"\>") # escape ">" 61 | return f"<{key}:{escaped_value}>" 62 | -------------------------------------------------------------------------------- /src/rhoknp/props/memo.py: -------------------------------------------------------------------------------- 1 | import re 2 | from dataclasses import dataclass 3 | from typing import ClassVar 4 | 5 | 6 | @dataclass(frozen=True) 7 | class MemoTag: 8 | """関係タグ付きコーパスにおける タグを表すクラス.""" 9 | 10 | PAT: ClassVar[re.Pattern] = re.compile(r'') 11 | text: str = "" #: メモの内容. 12 | 13 | @classmethod 14 | def from_fstring(cls, fstring: str) -> "MemoTag": 15 | """KNP における素性文字列からオブジェクトを作成.""" 16 | match = cls.PAT.search(fstring) 17 | memo_tag = MemoTag(text=match["text"] if match is not None else "") 18 | return memo_tag 19 | 20 | def to_fstring(self) -> str: 21 | """素性文字列に変換.""" 22 | return f'' 23 | 24 | def __str__(self) -> str: 25 | return self.to_fstring() 26 | 27 | def __bool__(self) -> bool: 28 | return bool(self.text) 29 | -------------------------------------------------------------------------------- /src/rhoknp/props/named_entity.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from dataclasses import dataclass 4 | from enum import Enum 5 | from typing import TYPE_CHECKING, ClassVar, Optional 6 | 7 | if TYPE_CHECKING: 8 | from rhoknp.units.morpheme import Morpheme 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class NamedEntityCategory(Enum): 14 | """固有表現カテゴリを表す列挙体.""" 15 | 16 | ORGANIZATION = "ORGANIZATION" 17 | PERSON = "PERSON" 18 | LOCATION = "LOCATION" 19 | ARTIFACT = "ARTIFACT" 20 | DATE = "DATE" 21 | TIME = "TIME" 22 | MONEY = "MONEY" 23 | PERCENT = "PERCENT" 24 | OPTIONAL = "OPTIONAL" 25 | 26 | @classmethod 27 | def has_value(cls, value: str) -> bool: 28 | """有効な固有表現カテゴリであれば True. 29 | 30 | Args: 31 | value: 固有表現のカテゴリ. 32 | """ 33 | return any(value == item.value for item in cls) 34 | 35 | 36 | @dataclass 37 | class NamedEntity: 38 | """固有表現を表すクラス.""" 39 | 40 | PAT: ClassVar[re.Pattern] = re.compile(r"\w+):(?P([^>\\]|\\>?)+)>") 41 | 42 | category: NamedEntityCategory 43 | morphemes: list["Morpheme"] 44 | 45 | def __str__(self) -> str: 46 | return self.text 47 | 48 | @property 49 | def text(self) -> str: 50 | """固有表現の表層文字列.""" 51 | return "".join(m.text for m in self.morphemes) 52 | 53 | @classmethod 54 | def from_fstring(cls, fstring: str, candidate_morphemes: list["Morpheme"]) -> Optional["NamedEntity"]: 55 | """KNP における素性文字列からオブジェクトを作成.""" 56 | match = cls.PAT.match(fstring) 57 | if match is None: 58 | logger.warning(f"{fstring} is not a valid NE fstring") 59 | return None 60 | category: str = match["cat"] 61 | if not NamedEntityCategory.has_value(category): 62 | logger.warning(f"{candidate_morphemes[0].sentence.sid}: unknown NE category: {category}") 63 | return None 64 | name: str = match["name"].replace(r"\>", ">") 65 | span = cls._find_morpheme_span(name, candidate_morphemes) 66 | if span is None: 67 | logger.warning(f"{candidate_morphemes[0].sentence.sid}: morpheme span of '{name}' not found") 68 | return None 69 | return NamedEntity(NamedEntityCategory(category), candidate_morphemes[span.start : span.stop]) 70 | 71 | def to_fstring(self) -> str: 72 | """素性文字列に変換.""" 73 | escaped_text = self.text.replace(">", r"\>") # escape ">" 74 | return f"" 75 | 76 | @staticmethod 77 | def _find_morpheme_span(name: str, candidates: list["Morpheme"]) -> Optional[range]: 78 | """固有表現の文字列にマッチする形態素の範囲を返す. 79 | 80 | Args: 81 | name: 固有表現の文字列 82 | candidates: 固有表現を構成する候補形態素のリスト 83 | """ 84 | stop = len(candidates) 85 | while stop > 0: 86 | for start in reversed(range(stop)): 87 | if "".join(m.text for m in candidates[start:stop]) == name: 88 | return range(start, stop) 89 | stop -= 1 90 | return None 91 | -------------------------------------------------------------------------------- /src/rhoknp/props/semantics.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Optional, Union 3 | 4 | 5 | class SemanticsDict(dict[str, Union[str, bool]]): 6 | """形態素の意味情報を表すクラス.""" 7 | 8 | NIL = "NIL" 9 | PAT = re.compile(rf'(?P("[^"]+?")|{NIL})') 10 | SEM_PAT = re.compile(r"(?P[^:\s]+)(:(?P\S+))?(\s|$)") 11 | 12 | def __init__(self, semantics: Optional[dict[str, Union[str, bool]]] = None, is_nil: bool = False) -> None: 13 | if semantics is None: 14 | semantics = {} 15 | super().__init__(semantics) 16 | self.nil: bool = is_nil 17 | 18 | def is_nil(self) -> bool: 19 | """NIL なら True.""" 20 | return self.nil 21 | 22 | @classmethod 23 | def from_sstring(cls, sstring: str) -> "SemanticsDict": 24 | """意味情報文字列をパースして辞書型に変換する. 25 | 26 | 例:"代表表記:日本/にほん 地名:国" -> {"代表表記": "日本/にほん", "地名": "国"} 27 | 28 | Args: 29 | sstring: KNP 形式における意味情報文字列. 30 | """ 31 | is_nil = sstring == cls.NIL 32 | semantics = {} 33 | if not is_nil: 34 | for match in cls.SEM_PAT.finditer(sstring.strip('"')): 35 | semantics[match["key"]] = match["value"] or True 36 | return cls(semantics, is_nil) 37 | 38 | def to_sstring(self) -> str: 39 | """意味情報文字列に変換.""" 40 | if len(self) == 0: 41 | return "" if not self.is_nil() else self.NIL 42 | return f'"{" ".join(self._item_to_sstring(k, v) for k, v in self.items())}"' 43 | 44 | @staticmethod 45 | def _item_to_sstring(key: str, value: Union[str, bool]) -> str: 46 | if value is False: 47 | return "" 48 | if value is True: 49 | return f"{key}" 50 | return f"{key}:{value}" 51 | -------------------------------------------------------------------------------- /src/rhoknp/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/src/rhoknp/py.typed -------------------------------------------------------------------------------- /src/rhoknp/units/__init__.py: -------------------------------------------------------------------------------- 1 | from rhoknp.units.base_phrase import BasePhrase 2 | from rhoknp.units.clause import Clause 3 | from rhoknp.units.document import Document 4 | from rhoknp.units.morpheme import Morpheme 5 | from rhoknp.units.phrase import Phrase 6 | from rhoknp.units.sentence import Sentence 7 | 8 | __all__ = ["BasePhrase", "Clause", "Document", "Morpheme", "Phrase", "Sentence"] 9 | -------------------------------------------------------------------------------- /src/rhoknp/units/unit.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections.abc import Sequence 3 | from typing import Optional 4 | 5 | 6 | class Unit(ABC): 7 | """言語単位の基底クラス・""" 8 | 9 | def __init__(self) -> None: 10 | self._text: Optional[str] = None 11 | 12 | def __post_init__(self) -> None: 13 | if self.child_units is not None: 14 | for child_unit in self.child_units: 15 | child_unit.__post_init__() 16 | 17 | @abstractmethod 18 | def __eq__(self, other: object) -> bool: 19 | raise NotImplementedError 20 | 21 | def __str__(self) -> str: 22 | return self.text 23 | 24 | def __repr__(self) -> str: 25 | return f"<{self.__module__}.{self.__class__.__name__}: {self.text!r}>" 26 | 27 | @property 28 | @abstractmethod 29 | def parent_unit(self) -> Optional["Unit"]: 30 | """上位の言語単位.""" 31 | raise NotImplementedError 32 | 33 | @property 34 | @abstractmethod 35 | def child_units(self) -> Optional[Sequence["Unit"]]: 36 | """下位の言語単位.""" 37 | raise NotImplementedError 38 | 39 | @property 40 | def text(self) -> str: 41 | """言語単位の表層文字列.""" 42 | if self._text is not None: 43 | return self._text 44 | if self.child_units is not None: 45 | self._text = "".join(str(child_unit) for child_unit in self.child_units) 46 | return self._text 47 | raise AttributeError 48 | 49 | @text.setter 50 | def text(self, text: str) -> None: 51 | """言語単位の表層文字列. 52 | 53 | Args: 54 | text: 言語単位の表層文字列. 55 | """ 56 | self._text = text 57 | -------------------------------------------------------------------------------- /src/rhoknp/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/src/rhoknp/utils/__init__.py -------------------------------------------------------------------------------- /src/rhoknp/utils/comment.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from typing import Optional 4 | 5 | from rhoknp.units.morpheme import Morpheme 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def is_comment_line(line: str) -> bool: 11 | """行がコメント行かどうかを判定する. 12 | 13 | Args: 14 | line: 行. 15 | 16 | Returns: 17 | bool: コメント行ならTrue. 18 | """ 19 | return line.startswith("#") and not Morpheme.is_morpheme_line(line) 20 | 21 | 22 | def extract_did_and_sid(comment_line: str, patterns: list[re.Pattern]) -> tuple[Optional[str], Optional[str], str]: 23 | """コメント行から文書IDおよび文IDを抽出する. 24 | 25 | Args: 26 | comment_line: コメント行. 27 | patterns: 文書IDを抽出する正規表現のリスト.最初にマッチしたものが使用される. 28 | 29 | Returns: 30 | Optional[str]: 文書ID(見つからなければNone). 31 | Optional[str]: 文ID(見つからなければNone). 32 | str: 残りのコメント行. 33 | """ 34 | match_sid = re.match(r"# S-ID: ?(\S*)( .+)?$", comment_line) 35 | if match_sid is not None: 36 | sid_string = match_sid[1] 37 | for pattern in patterns: 38 | match = pattern.match(sid_string) 39 | if match is not None: 40 | return match["did"], match["sid"], match_sid[2].lstrip() if match_sid[2] else "" 41 | logger.warning(f"Invalid S-ID: {sid_string}") 42 | return None, None, comment_line.lstrip("#").lstrip() 43 | -------------------------------------------------------------------------------- /src/rhoknp/utils/reader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from collections.abc import Iterator 4 | from functools import partial 5 | from typing import Callable, Optional, TextIO, Union 6 | 7 | from rhoknp import Sentence 8 | from rhoknp.utils.comment import extract_did_and_sid 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def chunk_by_sentence(f: TextIO) -> Iterator[str]: 14 | """解析結果ファイルを文ごとに分割するジェネレータ. 15 | 16 | Args: 17 | f: 分割するファイル. 18 | 19 | Example: 20 | >>> from rhoknp.units import Sentence 21 | >>> from rhoknp.utils.reader import chunk_by_sentence 22 | >>> with open("example.knp") as f: 23 | ... for knp in chunk_by_sentence(f): 24 | ... sentence = Sentence.from_knp(knp) 25 | """ 26 | buffer = [] 27 | for line in f: 28 | if line.strip() == "": 29 | continue 30 | buffer.append(line) 31 | if line.rstrip("\n") == Sentence.EOS: 32 | yield "".join(buffer) 33 | buffer = [] 34 | if buffer: 35 | yield "".join(buffer) 36 | 37 | 38 | def chunk_by_document(f: TextIO, doc_id_format: Union[str, Callable] = "default") -> Iterator[str]: 39 | """解析結果ファイルを文書ごとに分割するジェネレータ. 40 | 41 | Args: 42 | f: 分割するファイル. 43 | doc_id_format: 文書IDのフォーマット. 44 | 45 | Example: 46 | >>> from rhoknp.units import Document 47 | >>> from rhoknp.utils.reader import chunk_by_document 48 | >>> with open("example.knp") as f: 49 | ... for knp in chunk_by_document(f): 50 | ... document = Document.from_knp(knp) 51 | 52 | .. note:: 53 | 文書IDのフォーマットとして指定可能なのは以下の通り: 54 | * "default": 文ID (S-ID) の最後のハイフン以前を文書IDとみなす. 55 | (例) # S-ID:A-X-1 -> 文書ID: A-X 56 | * "kwdlc": KWDLCの文IDから文書IDを取り出す. 57 | (例) # S-ID:w201106-0000060050-1 -> 文書ID: w201106-0000060050 58 | * "wac": WACの文IDから文書IDを取り出す. 59 | (例) # S-ID:wiki00100176-00 -> 文書ID: wiki00100176 60 | 61 | 関数が指定された場合,文解析結果の先頭行から文書IDを取り出す関数とみなす. 62 | 例えば default 相当の処理を行うには以下のような関数を渡す. 63 | 64 | >>> def default_doc_id_format(line: str) -> str: 65 | ... return line.lstrip("# S-ID:").rsplit("-", maxsplit=1)[0] 66 | """ 67 | extract_doc_id: Callable[[str], Optional[str]] 68 | if isinstance(doc_id_format, str): 69 | if doc_id_format == "default": 70 | extract_doc_id = partial(_extract_doc_id, pat=Sentence.SID_PAT) 71 | elif doc_id_format == "kwdlc": 72 | extract_doc_id = partial(_extract_doc_id, pat=Sentence.SID_PAT_KWDLC) 73 | elif doc_id_format == "wac": 74 | extract_doc_id = partial(_extract_doc_id, pat=Sentence.SID_PAT_WAC) 75 | else: 76 | raise ValueError(f"Invalid doc_id_format: {doc_id_format}") 77 | elif callable(doc_id_format): 78 | extract_doc_id = doc_id_format 79 | else: 80 | raise TypeError(f"Invalid doc_id_format: {doc_id_format}") 81 | 82 | prev_doc_id: Optional[str] = None 83 | buffer: list[str] = [] 84 | for sentence in chunk_by_sentence(f): 85 | doc_id = extract_doc_id(sentence.split("\n")[0]) 86 | if buffer and (prev_doc_id != doc_id or doc_id is None): 87 | yield "".join(buffer) 88 | buffer = [] 89 | buffer.append(sentence) 90 | prev_doc_id = doc_id 91 | if buffer: 92 | yield "".join(buffer) 93 | 94 | 95 | def _extract_doc_id(line: str, pat: re.Pattern) -> Optional[str]: 96 | """文書IDを抽出する. 97 | 98 | Args: 99 | line: 文IDが含まれるコメント行. 100 | pat: 文書IDを抽出する正規表現. 101 | """ 102 | did, _, _ = extract_did_and_sid(line, [pat]) 103 | return did 104 | -------------------------------------------------------------------------------- /tests/bin/jumanpp-mock.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | while true; do 4 | read -r line 5 | 6 | if [ "$line" = "time consuming input" ]; then 7 | sleep 5 8 | fi 9 | 10 | if [ "$line" = "error causing input" ]; then 11 | echo 'エラー1' >&2 12 | echo 'エラー2' >&2 13 | exit 1 14 | fi 15 | 16 | if [ "$line" = "knp time consuming input" ]; then 17 | echo '# knp time consuming input' 18 | fi 19 | 20 | if [ "$line" = "knp error causing input" ]; then 21 | echo '# knp error causing input' 22 | fi 23 | 24 | echo 'こんにちは こんにちは こんにちは 感動詞 12 * 0 * 0 * 0 "代表表記:こんにちは/こんにちは"' 25 | echo 'さようなら さようなら さようなら 感動詞 12 * 0 * 0 * 0 "代表表記:さようなら/さようなら"' 26 | echo 'EOS' 27 | done 28 | -------------------------------------------------------------------------------- /tests/bin/knp-mock.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | while true; do 4 | read -r line 5 | 6 | if [ "$line" = "# knp time consuming input" ]; then 7 | sleep 5 8 | fi 9 | 10 | if [ "$line" = "# knp error causing input" ]; then 11 | echo 'エラー1' >&2 12 | echo 'エラー2' >&2 13 | exit 1 14 | fi 15 | 16 | echo '# S-ID:1 KNP:5.0-5c637eb DATE:2023/08/23 SCORE:-22.40768' 17 | echo '* -1D <文頭><文末><体言><用言:判><体言止><レベル:C><区切:5-5><裸名詞><提題受:30><主節><状態述語><正規化代表表記:こんにちは/こんにちは><主辞代表表記:こんにちは/こんにちは>' 18 | echo '+ -1D <文頭><文末><体言><用言:判><体言止><レベル:C><区切:5-5><裸名詞><提題受:30><主節><状態述語><判定詞句><名詞項候補><正規化代表表記:こんにちは/こんにちは><主辞代表表記:こんにちは/こんにちは><用言代表表記:こんにちは/こんにちは><節-区切><節-主辞><時制:非過去><格解析結果:こんにちは/こんにちは:判0:ニ/U/-/-/-/-;カラ/U/-/-/-/-><標準用言代表表記:こんにちは/こんにちは>' 19 | echo 'こんにちは こんにちは こんにちは 感動詞 12 * 0 * 0 * 0 "代表表記:こんにちは/こんにちは" <代表表記:こんにちは/こんにちは><正規化代表表記:こんにちは/こんにちは><かな漢字><ひらがな><文頭><文末><表現文末><自立><内容語><タグ単位始><文節始><文節主辞><用言表記先頭><用言表記末尾><用言意味表記末尾>' 20 | echo 'EOS' 21 | done 22 | -------------------------------------------------------------------------------- /tests/bin/kwja-mock.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | while true; do 4 | read -r line 5 | 6 | if [ "$line" = "time consuming input" ]; then 7 | sleep 5 8 | fi 9 | 10 | if [ "$line" = "error causing input" ]; then 11 | echo 'エラー1' >&2 12 | echo 'エラー2' >&2 13 | exit 1 14 | fi 15 | 16 | echo '# S-ID:1 KNP:5.0-5c637eb DATE:2023/08/23 SCORE:-22.40768' 17 | echo '* -1D <文頭><文末><体言><用言:判><体言止><レベル:C><区切:5-5><裸名詞><提題受:30><主節><状態述語><正規化代表表記:こんにちは/こんにちは><主辞代表表記:こんにちは/こんにちは>' 18 | echo '+ -1D <文頭><文末><体言><用言:判><体言止><レベル:C><区切:5-5><裸名詞><提題受:30><主節><状態述語><判定詞句><名詞項候補><正規化代表表記:こんにちは/こんにちは><主辞代表表記:こんにちは/こんにちは><用言代表表記:こんにちは/こんにちは><節-区切><節-主辞><時制:非過去><格解析結果:こんにちは/こんにちは:判0:ニ/U/-/-/-/-;カラ/U/-/-/-/-><標準用言代表表記:こんにちは/こんにちは>' 19 | echo 'こんにちは こんにちは こんにちは 感動詞 12 * 0 * 0 * 0 "代表表記:こんにちは/こんにちは" <代表表記:こんにちは/こんにちは><正規化代表表記:こんにちは/こんにちは><かな漢字><ひらがな><文頭><文末><表現文末><自立><内容語><タグ単位始><文節始><文節主辞><用言表記先頭><用言表記末尾><用言意味表記末尾>' 20 | echo 'EOS' 21 | echo 'EOD' 22 | done 23 | -------------------------------------------------------------------------------- /tests/cli/test_cat.py: -------------------------------------------------------------------------------- 1 | import textwrap 2 | 3 | from rhoknp.cli.cat import print_document 4 | from rhoknp.units import Document 5 | 6 | knp = textwrap.dedent( 7 | """\ 8 | # S-ID:1 9 | * 1D 10 | + 1D 11 | 望遠 ぼうえん 望遠 名詞 6 普通名詞 1 * 0 * 0 "代表表記:望遠/ぼうえん カテゴリ:抽象物" 12 | + 2D 13 | 鏡 きょう 鏡 名詞 6 普通名詞 1 * 0 * 0 "代表表記:鏡/きょう カテゴリ:人工物-その他 漢字読み:音" 14 | で で で 助詞 9 格助詞 1 * 0 * 0 NIL 15 | * 2D 16 | + 3D 17 | 泳いで およいで 泳ぐ 動詞 2 * 0 子音動詞ガ行 4 タ系連用テ形 14 "代表表記:泳ぐ/およぐ" 18 | いる いる いる 接尾辞 14 動詞性接尾辞 7 母音動詞 1 基本形 2 "代表表記:いる/いる" 19 | * 3D 20 | + 4D 21 | 少女 しょうじょ 少女 名詞 6 普通名詞 1 * 0 * 0 "代表表記:少女/しょうじょ カテゴリ:人" 22 | を を を 助詞 9 格助詞 1 * 0 * 0 NIL 23 | * -1D 24 | + -1D <節-区切><節-主辞> 25 | 見た みた 見る 動詞 2 * 0 母音動詞 1 タ形 10 "代表表記:見る/みる 自他動詞:自:見える/みえる 補文ト" 26 | 。 。 。 特殊 1 句点 1 * 0 * 0 NIL 27 | EOS 28 | """ 29 | ) 30 | 31 | 32 | def test_print_document() -> None: 33 | document = Document.from_knp(knp) 34 | print_document(document) 35 | 36 | 37 | def test_print_document_dark() -> None: 38 | document = Document.from_knp(knp) 39 | print_document(document, is_dark=True) 40 | -------------------------------------------------------------------------------- /tests/cli/test_cli.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import textwrap 3 | 4 | import pytest 5 | from typer.testing import CliRunner 6 | 7 | from rhoknp import Document, __version__ 8 | from rhoknp.cli.cli import app 9 | 10 | runner = CliRunner() 11 | 12 | 13 | knp_text = textwrap.dedent( 14 | """\ 15 | # S-ID:1 16 | * 1D 17 | + 1D 18 | 望遠 ぼうえん 望遠 名詞 6 普通名詞 1 * 0 * 0 "代表表記:望遠/ぼうえん カテゴリ:抽象物" 19 | + 2D 20 | 鏡 きょう 鏡 名詞 6 普通名詞 1 * 0 * 0 "代表表記:鏡/きょう カテゴリ:人工物-その他 漢字読み:音" 21 | で で で 助詞 9 格助詞 1 * 0 * 0 NIL 22 | * 2D 23 | + 3D 24 | 泳いで およいで 泳ぐ 動詞 2 * 0 子音動詞ガ行 4 タ系連用テ形 14 "代表表記:泳ぐ/およぐ" 25 | いる いる いる 接尾辞 14 動詞性接尾辞 7 母音動詞 1 基本形 2 "代表表記:いる/いる" 26 | * 3D 27 | + 4D 28 | 少女 しょうじょ 少女 名詞 6 普通名詞 1 * 0 * 0 "代表表記:少女/しょうじょ カテゴリ:人" 29 | を を を 助詞 9 格助詞 1 * 0 * 0 NIL 30 | * -1D 31 | + -1D <節-区切><節-主辞> 32 | 見た みた 見る 動詞 2 * 0 母音動詞 1 タ形 10 "代表表記:見る/みる 自他動詞:自:見える/みえる 補文ト" 33 | 。 。 。 特殊 1 句点 1 * 0 * 0 NIL 34 | EOS 35 | """ 36 | ) 37 | 38 | 39 | def test_version() -> None: 40 | result = runner.invoke(app, ["-v"]) 41 | assert result.exit_code == 0 42 | assert result.stdout.strip() == f"rhoknp version: {__version__}" 43 | 44 | 45 | def test_cat() -> None: 46 | doc = Document.from_knp(knp_text) 47 | with tempfile.NamedTemporaryFile("wt") as f: 48 | f.write(doc.to_knp()) 49 | f.flush() 50 | result = runner.invoke(app, ["cat", f.name]) 51 | assert result.exit_code == 0 52 | 53 | 54 | @pytest.fixture 55 | def _mock_stdin(monkeypatch: pytest.MonkeyPatch) -> None: 56 | monkeypatch.setattr("sys.stdin", knp_text) 57 | 58 | 59 | @pytest.mark.usefixtures("_mock_stdin") 60 | def test_cat_stdin() -> None: 61 | result = runner.invoke(app, ["cat"]) 62 | assert result.exit_code == 0 63 | 64 | 65 | def test_convert() -> None: 66 | doc = Document.from_knp(knp_text) 67 | with tempfile.NamedTemporaryFile("wt") as f: 68 | f.write(doc.to_knp()) 69 | f.flush() 70 | for format_ in ("text", "jumanpp", "knp"): 71 | result = runner.invoke(app, ["convert", f.name, "--format", format_]) 72 | assert result.exit_code == 0 73 | 74 | 75 | @pytest.mark.usefixtures("_mock_stdin") 76 | def test_convert_stdin() -> None: 77 | for format_ in ("text", "jumanpp", "knp"): 78 | result = runner.invoke(app, ["convert", "--format", format_]) 79 | assert result.exit_code == 0 80 | 81 | 82 | def test_convert_value_error() -> None: 83 | doc = Document.from_knp(knp_text) 84 | with tempfile.NamedTemporaryFile("wt") as f: 85 | f.write(doc.to_knp()) 86 | f.flush() 87 | result = runner.invoke(app, ["convert", f.name, "--format", "foo"]) # Unknown format 88 | assert result.exit_code == 1 89 | 90 | 91 | def test_show() -> None: 92 | doc = Document.from_knp(knp_text) 93 | with tempfile.NamedTemporaryFile("wt") as f: 94 | f.write(doc.to_knp()) 95 | f.flush() 96 | result = runner.invoke(app, ["show", f.name]) 97 | assert result.exit_code == 0 98 | 99 | 100 | def test_show_error() -> None: 101 | result = runner.invoke(app, ["show", "foo.knp"]) # not exist 102 | assert result.exit_code == 2 103 | 104 | 105 | def test_stats() -> None: 106 | doc = Document.from_knp(knp_text) 107 | with tempfile.NamedTemporaryFile("wt") as f: 108 | f.write(doc.to_knp()) 109 | f.flush() 110 | result = runner.invoke(app, ["stats", f.name]) 111 | assert result.exit_code == 0 112 | 113 | 114 | def test_stats_json() -> None: 115 | doc = Document.from_knp(knp_text) 116 | with tempfile.NamedTemporaryFile("wt") as f: 117 | f.write(doc.to_knp()) 118 | result = runner.invoke(app, ["stats", f.name, "--json"]) 119 | assert result.exit_code == 0 120 | 121 | 122 | def test_stats_error() -> None: 123 | result = runner.invoke(app, ["stats", "foo.knp"]) # not exist 124 | assert result.exit_code == 2 125 | 126 | 127 | def test_serve_error() -> None: 128 | result = runner.invoke(app, ["serve"]) 129 | assert result.exit_code == 2 130 | -------------------------------------------------------------------------------- /tests/cli/test_stats.py: -------------------------------------------------------------------------------- 1 | import textwrap 2 | 3 | from rhoknp.cli.stats import get_document_statistics 4 | from rhoknp.units import Document 5 | 6 | knp = textwrap.dedent( 7 | """\ 8 | # S-ID:1 9 | * 1D 10 | + 1D 11 | 望遠 ぼうえん 望遠 名詞 6 普通名詞 1 * 0 * 0 "代表表記:望遠/ぼうえん カテゴリ:抽象物" 12 | + 2D 13 | 鏡 きょう 鏡 名詞 6 普通名詞 1 * 0 * 0 "代表表記:鏡/きょう カテゴリ:人工物-その他 漢字読み:音" 14 | で で で 助詞 9 格助詞 1 * 0 * 0 NIL 15 | * 2D 16 | + 3D 17 | 泳いで およいで 泳ぐ 動詞 2 * 0 子音動詞ガ行 4 タ系連用テ形 14 "代表表記:泳ぐ/およぐ" 18 | いる いる いる 接尾辞 14 動詞性接尾辞 7 母音動詞 1 基本形 2 "代表表記:いる/いる" 19 | * 3D 20 | + 4D 21 | 少女 しょうじょ 少女 名詞 6 普通名詞 1 * 0 * 0 "代表表記:少女/しょうじょ カテゴリ:人" 22 | を を を 助詞 9 格助詞 1 * 0 * 0 NIL 23 | * -1D 24 | + -1D <節-区切><節-主辞> 25 | 見た みた 見る 動詞 2 * 0 母音動詞 1 タ形 10 "代表表記:見る/みる 自他動詞:自:見える/みえる 補文ト" 26 | 。 。 。 特殊 1 句点 1 * 0 * 0 NIL 27 | EOS 28 | """ 29 | ) 30 | 31 | 32 | def test_stats() -> None: 33 | document = Document.from_knp(knp) 34 | stats = get_document_statistics(document) 35 | assert stats == { 36 | "unit": { 37 | "sentence": 1, 38 | "clause": 1, 39 | "phrase": 4, 40 | "base_phrase": 5, 41 | "morpheme": 9, 42 | }, 43 | "cohesion": { 44 | "predicate": 0, 45 | "argument": 0, 46 | "coreference": 0, 47 | "discourse": 0, 48 | }, 49 | "other": { 50 | "named_entity": 0, 51 | }, 52 | } 53 | -------------------------------------------------------------------------------- /tests/cohesion/test_argument.py: -------------------------------------------------------------------------------- 1 | import textwrap 2 | 3 | import pytest 4 | 5 | from rhoknp.cohesion import ArgumentType, EndophoraArgument, ExophoraArgument, ExophoraReferent, Pas, Predicate 6 | from rhoknp.units import BasePhrase 7 | 8 | 9 | def test_endophora_argument() -> None: 10 | argument_base_phrase = BasePhrase.from_knp( 11 | textwrap.dedent( 12 | """\ 13 | + 4D 14 | 彼 かれ 彼 名詞 6 普通名詞 1 * 0 * 0 15 | は は は 助詞 9 副助詞 2 * 0 * 0 16 | """ 17 | ) 18 | ) 19 | predicate_base_phrase = BasePhrase.from_knp( 20 | textwrap.dedent( 21 | """\ 22 | + -1D 23 | 言う いう 言う 動詞 2 * 0 子音動詞ワ行 12 基本形 2 24 | """ 25 | ) 26 | ) 27 | another_predicate_base_phrase = BasePhrase.from_knp( 28 | textwrap.dedent( 29 | """\ 30 | + -1D 31 | 食べる たべる 食べる 動詞 2 * 0 母音動詞 1 基本形 2 32 | """ 33 | ) 34 | ) 35 | arg_type = ArgumentType.OMISSION 36 | pas = Pas(Predicate(predicate_base_phrase)) 37 | argument = EndophoraArgument("ガ", argument_base_phrase, pas.predicate, arg_type=arg_type) 38 | argument.pas = pas 39 | assert argument.case == "ガ" 40 | assert argument.type == arg_type 41 | assert argument.optional is False 42 | assert argument.is_special() is False 43 | assert argument.pas == pas 44 | assert argument.base_phrase == argument_base_phrase 45 | with pytest.raises(AssertionError): 46 | _ = argument.document 47 | with pytest.raises(AssertionError): 48 | _ = argument.sentence 49 | with pytest.raises(AssertionError): 50 | _ = argument.clause 51 | with pytest.raises(AssertionError): 52 | _ = argument.phrase 53 | 54 | assert repr(argument) == "" 55 | assert str(argument) == argument_base_phrase.text 56 | assert argument != "test" 57 | another_argument = EndophoraArgument("ガ", argument_base_phrase, pas.predicate, arg_type=ArgumentType.EXOPHORA) 58 | another_argument.pas = pas 59 | assert argument == another_argument 60 | 61 | another_pas = Pas(Predicate(another_predicate_base_phrase)) 62 | another_argument.pas = another_pas 63 | assert argument != another_argument 64 | 65 | 66 | def test_exophora_argument() -> None: 67 | predicate_base_phrase = BasePhrase.from_knp( 68 | textwrap.dedent( 69 | """\ 70 | + -1D 71 | 言う いう 言う 動詞 2 * 0 子音動詞ワ行 12 基本形 2 72 | """ 73 | ) 74 | ) 75 | another_predicate_base_phrase = BasePhrase.from_knp( 76 | textwrap.dedent( 77 | """\ 78 | + -1D 79 | 食べる たべる 食べる 動詞 2 * 0 母音動詞 1 基本形 2 80 | """ 81 | ) 82 | ) 83 | pas = Pas(Predicate(predicate_base_phrase)) 84 | exophora_referent = ExophoraReferent("不特定:人") 85 | argument = ExophoraArgument("ガ", exophora_referent, eid=3) 86 | argument.pas = pas 87 | assert argument.case == "ガ" 88 | assert argument.type == ArgumentType.EXOPHORA 89 | assert argument.optional is False 90 | assert argument.is_special() is True 91 | assert argument.pas == pas 92 | assert argument.exophora_referent == exophora_referent 93 | assert argument.eid == 3 94 | assert repr(argument) == f"ExophoraArgument(case='ガ', exophora_referent={exophora_referent!r}, eid=3)" 95 | assert eval(repr(argument)) == argument 96 | assert str(argument) == "不特定:人" 97 | assert argument != "test" 98 | another_argument = ExophoraArgument("ガ", exophora_referent, eid=1) 99 | another_argument.pas = pas 100 | assert argument == another_argument 101 | 102 | another_pas = Pas(Predicate(another_predicate_base_phrase)) 103 | another_argument.pas = another_pas 104 | assert argument != another_argument 105 | -------------------------------------------------------------------------------- /tests/cohesion/test_exophora.py: -------------------------------------------------------------------------------- 1 | from rhoknp.cohesion.exophora import ExophoraReferent, ExophoraReferentType 2 | 3 | 4 | def test_exophora() -> None: 5 | referent = ExophoraReferent("著者") 6 | assert referent.type == ExophoraReferentType.WRITER 7 | assert referent.index is None 8 | assert str(referent) == "著者" 9 | assert repr(referent) == "ExophoraReferent(text='著者')" 10 | assert eval(repr(referent)) == referent 11 | 12 | 13 | def test_exophora_number() -> None: 14 | referent = ExophoraReferent("不特定:人3") 15 | assert referent.type == ExophoraReferentType.UNSPECIFIED_PERSON 16 | assert referent.index == 3 17 | assert str(referent) == "不特定:人3" 18 | assert repr(referent) == "ExophoraReferent(text='不特定:人3')" 19 | assert eval(repr(referent)) == referent 20 | 21 | 22 | def test_exophora_other() -> None: 23 | referent = ExophoraReferent("ほげほげ2") 24 | assert referent.type == ExophoraReferentType.OTHER 25 | assert referent.index is None 26 | assert str(referent) == "ほげほげ2" 27 | assert repr(referent) == "ExophoraReferent(text='ほげほげ2')" 28 | assert eval(repr(referent)) == referent 29 | -------------------------------------------------------------------------------- /tests/cohesion/test_predicate.py: -------------------------------------------------------------------------------- 1 | import textwrap 2 | 3 | import pytest 4 | 5 | from rhoknp.cohesion import Pas, Predicate 6 | from rhoknp.units import BasePhrase 7 | 8 | 9 | def test_predicate() -> None: 10 | knp = textwrap.dedent( 11 | """\ 12 | + -1D <格解析結果:行く/いく:動12:ガ/N/彼/0/0/1;ニ/U/-/-/-/-;デ/U/-/-/-/-;ヘ/C/大学/3/0/1;時間/U/-/-/-/-> 13 | 行った いった 行く 動詞 2 * 0 子音動詞カ行促音便形 3 タ形 10 14 | 。 。 。 特殊 1 句点 1 * 0 * 0 NIL 15 | """ 16 | ) 17 | base_phrase = BasePhrase.from_knp(knp) 18 | predicate = Predicate(base_phrase, cfid="行く/いく:動12") 19 | pas = Pas(predicate) 20 | predicate.pas = pas 21 | assert predicate.base_phrase == base_phrase 22 | assert predicate.cfid == "行く/いく:動12" 23 | assert predicate.text == "行った。" 24 | with pytest.raises(AssertionError): 25 | _ = predicate.document 26 | with pytest.raises(AssertionError): 27 | _ = predicate.sentence 28 | with pytest.raises(AssertionError): 29 | _ = predicate.clause 30 | with pytest.raises(AssertionError): 31 | _ = predicate.phrase 32 | with pytest.raises(AssertionError): 33 | _ = predicate.sid 34 | assert predicate.pas == pas 35 | assert str(predicate) == "行った。" 36 | assert repr(predicate) == "" 37 | 38 | assert predicate != "test" 39 | another_predicate = Predicate(base_phrase, cfid="行く/いく:動12") 40 | assert predicate == another_predicate 41 | another_predicate = Predicate(base_phrase, cfid=None) 42 | assert predicate == another_predicate 43 | another_predicate = Predicate(base_phrase, cfid="行く/いく:動3") 44 | assert predicate != another_predicate 45 | -------------------------------------------------------------------------------- /tests/cohesion/test_rel.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from rhoknp.cohesion.rel import RelMode, RelTagList 4 | 5 | FSTRINGS = [ 6 | """""", 7 | """""", 8 | """""", 9 | ] 10 | 11 | 12 | def test_from_fstring_0() -> None: 13 | rel_tags = RelTagList.from_fstring(FSTRINGS[0]) 14 | assert len(rel_tags) == 3 15 | 16 | rel_tag = rel_tags[0] 17 | assert rel_tag.type == "=≒" 18 | assert rel_tag.target == "オフェンス" 19 | assert rel_tag.sid == "w201106-0001519365-1" 20 | assert rel_tag.base_phrase_index == 3 21 | assert rel_tag.mode is None 22 | 23 | rel_tag = rel_tags[1] 24 | assert rel_tag.type == "=≒" 25 | assert rel_tag.target == "ディフェンス" 26 | assert rel_tag.sid == "w201106-0001519365-1" 27 | assert rel_tag.base_phrase_index == 4 28 | assert rel_tag.mode == RelMode.AND 29 | 30 | rel_tag = rel_tags[2] 31 | assert rel_tag.type == "ノ?" 32 | assert rel_tag.target == "著者" 33 | assert rel_tag.sid is None 34 | assert rel_tag.base_phrase_index is None 35 | assert rel_tag.mode is None 36 | 37 | 38 | def test_from_fstring_1() -> None: 39 | rel_tags = RelTagList.from_fstring(FSTRINGS[1]) 40 | assert len(rel_tags) == 2 41 | 42 | rel_tag = rel_tags[0] 43 | assert rel_tag.type == "ガ" 44 | assert rel_tag.target == ">" 45 | assert rel_tag.sid == "202209271752-05054-00" 46 | assert rel_tag.base_phrase_index == 0 47 | assert rel_tag.mode is None 48 | 49 | rel_tag = rel_tags[1] 50 | assert rel_tag.type == "ニ" 51 | assert rel_tag.target == "不特定:人" 52 | assert rel_tag.sid is None 53 | assert rel_tag.base_phrase_index is None 54 | assert rel_tag.mode is None 55 | 56 | 57 | @pytest.mark.parametrize("fstring", FSTRINGS) 58 | def test_to_fstring(fstring: str) -> None: 59 | rel_tags = RelTagList.from_fstring(fstring) 60 | assert rel_tags.to_fstring() == fstring 61 | 62 | 63 | @pytest.mark.parametrize("fstring", FSTRINGS) 64 | def test_str(fstring: str) -> None: 65 | rel_tags = RelTagList.from_fstring(fstring) 66 | assert str(rel_tags) == fstring 67 | -------------------------------------------------------------------------------- /tests/data/w201106-0000060050.knp: -------------------------------------------------------------------------------- 1 | # S-ID:w201106-0000060050-1 JUMAN:6.1-20101108 KNP:3.1-20101107 DATE:2011/06/21 SCORE:-44.94406 MOD:2017/10/15 MEMO: 2 | * 2D 3 | + 1D 4 | コイン こいん コイン 名詞 6 普通名詞 1 * 0 * 0 5 | + 3D 6 | トス とす トス 名詞 6 サ変名詞 2 * 0 * 0 7 | を を を 助詞 9 格助詞 1 * 0 * 0 8 | * 2D 9 | + 3D 10 | 3 さん 3 名詞 6 数詞 7 * 0 * 0 11 | 回 かい 回 接尾辞 14 名詞性名詞助数辞 3 * 0 * 0 12 | * -1D 13 | + -1D 14 | 行う おこなう 行う 動詞 2 * 0 子音動詞ワ行 12 基本形 2 15 | 。 。 。 特殊 1 句点 1 * 0 * 0 16 | EOS 17 | # S-ID:w201106-0000060050-2 JUMAN:6.1-20101108 KNP:3.1-20101107 DATE:2011/06/21 SCORE:-64.95916 MOD:2013/04/13 18 | * 1D 19 | + 1D 20 | 表 おもて 表 名詞 6 普通名詞 1 * 0 * 0 21 | が が が 助詞 9 格助詞 1 * 0 * 0 22 | * 2D 23 | + 2D 24 | 出た でた 出る 動詞 2 * 0 母音動詞 1 タ形 10 25 | * 5D 26 | + 5D 27 | 数 かず 数 名詞 6 普通名詞 1 * 0 * 0 28 | だけ だけ だけ 助詞 9 副助詞 2 * 0 * 0 29 | 、 、 、 特殊 1 読点 2 * 0 * 0 30 | * 4D 31 | + 4D 32 | フィールド ふぃーるど フィールド 名詞 6 普通名詞 1 * 0 * 0 33 | 上 じょう 上 接尾辞 14 名詞性名詞接尾辞 2 * 0 * 0 34 | の の の 助詞 9 接続助詞 3 * 0 * 0 35 | * 5D 36 | + 5D 37 | モンスター もんすたー モンスター 名詞 6 普通名詞 1 * 0 * 0 38 | を を を 助詞 9 格助詞 1 * 0 * 0 39 | * -1D 40 | + -1D 41 | 破壊 はかい 破壊 名詞 6 サ変名詞 2 * 0 * 0 42 | する する する 動詞 2 * 0 サ変動詞 16 基本形 2 43 | 。 。 。 特殊 1 句点 1 * 0 * 0 44 | EOS 45 | # S-ID:w201106-0000060050-3 JUMAN:6.1-20101108 KNP:3.1-20101107 DATE:2011/06/21 SCORE:-130.82529 MOD:2016/07/22 MEMO: 46 | * 1D 47 | + 1D 48 | この この この 指示詞 7 連体詞形態指示詞 2 * 0 * 0 49 | * 6D 50 | + 8D 51 | 効果 こうか 効果 名詞 6 普通名詞 1 * 0 * 0 52 | は は は 助詞 9 副助詞 2 * 0 * 0 53 | * 3D 54 | + 3D 55 | 1 いち 1 名詞 6 数詞 7 * 0 * 0 56 | + 4D 57 | ターン たーん ターン 名詞 6 サ変名詞 2 * 0 * 0 58 | に に に 助詞 9 格助詞 1 * 0 * 0 59 | * 6D 60 | + 8D 61 | 1 いち 1 名詞 6 数詞 7 * 0 * 0 62 | 度 ど 度 接尾辞 14 名詞性名詞助数辞 3 * 0 * 0 63 | だけ だけ だけ 助詞 9 副助詞 2 * 0 * 0 64 | * 5D 65 | + 7D 66 | 自分 じぶん 自分 名詞 6 普通名詞 1 * 0 * 0 67 | の の の 助詞 9 接続助詞 3 * 0 * 0 68 | * 6D 69 | + 7D 70 | メイン めいん メインだ 形容詞 3 * 0 ナノ形容詞 22 語幹 1 71 | + 8D 72 | フェイズ ふぇいず フェイズ 名詞 6 普通名詞 1 * 0 * 0 73 | に に に 助詞 9 格助詞 1 * 0 * 0 74 | * -1D 75 | + -1D 76 | 使用 しよう 使用 名詞 6 サ変名詞 2 * 0 * 0 77 | する する する 動詞 2 * 0 サ変動詞 16 基本形 2 78 | 事 こと 事 名詞 6 普通名詞 1 * 0 * 0 79 | が が が 助詞 9 格助詞 1 * 0 * 0 80 | できる できる できる 動詞 2 * 0 母音動詞 1 基本形 2 81 | 。 。 。 特殊 1 句点 1 * 0 * 0 82 | EOS 83 | -------------------------------------------------------------------------------- /tests/data/w201106-0000074273.knp: -------------------------------------------------------------------------------- 1 | # S-ID:w201106-0000074273-1 JUMAN:6.1-20101108 KNP:3.1-20101107 DATE:2011/06/21 SCORE:-55.96971 MOD:2011/07/04 2 | * 2D 3 | + 2D 4 | 7 なな 7 名詞 6 数詞 7 * 0 * 0 5 | つ つ つ 接尾辞 14 名詞性名詞助数辞 3 * 0 * 0 6 | の の の 助詞 9 接続助詞 3 * 0 * 0 7 | * 2D 8 | + 2D 9 | 女神 めがみ 女神 名詞 6 普通名詞 1 * 0 * 0 10 | の の の 助詞 9 接続助詞 3 * 0 * 0 11 | * 4D 12 | + 4D 13 | 果実 かじつ 果実 名詞 6 普通名詞 1 * 0 * 0 14 | が が が 助詞 9 格助詞 1 * 0 * 0 15 | * 4D 16 | + 4D 17 | 全て すべて 全て 副詞 8 * 0 * 0 * 0 18 | * 6D 19 | + 7D 20 | そろったら そろったら そろう 動詞 2 * 0 子音動詞ワ行 12 タ系条件形 13 21 | * 6D 22 | + 6D 23 | 天使 てんし 天使 名詞 6 普通名詞 1 * 0 * 0 24 | + 7D 25 | 界 かい 界 名詞 6 普通名詞 1 * 0 * 0 26 | に に に 助詞 9 格助詞 1 * 0 * 0 27 | * -1D 28 | + -1D 29 | 向かい むかい 向かう 動詞 2 * 0 子音動詞ワ行 12 基本連用形 8 30 | ます ます ます 接尾辞 14 動詞性接尾辞 7 動詞性接尾辞ます型 31 基本形 2 31 | 。 。 。 特殊 1 句点 1 * 0 * 0 32 | EOS 33 | # S-ID:w201106-0000074273-2 JUMAN:6.1-20101108 KNP:3.1-20101107 DATE:2011/06/21 SCORE:-97.54113 MOD:2011/07/04 34 | * 1D 35 | + 1D 36 | ダーマ だーま ダーマ 名詞 6 固有名詞 3 * 0 * 0 "品詞変更:ダーマ-ダーマ-ダーマ-15-2-0-0" 37 | + 2D 38 | 神殿 しんでん 神殿 名詞 6 普通名詞 1 * 0 * 0 39 | + 3D 40 | 南西 なんせい 南西 名詞 6 普通名詞 1 * 0 * 0 41 | に に に 助詞 9 格助詞 1 * 0 * 0 42 | * 3D 43 | + 5D 44 | ある ある ある 動詞 2 * 0 子音動詞ラ行 10 基本形 2 45 | * 3D 46 | + 5D 47 | 青い あおい 青い 形容詞 3 * 0 イ形容詞アウオ段 18 基本形 2 48 | * 4D 49 | + 6D 50 | 木 き 木 名詞 6 普通名詞 1 * 0 * 0 51 | へ へ へ 助詞 9 格助詞 1 * 0 * 0 52 | * 7D 53 | + 9D 54 | 行き いき 行く 動詞 2 * 0 子音動詞カ行促音便形 3 基本連用形 8 55 | * 6D 56 | + 8D 57 | 天 てん 天 名詞 6 普通名詞 1 * 0 * 0 58 | の の の 助詞 9 接続助詞 3 * 0 * 0 59 | * 7D 60 | + 9D 61 | 箱舟 はこぶね 箱舟 名詞 6 普通名詞 1 * 0 * 0 62 | を を を 助詞 9 格助詞 1 * 0 * 0 63 | * -1D 64 | + -1D 65 | 呼ぶ よぶ 呼ぶ 動詞 2 * 0 子音動詞バ行 8 基本形 2 66 | 。 。 。 特殊 1 句点 1 * 0 * 0 67 | EOS 68 | # S-ID:w201106-0000074273-3 JUMAN:6.1-20101108 KNP:3.1-20101107 DATE:2011/06/21 SCORE:-98.41177 MOD:2013/04/22 69 | * 2D 70 | + 2D 71 | 途中 とちゅう 途中 名詞 6 時相名詞 10 * 0 * 0 72 | で で で 助詞 9 格助詞 1 * 0 * 0 73 | * 2D 74 | + 2D 75 | イベント いべんと イベント 名詞 6 普通名詞 1 * 0 * 0 76 | が が が 助詞 9 格助詞 1 * 0 * 0 77 | * 5P 78 | + 5P 79 | 発生 はっせい 発生 名詞 6 サ変名詞 2 * 0 * 0 80 | し し する 動詞 2 * 0 サ変動詞 16 基本連用形 8 81 | 、 、 、 特殊 1 読点 2 * 0 * 0 82 | * 5D 83 | + 5D 84 | 自動 じどう 自動 名詞 6 普通名詞 1 * 0 * 0 85 | 的に てきに 的だ 接尾辞 14 形容詞性名詞接尾辞 6 ナ形容詞 21 ダ列基本連用形 8 86 | * 5D 87 | + 5D 88 | ナザム なざむ ナザム 名詞 6 地名 4 * 0 * 0 "品詞変更:ナザム-ナザム-ナザム-15-2-0-0" 89 | 村 むら 村 接尾辞 14 名詞性特殊接尾辞 4 * 0 * 0 90 | へ へ へ 助詞 9 格助詞 1 * 0 * 0 91 | * -1D 92 | + -1D 93 | 行き いき 行く 動詞 2 * 0 子音動詞カ行促音便形 3 基本連用形 8 94 | ます ます ます 接尾辞 14 動詞性接尾辞 7 動詞性接尾辞ます型 31 基本形 2 95 | 。 。 。 特殊 1 句点 1 * 0 * 0 96 | EOS 97 | -------------------------------------------------------------------------------- /tests/data/wiki00100176.knp: -------------------------------------------------------------------------------- 1 | # S-ID:wiki00100176-00 KNP:5.0-6a1f607d DATE:2022/04/11 SCORE:50.00000 MOD:2022/04/29 MEMO: 2 | * 2D 3 | + 1D 4 | 株式 かぶしき 株式 名詞 6 普通名詞 1 * 0 * 0 5 | + 3D 6 | 会社 がいしゃ 会社 名詞 6 普通名詞 1 * 0 * 0 7 | + 3D 8 | ポニー ぽにー ポニー 名詞 6 普通名詞 1 * 0 * 0 9 | + 11D 10 | キャニオン きゃにおん キャニオン 名詞 6 普通名詞 1 * 0 * 0 11 | は は は 助詞 9 副助詞 2 * 0 * 0 12 | 、 、 、 特殊 1 読点 2 * 0 * 0 13 | * 2D 14 | + 5P 15 | フジ ふじ フジ 名詞 6 組織名 6 * 0 * 0 16 | + 6D 17 | サンケイ さんけい サンケイ 名詞 6 組織名 6 * 0 * 0 18 | + 11D 19 | グループ ぐるーぷ グループ 名詞 6 普通名詞 1 * 0 * 0 20 | の の の 助詞 9 接続助詞 3 * 0 * 0 21 | * -1D 22 | + 11D 23 | 大手 おおて 大手 名詞 6 普通名詞 1 * 0 * 0 24 | + 9P 25 | 映像 えいぞう 映像 名詞 6 普通名詞 1 * 0 * 0 26 | ・ ・ ・ 特殊 1 記号 5 * 0 * 0 27 | + 10D 28 | 音楽 おんがく 音楽 名詞 6 普通名詞 1 * 0 * 0 29 | + 11D 30 | ソフト そふと ソフト 名詞 6 普通名詞 1 * 0 * 0 31 | + -1D 32 | メーカー めーかー メーカー 名詞 6 普通名詞 1 * 0 * 0 33 | である である だ 判定詞 4 * 0 判定詞 25 デアル列基本形 15 34 | 。 。 。 特殊 1 句点 1 * 0 * 0 35 | EOS 36 | # S-ID:wiki00100176-01 KNP:5.0-6a1f607d DATE:2022/04/11 SCORE:50.00000 MOD:2022/04/29 MEMO: 37 | * 1D 38 | + 1D 39 | 通称 つうしょう 通称 名詞 6 普通名詞 1 * 0 * 0 40 | は は は 助詞 9 副助詞 2 * 0 * 0 41 | * -1D 42 | + -1D 43 | 「 「 「 特殊 1 括弧始 3 * 0 * 0 44 | ポニキャン ぽにきゃん ポニキャン 名詞 6 組織名 6 * 0 * 0 45 | 」 」 」 特殊 1 括弧終 4 * 0 * 0 46 | 。 。 。 特殊 1 句点 1 * 0 * 0 47 | EOS 48 | # S-ID:wiki00100176-02 KNP:5.0-6a1f607d DATE:2022/04/11 SCORE:0.00000 MOD:2022/04/29 MEMO: 49 | * 1D 50 | + 1D 51 | フジ ふじ フジ 名詞 6 組織名 6 * 0 * 0 52 | ・ ・ ・ 特殊 1 記号 5 * 0 * 0 53 | + 2D 54 | メディア めでぃあ メディア 名詞 6 普通名詞 1 * 0 * 0 55 | ・ ・ ・ 特殊 1 記号 5 * 0 * 0 56 | + 4D 57 | ホールディングス ほーるでぃんぐす ホールディングス 名詞 6 普通名詞 1 * 0 * 0 58 | の の の 助詞 9 接続助詞 3 * 0 * 0 59 | * -1D 60 | + 4D 61 | 連結 れんけつ 連結 名詞 6 サ変名詞 2 * 0 * 0 62 | + -1D 63 | 子会社 こがいしゃ 子会社 名詞 6 普通名詞 1 * 0 * 0 64 | 。 。 。 特殊 1 句点 1 * 0 * 0 65 | EOS 66 | -------------------------------------------------------------------------------- /tests/processors/test_kwja.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from rhoknp import KNP, KWJA, Document, Jumanpp, Sentence 4 | 5 | is_kwja_available = KWJA(options=["--model-size", "tiny", "--tasks", "typo"]).is_available() 6 | 7 | 8 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available") 9 | def test_get_version() -> None: 10 | kwja = KWJA(options=["--model-size", "tiny"]) 11 | _ = kwja.get_version() 12 | 13 | 14 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available") 15 | def test_is_available() -> None: 16 | kwja = KWJA(options=["--model-size", "tiny"]) 17 | assert kwja.is_available() is True 18 | 19 | kwja = KWJA("kwjaaaaaaaaaaaaaaaaa") 20 | assert kwja.is_available() is False 21 | 22 | with pytest.raises(RuntimeError): 23 | _ = kwja.apply_to_sentence("test") 24 | 25 | with pytest.raises(RuntimeError): 26 | _ = kwja.apply_to_document("test") 27 | 28 | with pytest.raises(RuntimeError): 29 | _ = kwja.get_version() 30 | 31 | 32 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available") 33 | def test_typo() -> None: 34 | kwja = KWJA(options=["--model-size", "tiny", "--tasks", "typo"]) 35 | text = "人口知能" 36 | doc = kwja.apply_to_document(text) 37 | assert doc.text == "人工知能" 38 | 39 | 40 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available") 41 | def test_char() -> None: 42 | kwja = KWJA(options=["--model-size", "tiny", "--tasks", "char"]) 43 | text = "こんにちは。さようなら。" 44 | doc = kwja.apply_to_document(text) 45 | morphemes = doc.morphemes 46 | assert len(morphemes) > 0 47 | morpheme = morphemes[0] 48 | assert text.startswith(morpheme.text) 49 | assert morpheme.reading == "*" 50 | assert morpheme.lemma == "*" 51 | 52 | 53 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available") 54 | def test_seq2seq() -> None: 55 | kwja = KWJA(options=["--model-size", "tiny", "--tasks", "char,seq2seq"]) 56 | text = "こんにちは。さようなら。" 57 | doc = kwja.apply_to_document(text, timeout=60) 58 | morphemes = doc.morphemes 59 | assert len(morphemes) > 0 60 | morpheme = morphemes[0] 61 | assert text.startswith(morpheme.text) 62 | assert text.startswith(morpheme.reading) 63 | assert text.startswith(morpheme.lemma) 64 | 65 | 66 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available") 67 | def test_word() -> None: 68 | kwja = KWJA(options=["--model-size", "tiny", "--tasks", "char,word"]) 69 | text = "こんにちは。さようなら。" 70 | doc = kwja.apply_to_document(text) 71 | morphemes = doc.morphemes 72 | assert len(morphemes) > 0 73 | assert text.startswith(morphemes[0].text) 74 | base_phrases = doc.base_phrases 75 | assert len(base_phrases) > 0 76 | assert text.startswith(base_phrases[0].text) 77 | phrases = doc.phrases 78 | assert len(phrases) > 0 79 | assert text.startswith(phrases[0].text) 80 | clauses = doc.clauses 81 | assert len(clauses) > 0 82 | assert text.startswith(clauses[0].text) 83 | 84 | 85 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available") 86 | def test_raw_input() -> None: 87 | kwja = KWJA(options=["--model-size", "tiny", "--tasks", "typo", "--input-format", "raw"]) 88 | text = "人口知能" 89 | doc = kwja.apply_to_document(text) 90 | assert doc.text == "人工知能" 91 | 92 | 93 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available") 94 | def test_jumanpp_input() -> None: 95 | doc0 = Document.from_raw_text("こんにちは。さようなら。") 96 | doc0.doc_id = "test" 97 | doc1 = Jumanpp().apply_to_document(doc0) 98 | for idx, sent in enumerate(doc1.sentences): 99 | sent.sent_id = f"test-{idx}" 100 | assert not doc1.is_jumanpp_required() 101 | doc2 = KWJA(options=["--model-size", "tiny", "--tasks", "word", "--input-format", "jumanpp"]).apply_to_document( 102 | doc1 103 | ) 104 | assert doc1.doc_id == doc2.doc_id 105 | assert [sent.sid for sent in doc2.sentences] == [sent.sid for sent in doc1.sentences] 106 | assert [sent.text for sent in doc2.sentences] == [sent.text for sent in doc1.sentences] 107 | assert [mrph.text for mrph in doc2.morphemes] == [mrph.text for mrph in doc1.morphemes] 108 | 109 | 110 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available") 111 | def test_knp_input() -> None: 112 | text = "こんにちは。さようなら。" 113 | doc1 = KNP().apply_to_document(text) 114 | assert not doc1.is_knp_required() 115 | doc2 = KWJA(options=["--model-size", "tiny", "--tasks", "word", "--input-format", "knp"]).apply_to_document(doc1) 116 | assert doc1.doc_id == doc2.doc_id 117 | assert [sent.sid for sent in doc2.sentences] == [sent.sid for sent in doc1.sentences] 118 | assert [sent.text for sent in doc2.sentences] == [sent.text for sent in doc1.sentences] 119 | assert [mrph.text for mrph in doc2.morphemes] == [mrph.text for mrph in doc1.morphemes] 120 | 121 | 122 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available") 123 | def test_apply() -> None: 124 | kwja = KWJA(options=["--model-size", "tiny"]) 125 | text = "外国人参政権" 126 | assert isinstance(kwja.apply(text), Document) 127 | assert isinstance(kwja.apply(Document.from_raw_text(text)), Document) 128 | with pytest.raises(NotImplementedError): 129 | _ = kwja.apply(Sentence.from_raw_text(text)) 130 | with pytest.raises(TypeError): 131 | _ = kwja.apply(1) # type: ignore 132 | 133 | 134 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available") 135 | def test_keep_doc_id_document() -> None: 136 | kwja = KWJA(options=["--model-size", "tiny"]) 137 | doc = Document.from_sentences(["こんにちは。", "さようなら。"]) 138 | doc.doc_id = "test" 139 | for sent in doc.sentences: 140 | sent.doc_id = "test" 141 | doc = kwja.apply_to_document(doc) 142 | assert doc.doc_id == "test" 143 | for sent in doc.sentences: 144 | assert sent.doc_id == "test" 145 | 146 | 147 | def test_timeout_error() -> None: 148 | kwja = KWJA("tests/bin/kwja-mock.sh", skip_sanity_check=True) 149 | with pytest.raises(TimeoutError): 150 | _ = kwja.apply_to_document("time consuming input", timeout=1) 151 | 152 | 153 | def test_runtime_error() -> None: 154 | kwja = KWJA("tests/bin/kwja-mock.sh", skip_sanity_check=True) 155 | with pytest.raises(RuntimeError): 156 | _ = kwja.apply_to_document("error causing input") 157 | 158 | 159 | def test_unsupported_option() -> None: 160 | with pytest.raises(ValueError, match=r"invalid task: \['wakachi'\]"): 161 | _ = KWJA(options=["--model-size", "tiny", "--tasks", "wakachi"]) 162 | with pytest.raises(ValueError, match="invalid input format: seq2seq"): 163 | _ = KWJA(options=["--model-size", "tiny", "--input-format", "seq2seq"]) 164 | 165 | 166 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available") 167 | def test_apply_to_sentence() -> None: 168 | kwja = KWJA(options=["--model-size", "tiny"]) 169 | with pytest.raises(NotImplementedError): 170 | _ = kwja.apply_to_sentence("外国人参政権") 171 | 172 | 173 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available") 174 | def test_repr() -> None: 175 | kwja = KWJA(options=["--model-size", "tiny", "--tasks", "char,word"]) 176 | assert repr(kwja) == "KWJA(executable='kwja', options=['--model-size', 'tiny', '--tasks', 'char,word'])" 177 | -------------------------------------------------------------------------------- /tests/processors/test_regex_senter.py: -------------------------------------------------------------------------------- 1 | import time 2 | from unittest.mock import MagicMock 3 | 4 | import pytest 5 | 6 | from rhoknp import Document, RegexSenter, Sentence 7 | 8 | 9 | @pytest.mark.parametrize( 10 | ("document", "sentence_strings"), 11 | [ 12 | ( 13 | "", 14 | [], 15 | ), 16 | ( 17 | "天気がいいので散歩した。", 18 | ["天気がいいので散歩した。"], 19 | ), 20 | ( 21 | "天気がいいので散歩した。散歩の途中で先生に出会った。", 22 | ["天気がいいので散歩した。", "散歩の途中で先生に出会った。"], 23 | ), 24 | ( 25 | "天気がいいので散歩した.散歩の途中で先生に出会った.", 26 | ["天気がいいので散歩した.", "散歩の途中で先生に出会った."], 27 | ), 28 | ( 29 | "天気がいいので散歩した\n散歩の途中で先生に出会った", 30 | ["天気がいいので散歩した", "散歩の途中で先生に出会った"], 31 | ), 32 | ( 33 | "天気がいいので散歩した。散歩の途中で Michael に出会った。", 34 | ["天気がいいので散歩した。", "散歩の途中で Michael に出会った。"], 35 | ), 36 | ( 37 | "今何時ですか?次の予定があるので失礼します。", 38 | ["今何時ですか?", "次の予定があるので失礼します。"], 39 | ), 40 | ( 41 | "今何時ですか?次の予定があるので失礼します。", 42 | ["今何時ですか?", "次の予定があるので失礼します。"], 43 | ), 44 | ( 45 | "今何時ですか!次の予定があるので失礼します。", 46 | ["今何時ですか!", "次の予定があるので失礼します。"], 47 | ), 48 | ( 49 | "今何時ですか! 次の予定があるので失礼します。", 50 | ["今何時ですか!", "次の予定があるので失礼します。"], 51 | ), 52 | ( 53 | "今何時ですか???次の予定があるので失礼します!!!", 54 | ["今何時ですか???", "次の予定があるので失礼します!!!"], 55 | ), 56 | ( 57 | "お疲れ様です♪次の予定があるので失礼します。", 58 | ["お疲れ様です♪", "次の予定があるので失礼します。"], 59 | ), 60 | ( 61 | "お疲れ様です★次の予定があるので失礼します。", 62 | ["お疲れ様です★", "次の予定があるので失礼します。"], 63 | ), 64 | ( 65 | "お疲れ様です☆次の予定があるので失礼します。", 66 | ["お疲れ様です☆", "次の予定があるので失礼します。"], 67 | ), 68 | ( 69 | "なるほど…これは難しい問題ですね。", 70 | ["なるほど…", "これは難しい問題ですね。"], 71 | ), 72 | ( 73 | "テレビで「今年の夏は暑いので、熱中症に注意しましょう。」と言っていた。", 74 | ["テレビで「今年の夏は暑いので、熱中症に注意しましょう。」と言っていた。"], 75 | ), 76 | ( 77 | "そんな(笑\n安心してください(笑", 78 | ["そんな(笑", "安心してください(笑"], 79 | ), 80 | ( 81 | "『君の名は。』は良い作品でした。", 82 | ["『君の名は。』は良い作品でした。"], 83 | ), 84 | ( 85 | "次の問いに答えよ。 1) tan30°は有理数か。 2) tan1°は有理数か。", 86 | ["次の問いに答えよ。", "1) tan30°は有理数か。", "2) tan1°は有理数か。"], 87 | ), 88 | ( 89 | "やっと掃除終わった_(:3 」∠)_もう24時…さっさと寝よう。", 90 | ["やっと掃除終わった_(:3 」∠)_もう24時…", "さっさと寝よう。"], 91 | ), 92 | ], 93 | ) 94 | def test_apply_to_document(document: str, sentence_strings: list[str]) -> None: 95 | senter = RegexSenter() 96 | doc = senter.apply_to_document(document) 97 | for i, sentence in enumerate(doc.sentences): 98 | assert sentence.text == sentence_strings[i] 99 | 100 | 101 | def test_apply_to_sentence() -> None: 102 | senter = RegexSenter() 103 | text = "天気がいいので散歩した。" 104 | sent = senter.apply_to_sentence(text) 105 | assert sent.text == text 106 | 107 | 108 | def test_keep_id_sentence() -> None: 109 | senter = RegexSenter() 110 | sent = Sentence.from_raw_text("天気がいいので散歩した。") 111 | sent.doc_id = "test" 112 | sent.sent_id = "test-1" 113 | sent = senter.apply_to_sentence(sent) 114 | assert sent.doc_id == "test" 115 | assert sent.sent_id == "test-1" 116 | 117 | 118 | def test_keep_id_document() -> None: 119 | senter = RegexSenter() 120 | doc = Document.from_raw_text("天気がいいので散歩した。散歩の途中で先生に出会った。") 121 | doc.doc_id = "test" 122 | doc = senter.apply_to_document(doc) 123 | assert doc.doc_id == "test" 124 | for sent in doc.sentences: 125 | assert sent.doc_id == "test" 126 | 127 | 128 | def test_repr() -> None: 129 | senter = RegexSenter() 130 | assert repr(senter) == "RegexSenter()" 131 | 132 | 133 | def test_timeout() -> None: 134 | senter = RegexSenter() 135 | senter._split_document = MagicMock(side_effect=lambda _: time.sleep(5)) # type: ignore 136 | with pytest.raises(TimeoutError): 137 | senter.apply_to_document("天気がいいので散歩した。", timeout=3) 138 | -------------------------------------------------------------------------------- /tests/props/test_features.py: -------------------------------------------------------------------------------- 1 | from dataclasses import astuple, dataclass 2 | from typing import Union 3 | 4 | import pytest 5 | 6 | from rhoknp.props import FeatureDict 7 | 8 | 9 | @dataclass(frozen=True) 10 | class FeaturesTestCase: 11 | fstring: str 12 | features: dict[str, Union[str, bool]] 13 | length: int 14 | 15 | 16 | cases = [ 17 | FeaturesTestCase( 18 | fstring="""<文節内><係:文節内><文頭><体言><名詞項候補><先行詞候補><正規化代表表記:構文/こうぶん>""", 19 | features={ 20 | "BGH": "構文/こうぶん", 21 | "文節内": True, 22 | "係": "文節内", 23 | "文頭": True, 24 | "体言": True, 25 | "名詞項候補": True, 26 | "先行詞候補": True, 27 | "正規化代表表記": "構文/こうぶん", 28 | }, 29 | length=8, 30 | ), 31 | FeaturesTestCase( 32 | fstring="""""", 33 | features={ 34 | 'ALT-京都-きょうと-京都-6-4-0-0-"代表表記:京都/きょうと 地名:日本:府"': True, 35 | }, 36 | length=1, 37 | ), 38 | FeaturesTestCase( 39 | fstring=r"""タグ>""", 40 | features={ 41 | "NE": r"OPTIONAL:html>タグ", 42 | }, 43 | length=1, 44 | ), 45 | FeaturesTestCase( 46 | fstring=r"""<係チ:非用言格解析||用言&&文節内:T解析格-ヲ><正規化代表表記:”/”><主辞代表表記:”/”><照応詞候補:最高">""", 47 | features={ 48 | "係チ": r"非用言格解析||用言&&文節内:T解析格-ヲ", 49 | "正規化代表表記": "”/”", 50 | "主辞代表表記": "”/”", 51 | "照応詞候補": '最高"', 52 | "EID": "2", 53 | }, 54 | length=5, 55 | ), 56 | ] 57 | 58 | 59 | cases_with_ignored_tag = [ 60 | FeaturesTestCase( 61 | fstring="""<解析済><体言>""", 62 | features={ 63 | "BGH": "関心/かんしん", 64 | "解析済": True, 65 | "体言": True, 66 | }, 67 | length=3, 68 | ), 69 | ] 70 | 71 | 72 | @pytest.mark.parametrize(("fstring", "features", "length"), [astuple(case) for case in cases + cases_with_ignored_tag]) 73 | def test_from_fstring(fstring: str, features: dict[str, Union[str, bool]], length: int) -> None: 74 | fs = FeatureDict.from_fstring(fstring) 75 | assert len(fs) == length 76 | assert dict(fs) == features 77 | assert fs.get("dummy") is None 78 | 79 | 80 | @pytest.mark.parametrize("fstring", [case.fstring for case in cases]) 81 | def test_to_fstring(fstring: str) -> None: 82 | fs = FeatureDict.from_fstring(fstring) 83 | assert fs.to_fstring() == fstring 84 | 85 | 86 | def test_false() -> None: 87 | assert FeatureDict._item_to_fstring("sem", False) == "" 88 | 89 | 90 | def test_ignore_tag_prefix() -> None: 91 | features = FeatureDict() 92 | features["rel"] = 'type="ノ" target="ユーザー" sid="w201106-0000060560-1" id="1"' 93 | assert len(features) == 0 94 | 95 | features["memo"] = 'text="メモ"' 96 | assert len(features) == 0 97 | 98 | 99 | def test_modification() -> None: 100 | features = FeatureDict.from_fstring("""<用言:動><主節>""") 101 | assert features.to_fstring() == """<用言:動><主節>""" 102 | # Update 103 | features["用言"] = "判" 104 | assert features.to_fstring() == """<用言:判><主節>""" 105 | # Insert 106 | features["文末"] = True 107 | assert features.to_fstring() == """<用言:判><主節><文末>""" 108 | # Delete 109 | del features["主節"] 110 | assert features.to_fstring() == """<用言:判><文末>""" 111 | -------------------------------------------------------------------------------- /tests/props/test_memo.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from rhoknp.props.memo import MemoTag 4 | 5 | CASES = [ 6 | { 7 | "from_fstring": """<体言>""", 8 | "to_fstring": """""", 9 | "text": "メモ", 10 | "bool": True, 11 | }, 12 | { 13 | "from_fstring": """""", 14 | "to_fstring": """""", 15 | "text": "メモ1", 16 | "bool": True, 17 | }, 18 | { 19 | "from_fstring": """""", 20 | "to_fstring": """""", 21 | "text": "", 22 | "bool": False, 23 | }, 24 | { 25 | "from_fstring": """""", 26 | "to_fstring": """""", 27 | "text": """<メモ> 'quote' "double quote\"""", 28 | "bool": True, 29 | }, 30 | ] 31 | 32 | 33 | @pytest.mark.parametrize("case", CASES) 34 | def test_from_fstring(case: dict) -> None: 35 | memo_tag = MemoTag.from_fstring(case["from_fstring"]) 36 | assert memo_tag.text == case["text"] 37 | 38 | 39 | @pytest.mark.parametrize("case", CASES) 40 | def test_to_fstring(case: dict) -> None: 41 | memo_tag = MemoTag.from_fstring(case["from_fstring"]) 42 | assert memo_tag.to_fstring() == case["to_fstring"] 43 | 44 | 45 | @pytest.mark.parametrize("case", CASES) 46 | def test_str(case: dict) -> None: 47 | memo_tag = MemoTag.from_fstring(case["from_fstring"]) 48 | assert str(memo_tag) == case["to_fstring"] 49 | 50 | 51 | @pytest.mark.parametrize("case", CASES) 52 | def test_bool(case: dict) -> None: 53 | memo_tag = MemoTag.from_fstring(case["from_fstring"]) 54 | assert bool(memo_tag) == case["bool"] 55 | -------------------------------------------------------------------------------- /tests/props/test_named_entity.py: -------------------------------------------------------------------------------- 1 | import textwrap 2 | from pathlib import Path 3 | from typing import Any 4 | 5 | import pytest 6 | 7 | from rhoknp import Document, Sentence 8 | from rhoknp.props import NamedEntity, NamedEntityCategory 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "case", 13 | [ 14 | { 15 | "doc_id": "w201106-0000060877", 16 | "named_entities": [ 17 | { 18 | "category": NamedEntityCategory.ORGANIZATION, 19 | "text": "柏市ひまわり園", 20 | "fstring": "", 21 | }, 22 | { 23 | "category": NamedEntityCategory.DATE, 24 | "text": "平成23年度", 25 | "fstring": "", 26 | }, 27 | ], 28 | }, 29 | { 30 | "doc_id": "w201106-0000074273", 31 | "named_entities": [ 32 | { 33 | "category": NamedEntityCategory.LOCATION, 34 | "text": "ダーマ神殿", 35 | "fstring": "", 36 | }, 37 | { 38 | "category": NamedEntityCategory.ARTIFACT, 39 | "text": "天の箱舟", 40 | "fstring": "", 41 | }, 42 | { 43 | "category": NamedEntityCategory.LOCATION, 44 | "text": "ナザム村", 45 | "fstring": "", 46 | }, 47 | ], 48 | }, 49 | ], 50 | ) 51 | def test_ne(case: dict[str, Any]) -> None: 52 | doc = Document.from_knp(Path(f"tests/data/{case['doc_id']}.knp").read_text()) 53 | actual_nes = doc.named_entities 54 | expected_nes = case["named_entities"] 55 | assert len(actual_nes) == len(expected_nes) 56 | for actual_ne, expected_ne in zip(actual_nes, expected_nes): 57 | assert actual_ne.category == expected_ne["category"] 58 | assert actual_ne.text == expected_ne["text"] 59 | assert str(actual_ne) == expected_ne["text"] 60 | assert actual_ne.to_fstring() == expected_ne["fstring"] 61 | 62 | 63 | def test_from_fstring_malformed_line() -> None: 64 | fstring = "" 65 | ne = NamedEntity.from_fstring(fstring, []) 66 | assert ne is None 67 | 68 | 69 | def test_unknown_category() -> None: 70 | fstring = "" 71 | sentence = Sentence.from_knp( 72 | textwrap.dedent( 73 | """\ 74 | # S-ID:1 75 | * -1D 76 | + -1D 77 | アンノウン アンノウン アンノウン 名詞 6 普通名詞 1 * 0 * 0 78 | EOS 79 | """ 80 | ) 81 | ) 82 | ne = NamedEntity.from_fstring(fstring, sentence.morphemes) 83 | assert ne is None 84 | 85 | 86 | def test_span_not_found() -> None: 87 | fstring = "" 88 | sentence = Sentence.from_knp( 89 | textwrap.dedent( 90 | """\ 91 | # S-ID:1 92 | * -1D 93 | + 1D 94 | 東京 とうきょう 東京 名詞 6 地名 4 * 0 * 0 95 | + -1D 96 | 大学 だいがく 大学 名詞 6 普通名詞 1 * 0 * 0 97 | EOS 98 | """ 99 | ) 100 | ) 101 | ne = NamedEntity.from_fstring(fstring, sentence.morphemes) 102 | assert ne is None 103 | 104 | 105 | @pytest.mark.parametrize( 106 | "case", 107 | [ 108 | dict( 109 | fstring=r"タグ>", 110 | category=NamedEntityCategory.OPTIONAL, 111 | text="html>タグ", 112 | knp=textwrap.dedent( 113 | """\ 114 | # S-ID:1 115 | * -1D 116 | + 1D 117 | < < < 特殊 1 括弧始 3 * 0 * 0 118 | html html html 名詞 6 普通名詞 1 * 0 * 0 119 | > > > 特殊 1 括弧終 4 * 0 * 0 120 | + -1D 121 | タグ たぐ タグ 名詞 6 普通名詞 1 * 0 * 0 122 | EOS 123 | """ 124 | ), 125 | ), 126 | dict( 127 | fstring=r"", 128 | category=NamedEntityCategory.OPTIONAL, 129 | text=" > > 特殊 1 括弧終 4 * 0 * 0 138 | + -1D 139 | タグ たぐ タグ 名詞 6 普通名詞 1 * 0 * 0 140 | EOS 141 | """ 142 | ), 143 | ), 144 | dict( 145 | fstring=r"", 146 | category=NamedEntityCategory.OPTIONAL, 147 | text=r"バック\スラッシュ", 148 | knp=textwrap.dedent( 149 | r""" 150 | * 1D 151 | + 2D 152 | バック ばっく バック 名詞 6 サ変名詞 2 * 0 * 0 153 | + 2D 154 | \ \ \ 特殊 1 記号 5 * 0 * 0 155 | * -1D 156 | + -1D 157 | スラッシュ すらっしゅ スラッシュ 名詞 6 普通名詞 1 * 0 * 0 158 | EOS 159 | """.lstrip("\n") 160 | ), 161 | ), 162 | ], 163 | ) 164 | def test_escape(case: dict[str, Any]) -> None: 165 | sentence = Sentence.from_knp(case["knp"]) 166 | ne = NamedEntity.from_fstring(case["fstring"], sentence.morphemes) 167 | assert ne is not None 168 | assert ne.category == case["category"] 169 | assert ne.text == case["text"] 170 | assert ne.to_fstring() == case["fstring"] 171 | 172 | 173 | def test_escape_in_knp() -> None: 174 | knp_text = textwrap.dedent( 175 | r""" 176 | # S-ID:1 177 | * -1D 178 | + 1D 179 | < < < 特殊 1 括弧始 3 * 0 * 0 180 | html html html 名詞 6 普通名詞 1 * 0 * 0 181 | > > > 特殊 1 括弧終 4 * 0 * 0 182 | + -1D タグ> 183 | タグ たぐ タグ 名詞 6 普通名詞 1 * 0 * 0 184 | EOS 185 | """.lstrip("\n") 186 | ) 187 | sentence = Sentence.from_knp(knp_text) 188 | assert sentence.named_entities[0].text == "html>タグ" 189 | assert sentence.to_knp() == knp_text 190 | -------------------------------------------------------------------------------- /tests/props/test_semantics.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import pytest 4 | 5 | from rhoknp.props import SemanticsDict 6 | 7 | CASES = [ 8 | { 9 | "sstring": '"代表表記:天気/てんき カテゴリ:抽象物"', 10 | "dict_": {"代表表記": "天気/てんき", "カテゴリ": "抽象物"}, 11 | }, 12 | { 13 | "sstring": '"代表表記:新/しん 内容語 NE:ORGANIZATION:head"', 14 | "dict_": {"代表表記": "新/しん", "内容語": True, "NE": "ORGANIZATION:head"}, 15 | }, 16 | { 17 | "sstring": "NIL", 18 | "dict_": {}, 19 | }, 20 | ] 21 | 22 | 23 | @pytest.mark.parametrize("case", CASES) 24 | def test_from_fstring(case: dict[str, Any]) -> None: 25 | semantics = SemanticsDict.from_sstring(case["sstring"]) 26 | assert dict(semantics) == case["dict_"] 27 | 28 | 29 | @pytest.mark.parametrize("case", CASES) 30 | def test_to_fstring(case: dict[str, Any]) -> None: 31 | semantics = SemanticsDict(case["dict_"], is_nil=True) 32 | assert semantics.to_sstring() == case["sstring"] 33 | 34 | 35 | def test_false() -> None: 36 | assert SemanticsDict._item_to_sstring("sem", False) == "" 37 | 38 | 39 | def test_empty_dict() -> None: 40 | semantics = SemanticsDict({}) 41 | assert semantics.to_sstring() == "" 42 | 43 | 44 | def test_void() -> None: 45 | semantics = SemanticsDict() 46 | assert semantics.to_sstring() == "" 47 | 48 | 49 | def test_empty_string() -> None: 50 | semantics = SemanticsDict.from_sstring("") 51 | assert semantics.to_sstring() == "" 52 | 53 | 54 | def test_modification() -> None: 55 | features = SemanticsDict.from_sstring('"代表表記:天気/てんき カテゴリ:抽象物"') 56 | assert features.to_sstring() == '"代表表記:天気/てんき カテゴリ:抽象物"' 57 | # Update 58 | features["代表表記"] = "転機/てんき" 59 | assert features.to_sstring() == '"代表表記:転機/てんき カテゴリ:抽象物"' 60 | # Insert 61 | features["内容語"] = True 62 | assert features.to_sstring() == '"代表表記:転機/てんき カテゴリ:抽象物 内容語"' 63 | # Delete 64 | del features["カテゴリ"] 65 | assert features.to_sstring() == '"代表表記:転機/てんき 内容語"' 66 | -------------------------------------------------------------------------------- /tests/utils/test_comment.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Optional 3 | 4 | import pytest 5 | 6 | from rhoknp import Sentence 7 | from rhoknp.utils.comment import extract_did_and_sid, is_comment_line 8 | 9 | 10 | @pytest.mark.parametrize( 11 | ("line", "expected"), 12 | [ 13 | ("# S-ID:1", True), 14 | ("# foo-bar", True), 15 | ("#", True), 16 | ("// S-ID:1", False), 17 | ("// foo-bar", False), 18 | ("//", False), 19 | ('# # # 未定義語 15 その他 1 * 0 * 0 "未知語:その他 品詞推定:特殊"', False), 20 | ], 21 | ) 22 | def test_is_comment_line(line: str, expected: bool) -> None: 23 | assert is_comment_line(line) == expected 24 | 25 | 26 | @pytest.mark.parametrize( 27 | ("pat", "line", "doc_id", "sent_id"), 28 | [ 29 | (Sentence.SID_PAT, "# S-ID:", "", ""), 30 | (Sentence.SID_PAT, "# S-ID:1", "", "1"), 31 | (Sentence.SID_PAT, "# S-ID:123", "", "123"), 32 | (Sentence.SID_PAT, "# S-ID:1a", "1a", "1a"), 33 | (Sentence.SID_PAT, "# S-ID:1-a", "1-a", "1-a"), 34 | (Sentence.SID_PAT, "# S-ID:1-1", "1", "1-1"), 35 | (Sentence.SID_PAT, "# S-ID:1-2", "1", "1-2"), 36 | (Sentence.SID_PAT, "# S-ID:a-1", "a", "a-1"), 37 | (Sentence.SID_PAT, "# S-ID:a-2", "a", "a-2"), 38 | (Sentence.SID_PAT_KWDLC, "# S-ID:w201106-0000060050-1", "w201106-0000060050", "w201106-0000060050-1"), 39 | (Sentence.SID_PAT_WAC, "# S-ID:wiki00100176-00", "wiki00100176", "wiki00100176-00"), 40 | ], 41 | ) 42 | def test_extract_doc_id(pat: re.Pattern, line: str, doc_id: Optional[str], sent_id: Optional[str]) -> None: 43 | did, sid, _ = extract_did_and_sid(line, [pat]) 44 | assert did == doc_id 45 | assert sid == sent_id 46 | -------------------------------------------------------------------------------- /tests/utils/test_reader.py: -------------------------------------------------------------------------------- 1 | import textwrap 2 | from io import StringIO 3 | from typing import Any 4 | 5 | import pytest 6 | 7 | from rhoknp.utils.reader import chunk_by_document, chunk_by_sentence 8 | 9 | CASES = [ 10 | { 11 | "text": textwrap.dedent( 12 | """\ 13 | # S-ID:A-X-1 14 | EOS 15 | # S-ID:A-X-2 16 | EOS 17 | # S-ID:A-Y-1 18 | EOS 19 | """ 20 | ), 21 | "sentences": [ 22 | "# S-ID:A-X-1\nEOS\n", 23 | "# S-ID:A-X-2\nEOS\n", 24 | "# S-ID:A-Y-1\nEOS\n", 25 | ], 26 | "documents": [ 27 | "# S-ID:A-X-1\nEOS\n# S-ID:A-X-2\nEOS\n", 28 | "# S-ID:A-Y-1\nEOS\n", 29 | ], 30 | "doc_id_format": "default", 31 | }, 32 | { 33 | "text": textwrap.dedent( 34 | """\ 35 | # S-ID:w201106-0000060050-1 36 | EOS 37 | # S-ID:w201106-0000060050-2 38 | EOS 39 | """ 40 | ), 41 | "sentences": [ 42 | "# S-ID:w201106-0000060050-1\nEOS\n", 43 | "# S-ID:w201106-0000060050-2\nEOS\n", 44 | ], 45 | "documents": [ 46 | "# S-ID:w201106-0000060050-1\nEOS\n# S-ID:w201106-0000060050-2\nEOS\n", 47 | ], 48 | "doc_id_format": "kwdlc", 49 | }, 50 | { 51 | "text": textwrap.dedent( 52 | """\ 53 | # S-ID:wiki00100176-00 54 | EOS 55 | # S-ID:wiki00100176-01 56 | EOS 57 | """ 58 | ), 59 | "sentences": [ 60 | "# S-ID:wiki00100176-00\nEOS\n", 61 | "# S-ID:wiki00100176-01\nEOS\n", 62 | ], 63 | "documents": [ 64 | "# S-ID:wiki00100176-00\nEOS\n# S-ID:wiki00100176-01\nEOS\n", 65 | ], 66 | "doc_id_format": "wac", 67 | }, 68 | { 69 | "text": textwrap.dedent( 70 | """\ 71 | # 1-1 72 | EOS 73 | # 1-2 74 | EOS 75 | # 2-1 76 | EOS 77 | """ 78 | ), 79 | "sentences": [ 80 | "# 1-1\nEOS\n", 81 | "# 1-2\nEOS\n", 82 | "# 2-1\nEOS\n", 83 | ], 84 | "documents": [ 85 | "# 1-1\nEOS\n# 1-2\nEOS\n", 86 | "# 2-1\nEOS\n", 87 | ], 88 | "doc_id_format": lambda x: x.lstrip("# ").split("-")[0], 89 | }, 90 | # empty line 91 | { 92 | "text": textwrap.dedent( 93 | """\ 94 | # S-ID:1-1 95 | EOS 96 | 97 | # S-ID:1-2 98 | EOS 99 | """ 100 | ), 101 | "sentences": [ 102 | "# S-ID:1-1\nEOS\n", 103 | "# S-ID:1-2\nEOS\n", 104 | ], 105 | "documents": [ 106 | "# S-ID:1-1\nEOS\n# S-ID:1-2\nEOS\n", 107 | ], 108 | "doc_id_format": "default", 109 | }, 110 | # no sid 111 | { 112 | "text": textwrap.dedent( 113 | """\ 114 | # 1-1 115 | EOS 116 | # 1-2 117 | EOS 118 | """ 119 | ), 120 | "sentences": [ 121 | "# 1-1\nEOS\n", 122 | "# 1-2\nEOS\n", 123 | ], 124 | "documents": [ 125 | "# 1-1\nEOS\n", 126 | "# 1-2\nEOS\n", 127 | ], 128 | "doc_id_format": "default", 129 | }, 130 | # no trailing EOS 131 | { 132 | "text": textwrap.dedent( 133 | """\ 134 | # S-ID:1-1 135 | EOS 136 | # S-ID:1-2 137 | """ 138 | ), 139 | "sentences": [ 140 | "# S-ID:1-1\nEOS\n", 141 | "# S-ID:1-2\n", 142 | ], 143 | "documents": [ 144 | "# S-ID:1-1\nEOS\n# S-ID:1-2\n", 145 | ], 146 | "doc_id_format": "default", 147 | }, 148 | # invalid sid 149 | { 150 | "text": textwrap.dedent( 151 | """\ 152 | # S-ID:1-1 153 | EOS 154 | # S-ID:1-2 155 | EOS 156 | # S-ID:2-1 157 | EOS 158 | """ 159 | ), 160 | "sentences": [ 161 | "# S-ID:1-1\nEOS\n", 162 | "# S-ID:1-2\nEOS\n", 163 | "# S-ID:2-1\nEOS\n", 164 | ], 165 | "documents": [ 166 | "# S-ID:1-1\nEOS\n", 167 | "# S-ID:1-2\nEOS\n", 168 | "# S-ID:2-1\nEOS\n", 169 | ], 170 | "doc_id_format": "kwdlc", 171 | }, 172 | ] 173 | 174 | 175 | @pytest.mark.parametrize("case", CASES) 176 | def test_chunk_by_sentence(case: dict[str, Any]) -> None: 177 | actual = list(chunk_by_sentence(StringIO(case["text"]))) 178 | assert actual == case["sentences"] 179 | 180 | 181 | @pytest.mark.parametrize("case", CASES) 182 | def test_chunk_by_document(case: dict[str, Any]) -> None: 183 | actual = list(chunk_by_document(StringIO(case["text"]), doc_id_format=case["doc_id_format"])) 184 | assert actual == case["documents"] 185 | 186 | 187 | def test_chunk_by_document_value_error() -> None: 188 | with pytest.raises(ValueError, match="Invalid doc_id_format: ERROR"): 189 | _ = list(chunk_by_document(StringIO(""), doc_id_format="ERROR")) # type: ignore 190 | 191 | 192 | def test_chunk_by_document_type_error() -> None: 193 | with pytest.raises(TypeError): 194 | _ = list(chunk_by_document(StringIO(""), doc_id_format=1)) # type: ignore 195 | --------------------------------------------------------------------------------