├── .github
├── dependabot.yml
└── workflows
│ ├── build.yml
│ ├── codeql-analysis.yml
│ ├── dependabot-auto-merge.yml
│ ├── docs-requirements.yml
│ ├── lint.yml
│ ├── publish.yml
│ ├── pylock.yml
│ ├── release.yml
│ ├── test-example.yml
│ └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── AUTHORS.md
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── assets
├── KNP.sublime-syntax
├── logo-original.png
├── logo-wide.xcf
└── logo.xcf
├── docs
├── Makefile
├── _static
│ ├── favicon-16x16.png
│ ├── favicon-32x32.png
│ ├── favicon.ico
│ ├── logo-wide.png
│ └── logo.png
├── authors.md
├── cli
│ └── index.md
├── conf.py
├── contributing
│ └── index.md
├── format
│ └── index.md
├── index.md
├── installation
│ └── index.md
├── make.bat
├── reference
│ ├── index.md
│ ├── rhoknp.cli.cli.md
│ ├── rhoknp.cli.md
│ ├── rhoknp.cli.serve.md
│ ├── rhoknp.cli.show.md
│ ├── rhoknp.cli.stats.md
│ ├── rhoknp.cohesion.argument.md
│ ├── rhoknp.cohesion.coreference.md
│ ├── rhoknp.cohesion.discourse.md
│ ├── rhoknp.cohesion.exophora.md
│ ├── rhoknp.cohesion.md
│ ├── rhoknp.cohesion.pas.md
│ ├── rhoknp.cohesion.predicate.md
│ ├── rhoknp.cohesion.rel.md
│ ├── rhoknp.md
│ ├── rhoknp.processors.jumanpp.md
│ ├── rhoknp.processors.knp.md
│ ├── rhoknp.processors.kwja.md
│ ├── rhoknp.processors.md
│ ├── rhoknp.processors.processor.md
│ ├── rhoknp.processors.senter.md
│ ├── rhoknp.props.dependency.md
│ ├── rhoknp.props.feature.md
│ ├── rhoknp.props.md
│ ├── rhoknp.props.memo.md
│ ├── rhoknp.props.named_entity.md
│ ├── rhoknp.props.semantics.md
│ ├── rhoknp.units.base_phrase.md
│ ├── rhoknp.units.clause.md
│ ├── rhoknp.units.document.md
│ ├── rhoknp.units.md
│ ├── rhoknp.units.morpheme.md
│ ├── rhoknp.units.phrase.md
│ ├── rhoknp.units.sentence.md
│ ├── rhoknp.units.unit.md
│ ├── rhoknp.utils.md
│ └── rhoknp.utils.reader.md
└── requirements.txt
├── examples
├── README.md
├── apply_jumanpp.py
├── apply_knp.py
├── apply_kwja.py
├── load_jumanpp.py
├── load_knp.py
├── use_coreference_resolution.py
├── use_dependency_parsing.py
├── use_discourse_relation_analysis.py
├── use_morphological_analysis.py
├── use_named_entity_recognition.py
└── use_predicate_argument_structure_analysis.py
├── pyproject.toml
├── src
└── rhoknp
│ ├── __init__.py
│ ├── cli
│ ├── __init__.py
│ ├── cat.py
│ ├── cli.py
│ ├── serve.py
│ ├── show.py
│ ├── static
│ │ ├── css
│ │ │ └── style.css
│ │ ├── images
│ │ │ ├── apple-touch-icon.png
│ │ │ └── favicon.ico
│ │ └── js
│ │ │ └── script.js
│ ├── stats.py
│ └── templates
│ │ ├── base.jinja2
│ │ ├── components
│ │ ├── dependency_parsing.jinja2
│ │ ├── discourse_parsing.jinja2
│ │ ├── error.jinja2
│ │ ├── form.jinja2
│ │ ├── hide_all_button.jinja2
│ │ ├── morphological_analysis.jinja2
│ │ ├── named_entity_recognition.jinja2
│ │ ├── navbar.jinja2
│ │ ├── raw_input.jinja2
│ │ ├── raw_output.jinja2
│ │ ├── show_all_button.jinja2
│ │ ├── typo_correction.jinja2
│ │ └── word_splitting.jinja2
│ │ ├── jumanpp.jinja2
│ │ ├── knp.jinja2
│ │ └── kwja.jinja2
│ ├── cohesion
│ ├── __init__.py
│ ├── argument.py
│ ├── coreference.py
│ ├── discourse.py
│ ├── exophora.py
│ ├── pas.py
│ ├── predicate.py
│ └── rel.py
│ ├── processors
│ ├── __init__.py
│ ├── jumanpp.py
│ ├── knp.py
│ ├── kwja.py
│ ├── processor.py
│ └── senter.py
│ ├── props
│ ├── __init__.py
│ ├── dependency.py
│ ├── feature.py
│ ├── memo.py
│ ├── named_entity.py
│ └── semantics.py
│ ├── py.typed
│ ├── units
│ ├── __init__.py
│ ├── base_phrase.py
│ ├── clause.py
│ ├── document.py
│ ├── morpheme.py
│ ├── phrase.py
│ ├── sentence.py
│ └── unit.py
│ └── utils
│ ├── __init__.py
│ ├── comment.py
│ └── reader.py
├── tests
├── bin
│ ├── jumanpp-mock.sh
│ ├── knp-mock.sh
│ └── kwja-mock.sh
├── cli
│ ├── test_cat.py
│ ├── test_cli.py
│ ├── test_serve.py
│ ├── test_show.py
│ └── test_stats.py
├── cohesion
│ ├── test_argument.py
│ ├── test_coreference.py
│ ├── test_discourse.py
│ ├── test_exophora.py
│ ├── test_pas.py
│ ├── test_predicate.py
│ └── test_rel.py
├── data
│ ├── w201106-0000060050.knp
│ ├── w201106-0000060560.knp
│ ├── w201106-0000060877.knp
│ ├── w201106-0000074273.knp
│ └── wiki00100176.knp
├── processors
│ ├── test_jumanpp.py
│ ├── test_knp.py
│ ├── test_kwja.py
│ └── test_regex_senter.py
├── props
│ ├── test_features.py
│ ├── test_memo.py
│ ├── test_named_entity.py
│ └── test_semantics.py
├── units
│ ├── test_base_phrase.py
│ ├── test_clause.py
│ ├── test_document.py
│ ├── test_morpheme.py
│ ├── test_phrase.py
│ └── test_sentence.py
└── utils
│ ├── test_comment.py
│ └── test_reader.py
└── uv.lock
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "uv"
9 | directory: "/"
10 | schedule:
11 | interval: "monthly"
12 | timezone: "Asia/Tokyo"
13 | groups:
14 | dependencies:
15 | patterns:
16 | - "*"
17 | target-branch: "develop"
18 | ignore:
19 | - dependency-name: "*"
20 | update-types: ["version-update:semver-major"]
21 |
22 | - package-ecosystem: "github-actions"
23 | # Workflow files stored in the
24 | # default location of `.github/workflows`
25 | directory: "/"
26 | schedule:
27 | interval: "monthly"
28 | timezone: "Asia/Tokyo"
29 | target-branch: "develop"
30 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 | name: Build the project
8 | runs-on: ${{ matrix.os }}
9 | strategy:
10 | max-parallel: 18
11 | fail-fast: false
12 | matrix:
13 | os: [ubuntu-latest, macos-latest, windows-latest]
14 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
15 | steps:
16 | - name: Checkout repository
17 | uses: actions/checkout@v4
18 | - name: Set up Python ${{ matrix.python-version }}
19 | uses: actions/setup-python@v5
20 | id: setup-python
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 | - name: Install uv
24 | uses: astral-sh/setup-uv@v6
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - name: Build package
28 | run: uv build -o dist
29 | - name: Install virtualenv and create virtual environment
30 | run: |
31 | uv tool install virtualenv
32 | virtualenv -p ${{ matrix.python-version }} .venv
33 | - name: Install rhoknp from wheel (non-Windows)
34 | if: ${{ matrix.os != 'windows-latest' }}
35 | run: |
36 | source .venv/bin/activate
37 | wheelFile=$(ls dist/*.whl)
38 | pip install "${wheelFile}[cli]"
39 | - name: Install rhoknp from wheel (Windows)
40 | if: ${{ matrix.os == 'windows-latest' }}
41 | run: |
42 | .\.venv\Scripts\Activate
43 | $wheelFile = (Get-ChildItem -Path dist -Filter *.whl).FullName
44 | pip install "${wheelFile}[cli]"
45 | shell: pwsh
46 | - name: Run rhoknp (non-Windows)
47 | if: ${{ matrix.os != 'windows-latest' }}
48 | run: |
49 | source .venv/bin/activate
50 | rhoknp --version
51 | rhoknp --help
52 | - name: Run rhoknp (Windows)
53 | if: ${{ matrix.os == 'windows-latest' }}
54 | run: |
55 | .\.venv\Scripts\Activate
56 | rhoknp --version
57 | rhoknp --help
58 | shell: pwsh
59 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: ["main"]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: ["main"]
20 | schedule:
21 | - cron: "37 23 * * 0"
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: ["python"]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38 |
39 | steps:
40 | - name: Checkout repository
41 | uses: actions/checkout@v4
42 |
43 | # Initializes the CodeQL tools for scanning.
44 | - name: Initialize CodeQL
45 | uses: github/codeql-action/init@v3
46 | with:
47 | languages: ${{ matrix.language }}
48 | # If you wish to specify custom queries, you can do so here or in a config file.
49 | # By default, queries listed here will override any specified in a config file.
50 | # Prefix the list here with "+" to use these queries and those in the config file.
51 |
52 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
53 | # queries: security-extended,security-and-quality
54 |
55 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
56 | # If this step fails, then you should remove it and run the build manually (see below)
57 | - name: Autobuild
58 | uses: github/codeql-action/autobuild@v3
59 |
60 | # ℹ️ Command-line programs to run using the OS shell.
61 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
62 |
63 | # If the Autobuild fails above, remove it and uncomment the following three lines.
64 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
65 |
66 | # - run: |
67 | # echo "Run, Build Application using script"
68 | # ./location_of_script_within_repo/buildscript.sh
69 |
70 | - name: Perform CodeQL Analysis
71 | uses: github/codeql-action/analyze@v3
72 | with:
73 | category: "/language:${{matrix.language}}"
74 |
--------------------------------------------------------------------------------
/.github/workflows/dependabot-auto-merge.yml:
--------------------------------------------------------------------------------
1 | # https://docs.github.com/en/code-security/dependabot/working-with-dependabot/automating-dependabot-with-github-actions#enable-auto-merge-on-a-pull-request
2 | name: Dependabot auto-merge
3 | on: pull_request_target
4 |
5 | permissions:
6 | pull-requests: write
7 | contents: write
8 |
9 | jobs:
10 | dependabot:
11 | runs-on: ubuntu-latest
12 | if: ${{ github.actor == 'dependabot[bot]' }}
13 | steps:
14 | - name: Dependabot metadata
15 | id: metadata
16 | uses: dependabot/fetch-metadata@v2
17 | with:
18 | github-token: "${{ secrets.GITHUB_TOKEN }}"
19 | - name: Wait for tests to pass
20 | uses: lewagon/wait-on-check-action@v1.3.4
21 | with:
22 | ref: ${{ github.event.pull_request.head.sha }}
23 | # running-workflow-name: "Test" # this condition does not work
24 | check-regexp: Run tests with pytest.*
25 | repo-token: ${{ secrets.GITHUB_TOKEN }}
26 | wait-interval: 60 # Check every 60 seconds
27 | - name: Enable auto-merge for Dependabot PRs
28 | if: ${{ steps.metadata.outputs.update-type != 'version-update:semver-major' }}
29 | run: gh pr merge --auto --merge "$PR_URL"
30 | env:
31 | PR_URL: ${{github.event.pull_request.html_url}}
32 | GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
33 |
--------------------------------------------------------------------------------
/.github/workflows/docs-requirements.yml:
--------------------------------------------------------------------------------
1 | name: Generate `docs/requirements.txt`
2 |
3 | on:
4 | push:
5 | paths:
6 | - "pyproject.toml"
7 | - "uv.lock"
8 |
9 | jobs:
10 | generate-requirements:
11 | name: Generate `docs/requirements.txt` from pyproject.toml
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v4
15 | - name: Install uv
16 | uses: astral-sh/setup-uv@v6
17 | - name: Export requirements.txt
18 | run: uv export --only-group docs --no-annotate --no-hashes -o docs/requirements.txt
19 | - name: Commit and push changes
20 | uses: stefanzweifel/git-auto-commit-action@v5
21 | with:
22 | commit_message: update docs/requirements.txt
23 | # Optional glob pattern of files which should be added to the commit
24 | file_pattern: docs/requirements.txt
25 | # Optional. Prevents the shell from expanding filenames.
26 | # Details: https://www.gnu.org/software/bash/manual/html_node/Filename-Expansion.html
27 | disable_globbing: true
28 |
--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
1 | name: Lint
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | lint:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - name: Checkout repository
10 | uses: actions/checkout@v4
11 | - name: Set up Python 3.9
12 | uses: actions/setup-python@v5
13 | with:
14 | python-version: "3.9"
15 | - name: Run linters
16 | run: |
17 | pipx install pre-commit
18 | pre-commit run --all-files
19 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish
2 |
3 | on:
4 | push:
5 | tags:
6 | - "v*.*.*"
7 | workflow_dispatch:
8 |
9 | jobs:
10 | build-publish:
11 | runs-on: ubuntu-latest
12 | # https://github.com/pypa/gh-action-pypi-publish?tab=readme-ov-file#trusted-publishing
13 | environment:
14 | name: pypi
15 | url: https://pypi.org/p/rhoknp
16 | permissions:
17 | id-token: write
18 | steps:
19 | - uses: actions/checkout@v4
20 | - name: Install uv
21 | uses: astral-sh/setup-uv@v6
22 | - name: Build package
23 | run: uv build -o dist
24 | - name: Publish package
25 | uses: pypa/gh-action-pypi-publish@release/v1
26 | with:
27 | verbose: true
28 |
--------------------------------------------------------------------------------
/.github/workflows/pylock.yml:
--------------------------------------------------------------------------------
1 | name: Generate pylock.toml
2 |
3 | on:
4 | push:
5 | paths:
6 | - "pyproject.toml"
7 | - "uv.lock"
8 |
9 | jobs:
10 | generate-pylock:
11 | name: Generate pylock.toml from pyproject.toml
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v4
15 | - name: Install uv
16 | uses: astral-sh/setup-uv@v6
17 | - name: Export pylock.toml
18 | run: uv export -o pylock.toml
19 | - name: Commit and push changes
20 | uses: stefanzweifel/git-auto-commit-action@v5
21 | with:
22 | commit_message: update pylock.yml
23 | # Optional glob pattern of files which should be added to the commit
24 | file_pattern: pylock.yml
25 | # Optional. Prevents the shell from expanding filenames.
26 | # Details: https://www.gnu.org/software/bash/manual/html_node/Filename-Expansion.html
27 | disable_globbing: true
28 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - "v*.*.*"
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | permissions:
12 | contents: write
13 | steps:
14 | - name: Checkout code
15 | uses: actions/checkout@v4
16 | - name: Create release
17 | id: create_release
18 | uses: softprops/action-gh-release@v2
19 | with:
20 | tag_name: ${{ github.ref_name }}
21 | draft: false
22 | prerelease: false
23 |
--------------------------------------------------------------------------------
/.github/workflows/test-example.yml:
--------------------------------------------------------------------------------
1 | name: TestExample
2 |
3 | on:
4 | schedule:
5 | - cron: "0 3 */16 * *" # Runs at 3:00 UTC on the 1 and 17th of every month.
6 | workflow_dispatch:
7 |
8 | jobs:
9 | test-example:
10 | name: Run tests for examples
11 | container: kunlp/jumanpp-knp:ubuntu22.04
12 | runs-on: ubuntu-22.04
13 | strategy:
14 | max-parallel: 5
15 | fail-fast: false
16 | matrix:
17 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
18 | steps:
19 | - name: Checkout repository
20 | uses: actions/checkout@v4
21 | - name: Install required apt packages
22 | run: |
23 | export DEBIAN_FRONTEND=noninteractive
24 | apt-get update -yq
25 | apt-get install -yq curl build-essential libsqlite3-dev libffi-dev
26 | - name: Set up Python ${{ matrix.python-version }}
27 | uses: actions/setup-python@v5
28 | with:
29 | python-version: ${{ matrix.python-version }}
30 | - name: Install uv
31 | uses: astral-sh/setup-uv@v6
32 | with:
33 | python-version: ${{ matrix.python-version }}
34 | - name: Install dependencies
35 | run: |
36 | uv sync --group test --extra cli --no-cache
37 | - name: Install KWJA
38 | # KWJA does not support Python 3.13
39 | if: ${{ matrix.python-version != 3.13 }}
40 | run: |
41 | uv tool install kwja
42 | - name: Run tests for all files under examples/apply_*.py
43 | shell: bash
44 | run: |
45 | for example in examples/apply_*.py; do
46 | if [[ -f "${example}" ]]; then
47 | echo "Running tests for ${example}"
48 | uv run python "${example}" "こんにちは"
49 | fi
50 | done
51 | - name: Run tests for examples/use_*.py
52 | shell: bash
53 | run: |
54 | for example in examples/use_*.py; do
55 | if [[ -f "${example}" ]]; then
56 | echo "Running tests for ${example}"
57 | uv run python "${example}" "こんにちは"
58 | fi
59 | done
60 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 |
3 | on: [push, pull_request, workflow_dispatch]
4 |
5 | jobs:
6 | test:
7 | name: Run tests with pytest
8 | container: kunlp/jumanpp-knp:ubuntu22.04
9 | runs-on: ubuntu-22.04
10 | strategy:
11 | max-parallel: 6
12 | fail-fast: false
13 | matrix:
14 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
15 | steps:
16 | - name: Checkout repository
17 | uses: actions/checkout@v4
18 | - name: Install required apt packages
19 | run: |
20 | export DEBIAN_FRONTEND=noninteractive
21 | apt-get update -yq
22 | apt-get install -yq curl build-essential libsqlite3-dev libffi-dev libssl-dev
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v5
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - name: Install uv
28 | uses: astral-sh/setup-uv@v6
29 | with:
30 | python-version: ${{ matrix.python-version }}
31 | - name: Install dependencies
32 | run: uv sync --extra cli --no-cache
33 | - name: Install KWJA
34 | # KWJA does not support Python 3.13
35 | if: ${{ matrix.python-version != '3.13' && matrix.python-version != '3.13t' }}
36 | run: |
37 | uv tool install kwja
38 | kwja --model-size tiny --text "テスト"
39 | - name: Run tests with coverage
40 | if: ${{ matrix.python-version == '3.10' }}
41 | run: |
42 | uv run pytest --cov=./ --cov-report=xml -v ./tests
43 | - name: Run tests without coverage
44 | if: ${{ matrix.python-version != '3.10' }}
45 | run: |
46 | uv run pytest -v ./tests
47 | - name: Install git for codecov
48 | if: ${{ matrix.python-version == '3.10' }}
49 | run: |
50 | apt-get install -yq git
51 | - name: Upload coverage to Codecov
52 | if: ${{ matrix.python-version == '3.10' }}
53 | uses: codecov/codecov-action@v5
54 | with:
55 | files: ./coverage.xml
56 | name: codecov-umbrella
57 | token: ${{ secrets.CODECOV_TOKEN }}
58 | verbose: true
59 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .nox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *.cover
48 | .hypothesis/
49 | .pytest_cache/
50 |
51 | # Translations
52 | *.mo
53 | *.pot
54 |
55 | # Django stuff:
56 | *.log
57 | local_settings.py
58 | db.sqlite3
59 |
60 | # Flask stuff:
61 | instance/
62 | .webassets-cache
63 |
64 | # Scrapy stuff:
65 | .scrapy
66 |
67 | # Sphinx documentation
68 | docs/_build/
69 |
70 | # PyBuilder
71 | target/
72 |
73 | # Jupyter Notebook
74 | .ipynb_checkpoints
75 |
76 | # IPython
77 | profile_default/
78 | ipython_config.py
79 |
80 | # pyenv
81 | .python-version
82 |
83 | # celery beat schedule file
84 | celerybeat-schedule
85 |
86 | # SageMath parsed files
87 | *.sage.py
88 |
89 | # Environments
90 | .env
91 | .venv
92 | env/
93 | venv/
94 | ENV/
95 | env.bak/
96 | venv.bak/
97 |
98 | # Spyder project settings
99 | .spyderproject
100 | .spyproject
101 |
102 | # Rope project settings
103 | .ropeproject
104 |
105 | # mkdocs documentation
106 | /site
107 |
108 | # mypy
109 | .mypy_cache/
110 | .dmypy.json
111 | dmypy.json
112 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v5.0.0
4 | hooks:
5 | - id: end-of-file-fixer
6 | - id: trailing-whitespace
7 | - id: check-docstring-first
8 | - id: check-yaml
9 | - id: check-toml
10 | - id: check-added-large-files
11 | exclude: "assets/logo.*"
12 | - repo: https://github.com/astral-sh/ruff-pre-commit
13 | rev: v0.11.12
14 | hooks:
15 | - id: ruff
16 | args: [--fix, --exit-non-zero-on-fix]
17 | - id: ruff-format
18 | - repo: https://github.com/pre-commit/mirrors-mypy
19 | rev: v1.16.0
20 | hooks:
21 | - id: mypy
22 | additional_dependencies:
23 | - fastapi
24 | - jinja2
25 | - pygments
26 | - rich
27 | - typer-slim
28 | - types-click
29 | - types-PyYAML
30 | - typing-extensions
31 | - uvicorn
32 | - repo: https://github.com/pre-commit/mirrors-prettier
33 | rev: v4.0.0-alpha.8
34 | hooks:
35 | - id: prettier
36 | - repo: https://github.com/Riverside-Healthcare/djLint
37 | rev: v1.36.4
38 | hooks:
39 | - id: djlint-jinja
40 | - id: djlint-reformat-jinja
41 | - repo: https://github.com/rhysd/actionlint
42 | rev: v1.7.7
43 | hooks:
44 | - id: actionlint
45 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | build:
4 | os: ubuntu-22.04
5 | tools:
6 | python: "3.9"
7 |
8 | sphinx:
9 | configuration: docs/conf.py
10 |
11 | python:
12 | install:
13 | - requirements: docs/requirements.txt
14 |
--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
1 | # Authors
2 |
3 | Maintained with:
4 |
5 | - [Hirokazu Kiyomaru](mailto:h.kiyomaru@gmail.com)
6 | - [Nobuhiro Ueda](mailto:ueda@nlp.ist.i.kyoto-u.ac.jp)
7 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this software, please cite it as below."
3 | title: "rhoknp: Yet another Python binding for Juman++/KNP/KWJA"
4 | authors:
5 | - family-names: Kiyomaru
6 | given-names: Hirokazu
7 | - family-names: Ueda
8 | given-names: Nobuhiro
9 | version: 1.6.0
10 | repository-code: "https://github.com/ku-nlp/rhoknp"
11 | date-released: 2023-11-08
12 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to _rhoknp_
2 |
3 | Thank you for your interest in improving _rhoknp_!
4 | Our [contributing documentation](https://rhoknp.readthedocs.io/en/latest/contributing/index.html) contains what you need to know about contributing to _rhoknp_.
5 | We look forward to your contributions!
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Kyoto University
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | rhoknp: Yet another Python binding for Juman++/KNP/KWJA
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | ---
20 |
21 | **Documentation**: [https://rhoknp.readthedocs.io/en/latest/](https://rhoknp.readthedocs.io/en/latest/)
22 |
23 | **Source Code**: [https://github.com/ku-nlp/rhoknp](https://github.com/ku-nlp/rhoknp)
24 |
25 | ---
26 |
27 | _rhoknp_ is a Python binding for [Juman++](https://github.com/ku-nlp/jumanpp), [KNP](https://github.com/ku-nlp/knp), and [KWJA](https://github.com/ku-nlp/kwja).[^1]
28 |
29 | [^1]: The logo was generated by OpenAI DALL·E 2.
30 |
31 | ```python
32 | import rhoknp
33 |
34 | # Perform morphological analysis by Juman++
35 | jumanpp = rhoknp.Jumanpp()
36 | sentence = jumanpp.apply_to_sentence(
37 | "電気抵抗率は電気の通しにくさを表す物性値である。"
38 | )
39 |
40 | # Access to the result
41 | for morpheme in sentence.morphemes: # a.k.a. keitai-so
42 | ...
43 |
44 | # Save the result
45 | with open("result.jumanpp", "wt") as f:
46 | f.write(sentence.to_jumanpp())
47 |
48 | # Load the result
49 | with open("result.jumanpp", "rt") as f:
50 | sentence = rhoknp.Sentence.from_jumanpp(f.read())
51 | ```
52 |
53 | ## Requirements
54 |
55 | - Python 3.9+
56 | - (Optional) [Juman++](https://github.com/ku-nlp/jumanpp) v2.0.0-rc3+
57 | - (Optional) [KNP](https://github.com/ku-nlp/knp) 5.0+
58 | - (Optional) [KWJA](https://github.com/ku-nlp/kwja) 1.0.0+
59 |
60 | ## Installation
61 |
62 | ```shell
63 | pip install rhoknp
64 | ```
65 |
66 | ## Quick tour
67 |
68 | Let's begin by using Juman++ with rhoknp.
69 | Here, we present a simple example demonstrating how Juman++ can be used to analyze a sentence.
70 |
71 | ```python
72 | # Perform morphological analysis by Juman++
73 | jumanpp = rhoknp.Jumanpp()
74 | sentence = jumanpp.apply_to_sentence("電気抵抗率は電気の通しにくさを表す物性値である。")
75 | ```
76 |
77 | You can easily access the individual morphemes that make up the sentence.
78 |
79 | ```python
80 | for morpheme in sentence.morphemes: # a.k.a. keitai-so
81 | ...
82 | ```
83 |
84 | Sentence objects can be saved in the JUMAN format.
85 |
86 | ```python
87 | # Save the sentence in the JUMAN format
88 | with open("sentence.jumanpp", "wt") as f:
89 | f.write(sentence.to_jumanpp())
90 |
91 | # Load the sentence
92 | with open("sentence.jumanpp", "rt") as f:
93 | sentence = rhoknp.Sentence.from_jumanpp(f.read())
94 | ```
95 |
96 | Almost the same APIs are available for KNP.
97 |
98 | ```python
99 | # Perform language analysis by KNP
100 | knp = rhoknp.KNP()
101 | sentence = knp.apply_to_sentence("電気抵抗率は電気の通しにくさを表す物性値である。")
102 | ```
103 |
104 | KNP performs language analysis at multiple levels.
105 |
106 | ```python
107 | for clause in sentence.clauses: # a.k.a., setsu
108 | ...
109 | for phrase in sentence.phrases: # a.k.a. bunsetsu
110 | ...
111 | for base_phrase in sentence.base_phrases: # a.k.a. kihon-ku
112 | ...
113 | for morpheme in sentence.morphemes: # a.k.a. keitai-so
114 | ...
115 | ```
116 |
117 | Sentence objects can be saved in the KNP format.
118 |
119 | ```python
120 | # Save the sentence in the KNP format
121 | with open("sentence.knp", "wt") as f:
122 | f.write(sentence.to_knp())
123 |
124 | # Load the sentence
125 | with open("sentence.knp", "rt") as f:
126 | sentence = rhoknp.Sentence.from_knp(f.read())
127 | ```
128 |
129 | Furthermore, rhoknp provides convenient APIs for document-level language analysis.
130 |
131 | ```python
132 | document = rhoknp.Document.from_raw_text(
133 | "電気抵抗率は電気の通しにくさを表す物性値である。単に抵抗率とも呼ばれる。"
134 | )
135 | # If you know sentence boundaries, you can use `Document.from_sentences` instead.
136 | document = rhoknp.Document.from_sentences(
137 | [
138 | "電気抵抗率は電気の通しにくさを表す物性値である。",
139 | "単に抵抗率とも呼ばれる。",
140 | ]
141 | )
142 | ```
143 |
144 | Document objects can be handled in a similar manner as Sentence objects.
145 |
146 | ```python
147 | # Perform morphological analysis by Juman++
148 | document = jumanpp.apply_to_document(document)
149 |
150 | # Access language units in the document
151 | for sentence in document.sentences:
152 | ...
153 | for morpheme in document.morphemes:
154 | ...
155 |
156 | # Save language analysis by Juman++
157 | with open("document.jumanpp", "wt") as f:
158 | f.write(document.to_jumanpp())
159 |
160 | # Load language analysis by Juman++
161 | with open("document.jumanpp", "rt") as f:
162 | document = rhoknp.Document.from_jumanpp(f.read())
163 | ```
164 |
165 | For more information, please refer to the [examples](./examples) and [documentation](https://rhoknp.readthedocs.io/en/latest/).
166 |
167 | ## Main differences from [pyknp](https://github.com/ku-nlp/pyknp/)
168 |
169 | [_pyknp_](https://pypi.org/project/pyknp/) serves as the official Python binding for Juman++ and KNP.
170 | In the development of rhoknp, we redesigned the API, considering the current use cases of pyknp.
171 | The key differences between the two are as follows:
172 |
173 | - **Support for document-level language analysis**: rhoknp allows you to load and instantiate the results of document-level language analysis, including cohesion analysis and discourse relation analysis.
174 | - **Strict type-awareness**: rhoknp has been thoroughly annotated with type annotations, ensuring strict type checking and improved code clarity.
175 | - **Comprehensive test suite**: rhoknp is extensively tested with a comprehensive test suite. You can view the code coverage report on [Codecov](https://app.codecov.io/gh/ku-nlp/rhoknp).
176 |
177 | ## License
178 |
179 | MIT
180 |
181 | ## Contributing
182 |
183 | We warmly welcome contributions to rhoknp.
184 | You can get started by reading the [contribution guide](https://rhoknp.readthedocs.io/en/latest/contributing/index.html).
185 |
186 | ## Reference
187 |
188 | - [KNP FORMAT](http://cr.fvcrc.i.nagoya-u.ac.jp/~sasano/knp/format.html)
189 | - [KNP - KUROHASHI-CHU-MURAWAKI LAB](https://nlp.ist.i.kyoto-u.ac.jp/?KNP)
190 |
--------------------------------------------------------------------------------
/assets/KNP.sublime-syntax:
--------------------------------------------------------------------------------
1 | %YAML 1.2
2 | ---
3 | name: KNP
4 | file_extensions: [knp]
5 | scope: source.knp
6 |
7 | contexts:
8 | main:
9 | - match: '^[^+*\#\"<> ]+'
10 | scope: variablel
11 | - match: '(?<=\s)[^+\#\"<> ]+'
12 | scope: variable
13 | - match: "<"
14 | scope: keyword
15 | push: feature
16 | - match: ^\+
17 | scope: keyword
18 | push: tag_bnst
19 | - match: ^\*
20 | scope: keyword
21 | push: tag_bnst
22 | - match: ^EOS$
23 | scope: constant
24 | - match: \"
25 | scope: string
26 | push: string
27 | - match: ^\#
28 | scope: comment
29 | push: comment
30 | string:
31 | - match: '[^\"]+'
32 | scope: string
33 | - match: \"
34 | scope: string
35 | pop: true
36 | tag_bnst:
37 | - match: (-1|\d+)[DPAI]
38 | scope: constant.language
39 | - match: "<"
40 | scope: keyword
41 | push: feature
42 | - match: $
43 | pop: true
44 | feature:
45 | - match: \"
46 | scope: string
47 | push: string
48 | - match: ">"
49 | scope: keyword
50 | pop: true
51 | - match: ":"
52 | scope: keyword
53 | push: feature_value
54 | - match: "="
55 | scope: keyword
56 | - match: '[^ :><\"=]+'
57 | scope: storage.type
58 | feature_value:
59 | - match: "[^><]+(?=>)"
60 | scope: variable.parameter
61 | pop: true
62 | comment:
63 | - match: "S-ID:"
64 | scope: comment
65 | push: sid
66 | - match: \S+
67 | scope: comment
68 | - match: $
69 | pop: true
70 | sid:
71 | - match: \S+
72 | scope: constant.numeric
73 | pop: true
74 |
--------------------------------------------------------------------------------
/assets/logo-original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/assets/logo-original.png
--------------------------------------------------------------------------------
/assets/logo-wide.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/assets/logo-wide.xcf
--------------------------------------------------------------------------------
/assets/logo.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/assets/logo.xcf
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/_static/favicon-16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/docs/_static/favicon-16x16.png
--------------------------------------------------------------------------------
/docs/_static/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/docs/_static/favicon-32x32.png
--------------------------------------------------------------------------------
/docs/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/docs/_static/favicon.ico
--------------------------------------------------------------------------------
/docs/_static/logo-wide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/docs/_static/logo-wide.png
--------------------------------------------------------------------------------
/docs/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/docs/_static/logo.png
--------------------------------------------------------------------------------
/docs/authors.md:
--------------------------------------------------------------------------------
1 | ```{include} ../AUTHORS.md
2 |
3 | ```
4 |
--------------------------------------------------------------------------------
/docs/cli/index.md:
--------------------------------------------------------------------------------
1 | # CLI Tools
2 |
3 | _rhoknp_ provides a command-line interface (CLI).
4 |
5 | Before using the CLI, you need to install _rhoknp_ with the following command:
6 |
7 | ```{eval-rst}
8 | .. prompt::
9 | :prompts: $
10 |
11 | pip install rhoknp[cli]
12 | ```
13 |
14 | ## cat
15 |
16 | The `cat` command prints KNP files with syntax highlighting.
17 |
18 | ```{eval-rst}
19 | .. prompt::
20 | :prompts: $
21 |
22 | rhoknp cat [--dark]
23 | ```
24 |
25 | ## serve
26 |
27 | The `serve` command starts a web server to provide a playground for the given language analyzer.
28 |
29 | ```{eval-rst}
30 | .. prompt::
31 | :prompts: $
32 |
33 | rhoknp serve {jumanpp|knp|kwja} [--host HOST] [--port PORT]
34 | ```
35 |
36 | ## show
37 |
38 | The `show` command shows the given KNP file in a tree format.
39 |
40 | ```{eval-rst}
41 | .. prompt::
42 | :prompts: $
43 |
44 | rhoknp show [--pos] [--rel]
45 | ```
46 |
47 | ## stats
48 |
49 | The `stats` command shows the statistics of the given KNP file.
50 |
51 | ```{eval-rst}
52 | .. prompt::
53 | :prompts: $
54 |
55 | rhoknp stats [--json]
56 | ```
57 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import sys
14 | from pathlib import Path
15 |
16 | sys.path.insert(0, str(Path("../src").resolve()))
17 |
18 |
19 | # -- Project information -----------------------------------------------------
20 |
21 | project = "rhoknp"
22 | copyright = "2021, Hirokazu Kiyomaru and Nobuhiro Ueda"
23 | author = "Hirokazu Kiyomaru and Nobuhiro Ueda"
24 |
25 |
26 | # -- General configuration ---------------------------------------------------
27 |
28 | # Add any Sphinx extension module names here, as strings. They can be
29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
30 | # ones.
31 | extensions = [
32 | "sphinx.ext.autodoc",
33 | "sphinx.ext.viewcode",
34 | "sphinx.ext.napoleon",
35 | "sphinx_copybutton",
36 | "sphinx-prompt",
37 | "myst_parser",
38 | ]
39 |
40 | # sphinx.ext.autodoc
41 | autodoc_default_options = {
42 | "members": True,
43 | "show-inheritance": True,
44 | "undoc-members": True,
45 | "exclude-members": ",".join(["__weakref__", "count", "parent_unit", "child_units"]),
46 | "member-order": "bysource",
47 | }
48 |
49 | # sphinx_copybutton
50 | copybutton_prompt_text = r">>> |\.\.\. |\$ "
51 | copybutton_prompt_is_regexp = True
52 |
53 | # Add any paths that contain templates here, relative to this directory.
54 | templates_path = ["_templates"]
55 |
56 | # List of patterns, relative to source directory, that match files and
57 | # directories to ignore when looking for source files.
58 | # This pattern also affects html_static_path and html_extra_path.
59 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
60 |
61 |
62 | # -- Options for HTML output -------------------------------------------------
63 |
64 | # The theme to use for HTML and HTML Help pages. See the documentation for
65 | # a list of builtin themes.
66 | #
67 | html_theme = "furo"
68 | html_logo = "_static/logo-wide.png"
69 |
70 | # Add any paths that contain custom static files (such as style sheets) here,
71 | # relative to this directory. They are copied after the builtin static files,
72 | # so a file named "default.css" will overwrite the builtin "default.css".
73 | html_static_path = ["_static"]
74 |
--------------------------------------------------------------------------------
/docs/contributing/index.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Thank you for your interest in improving _rhoknp_!
4 | We give an overview on contributing to the _rhoknp_ project.
5 |
6 | ## Development Environment
7 |
8 | Development should be done using the latest version of Python.
9 | As of this writing, it is Python 3.10.
10 |
11 | Install the development dependencies using [uv](https://docs.astral.sh/uv/).
12 |
13 | ```{eval-rst}
14 | .. prompt::
15 | :prompts: $
16 |
17 | uv sync
18 | pre-commit install
19 | ```
20 |
21 | ## Submitting a Pull Request
22 |
23 | Before submitting a pull request, run lints and test.
24 |
25 | ```{eval-rst}
26 | .. prompt::
27 | :prompts: $
28 |
29 | uv run pre-commit run --all-files
30 | uv run pytest
31 | ```
32 |
33 | ## Testing
34 |
35 | If you are adding a new feature, please add a test for it.
36 | When the feature is large, first open an issue to discuss the idea.
37 |
38 | If you are fixing a bug, please add a test that exposes the bug and fails before applying your fix.
39 |
--------------------------------------------------------------------------------
/docs/format/index.md:
--------------------------------------------------------------------------------
1 | # Juman++/KNP Format
2 |
3 | This page describes the format of the result of Juman++ and KNP.
4 |
5 | ## Juman++
6 |
7 | Juman++ is a morphological analyzer for Japanese.
8 | We show an example of the result of Juman++:
9 |
10 | ```
11 | # Language analysis of "麻生太郎はコーヒーを買って飲んだ。"
12 | 麻生 あそう 麻生 名詞 6 人名 5 * 0 * 0 "人名:日本:姓:135:0.00166"
13 | 太郎 たろう 太郎 名詞 6 人名 5 * 0 * 0 "人名:日本:名:45:0.00106"
14 | は は は 助詞 9 副助詞 2 * 0 * 0 NIL
15 | コーヒー こーひー コーヒー 名詞 6 普通名詞 1 * 0 * 0 "代表表記:珈琲/こーひー ドメイン:料理・食事 カテゴリ:人工物-食べ物"
16 | を を を 助詞 9 格助詞 1 * 0 * 0 NIL
17 | 買って かって 買う 動詞 2 * 0 子音動詞ワ行 12 タ系連用テ形 14 "代表表記:買う/かう ドメイン:家庭・暮らし;ビジネス 反義:動詞:売る/うる"
18 | 飲んだ のんだ 飲む 動詞 2 * 0 子音動詞マ行 9 タ形 10 "代表表記:飲む/のむ ドメイン:料理・食事"
19 | 。 。 。 特殊 1 句点 1 * 0 * 0 NIL
20 | EOS
21 | ```
22 |
23 | Each line represents a morpheme (a.k.a. _keitai-so_) and formatted as `[surface form] [reading] [lemma] [pos] [pos ID] [pos subcategory] [pos subcategory ID] [conjugation type] [conjugation type ID] [conjugation form] [conjugation form ID] [semantic information]`.
24 | For example, `飲んだ のんだ 飲む 動詞 2 * 0 子音動詞マ行 9 タ形 10 "代表表記:飲む/のむ ドメイン:料理・食事"` indicates that the surface form is `飲んだ`, the reading is `のんだ`, the lemma is `飲む`, and the pos (part-of-speech) is `動詞`, and so forth.
25 |
26 | ## KNP
27 |
28 | KNP is a Japanese dependency parser.
29 | We show an example of the result of KNP:
30 |
31 | ```
32 | # Language analysis of "麻生太郎はコーヒーを買って飲んだ。"
33 | * 3D <文頭><人名><ハ><助詞><体言><係:未格><提題><区切:3-5><主題表現><格要素><連用要素><正規化代表表記:麻生/あそう+太郎/たろう><主辞代表表記:太郎/たろう>
34 | + 1D <文節内><係:文節内><文頭><人名><体言><名詞項候補><先行詞候補><正規化代表表記:麻生/あそう>
35 | 麻生 あそう 麻生 名詞 6 人名 5 * 0 * 0 "人名:日本:姓:135:0.00166 疑似代表表記 代表表記:麻生/あそう" <人名:日本:姓:135:0.00166><疑似代表表記><代表表記:麻生/あそう><正規化代表表記:麻生/あそう><漢字><かな漢字><名詞相当語><文頭><自立><内容語><タグ単位始><文節始><固有キー><用言表記先頭><用言表記末尾><用言意味表記末尾>
36 | + 4D <人名><ハ><助詞><体言><係:未格><提題><区切:3-5><主題表現><格要素><連用要素><名詞項候補><先行詞候補><正規化代表表記:太郎/たろう><主辞代表表記:太郎/たろう><解析格:ガ>
37 | 太郎 たろう 太郎 名詞 6 人名 5 * 0 * 0 "人名:日本:名:45:0.00106 疑似代表表記 代表表記:太郎/たろう" <人名:日本:名:45:0.00106><疑似代表表記><代表表記:太郎/たろう><正規化代表表記:太郎/たろう><漢字><かな漢字><名詞相当語><自立><複合←><内容語><タグ単位始><固有キー><文節主辞><用言表記先頭><用言表記末尾><用言意味表記末尾>
38 | は は は 助詞 9 副助詞 2 * 0 * 0 NIL <かな漢字><ひらがな><付属>
39 | * 2D <ヲ><助詞><体言><係:ヲ格><区切:0-0><格要素><連用要素><正規化代表表記:珈琲/こーひー><主辞代表表記:珈琲/こーひー>
40 | + 3D <ヲ><助詞><体言><係:ヲ格><区切:0-0><格要素><連用要素><名詞項候補><先行詞候補><正規化代表表記:珈琲/こーひー><主辞代表表記:珈琲/こーひー><解析格:ヲ>
41 | コーヒー こーひー コーヒー 名詞 6 普通名詞 1 * 0 * 0 "代表表記:珈琲/こーひー ドメイン:料理・食事 カテゴリ:人工物-食べ物" <代表表記:珈琲/こーひー><ドメイン:料理・食事><カテゴリ:人工物-食べ物><正規化代表表記:珈琲/こーひー><記英数カ><カタカナ><名詞相当語><自立><内容語><タグ単位始><文節始><固有キー><文節主辞>
42 | を を を 助詞 9 格助詞 1 * 0 * 0 NIL <かな漢字><ひらがな><付属>
43 | * 3D <用言:動><係:連用><レベル:A><区切:3-5><連用要素><連用節><動態述語><正規化代表表記:買う/かう><主辞代表表記:買う/かう>
44 | + 4D <用言:動><係:連用><レベル:A><区切:3-5><連用要素><連用節><動態述語><正規化代表表記:買う/かう><主辞代表表記:買う/かう><用言代表表記:買う/かう><節-区切><節-主辞><格関係2:ヲ:コーヒー><格解析結果:買う/かう:動1:ガ/U/-/-/-/-;ヲ/C/コーヒー/2/0/1;ニ/U/-/-/-/-;ト/U/-/-/-/-;デ/U/-/-/-/-;時間/U/-/-/-/-><標準用言代表表記:買う/かう>
45 | 買って かって 買う 動詞 2 * 0 子音動詞ワ行 12 タ系連用テ形 14 "代表表記:買う/かう ドメイン:家庭・暮らし;ビジネス 反義:動詞:売る/うる" <代表表記:買う/かう><ドメイン:家庭・暮らし;ビジネス><反義:動詞:売る/うる><正規化代表表記:買う/かう><かな漢字><活用語><自立><内容語><タグ単位始><文節始><文節主辞><用言表記先頭><用言表記末尾><用言意味表記末尾>
46 | * -1D <文末><時制:過去><句点><用言:動><レベル:C><区切:5-5><係:文末><提題受:30><主節><格要素><連用要素><動態述語><正規化代表表記:飲む/のむ><主辞代表表記:飲む/のむ>
47 | + -1D <文末><時制:過去><句点><用言:動><レベル:C><区切:5-5><係:文末><提題受:30><主節><格要素><連用要素><動態述語><正規化代表表記:飲む/のむ><主辞代表表記:飲む/のむ><用言代表表記:飲む/のむ><節-区切><節-主辞><主題格:一人称優位><格関係1:ガ:太郎><格解析結果:飲む/のむ:動8:ガ/N/太郎/1/0/1;ヲ/U/-/-/-/-;ニ/U/-/-/-/-;デ/U/-/-/-/-;時間/U/-/-/-/-><標準用言代表表記:飲む/のむ>
48 | 飲んだ のんだ 飲む 動詞 2 * 0 子音動詞マ行 9 タ形 10 "代表表記:飲む/のむ ドメイン:料理・食事" <代表表記:飲む/のむ><ドメイン:料理・食事><正規化代表表記:飲む/のむ><かな漢字><活用語><表現文末><自立><内容語><タグ単位始><文節始><文節主辞><用言表記先頭><用言表記末尾><用言意味表記末尾>
49 | 。 。 。 特殊 1 句点 1 * 0 * 0 NIL <英記号><記号><文末><付属>
50 | EOS
51 | ```
52 |
53 | The line starting with `*` represents the beginning of a phrase (a.k.a. _bunsetsu_) and formatted as `* [parent phrase index][dependency type] [semantic information]`.
54 | For example, the line `* 3D <文頭><人名>` indicates that the phrase modifies the `3`rd phrase with the dependency type of `D` includes the semantic information of `<文頭>` and `<人名>`.
55 |
56 | The line starting with `+` represents the beginning of a base-phrase (a.k.a. _kihon-ku_) and formatted as `+ [parent base-phrase index][dependency type] [semantic information]`.
57 | For example, the line `+ 1D <文節内><係:文節内>` indicates that the base-phrase modifies the `1`st base-phrase with the dependency type of `D` includes the semantic information of `<文節内>` and `<係:文節内>`.
58 |
59 | Lines with neither `*` nor `+` represent morphemes.
60 | The format is almost the same as Juman++'s one, except that the column of representing the semantic information is added at the end.
61 |
62 | ## Misc
63 |
64 | - Lines starting with `#` are comments.
65 | - `EOS` represents the end of the sentence.
66 |
67 | ---
68 |
69 | ## Reference
70 |
71 | - [KNP の基本的な出力の読み方 (in Japanese)](http://cr.fvcrc.i.nagoya-u.ac.jp/~sasano/knp/format.html)
72 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # rhoknp: Yet another Python binding for Juman++/KNP/KWJA
2 |
3 | [](https://github.com/ku-nlp/rhoknp/actions/workflows/test.yml)
4 | [](https://codecov.io/gh/ku-nlp/rhoknp)
5 | [](https://www.codefactor.io/repository/github/ku-nlp/rhoknp)
6 | [](https://pypi.org/project/rhoknp/)
7 | [](https://pypi.org/project/rhoknp/)
8 | [](https://rhoknp.readthedocs.io/en/latest/?badge=latest)
9 |
10 | **rhoknp** is a Python binding for [Juman++](https://github.com/ku-nlp/jumanpp), [KNP](https://github.com/ku-nlp/knp), and [KWJA](https://github.com/ku-nlp/kwja).
11 |
12 | ```python3
13 | import rhoknp
14 |
15 | # Perform language analysis by Juman++
16 | jumanpp = rhoknp.Jumanpp()
17 | sentence = jumanpp.apply_to_sentence("電気抵抗率は電気の通しにくさを表す物性値である。")
18 |
19 | # Dump language analysis by Juman++
20 | with open("result.jumanpp", "wt") as f:
21 | f.write(sentence.to_jumanpp())
22 |
23 | # Load language analysis by Juman++
24 | with open("result.jumanpp", "rt") as f:
25 | sentence = rhoknp.Sentence.from_jumanpp(f.read())
26 | ```
27 |
28 | ```{admonition} Why not *pyknp*?
29 | :class: note
30 | [*pyknp*](https://pypi.org/project/pyknp/) has been developed as the official Python binding for Juman++ and KNP.
31 | In *rhoknp*, we redesigned the API from the top-down, taking into account the current use cases of *pyknp*.
32 | The main differences from *pyknp* are as follows:
33 |
34 | - **Support document-level language analysis**: *rhoknp* can load and instantiate the result of document-level language analysis: i.e., cohesion analysis and discourse relation analysis.
35 | - **Strictly type-aware**: *rhoknp* is thoroughly annotated with type annotations. Efficient development is possible with the help of an IDE.
36 | - **Extensive test suite**: *rhoknp* is tested with an extensive test suite. See the code coverage at Codecov.
37 | ```
38 |
39 | ```{toctree}
40 | ---
41 | hidden:
42 | caption: User Guide
43 | maxdepth: 1
44 | ---
45 |
46 | installation/index
47 | reference/index
48 | cli/index
49 | format/index
50 | ```
51 |
52 | ```{toctree}
53 | ---
54 | hidden:
55 | caption: Development
56 | maxdepth: 1
57 | ---
58 |
59 | contributing/index
60 | authors
61 | ```
62 |
63 | ```{toctree}
64 | ---
65 | hidden:
66 | caption: Project Links
67 | ---
68 |
69 | GitHub
70 | PyPI
71 | ```
72 |
73 | ## Indices and tables
74 |
75 | - {ref}`genindex`
76 | - {ref}`search`
77 |
--------------------------------------------------------------------------------
/docs/installation/index.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | ## Requirements
4 |
5 | - [Python](https://python.org/)
6 | - Supported versions: 3.9+
7 | - [Juman++](https://github.com/ku-nlp/jumanpp) (Optional)
8 | - Supported versions: v2.0.0-rc3+
9 | - [KNP](https://github.com/ku-nlp/knp) (Optional)
10 | - Supported versions: 5.0+
11 |
12 | ```{note}
13 | If you just would like to load the result of language analysis by Juman++ and KNP, you do not need to install them.
14 | ```
15 |
16 | ## Installation
17 |
18 | We recommend installing _rhoknp_ with pip:
19 |
20 | ```{eval-rst}
21 | .. prompt::
22 | :prompts: $
23 |
24 | pip install rhoknp
25 | ```
26 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.https://www.sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/reference/index.md:
--------------------------------------------------------------------------------
1 | # API Reference
2 |
3 | ```{toctree}
4 | :maxdepth: 4
5 |
6 | rhoknp
7 | ```
8 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.cli.cli.md:
--------------------------------------------------------------------------------
1 | # rhoknp.cli.cli module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.cli.cli
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.cli.md:
--------------------------------------------------------------------------------
1 | # rhoknp.cli package
2 |
3 | ```{toctree}
4 | :maxdepth: 4
5 |
6 | rhoknp.cli.cli
7 | rhoknp.cli.serve
8 | rhoknp.cli.show
9 | rhoknp.cli.stats
10 | ```
11 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.cli.serve.md:
--------------------------------------------------------------------------------
1 | # rhoknp.cli.serve module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.cli.serve
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.cli.show.md:
--------------------------------------------------------------------------------
1 | # rhoknp.cli.show module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.cli.show
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.cli.stats.md:
--------------------------------------------------------------------------------
1 | # rhoknp.cli.stats module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.cli.stats
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.cohesion.argument.md:
--------------------------------------------------------------------------------
1 | # rhoknp.cohesion.argument module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.cohesion.argument
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.cohesion.coreference.md:
--------------------------------------------------------------------------------
1 | # rhoknp.cohesion.coreference module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.cohesion.coreference
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.cohesion.discourse.md:
--------------------------------------------------------------------------------
1 | # rhoknp.cohesion.discourse module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.cohesion.discourse
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.cohesion.exophora.md:
--------------------------------------------------------------------------------
1 | # rhoknp.cohesion.exophora module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.cohesion.exophora
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.cohesion.md:
--------------------------------------------------------------------------------
1 | # rhoknp.cohesion package
2 |
3 | ```{toctree}
4 | :maxdepth: 4
5 |
6 | rhoknp.cohesion.rel
7 | rhoknp.cohesion.pas
8 | rhoknp.cohesion.predicate
9 | rhoknp.cohesion.argument
10 | rhoknp.cohesion.exophora
11 | rhoknp.cohesion.coreference
12 | rhoknp.cohesion.discourse
13 | ```
14 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.cohesion.pas.md:
--------------------------------------------------------------------------------
1 | # rhoknp.cohesion.pas module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.cohesion.pas
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.cohesion.predicate.md:
--------------------------------------------------------------------------------
1 | # rhoknp.cohesion.predicate module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.cohesion.predicate
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.cohesion.rel.md:
--------------------------------------------------------------------------------
1 | # rhoknp.cohesion.rel module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.cohesion.rel
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.md:
--------------------------------------------------------------------------------
1 | # rhoknp package
2 |
3 | ```{toctree}
4 | :maxdepth: 4
5 |
6 | rhoknp.processors
7 | rhoknp.units
8 | rhoknp.props
9 | rhoknp.cohesion
10 | rhoknp.utils
11 | rhoknp.cli
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.processors.jumanpp.md:
--------------------------------------------------------------------------------
1 | # rhoknp.processors.jumanpp module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.processors.jumanpp
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.processors.knp.md:
--------------------------------------------------------------------------------
1 | # rhoknp.processors.knp module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.processors.knp
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.processors.kwja.md:
--------------------------------------------------------------------------------
1 | # rhoknp.processors.kwja module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.processors.kwja
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.processors.md:
--------------------------------------------------------------------------------
1 | # rhoknp.processors package
2 |
3 | ```{toctree}
4 | :maxdepth: 4
5 |
6 | rhoknp.processors.senter
7 | rhoknp.processors.jumanpp
8 | rhoknp.processors.knp
9 | rhoknp.processors.kwja
10 | rhoknp.processors.processor
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.processors.processor.md:
--------------------------------------------------------------------------------
1 | # rhoknp.processors.processor module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.processors.processor
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.processors.senter.md:
--------------------------------------------------------------------------------
1 | # rhoknp.processors.senter module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.processors.senter
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.props.dependency.md:
--------------------------------------------------------------------------------
1 | # rhoknp.props.dependency module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.props.dependency
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.props.feature.md:
--------------------------------------------------------------------------------
1 | # rhoknp.props.feature module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.props.feature
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.props.md:
--------------------------------------------------------------------------------
1 | # rhoknp.props package
2 |
3 | ```{toctree}
4 | :maxdepth: 4
5 |
6 | rhoknp.props.dependency
7 | rhoknp.props.feature
8 | rhoknp.props.semantics
9 | rhoknp.props.named_entity
10 | rhoknp.props.memo
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.props.memo.md:
--------------------------------------------------------------------------------
1 | # rhoknp.props.memo module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.props.memo
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.props.named_entity.md:
--------------------------------------------------------------------------------
1 | # rhoknp.props.named_entity module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.props.named_entity
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.props.semantics.md:
--------------------------------------------------------------------------------
1 | # rhoknp.props.semantics module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.props.semantics
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.units.base_phrase.md:
--------------------------------------------------------------------------------
1 | # rhoknp.units.base_phrase module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.units.base_phrase
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.units.clause.md:
--------------------------------------------------------------------------------
1 | # rhoknp.units.clause module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.units.clause
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.units.document.md:
--------------------------------------------------------------------------------
1 | # rhoknp.units.document module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.units.document
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.units.md:
--------------------------------------------------------------------------------
1 | # rhoknp.units package
2 |
3 | ```{toctree}
4 | :maxdepth: 4
5 |
6 | rhoknp.units.document
7 | rhoknp.units.sentence
8 | rhoknp.units.clause
9 | rhoknp.units.phrase
10 | rhoknp.units.base_phrase
11 | rhoknp.units.morpheme
12 | rhoknp.units.unit
13 | ```
14 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.units.morpheme.md:
--------------------------------------------------------------------------------
1 | # rhoknp.units.morpheme module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.units.morpheme
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.units.phrase.md:
--------------------------------------------------------------------------------
1 | # rhoknp.units.phrase module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.units.phrase
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.units.sentence.md:
--------------------------------------------------------------------------------
1 | # rhoknp.units.sentence module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.units.sentence
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.units.unit.md:
--------------------------------------------------------------------------------
1 | # rhoknp.units.unit module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.units.unit
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.utils.md:
--------------------------------------------------------------------------------
1 | # rhoknp.utils package
2 |
3 | ```{toctree}
4 | :maxdepth: 4
5 |
6 | rhoknp.utils.reader
7 | ```
8 |
--------------------------------------------------------------------------------
/docs/reference/rhoknp.utils.reader.md:
--------------------------------------------------------------------------------
1 | # rhoknp.utils.reader module
2 |
3 | ```{eval-rst}
4 | .. automodule:: rhoknp.utils.reader
5 | ```
6 |
7 | ```{toctree}
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file was autogenerated by uv via the following command:
2 | # uv export --only-group docs --no-annotate --no-hashes -o docs/requirements.txt
3 | alabaster==0.7.16 ; python_full_version < '3.10'
4 | alabaster==1.0.0 ; python_full_version >= '3.10'
5 | babel==2.17.0
6 | beautifulsoup4==4.13.4
7 | certifi==2025.4.26
8 | charset-normalizer==3.4.2
9 | colorama==0.4.6 ; sys_platform == 'win32'
10 | docutils==0.21.2
11 | furo==2024.8.6
12 | idna==3.10
13 | imagesize==1.4.1
14 | importlib-metadata==8.7.0 ; python_full_version < '3.10'
15 | jinja2==3.1.6
16 | markdown-it-py==3.0.0
17 | markupsafe==3.0.2
18 | mdit-py-plugins==0.4.2
19 | mdurl==0.1.2
20 | myst-parser==3.0.1 ; python_full_version < '3.10'
21 | myst-parser==4.0.1 ; python_full_version >= '3.10'
22 | packaging==25.0
23 | pygments==2.19.1
24 | pyyaml==6.0.2
25 | requests==2.32.3
26 | roman-numerals-py==3.1.0 ; python_full_version >= '3.11'
27 | snowballstemmer==2.2.0
28 | soupsieve==2.7
29 | sphinx==7.4.7 ; python_full_version < '3.10'
30 | sphinx==8.1.3 ; python_full_version == '3.10.*'
31 | sphinx==8.2.3 ; python_full_version >= '3.11'
32 | sphinx-basic-ng==1.0.0b2
33 | sphinx-copybutton==0.5.2
34 | sphinx-prompt==1.8.0 ; python_full_version < '3.10'
35 | sphinx-prompt==1.9.0 ; python_full_version >= '3.10'
36 | sphinxcontrib-applehelp==2.0.0
37 | sphinxcontrib-devhelp==2.0.0
38 | sphinxcontrib-htmlhelp==2.1.0
39 | sphinxcontrib-jsmath==1.0.1
40 | sphinxcontrib-qthelp==2.0.0
41 | sphinxcontrib-serializinghtml==2.0.0
42 | tomli==2.2.1 ; python_full_version < '3.11'
43 | typing-extensions==4.13.2
44 | urllib3==2.4.0
45 | zipp==3.21.0 ; python_full_version < '3.10'
46 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | We host a number of example scripts.
4 |
5 | ## Perform language analysis
6 |
7 | - [Juman++](./apply_jumanpp.py)
8 | - [KNP](./apply_knp.py)
9 | - [KWJA](./apply_kwja.py)
10 |
11 | ## Load language analysis results
12 |
13 | - [Juman++](./load_jumanpp.py)
14 | - [KNP](./load_knp.py)
15 | - [KWJA](./load_knp.py)
16 |
17 | ## Use language analysis results
18 |
19 | - [Morphological analysis](./use_morphological_analysis.py)
20 | - [Dependency parsing](./use_dependency_parsing.py)
21 | - [Named entity recognition](./use_named_entity_recognition.py)
22 | - [Discourse relation analysis](./use_discourse_relation_analysis.py)
23 | - [Predicate-argument structure analysis](./use_predicate_argument_structure_analysis.py)
24 | - [Coreference resolution](./use_coreference_resolution.py)
25 |
--------------------------------------------------------------------------------
/examples/apply_jumanpp.py:
--------------------------------------------------------------------------------
1 | """Example code for applying Juman++ to the given sentence.
2 |
3 | Usage:
4 | $ python examples/apply_jumanpp.py "今日はいい天気ですね。"
5 | """
6 |
7 | import sys
8 |
9 | from rhoknp import Jumanpp
10 |
11 | # Create a Jumanpp instance.
12 | jumanpp = Jumanpp()
13 |
14 | # Apply Jumanpp to a sentence.
15 | sent = jumanpp.apply_to_sentence(sys.argv[1])
16 |
17 | # Get information.
18 | for mrph in sent.morphemes:
19 | print(f"Text: {mrph.text}")
20 | print(f"Reading: {mrph.reading}")
21 | print(f"Lemma: {mrph.lemma}")
22 | print(f"POS: {mrph.pos}")
23 | print(f"Sub POS: {mrph.subpos}")
24 | print(f"Conjugation (type): {mrph.conjtype}")
25 | print(f"Conjugation (form): {mrph.conjform}")
26 | print("---")
27 |
--------------------------------------------------------------------------------
/examples/apply_knp.py:
--------------------------------------------------------------------------------
1 | """Example code for applying KNP to the given sentence.
2 |
3 | Usage:
4 | $ python examples/apply_knp.py "今日はいい天気ですね。"
5 | """
6 |
7 | import sys
8 |
9 | from rhoknp import KNP
10 |
11 | # Create a KNP instance.
12 | knp = KNP()
13 |
14 | # Apply KNP to a sentence.
15 | sent = knp.apply_to_sentence(sys.argv[1])
16 |
17 | # Get information.
18 | for mrph in sent.morphemes:
19 | print(f"Text: {mrph.text}")
20 | print(f"Reading: {mrph.reading}")
21 | print(f"Lemma: {mrph.lemma}")
22 | print(f"POS: {mrph.pos}")
23 | print(f"Sub POS: {mrph.subpos}")
24 | print(f"Conjugation (type): {mrph.conjtype}")
25 | print(f"Conjugation (form): {mrph.conjform}")
26 | print("---")
27 |
--------------------------------------------------------------------------------
/examples/apply_kwja.py:
--------------------------------------------------------------------------------
1 | """Example code for applying KWJA to the given sentence.
2 |
3 | Usage:
4 | $ python examples/apply_kwja.py "今日はいい天気ですね。"
5 | """
6 |
7 | import sys
8 |
9 | from rhoknp import KWJA
10 |
11 | # Create a KWJA instance.
12 | kwja = KWJA(options=["--model-size", "tiny"])
13 |
14 | # Apply KWJA to a document.
15 | doc = kwja.apply_to_document(sys.argv[1], timeout=120)
16 |
17 | # Get information.
18 | for mrph in doc.morphemes:
19 | print(f"Text: {mrph.text}")
20 | print(f"Reading: {mrph.reading}")
21 | print(f"Lemma: {mrph.lemma}")
22 | print(f"POS: {mrph.pos}")
23 | print(f"Sub POS: {mrph.subpos}")
24 | print(f"Conjugation (type): {mrph.conjtype}")
25 | print(f"Conjugation (form): {mrph.conjform}")
26 | print("---")
27 |
--------------------------------------------------------------------------------
/examples/load_jumanpp.py:
--------------------------------------------------------------------------------
1 | """Example code for loading the result of Juman++ from a file.
2 |
3 | Usage:
4 | $ python examples/load_jumanpp.py example.jumanpp
5 | """
6 |
7 | import sys
8 |
9 | from rhoknp import Sentence
10 | from rhoknp.utils.reader import chunk_by_sentence
11 |
12 | with open(sys.argv[1]) as f:
13 | for jumanpp in chunk_by_sentence(f):
14 | sent = Sentence.from_jumanpp(jumanpp)
15 | print(f"Successfully loaded a sentence: {sent.text}")
16 |
--------------------------------------------------------------------------------
/examples/load_knp.py:
--------------------------------------------------------------------------------
1 | """Example code for loading the result of KNP/KWJA from a file.
2 |
3 | Usage:
4 | $ python examples/load_knp.py example.jumanpp
5 | """
6 |
7 | import sys
8 |
9 | from rhoknp import Sentence
10 | from rhoknp.utils.reader import chunk_by_sentence
11 |
12 | with open(sys.argv[1]) as f:
13 | for knp in chunk_by_sentence(f):
14 | sent = Sentence.from_knp(knp)
15 | print(f"Successfully loaded a sentence: {sent.text}")
16 |
--------------------------------------------------------------------------------
/examples/use_coreference_resolution.py:
--------------------------------------------------------------------------------
1 | """Example code for using the result of coreference resolution.
2 |
3 | Usage:
4 | $ python examples/use_coreference_resolution.py "ソビエト連邦はソ連ともよばれる。同国の首都はモスクワである。"
5 | """
6 |
7 | import sys
8 |
9 | from rhoknp import KWJA, BasePhrase
10 |
11 | # Create a KWJA instance.
12 | kwja = KWJA()
13 |
14 | # Apply KWJA to a document.
15 | doc = kwja.apply_to_document(sys.argv[1])
16 |
17 | # Get information.
18 | for base_phrase in doc.base_phrases:
19 | coreferents: list[BasePhrase] = base_phrase.get_coreferents()
20 | if len(coreferents) > 0:
21 | print(f"Mention {base_phrase}")
22 | for coreferring_mention in coreferents:
23 | print(f" = {coreferring_mention}")
24 | print("---")
25 |
--------------------------------------------------------------------------------
/examples/use_dependency_parsing.py:
--------------------------------------------------------------------------------
1 | """Example code for using the result of dependency parsing.
2 |
3 | Usage:
4 | $ python examples/use_dependency_parsing.py "今日はいい天気ですね。"
5 | """
6 |
7 | import sys
8 |
9 | from rhoknp import KNP
10 |
11 | # Create a KNP instance.
12 | knp = KNP()
13 |
14 | # Apply KNP to a sentence.
15 | sent = knp.apply_to_sentence(sys.argv[1])
16 |
17 | # Get information.
18 | for phrase in sent.phrases:
19 | parent = phrase.parent
20 | if parent:
21 | print(f"{phrase.text} -> {parent.text}")
22 | else:
23 | print(f"{phrase.text} -> ROOT")
24 |
--------------------------------------------------------------------------------
/examples/use_discourse_relation_analysis.py:
--------------------------------------------------------------------------------
1 | """Example code for using the result of discourse relation analysis.
2 |
3 | Usage:
4 | $ python examples/use_discourse_relation_analysis.py "風が吹いたら桶屋が儲かる。"
5 | """
6 |
7 | import sys
8 |
9 | from rhoknp import KNP
10 |
11 | # Create a KNP instance.
12 | knp = KNP()
13 |
14 | # Apply KNP to a sentence.
15 | sent = knp.apply_to_sentence(sys.argv[1])
16 |
17 | # Get information.
18 | if sent.is_clause_tag_required() is True:
19 | print("KNP might be too old; please update it.")
20 | sys.exit(1)
21 |
22 | discourse_relations = []
23 | for clause in sent.clauses:
24 | discourse_relations.extend(clause.discourse_relations)
25 |
26 | if discourse_relations:
27 | print(f"Found {len(discourse_relations)} discourse relations:")
28 | for i, discourse_relation in enumerate(discourse_relations, start=1):
29 | modifier = discourse_relation.modifier
30 | head = discourse_relation.head
31 | label = discourse_relation.label
32 | print(f' {i}. "{modifier}" -({label.value})-> "{head}"')
33 | else:
34 | print("No discourse relation found.")
35 |
--------------------------------------------------------------------------------
/examples/use_morphological_analysis.py:
--------------------------------------------------------------------------------
1 | """Example code for using the result of morphological analysis.
2 |
3 | Usage:
4 | $ python examples/use_morphological_analysis.py "今日はいい天気ですね。"
5 | """
6 |
7 | import sys
8 |
9 | from rhoknp import Jumanpp
10 |
11 | # Create a Jumanpp instance.
12 | jumanpp = Jumanpp()
13 |
14 | # Apply Jumanpp to a sentence.
15 | sent = jumanpp.apply_to_sentence(sys.argv[1])
16 |
17 | # Get information.
18 | for mrph in sent.morphemes:
19 | print(f"Text: {mrph.text}")
20 | print(f"Reading: {mrph.reading}")
21 | print(f"Lemma: {mrph.lemma}")
22 | print(f"POS: {mrph.pos}")
23 | print(f"Sub POS: {mrph.subpos}")
24 | print(f"Conjugation (type): {mrph.conjtype}")
25 | print(f"Conjugation (form): {mrph.conjform}")
26 | print("---")
27 |
--------------------------------------------------------------------------------
/examples/use_named_entity_recognition.py:
--------------------------------------------------------------------------------
1 | """Example code for using the result of named entity recognition.
2 |
3 | Usage:
4 | $ python examples/use_named_entity_recognition.py "太郎は花子が読んでいる本を次郎に渡した。"
5 | """
6 |
7 | import sys
8 |
9 | from rhoknp import KNP
10 |
11 | # Create a KNP instance.
12 | knp = KNP()
13 |
14 | # Apply KNP to a sentence.
15 | sent = knp.apply_to_sentence(sys.argv[1])
16 |
17 | # Get information.
18 | if sent.named_entities:
19 | print(f"Found {len(sent.named_entities)} named entities:")
20 | for i, named_entity in enumerate(sent.named_entities, start=1):
21 | print(f' {i}. "{named_entity.text}" ({named_entity.category.value})')
22 | else:
23 | print("No named entity found.")
24 |
--------------------------------------------------------------------------------
/examples/use_predicate_argument_structure_analysis.py:
--------------------------------------------------------------------------------
1 | """Example code for using the result of predicate-argument structure analysis.
2 |
3 | Usage:
4 | $ python examples/use_predicate_argument_structure_analysis.py "太郎は花子が読んでいる本を次郎に渡した。"
5 | """
6 |
7 | import sys
8 |
9 | from rhoknp import KWJA
10 | from rhoknp.cohesion import Argument
11 |
12 | # Create a KWJA instance.
13 | kwja = KWJA()
14 |
15 | # Apply KWJA to a document.
16 | doc = kwja.apply_to_document(sys.argv[1])
17 |
18 | # Get information.
19 | for base_phrase in doc.base_phrases:
20 | pas = base_phrase.pas
21 | if pas.is_empty() is True:
22 | continue
23 | all_arguments: dict[str, list[Argument]] = pas.get_all_arguments()
24 | print(f"Predicate: {pas.predicate}")
25 | for case, arguments in all_arguments.items():
26 | print(f" {case}格: ", end="")
27 | print(", ".join(str(argument) for argument in arguments))
28 | print("---")
29 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "rhoknp"
3 | version = "1.7.1"
4 | description = "Yet another Python binding for Juman++/KNP/KWJA"
5 | license = "MIT"
6 | authors = [
7 | { name = "Hirokazu Kiyomaru", email = "h.kiyomaru@gmail.com"},
8 | { name = "Nobuhiro Ueda", email = "ueda@nlp.i.kyoto-u.ac.jp"},
9 | ]
10 | maintainers = [
11 | { name = "Hirokazu Kiyomaru", email = "h.kiyomaru@gmail.com"},
12 | { name = "Nobuhiro Ueda", email = "ueda@nlp.i.kyoto-u.ac.jp"},
13 | ]
14 | readme = "README.md"
15 | keywords = ["NLP", "Japanese", "Juman++", "KNP", "KWJA"]
16 | classifiers = [
17 | "License :: OSI Approved :: MIT License",
18 | "Natural Language :: Japanese",
19 | "Operating System :: MacOS",
20 | "Operating System :: MacOS :: MacOS X",
21 | "Operating System :: Microsoft :: Windows",
22 | "Operating System :: POSIX :: Linux",
23 | "Programming Language :: Python :: 3",
24 | "Programming Language :: Python :: 3.9",
25 | "Programming Language :: Python :: 3.10",
26 | "Programming Language :: Python :: 3.11",
27 | "Programming Language :: Python :: 3.12",
28 | "Programming Language :: Python :: 3.13",
29 | "Topic :: Scientific/Engineering",
30 | "Topic :: Software Development :: Libraries",
31 | "Topic :: Software Development :: Libraries :: Python Modules",
32 | "Topic :: Text Processing",
33 | "Topic :: Text Processing :: Linguistic",
34 | ]
35 | requires-python = ">=3.9"
36 |
37 | dependencies = [
38 | "typing-extensions>=4.4; python_version < '3.12'"
39 | ]
40 |
41 | [project.optional-dependencies]
42 | cli = [
43 | "typer-slim>=0.15.2",
44 | "PyYAML>=6.0",
45 | "rich>=12.6",
46 | "uvicorn>=0.30.0",
47 | "fastapi>=0.111.0",
48 | "jinja2>=3.1.4",
49 | "pygments>=2.18.0",
50 | ]
51 |
52 | [project.urls]
53 | Homepage = "https://github.com/ku-nlp/rhoknp"
54 | Documentation = "https://rhoknp.readthedocs.io/en/latest"
55 | Repository = "https://github.com/ku-nlp/rhoknp"
56 | Issues = "https://github.com/ku-nlp/rhoknp/issues"
57 |
58 | [project.scripts]
59 | rhoknp = "rhoknp.cli.cli:app"
60 |
61 | [dependency-groups]
62 | dev = [
63 | "ipdb>=0.13.13",
64 | ]
65 | test = [
66 | "pytest>=8.0",
67 | "coverage[toml]>=7.3",
68 | "pytest-cov>=6.0",
69 | "httpx>=0.25",
70 | ]
71 | docs = [
72 | "Sphinx>=7.0; python_version < '3.10'",
73 | "Sphinx>=8.0; python_version >= '3.10'",
74 | "sphinx-prompt>=1.8; python_version < '3.10'",
75 | "sphinx-prompt>=1.9; python_version >= '3.10'",
76 | "sphinx-copybutton>=0.5.0",
77 | "myst-parser>=3.0; python_version < '3.10'",
78 | "myst-parser>=4.0; python_version >= '3.10'",
79 | "markdown-it-py>=3.0",
80 | "furo>=2024.4",
81 | "typing-extensions>=4.4",
82 | ]
83 |
84 | [build-system]
85 | requires = ["hatchling"]
86 | build-backend = "hatchling.build"
87 |
88 | [tool.hatch.build.targets.sdist]
89 | only-include = ["/src/rhoknp"]
90 |
91 | [tool.uv]
92 | package = true
93 | default-groups = ["dev", "test"]
94 |
95 | [tool.ruff]
96 | line-length = 120
97 | indent-width = 4
98 | src = ["src"]
99 | target-version = "py39" # The minimum Python version to target
100 |
101 | [tool.ruff.lint]
102 | select = ["F", "E", "W", "I", "B", "PL", "PD", "NPY", "RUF", "UP", "TID", "COM", "PT", "D", "ARG", "PYI", "ANN", "G", "FBT", "EM", "TRY", "PTH", "T", "INP"]
103 | #select = ["ALL"]
104 | ignore = [
105 | "PLR0911", # Too many return statements
106 | "PLR0912", # Too many branches
107 | "PLR0913", # Too many arguments in function definition
108 | "PLR0915", # Too many statements
109 | "E501", # Line too long
110 | "RUF001", # String contains ambiguous `ノ` (KATAKANA LETTER NO). Did you mean `/` (SOLIDUS)?
111 | "RUF002", # Docstring contains ambiguous `,` (FULLWIDTH COMMA). Did you mean `,` (COMMA)?
112 | "RUF003", # Comment contains ambiguous `(` (FULLWIDTH LEFT PARENTHESIS). Did you mean `(` (LEFT PARENTHESIS)?
113 | "UP037", # Remove quotes from type annotation
114 | "COM812", # Trailing comma missing
115 | "PLR2004", # Magic value used in comparison
116 | "D100", # Missing docstring in public module
117 | "D105", # Missing docstring in magic method
118 | "D107", # Missing docstring in `__init__`
119 | "D301", # Use `r"""` if any backslashes in a docstring
120 | "D403", # First word of the first line should be properly capitalized
121 | "D415", # First line should end with a period, question mark, or exclamation point
122 | "ANN002", # Missing type annotation for `*args`
123 | "ANN003", # Missing type annotation for `**kwargs`
124 | "FA100", # Missing `from __future__ import annotations`, but uses `...`
125 | "S101", # Use of `assert` detected
126 | "G004", # Logging statement uses f-string
127 | "FBT001", # Boolean-typed positional argument in function definition
128 | "FBT002", # Boolean default positional argument in function definition
129 | "FBT003", # Boolean positional value in function call
130 | "EM101", # Exception must not use a string literal, assign to variable first
131 | "EM102", # Exception must not use an f-string literal, assign to variable first
132 | "TRY003", # Avoid specifying long messages outside the exception class
133 | ]
134 |
135 | [tool.ruff.lint.per-file-ignores]
136 | "__init__.py" = [
137 | "D104", # Missing docstring in public package
138 | ]
139 | "tests/*" = [
140 | "D", # pydocstyle
141 | "S101", # Use of `assert` detected
142 | "INP001", # File `...` is part of an implicit namespace package. Add an `__init__.py`
143 | ]
144 | "src/rhoknp/cli/*" = [
145 | "T201", # `print` found
146 | ]
147 | "examples/*" = [
148 | "T201", # `print` found
149 | "INP001", # File `...` is part of an implicit namespace package. Add an `__init__.py`
150 | "PTH123", # `open()` should be replaced by `Path.open()`
151 | ]
152 | "docs/conf.py" = [
153 | "INP001", # File `...` is part of an implicit namespace package. Add an `__init__.py`
154 | ]
155 |
156 | [tool.ruff.lint.flake8-bugbear]
157 | extend-immutable-calls = ["typer.Argument", "typer.Option"]
158 |
159 | [tool.ruff.lint.flake8-tidy-imports]
160 | ban-relative-imports = "all"
161 |
162 | [tool.ruff.lint.pydocstyle]
163 | convention = "google"
164 |
165 | [tool.ruff.lint.mccabe]
166 | max-complexity = 20 # default: 10
167 |
168 | [tool.mypy]
169 | python_version = "3.9"
170 |
171 | [tool.coverage.run]
172 | omit = ["tests/*"]
173 |
174 | [tool.coverage.report]
175 | exclude_lines = [
176 | "pragma: no cover",
177 | "def __repr__", # Do not complain about missing debug-only code
178 | "except ImportError", # Do not complain about packages we have installed
179 | # Do not complain if tests do not hit defensive assertion code
180 | "raise AssertionError",
181 | "raise NotImplementedError",
182 | "raise ImportError",
183 | # Do not complain if non-runnable code is not run
184 | "if TYPE_CHECKING:",
185 | "if __name__ == .__main__.:",
186 | "@(abc\\.)?abstractmethod", # Do not complain about abstract methods
187 | "@overload", # Do not complain about overloads
188 | ]
189 |
--------------------------------------------------------------------------------
/src/rhoknp/__init__.py:
--------------------------------------------------------------------------------
1 | from importlib.metadata import version
2 |
3 | from rhoknp.processors import KNP, KWJA, Jumanpp, RegexSenter
4 | from rhoknp.units import BasePhrase, Clause, Document, Morpheme, Phrase, Sentence
5 |
6 | __version__ = version("rhoknp")
7 |
8 | __all__ = [
9 | "KNP",
10 | "KWJA",
11 | "BasePhrase",
12 | "Clause",
13 | "Document",
14 | "Jumanpp",
15 | "Morpheme",
16 | "Phrase",
17 | "RegexSenter",
18 | "Sentence",
19 | "__version__",
20 | ]
21 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 | import rhoknp.cli.cli
3 | import rhoknp.cli.serve
4 | import rhoknp.cli.show
5 | import rhoknp.cli.stats # noqa: F401
6 | except ImportError as e:
7 | raise ImportError(
8 | f"{e.msg}\nExtra dependencies are required to use the CLI. Install them with `pip install rhoknp[cli]`."
9 | ) from e
10 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/cat.py:
--------------------------------------------------------------------------------
1 | from typing import ClassVar
2 |
3 | from pygments import highlight
4 | from pygments.formatters import TerminalFormatter
5 | from pygments.lexer import RegexLexer, bygroups, default
6 | from pygments.token import Comment, Generic, Literal, Name, Number, String, Text, Whitespace
7 |
8 | from rhoknp import BasePhrase, Document, Morpheme, Phrase
9 |
10 |
11 | class KNPLexer(RegexLexer):
12 | """KNP の出力を色付けするための Lexer."""
13 |
14 | name: ClassVar[str] = "KNP"
15 | url: ClassVar[str] = "https://github.com/ku-nlp/knp"
16 | filenames: ClassVar[list[str]] = ["*.knp", "*.kwja"]
17 | mimetypes: ClassVar[list[str]] = ["text/plain"]
18 |
19 | tokens = { # noqa: RUF012
20 | "root": [
21 | (r"\s+", Whitespace),
22 | (rf"(?={Phrase.PAT.pattern})", Text, "phrase"),
23 | (rf"(?={BasePhrase.PAT.pattern})", Text, "base_phrase"),
24 | (rf"(?={Morpheme.PAT.pattern})", Text, "morpheme"),
25 | (r"^#.*$", Comment.Single),
26 | (r"^EOS$", Generic.Subheading),
27 | ],
28 | "phrase": [
29 | (r"\s+", Whitespace),
30 | (r"^\*", Generic.Heading),
31 | (r"(-?\d+)([DPAI])", bygroups(Number, Literal.String)),
32 | (r"<", Name.Tag, "tag"),
33 | default("#pop"),
34 | ],
35 | "base_phrase": [
36 | (r"\s+", Whitespace),
37 | (r"^\+", Generic.Heading),
38 | (r"(-?\d+)([DPAI])", bygroups(Number, Literal.String)),
39 | (r":]+)(:)?([^>]+)?", bygroups(Name.Tag, Name.Tag, Name.Attribute)),
85 | (r">", Name.Tag, "#pop"),
86 | ],
87 | "rel_tag": [
88 | (r"\s+", Whitespace),
89 | (r'(\S+=)("\S+?")', bygroups(Name.Attribute, String)),
90 | (r"/>", Name.Tag, "#pop"),
91 | ],
92 | }
93 |
94 |
95 | def print_document(document: Document, is_dark: bool = False) -> None:
96 | """KNP ファイルを色付きで表示.
97 |
98 | Args:
99 | document (Document): 文書.
100 | is_dark (bool, optional): ターミナルの背景色が dark なら True.デフォルトは False.
101 | """
102 | formatter = TerminalFormatter(bg="dark" if is_dark else "light")
103 | print(highlight(document.to_knp(), KNPLexer(), formatter), end="")
104 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/cli.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 | from pathlib import Path
4 | from typing import Optional
5 |
6 | import typer
7 | import yaml
8 |
9 | from rhoknp import Document, __version__
10 | from rhoknp.cli.cat import print_document
11 | from rhoknp.cli.serve import AnalyzerType, serve_analyzer
12 | from rhoknp.cli.show import draw_tree
13 | from rhoknp.cli.stats import get_document_statistics
14 |
15 | app = typer.Typer(help="rhoknp CLI utilities.")
16 |
17 |
18 | def version_callback(value: bool) -> None:
19 | """バージョンを表示.
20 |
21 | Args:
22 | value: True ならバージョンを表示してプログラムを終了.
23 | """
24 | if value:
25 | print(f"rhoknp version: {__version__}")
26 | raise typer.Exit
27 |
28 |
29 | @app.callback()
30 | def main(
31 | _: bool = typer.Option(False, "--version", "-v", callback=version_callback, help="Show version and exit."),
32 | ) -> None:
33 | """CLI のメイン関数."""
34 |
35 |
36 | @app.command(help="Print KNP files with syntax highlighting.")
37 | def cat(
38 | knp_path: Optional[Path] = typer.Argument(None, exists=True, dir_okay=False, help="Path to knp file to show."),
39 | dark: bool = typer.Option(False, "--dark", "-d", help="Use dark background."),
40 | ) -> None:
41 | """KNP ファイルを色付きで表示.
42 |
43 | Args:
44 | knp_path: KNP ファイルのパス.
45 | dark: True なら背景を黒にする.
46 | """
47 | knp_text = sys.stdin.read() if knp_path is None else knp_path.read_text()
48 | doc = Document.from_knp(knp_text)
49 | print_document(doc, is_dark=dark)
50 |
51 |
52 | @app.command(help="Convert a KNP file into raw text, Juman++ format, or KNP format.")
53 | def convert(
54 | knp_path: Optional[Path] = typer.Argument(
55 | None, exists=True, dir_okay=False, help="Path to knp file to convert. If not given, read from stdin"
56 | ),
57 | format_: str = typer.Option("text", "--format", "-f", help="Format to convert to."),
58 | ) -> None:
59 | """KNP ファイルを種々のフォーマットに変換.
60 |
61 | Args:
62 | knp_path: KNP ファイルのパス.
63 | format_: 変換先のフォーマット."text", "jumanpp", "knp" のいずれか.
64 | """
65 | knp_text = sys.stdin.read() if knp_path is None else knp_path.read_text()
66 | doc = Document.from_knp(knp_text)
67 | if format_ == "text":
68 | print(doc.text)
69 | elif format_ == "jumanpp":
70 | print(doc.to_jumanpp(), end="")
71 | elif format_ == "knp":
72 | print(doc.to_knp(), end="")
73 | else:
74 | raise ValueError(f"Unknown format: {format_}")
75 |
76 |
77 | @app.command(help="Print given file content in tree format.")
78 | def show(
79 | knp_path: Path = typer.Argument(..., exists=True, dir_okay=False, help="Path to knp file to show"),
80 | pos: bool = typer.Option(False, "--pos", "-p", help="Show POS characters."),
81 | rel: bool = typer.Option(False, "--rel", "-r", help="Show contents of tags."),
82 | pas: bool = typer.Option(False, "--pas", help="Show predicate-argument structures."),
83 | ) -> None:
84 | """KNP ファイルを読み込み係り受けを可視化.
85 |
86 | Args:
87 | knp_path: KNP ファイルのパス.
88 | pos: True なら同時に品詞を表示.
89 | rel: True なら同時に タグの内容を表示.
90 | pas: True なら同時に述語項構造を表示.
91 | """
92 | doc = Document.from_knp(knp_path.read_text())
93 | for sent in doc.sentences:
94 | print(sent.comment)
95 | draw_tree(sent.base_phrases, show_pos=pos, show_rel=rel, show_pas=pas)
96 |
97 |
98 | @app.command(help="Show statistics of given KNP file.")
99 | def stats(
100 | knp_path: Path = typer.Argument(
101 | ..., exists=True, dir_okay=False, help="Path to knp file to calculate statistics on."
102 | ),
103 | use_json: bool = typer.Option(False, "--json", "-j", help="Output statistics in JSON format."),
104 | ) -> None:
105 | """KNP ファイルを読み込みその統計情報を出力.
106 |
107 | Args:
108 | knp_path: KNP ファイルのパス.
109 | use_json: JSON 形式で出力.
110 | """
111 | doc = Document.from_knp(knp_path.read_text())
112 | doc_stats = get_document_statistics(doc)
113 | if use_json:
114 | print(json.dumps(doc_stats, ensure_ascii=False, indent=4))
115 | else:
116 | print(yaml.dump(doc_stats, allow_unicode=True, sort_keys=False), end="")
117 |
118 |
119 | @app.command(help="Serve an analyzer as HTTP server.")
120 | def serve(
121 | analyzer: AnalyzerType = typer.Argument(..., help="Analyzer to use. Choose from jumanpp, knp, kwja."),
122 | host: str = typer.Option("localhost", "--host", "-h", help="Host to listen on."),
123 | port: int = typer.Option(8000, "--port", "-p", help="Port to listen on."),
124 | base_url: str = typer.Option("/", "--base-url", help="Root path of the server."),
125 | analyzer_args: Optional[list[str]] = typer.Argument(None, help="Additional arguments for the analyzer."),
126 | ) -> None:
127 | """解析器を起動し,HTTP サーバとして提供.
128 |
129 | Args:
130 | analyzer: 解析器の種類.
131 | host: ホスト.
132 | port: ポート.
133 | base_url: ベース URL.
134 | analyzer_args: 解析器のオプション.
135 | """
136 | serve_analyzer(analyzer, host, port, base_url, analyzer_args) # pragma: no cover
137 |
138 |
139 | if __name__ == "__main__":
140 | app()
141 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/show.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from collections.abc import Sequence
3 | from typing import TextIO, Union
4 |
5 | from rich.console import Console
6 | from rich.table import Table
7 | from rich.text import Text
8 |
9 | from rhoknp.cohesion import EndophoraArgument
10 | from rhoknp.props.dependency import DepType
11 | from rhoknp.units.base_phrase import BasePhrase
12 | from rhoknp.units.phrase import Phrase
13 |
14 | POS_MARK = {
15 | "特殊": "*",
16 | "動詞": "v",
17 | "形容詞": "j",
18 | "判定詞": "c",
19 | "助動詞": "x",
20 | "名詞": "n",
21 | "固有名詞": "N",
22 | "人名": "J",
23 | "地名": "C",
24 | "組織名": "A",
25 | "指示詞": "d",
26 | "副詞": "a",
27 | "助詞": "p",
28 | "接続詞": "c",
29 | "連体詞": "m",
30 | "感動詞": "!",
31 | "接頭辞": "p",
32 | "接尾辞": "s",
33 | "未定義語": "?",
34 | }
35 |
36 |
37 | def draw_tree(
38 | leaves: Union[Sequence[Phrase], Sequence[BasePhrase]],
39 | fh: TextIO = sys.stdout,
40 | show_pos: bool = False,
41 | show_rel: bool = False,
42 | show_pas: bool = False,
43 | ) -> None:
44 | """構文木を指定された fh に出力.
45 |
46 | Args:
47 | leaves: 構文木の葉となる文節列または基本句列.
48 | fh: 出力先.
49 | show_pos: True なら同時に品詞を表示する.
50 | show_rel: True なら同時に タグの内容を表示する.
51 | show_pas: True なら同時に述語項構造を表示する.
52 | """
53 | console = Console(file=fh)
54 | table = Table.grid(padding=(0, 2))
55 | limit = len(leaves)
56 | item = [[""] * limit for _ in leaves]
57 | active_column = [0] * limit
58 | limit -= 1
59 |
60 | for i in range(limit):
61 | parent_index = leaves[i].parent_index
62 | dep_type = leaves[i].dep_type
63 | assert parent_index is not None, "parent_index has not been set"
64 | para_row = leaves[i].dep_type == DepType.PARALLEL
65 | for j in range(i + 1, limit + 1):
66 | if j < parent_index:
67 | if active_column[j] == 2:
68 | item[i][j] = "╋" if para_row else "╂"
69 | elif active_column[j] == 1:
70 | item[i][j] = "┿" if para_row else "┼"
71 | else:
72 | item[i][j] = "━" if para_row else "─"
73 | elif j == parent_index:
74 | if dep_type in (DepType.PARALLEL, DepType.IMPERFECT_PARALLEL, DepType.APPOSITION):
75 | item[i][j] = str(dep_type.value)
76 | elif active_column[j] == 2:
77 | item[i][j] = "┨"
78 | elif active_column[j] == 1:
79 | item[i][j] = "┤"
80 | else:
81 | item[i][j] = "┐"
82 | if active_column[j] == 2:
83 | pass
84 | elif para_row:
85 | active_column[j] = 2
86 | else:
87 | active_column[j] = 1
88 | else: # noqa: PLR5501
89 | if active_column[j] == 2:
90 | item[i][j] = "┃"
91 | elif active_column[j] == 1:
92 | item[i][j] = "│"
93 | else:
94 | item[i][j] = " "
95 |
96 | lines: list[str] = []
97 | for i in range(len(leaves)):
98 | line = _leaf_string(leaves[i], show_pos)
99 | for j in range(i + 1, len(leaves)):
100 | line += _extend_horizontal(item[i][j]) + item[i][j]
101 | lines.append(line)
102 |
103 | max_length = max(_str_real_length(line) for line in lines)
104 | for line, leaf in zip(lines, leaves):
105 | diff = max_length - _str_real_length(line)
106 | tree_string = " " * diff + line
107 | feat_string = _feat_string(leaf, show_rel, show_pas) if isinstance(leaf, BasePhrase) else ""
108 | table.add_row(Text(tree_string), Text(feat_string))
109 | console.print(table)
110 |
111 |
112 | def _extend_horizontal(token: str) -> str:
113 | if token in ("╂", "┼", "┤", "┨", "┐", "─", "I", "A"):
114 | return "─"
115 | elif token in ("╋", "┿", "━", "P"):
116 | return "━"
117 | else:
118 | return " "
119 |
120 |
121 | def _leaf_string(leaf: Union[Phrase, BasePhrase], show_pos: bool) -> str:
122 | ret = ""
123 | for morpheme in leaf.morphemes:
124 | ret += morpheme.text
125 | if show_pos is True:
126 | if morpheme.subpos in ("固有名詞", "人名", "地名"):
127 | ret += POS_MARK[morpheme.subpos]
128 | else:
129 | ret += POS_MARK[morpheme.pos]
130 | return ret
131 |
132 |
133 | def _str_real_length(string: str) -> int:
134 | return Text(string).cell_len
135 |
136 |
137 | def _feat_string(base_phrase: BasePhrase, show_rel: bool, show_pas: bool) -> str:
138 | tag_strings: list[str] = []
139 | if show_rel is True:
140 | for tag in base_phrase.rel_tags:
141 | tag_strings.append(f"{tag.type}:{tag.target}")
142 | if show_pas is True:
143 | for case, arguments in base_phrase.pas.get_all_arguments(relax=False).items():
144 | for arg in arguments:
145 | core_text = _get_core_text(arg.base_phrase) if isinstance(arg, EndophoraArgument) else str(arg)
146 | tag_string = f"{case}:{core_text}"
147 | if tag_string not in tag_strings:
148 | tag_strings.append(tag_string)
149 | return " ".join(tag_strings)
150 |
151 |
152 | def _get_core_text(base_phrase: BasePhrase) -> str:
153 | """Get the core text without ancillary words."""
154 | morphemes = base_phrase.morphemes
155 | start_index = 0
156 | for morpheme in morphemes:
157 | if morpheme.pos in ("助詞", "特殊", "判定詞"):
158 | start_index += 1
159 | else:
160 | break
161 | end_index = len(morphemes)
162 | for morpheme in reversed(morphemes):
163 | if morpheme.pos in ("助詞", "特殊", "判定詞"):
164 | end_index -= 1
165 | else:
166 | break
167 | ret = "".join(m.text for m in morphemes[start_index:end_index])
168 | if not ret:
169 | start_index = 0
170 | end_index = len(morphemes)
171 | return "".join(m.text for m in morphemes[start_index:end_index])
172 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/static/css/style.css:
--------------------------------------------------------------------------------
1 | /* templates/components/raw_input.jinja2 */
2 | .input-text {
3 | white-space: pre-wrap;
4 | margin: 0 0.5em 1em;
5 | padding: 0.5em;
6 | }
7 |
8 | .result {
9 | margin: 0 0.5em 1em;
10 | padding: 0.5em;
11 | }
12 |
13 | /* templates/components/named_entity_recognition.jinja2 */
14 | .entity {
15 | margin: 0 0.25em;
16 | line-height: 1;
17 | border-radius: 0.35em;
18 | }
19 |
20 | .entity-organization {
21 | background: #7aecec;
22 | }
23 |
24 | .entity-person {
25 | background: #aa9cfc;
26 | }
27 |
28 | .entity-location {
29 | background: #ff9561;
30 | }
31 |
32 | .entity-artifact {
33 | background: #bfeeb7;
34 | }
35 |
36 | .entity-date {
37 | background: #bfe1d9;
38 | }
39 |
40 | .entity-time {
41 | background: #bfe1d9;
42 | }
43 |
44 | .entity-money {
45 | background: #e4e7d2;
46 | }
47 |
48 | .entity-percent {
49 | background: #e4e7d2;
50 | }
51 |
52 | .entity-label {
53 | font-size: 0.8em;
54 | font-weight: bold;
55 | line-height: 1;
56 | border-radius: 0.35em;
57 | vertical-align: middle;
58 | }
59 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/static/images/apple-touch-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/src/rhoknp/cli/static/images/apple-touch-icon.png
--------------------------------------------------------------------------------
/src/rhoknp/cli/static/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/src/rhoknp/cli/static/images/favicon.ico
--------------------------------------------------------------------------------
/src/rhoknp/cli/static/js/script.js:
--------------------------------------------------------------------------------
1 | /* Keep the status of the accordion to show analysis results */
2 | const defaultOpenAccordionItems = document.querySelectorAll(
3 | ".accordion-item-default-open",
4 | );
5 | defaultOpenAccordionItems.forEach((item) => {
6 | const itemId = `accordion-${item.id}`;
7 | if (localStorage.getItem(itemId) === null) {
8 | localStorage.setItem(itemId, "true");
9 | }
10 | });
11 |
12 | const accordionItems = document.querySelectorAll(".accordion-item");
13 | accordionItems.forEach((item) => {
14 | const itemId = `accordion-${item.id}`;
15 | item.addEventListener("shown.bs.collapse", () => {
16 | localStorage.setItem(itemId, "true");
17 | });
18 | item.addEventListener("hidden.bs.collapse", () => {
19 | localStorage.setItem(itemId, "false");
20 | });
21 | });
22 |
23 | accordionItems.forEach((item) => {
24 | const itemId = `accordion-${item.id}`;
25 | const state = localStorage.getItem(itemId);
26 | console.log(state);
27 | if (state === "true") {
28 | item.querySelector(".accordion-button").classList.remove("collapsed");
29 | item.querySelector(".accordion-collapse").classList.add("show");
30 | } else {
31 | item.querySelector(".accordion-button").classList.add("collapsed");
32 | item.querySelector(".accordion-collapse").classList.remove("show");
33 | }
34 | });
35 |
36 | const showAllButton = document.querySelector("#show-all-button");
37 | showAllButton.addEventListener("click", () => {
38 | accordionItems.forEach((item) => {
39 | const itemId = `accordion-${item.id}`;
40 | localStorage.setItem(itemId, "true");
41 | item.querySelector(".accordion-button").classList.remove("collapsed");
42 | item.querySelector(".accordion-collapse").classList.add("show");
43 | });
44 | });
45 |
46 | const hideAllButton = document.querySelector("#hide-all-button");
47 | hideAllButton.addEventListener("click", () => {
48 | accordionItems.forEach((item) => {
49 | const itemId = `accordion-${item.id}`;
50 | localStorage.setItem(itemId, "false");
51 | item.querySelector(".accordion-button").classList.add("collapsed");
52 | item.querySelector(".accordion-collapse").classList.remove("show");
53 | });
54 | });
55 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/stats.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from rhoknp import Document
4 | from rhoknp.cohesion.rel import CASE_TYPES, COREF_TYPES
5 |
6 |
7 | def get_document_statistics(document: Document) -> dict[str, dict[str, int]]:
8 | """文書の統計情報を取得.
9 |
10 | Args:
11 | document (Document): 文書.
12 |
13 | Returns:
14 | Dict[str, Dict[str, int]]: 統計情報.
15 | """
16 | stats: dict[str, Any] = {"unit": {}, "cohesion": {}, "other": {}}
17 | # Unit
18 | if not document.is_senter_required():
19 | stats["unit"]["sentence"] = len(document.sentences)
20 | if not document.is_clause_tag_required():
21 | stats["unit"]["clause"] = len(document.clauses)
22 | if not document.is_knp_required():
23 | stats["unit"]["phrase"] = len(document.phrases)
24 | stats["unit"]["base_phrase"] = len(document.base_phrases)
25 | if not document.is_jumanpp_required():
26 | stats["unit"]["morpheme"] = len(document.morphemes)
27 | # Cohesion
28 | if not document.is_knp_required():
29 | stats["cohesion"]["predicate"] = sum(
30 | len([rel_tag for rel_tag in bp.rel_tags if rel_tag.type in CASE_TYPES]) > 0 for bp in document.base_phrases
31 | )
32 | stats["cohesion"]["argument"] = sum(
33 | len([rel_tag for rel_tag in bp.rel_tags if rel_tag.type in CASE_TYPES]) for bp in document.base_phrases
34 | )
35 | stats["cohesion"]["coreference"] = sum(
36 | len([rel_tag for rel_tag in bp.rel_tags if rel_tag.type in COREF_TYPES]) for bp in document.base_phrases
37 | )
38 | if not document.is_clause_tag_required():
39 | stats["cohesion"]["discourse"] = sum(len(clause.discourse_relations) for clause in document.clauses)
40 | if not document.is_senter_required():
41 | stats["other"]["named_entity"] = sum(len(sentence.named_entities) for sentence in document.sentences)
42 | return stats
43 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/base.jinja2:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | {{ title }}
8 |
9 |
11 |
15 |
17 |
21 |
24 |
28 |
29 |
30 |
31 | {% include "components/navbar.jinja2" %}
32 |
33 | {% include "components/form.jinja2" %}
34 | {% if analyzed_document %}
35 |
36 | {% include "components/raw_input.jinja2" %}
37 |
解析結果
38 |
39 | {% include "components/show_all_button.jinja2" %}
40 | {% include "components/hide_all_button.jinja2" %}
41 | {% block result %}
42 | {% endblock result %}
43 |
44 | {% endif %}
45 | {% if error %}
46 | {% include "components/error.jinja2" %}
47 | {% endif %}
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/components/dependency_parsing.jinja2:
--------------------------------------------------------------------------------
1 |
19 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/components/discourse_parsing.jinja2:
--------------------------------------------------------------------------------
1 |
2 |
10 |
13 |
14 |
15 |
16 |
17 | 談話関係 |
18 | タイプ |
19 | Modifier |
20 | Head |
21 |
22 |
23 |
24 | {% for clause in analyzed_document.clauses %}
25 | {% for discourse_relation in clause.discourse_relations %}
26 |
27 | {{ discourse_relation.label.value }} |
28 |
29 | {% if discourse_relation.is_explicit %}
30 | 明示的
31 | {% else %}
32 | 非明示的
33 | {% endif %}
34 | |
35 | {{ discourse_relation.modifier.text }} |
36 | {{ discourse_relation.head.text }} |
37 |
38 | {% endfor %}
39 | {% endfor %}
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/components/error.jinja2:
--------------------------------------------------------------------------------
1 |
2 |
解析器の実行中に以下のエラーが発生しました。
3 |
{{ error }}
4 |
5 | Github の Issue に報告してください。
6 |
7 |
8 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/components/form.jinja2:
--------------------------------------------------------------------------------
1 |
9 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/components/hide_all_button.jinja2:
--------------------------------------------------------------------------------
1 |
4 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/components/morphological_analysis.jinja2:
--------------------------------------------------------------------------------
1 |
3 |
11 |
14 |
15 |
16 |
17 |
18 | 表層文字列 |
19 | 読み |
20 | 原形 |
21 | 品詞 |
22 | 品詞細分類 |
23 | 活用型 |
24 | 活用形 |
25 | 意味情報 |
26 |
27 |
28 |
29 | {% for morpheme in analyzed_document.morphemes %}
30 |
31 | {{ morpheme.text }} |
32 | {{ morpheme.reading }} |
33 | {{ morpheme.lemma }} |
34 | {{ morpheme.pos }} |
35 | {{ morpheme.subpos }} |
36 | {{ morpheme.conjtype }} |
37 | {{ morpheme.conjform }} |
38 | {{ morpheme.semantics.to_sstring().strip('"') }} |
39 |
40 | {% endfor %}
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/components/named_entity_recognition.jinja2:
--------------------------------------------------------------------------------
1 |
2 |
10 |
13 |
14 | {% for span in get_entity_spans(analyzed_document) %}
15 | {% if span.label %}
16 |
17 | {{ span.text }}
18 | {{ span.label }}
19 |
20 | {% else %}
21 | {{ span.text }}
22 | {% endif %}
23 | {% endfor %}
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/components/navbar.jinja2:
--------------------------------------------------------------------------------
1 |
6 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/components/raw_input.jinja2:
--------------------------------------------------------------------------------
1 | テキスト
2 | {{ text }}
3 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/components/raw_output.jinja2:
--------------------------------------------------------------------------------
1 |
2 |
10 |
13 |
14 |
{{ raw_output }}
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/components/show_all_button.jinja2:
--------------------------------------------------------------------------------
1 |
4 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/components/typo_correction.jinja2:
--------------------------------------------------------------------------------
1 |
2 |
10 |
13 |
14 | {% for diff in get_string_diff(text, analyzed_document.text) %}
15 | {% if diff.label == '+' %}
16 | {{ diff.text }}
17 | {% elif diff.label == '-' %}
18 | {{ diff.text }}
19 | {% else %}
20 | {{ diff.text }}
21 | {% endif %}
22 | {% endfor %}
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/components/word_splitting.jinja2:
--------------------------------------------------------------------------------
1 |
3 |
11 |
14 |
15 | {% for morpheme in analyzed_document.morphemes %}{{ morpheme.text + " " }}{% endfor %}
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/jumanpp.jinja2:
--------------------------------------------------------------------------------
1 | {% extends "base.jinja2" %}
2 | {% block result %}
3 |
4 | {% include "components/word_splitting.jinja2" %}
5 | {% include "components/morphological_analysis.jinja2" %}
6 | {% with raw_output = analyzed_document.to_jumanpp() %}
7 | {% include "components/raw_output.jinja2" %}
8 | {% endwith %}
9 |
10 | {% endblock result %}
11 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/knp.jinja2:
--------------------------------------------------------------------------------
1 | {% extends "base.jinja2" %}
2 | {% block result %}
3 |
4 | {% include "components/word_splitting.jinja2" %}
5 | {% include "components/morphological_analysis.jinja2" %}
6 | {% with tree = draw_tree(analyzed_document, show_pas=True) %}
7 | {% include "components/dependency_parsing.jinja2" %}
8 | {% endwith %}
9 | {% include "components/discourse_parsing.jinja2" %}
10 | {% with raw_output = analyzed_document.to_knp() %}
11 | {% include "components/raw_output.jinja2" %}
12 | {% endwith %}
13 |
14 | {% endblock result %}
15 |
--------------------------------------------------------------------------------
/src/rhoknp/cli/templates/kwja.jinja2:
--------------------------------------------------------------------------------
1 | {% extends "base.jinja2" %}
2 | {% block result %}
3 |
4 | {% include "components/typo_correction.jinja2" %}
5 | {% include "components/word_splitting.jinja2" %}
6 | {% include "components/morphological_analysis.jinja2" %}
7 | {% include "components/named_entity_recognition.jinja2" %}
8 | {% with tree = draw_tree(analyzed_document, show_rel=True) %}
9 | {% include "components/dependency_parsing.jinja2" %}
10 | {% endwith %}
11 | {% include "components/discourse_parsing.jinja2" %}
12 | {% with raw_output = analyzed_document.to_knp() %}
13 | {% include "components/raw_output.jinja2" %}
14 | {% endwith %}
15 |
16 | {% endblock result %}
17 |
--------------------------------------------------------------------------------
/src/rhoknp/cohesion/__init__.py:
--------------------------------------------------------------------------------
1 | from rhoknp.cohesion.argument import Argument, ArgumentType, EndophoraArgument, ExophoraArgument
2 | from rhoknp.cohesion.coreference import Entity, EntityManager
3 | from rhoknp.cohesion.discourse import DiscourseRelation, DiscourseRelationLabel, DiscourseRelationTag
4 | from rhoknp.cohesion.exophora import ExophoraReferent, ExophoraReferentType
5 | from rhoknp.cohesion.pas import Pas
6 | from rhoknp.cohesion.predicate import Predicate
7 | from rhoknp.cohesion.rel import RelMode, RelTag, RelTagList
8 |
9 | __all__ = [
10 | "Argument",
11 | "ArgumentType",
12 | "DiscourseRelation",
13 | "DiscourseRelationLabel",
14 | "DiscourseRelationTag",
15 | "EndophoraArgument",
16 | "Entity",
17 | "EntityManager",
18 | "ExophoraArgument",
19 | "ExophoraReferent",
20 | "ExophoraReferentType",
21 | "Pas",
22 | "Predicate",
23 | "RelMode",
24 | "RelTag",
25 | "RelTagList",
26 | ]
27 |
--------------------------------------------------------------------------------
/src/rhoknp/cohesion/argument.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from enum import Enum
3 | from typing import TYPE_CHECKING, Optional, Union
4 |
5 | from rhoknp.cohesion.exophora import ExophoraReferent
6 | from rhoknp.cohesion.predicate import Predicate
7 |
8 | if TYPE_CHECKING:
9 | from rhoknp.cohesion.pas import Pas
10 | from rhoknp.units.base_phrase import BasePhrase
11 | from rhoknp.units.clause import Clause
12 | from rhoknp.units.document import Document
13 | from rhoknp.units.phrase import Phrase
14 | from rhoknp.units.sentence import Sentence
15 |
16 | _HIRAGANA = "ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろわをんーゎゐゑゕゖゔゝゞ"
17 | _KATAKANA = "ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロワヲンーヮヰヱヵヶヴヽヾ"
18 | HIRA2KATA = str.maketrans(_HIRAGANA, _KATAKANA)
19 |
20 |
21 | class ArgumentType(Enum):
22 | """項のタイプ."""
23 |
24 | CASE_EXPLICIT = "C" #: 直接係り受けをもつ格要素(格は明示されている).
25 | CASE_HIDDEN = "N" #: 直接係り受けをもつ格要素(格は明示されていない).
26 | OMISSION = "O" #: 省略の指示対象.
27 | DEMONSTRATIVE = "D" #: 指示詞の指示対象.
28 | EXOPHORA = "E" #: 特殊(不特定:人など).
29 | UNASSIGNED = "U" #: 格要素の割り当てなし.
30 |
31 |
32 | class BaseArgument(ABC):
33 | """項の基底クラス.
34 |
35 | Args:
36 | case: 述語に対する格.
37 | arg_type: 項のタイプ.
38 | """
39 |
40 | def __init__(self, case: str, arg_type: ArgumentType) -> None:
41 | self.case: str = case #: 述語に対する格.
42 | self.type: ArgumentType = arg_type #: 項のタイプ.
43 | self.optional: bool = False #: 修飾的な項かどうか.
44 | self._pas: Optional["Pas"] = None
45 |
46 | @abstractmethod
47 | def __str__(self) -> str:
48 | raise NotImplementedError
49 |
50 | @abstractmethod
51 | def __repr__(self) -> str:
52 | raise NotImplementedError
53 |
54 | @abstractmethod
55 | def __eq__(self, other: object) -> bool:
56 | raise NotImplementedError
57 |
58 | @property
59 | def pas(self) -> "Pas":
60 | """述語項構造."""
61 | assert self._pas is not None
62 | return self._pas
63 |
64 | @pas.setter
65 | def pas(self, pas: "Pas") -> None:
66 | """述語項構造."""
67 | self._pas = pas
68 |
69 | def is_special(self) -> bool:
70 | """外界照応なら True."""
71 | return self.type == ArgumentType.EXOPHORA
72 |
73 |
74 | class EndophoraArgument(BaseArgument):
75 | """文脈中の基本句に対応する項を表すクラス.
76 |
77 | Args:
78 | case: 述語に対する格.
79 | base_phrase: 項の核となる基本句.
80 | arg_type: 項のタイプ.
81 | """
82 |
83 | def __init__(
84 | self,
85 | case: str,
86 | base_phrase: "BasePhrase",
87 | predicate: Predicate,
88 | arg_type: Optional[ArgumentType] = None,
89 | ) -> None:
90 | super().__init__(case, arg_type or self._get_arg_type(predicate, base_phrase, case))
91 | self.base_phrase = base_phrase #: 項の核となる基本句.
92 |
93 | def __repr__(self) -> str:
94 | return f"<{self.__module__}.{self.__class__.__name__}: {self.case!r}, {self.base_phrase.text!r}>"
95 |
96 | def __str__(self) -> str:
97 | return self.base_phrase.text
98 |
99 | def __eq__(self, other: object) -> bool:
100 | if not isinstance(other, type(self)):
101 | return False
102 | if self._pas is not None and other._pas is not None:
103 | if self.pas.predicate != other.pas.predicate:
104 | return False
105 | return self.case == other.case and self.base_phrase == other.base_phrase
106 |
107 | @property
108 | def document(self) -> "Document":
109 | """項の核となる基本句が属する文書.
110 |
111 | Raises:
112 | AttributeError: 解析結果にアクセスできない場合.
113 | """
114 | return self.base_phrase.document
115 |
116 | @property
117 | def sentence(self) -> "Sentence":
118 | """項の核となる基本句が属する文."""
119 | return self.base_phrase.sentence
120 |
121 | @property
122 | def clause(self) -> "Clause":
123 | """項の核となる基本句が属する節.
124 |
125 | Raises:
126 | AttributeError: 解析結果にアクセスできない場合.
127 | """
128 | return self.base_phrase.clause
129 |
130 | @property
131 | def phrase(self) -> "Phrase":
132 | """項の核となる基本句が属する文節."""
133 | return self.base_phrase.phrase
134 |
135 | @staticmethod
136 | def _get_arg_type(predicate: Predicate, arg_base_phrase: "BasePhrase", case: str) -> ArgumentType:
137 | if predicate.base_phrase.parent_index is None:
138 | return ArgumentType.UNASSIGNED
139 | if arg_base_phrase in predicate.base_phrase.children:
140 | tail_morpheme = arg_base_phrase.morphemes[-1]
141 | if tail_morpheme.subpos == "格助詞" and tail_morpheme.text.translate(HIRA2KATA) == case:
142 | return ArgumentType.CASE_EXPLICIT
143 | else:
144 | return ArgumentType.CASE_HIDDEN
145 | elif predicate.base_phrase.parent and predicate.base_phrase.parent == arg_base_phrase:
146 | return ArgumentType.CASE_HIDDEN
147 | else:
148 | return ArgumentType.OMISSION
149 |
150 |
151 | class ExophoraArgument(BaseArgument):
152 | """外界照応の照応先に対応する項を表すクラス.
153 |
154 | Args:
155 | case: 述語に対する格.
156 | exophora_referent: 外界照応における照応先(不特定:人など).
157 | eid: エンティティID.
158 | """
159 |
160 | def __init__(self, case: str, exophora_referent: ExophoraReferent, eid: int) -> None:
161 | super().__init__(case, ArgumentType.EXOPHORA)
162 | self.exophora_referent = exophora_referent #: 外界照応における照応先.
163 | self.eid = eid #: エンティティID.
164 |
165 | def __repr__(self) -> str:
166 | return (
167 | f"{self.__class__.__name__}(case={self.case!r}, exophora_referent={self.exophora_referent!r}, "
168 | f"eid={self.eid!r})"
169 | )
170 |
171 | def __str__(self) -> str:
172 | return str(self.exophora_referent)
173 |
174 | def __eq__(self, other: object) -> bool:
175 | if not isinstance(other, type(self)):
176 | return False
177 | if self._pas is not None and other._pas is not None:
178 | if self.pas.predicate != other.pas.predicate:
179 | return False
180 | return self.case == other.case and self.exophora_referent == other.exophora_referent
181 |
182 |
183 | Argument = Union[EndophoraArgument, ExophoraArgument]
184 |
--------------------------------------------------------------------------------
/src/rhoknp/cohesion/exophora.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | from enum import Enum
4 | from typing import ClassVar, Optional
5 |
6 | logger = logging.getLogger(__name__)
7 |
8 |
9 | class ExophoraReferentType(Enum):
10 | """外海照応における照応先を表す列挙体."""
11 |
12 | WRITER = "著者"
13 | READER = "読者"
14 | UNSPECIFIED_PERSON = "不特定:人"
15 | UNSPECIFIED_MATTER = "不特定:物"
16 | UNSPECIFIED_SITUATION = "不特定:状況"
17 | PREVIOUS_SENTENCE = "前文"
18 | NEXT_SENTENCE = "後文"
19 | OTHER = "OTHER"
20 |
21 |
22 | class ExophoraReferent:
23 | """外界照応における照応先を表すクラス."""
24 |
25 | PAT: ClassVar[re.Pattern] = re.compile(
26 | rf"^(?P{'|'.join(t.value for t in ExophoraReferentType if t != ExophoraReferentType.OTHER)})"
27 | rf"(?P[0-9\d]*)$"
28 | )
29 |
30 | def __init__(self, text: str) -> None:
31 | self.index: Optional[int] = None
32 | self._other_text: Optional[str] = None
33 | match: Optional[re.Match[str]] = self.PAT.match(text)
34 | if match is None:
35 | logger.warning(f"unknown exophora referent found: {text}")
36 | self.type = ExophoraReferentType.OTHER
37 | self._other_text = text
38 | else:
39 | index = match["index"]
40 | if index:
41 | self.index = int(index)
42 | self.type = ExophoraReferentType(match["type"])
43 |
44 | @property
45 | def text(self) -> str:
46 | """外界照応の照応先を表すテキスト表現."""
47 | if self.type != ExophoraReferentType.OTHER:
48 | return str(self.type.value) + str(self.index or "")
49 | else:
50 | assert self._other_text is not None
51 | return self._other_text
52 |
53 | def is_singleton(self) -> bool:
54 | """文書中に1つしか存在しないエンティティであれば True."""
55 | if self.type in (ExophoraReferentType.WRITER, ExophoraReferentType.READER):
56 | return True
57 | if self.index is not None:
58 | return True
59 | return False
60 |
61 | def __str__(self) -> str:
62 | return self.text
63 |
64 | def __repr__(self) -> str:
65 | return f"{self.__class__.__name__}(text={self.text!r})"
66 |
67 | def __eq__(self, other: object) -> bool:
68 | if not isinstance(other, type(self)) or self.type != other.type:
69 | return False
70 | if self.type == ExophoraReferentType.OTHER:
71 | return self._other_text == other._other_text
72 | return self.index == other.index
73 |
--------------------------------------------------------------------------------
/src/rhoknp/cohesion/predicate.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING, Optional
2 |
3 | if TYPE_CHECKING:
4 | from rhoknp.cohesion.pas import Pas
5 | from rhoknp.units.base_phrase import BasePhrase
6 | from rhoknp.units.clause import Clause
7 | from rhoknp.units.document import Document
8 | from rhoknp.units.phrase import Phrase
9 | from rhoknp.units.sentence import Sentence
10 |
11 |
12 | class Predicate:
13 | """述語を表すクラス.
14 |
15 | Args:
16 | base_phrase: 述語の核となる基本句.
17 | cfid: 格フーレムID.
18 | """
19 |
20 | def __init__(self, base_phrase: "BasePhrase", cfid: Optional[str] = None) -> None:
21 | self.base_phrase: "BasePhrase" = base_phrase #: 述語の核となる基本句.
22 | self.cfid: Optional[str] = cfid #: 格フーレムID.
23 | self._pas: Optional["Pas"] = None
24 |
25 | @property
26 | def text(self) -> str:
27 | """表層文字列."""
28 | return self.base_phrase.text
29 |
30 | @property
31 | def sid(self) -> str:
32 | """文 ID."""
33 | return self.base_phrase.sentence.sid
34 |
35 | @property
36 | def pas(self) -> "Pas":
37 | """述語項構造."""
38 | assert self._pas is not None
39 | return self._pas
40 |
41 | @pas.setter
42 | def pas(self, pas: "Pas") -> None:
43 | """述語項構造.
44 |
45 | Args:
46 | pas: 述語項構造.
47 | """
48 | self._pas = pas
49 |
50 | @property
51 | def document(self) -> "Document":
52 | """述語の核となる基本句が属する文書.
53 |
54 | Raises:
55 | AttributeError: 解析結果にアクセスできない場合.
56 | """
57 | return self.base_phrase.document
58 |
59 | @property
60 | def sentence(self) -> "Sentence":
61 | """述語の核となる基本句が属する文."""
62 | return self.base_phrase.sentence
63 |
64 | @property
65 | def clause(self) -> "Clause":
66 | """述語の核となる基本句が属する節.
67 |
68 | Raises:
69 | AttributeError: 解析結果にアクセスできない場合.
70 | """
71 | return self.base_phrase.clause
72 |
73 | @property
74 | def phrase(self) -> "Phrase":
75 | """述語の核となる基本句が属する文節."""
76 | return self.base_phrase.phrase
77 |
78 | def __str__(self) -> str:
79 | return self.text
80 |
81 | def __repr__(self) -> str:
82 | return f"<{self.__module__}.{self.__class__.__name__}: {self.text!r}>"
83 |
84 | def __eq__(self, other: object) -> bool:
85 | if not isinstance(other, type(self)) or self.base_phrase != other.base_phrase:
86 | return False
87 | if self.cfid is None or other.cfid is None:
88 | return True
89 | return self.cfid == other.cfid
90 |
--------------------------------------------------------------------------------
/src/rhoknp/cohesion/rel.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | from dataclasses import dataclass
4 | from enum import Enum
5 | from typing import ClassVar, Optional
6 |
7 | CASE_TYPES = [
8 | "ガ",
9 | "デ",
10 | "ト",
11 | "ニ",
12 | "ノ",
13 | "ヘ",
14 | "ヲ",
15 | "カラ",
16 | "ガ2",
17 | "ノ?",
18 | "マデ",
19 | "ヨリ",
20 | "トイウ",
21 | "トシテ",
22 | "トスル",
23 | "ニオク",
24 | "ニシテ",
25 | "ニツク",
26 | "ニトル",
27 | "ニヨル",
28 | "マデニ",
29 | "ニオイテ",
30 | "ニカワル",
31 | "ニソッテ",
32 | "ニツイテ",
33 | "ニトッテ",
34 | "ニムケテ",
35 | "ニムケル",
36 | "ニヨッテ",
37 | "ニヨラズ",
38 | "ニアワセテ",
39 | "ニカギッテ",
40 | "ニカギラズ",
41 | "ニカランデ",
42 | "ニカワッテ",
43 | "ニカンシテ",
44 | "ニカンスル",
45 | "ニクラベテ",
46 | "ニクワエテ",
47 | "ニタイシテ",
48 | "ニタイスル",
49 | "ニツヅイテ",
50 | "ニナランデ",
51 | "ヲツウジテ",
52 | "ヲツウジル",
53 | "ヲノゾイテ",
54 | "ヲフクメテ",
55 | "ヲメグッテ",
56 | "ニトモナッテ",
57 | "ニモトヅイテ",
58 | "無",
59 | "修飾",
60 | "判ガ",
61 | "時間",
62 | "外の関係",
63 | ]
64 | CASE_TYPES += [case + "≒" for case in CASE_TYPES]
65 |
66 | COREF_TYPES = ["=", "=構", "=役"]
67 | COREF_TYPES += [coref + "≒" for coref in COREF_TYPES]
68 |
69 | logger = logging.getLogger(__name__)
70 |
71 |
72 | class RelMode(Enum):
73 | """同一の基本句に同一タイプの関係タグが複数付いている場合にそれらの関係を表す列挙体.
74 |
75 | .. note::
76 | 各関係タグの具体例は以下の通りである:
77 |
78 | * AND
79 | (例)太郎と花子が学校から<帰った>(ガ格:太郎, ガ格:花子 [and])
80 | * OR
81 | (例)私は田園調布か国立に<住みたい>(ガ格:私, ニ格:田園調布, ニ格:国立 [or])
82 | * AMBIGUOUS
83 | (例)高知県の橋本知事は…国籍条項を<撤廃する>方針を明らかにした(ガ格:高知県, ガ格:橋本知事 [?], ガ格:不特定:人 [?], ヲ格:条項, 外の関係:方針)
84 |
85 | .. note::
86 | target が「なし」の場合,同じタイプの関係タグが任意的要素であることを示す.
87 | (例)太郎は一人で<立っていた>(ガ格:太郎, デ格:一人, デ格:なし [?])
88 | """
89 |
90 | AND = "AND" #: 関係の対象が並列である.
91 | OR = "OR" #: 「AかB」のように意味的に or である.
92 | AMBIGUOUS = "?" #: いずれの解釈も妥当であり,文脈から判断ができない.
93 |
94 |
95 | @dataclass(frozen=True)
96 | class RelTag:
97 | """関係タグ付きコーパスにおける タグを表すクラス."""
98 |
99 | PAT: ClassVar[re.Pattern] = re.compile(
100 | r'\d+?)")?/>'
102 | )
103 | type: str
104 | target: str
105 | sid: Optional[str]
106 | base_phrase_index: Optional[int]
107 | mode: Optional[RelMode]
108 |
109 | def __post_init__(self) -> None:
110 | if self.is_coreference():
111 | if self.type not in COREF_TYPES:
112 | logger.warning(f"Unknown coreference type: {self.type} ({self})")
113 | else: # noqa: PLR5501
114 | if self.type not in CASE_TYPES:
115 | logger.warning(f"Unknown case type: {self.type} ({self})")
116 |
117 | def to_fstring(self) -> str:
118 | """素性文字列に変換."""
119 | ret = f'"
127 | return ret
128 |
129 | def is_coreference(self) -> bool:
130 | """共参照・照応関係を表すタグなら True."""
131 | return self.type.startswith("=")
132 |
133 |
134 | class RelTagList(list[RelTag]):
135 | """関係タグ付きコーパスにおける タグの列を表すクラス."""
136 |
137 | @classmethod
138 | def from_fstring(cls, fstring: str) -> "RelTagList":
139 | """KNP における素性文字列からオブジェクトを作成."""
140 | rel_tags = []
141 | for match in RelTag.PAT.finditer(fstring):
142 | rel_tags.append(
143 | RelTag(
144 | type=match["type"],
145 | target=match["target"],
146 | sid=match["sid"],
147 | base_phrase_index=int(match["id"]) if match["id"] else None,
148 | mode=RelMode(match["mode"]) if match["mode"] else None,
149 | )
150 | )
151 | return cls(rel_tags)
152 |
153 | def to_fstring(self) -> str:
154 | """素性文字列に変換."""
155 | return "".join(rel_tag.to_fstring() for rel_tag in self)
156 |
157 | def __str__(self) -> str:
158 | return self.to_fstring()
159 |
--------------------------------------------------------------------------------
/src/rhoknp/processors/__init__.py:
--------------------------------------------------------------------------------
1 | from rhoknp.processors.jumanpp import Jumanpp
2 | from rhoknp.processors.knp import KNP
3 | from rhoknp.processors.kwja import KWJA
4 | from rhoknp.processors.senter import RegexSenter
5 |
6 | __all__ = ["KNP", "KWJA", "Jumanpp", "RegexSenter"]
7 |
--------------------------------------------------------------------------------
/src/rhoknp/processors/processor.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Union, overload
3 |
4 | from rhoknp.units import Document, Sentence
5 |
6 |
7 | class Processor(ABC):
8 | """解析器の基底クラス."""
9 |
10 | @overload
11 | def __call__(self, text: str, timeout: int = 10) -> Document: ...
12 |
13 | @overload
14 | def __call__(self, text: Sentence, timeout: int = 10) -> Sentence: ...
15 |
16 | @overload
17 | def __call__(self, text: Document, timeout: int = 10) -> Document: ...
18 |
19 | def __call__(self, text: Union[str, Sentence, Document], timeout: int = 10) -> Union[Document, Sentence]:
20 | """テキストに解析器を適用する.
21 |
22 | Args:
23 | text: 解析するテキスト.
24 | timeout: 最大処理時間.
25 |
26 | Raises:
27 | TypeError: textの型がstr, Sentence, Document以外の場合.
28 |
29 | .. note::
30 | このメソッドは引数の型に応じて ``apply_to_document`` または ``apply_to_sentence`` を呼び出す.
31 | 引数の型が ``str`` の場合は ``apply_to_document`` を呼び出す.
32 | 引数の型が ``Sentence`` の場合は ``apply_to_sentence`` を呼び出す.
33 | 引数の型が ``Document`` の場合は ``apply_to_document`` を呼び出す.
34 | """
35 | return self.apply(text, timeout=timeout)
36 |
37 | @overload
38 | def apply(self, text: str, timeout: int = 10) -> Document: ...
39 |
40 | @overload
41 | def apply(self, text: Sentence, timeout: int = 10) -> Sentence: ...
42 |
43 | @overload
44 | def apply(self, text: Document, timeout: int = 10) -> Document: ...
45 |
46 | def apply(self, text: Union[str, Sentence, Document], timeout: int = 10) -> Union[Document, Sentence]:
47 | """テキストに解析器を適用する.
48 |
49 | Args:
50 | text: 解析するテキスト.
51 | timeout: 最大処理時間.
52 |
53 | Raises:
54 | TypeError: textの型がstr, Sentence, Document以外の場合.
55 |
56 | .. note::
57 | このメソッドは引数の型に応じて ``apply_to_document`` または ``apply_to_sentence`` を呼び出す.
58 | 引数の型が ``str`` の場合は ``apply_to_document`` を呼び出す.
59 | 引数の型が ``Sentence`` の場合は ``apply_to_sentence`` を呼び出す.
60 | 引数の型が ``Document`` の場合は ``apply_to_document`` を呼び出す.
61 | """
62 | if isinstance(text, (Document, str)):
63 | return self.apply_to_document(text, timeout=timeout)
64 | elif isinstance(text, Sentence):
65 | return self.apply_to_sentence(text, timeout=timeout)
66 | else:
67 | raise TypeError("Invalid type: text must be str, Sentence, or Document")
68 |
69 | @abstractmethod
70 | def apply_to_document(self, document: Union[Document, str], timeout: int = 10) -> Document:
71 | """文書に解析器を適用する.
72 |
73 | Args:
74 | document: 文書.
75 | timeout: 最大処理時間.
76 | """
77 | raise NotImplementedError
78 |
79 | @abstractmethod
80 | def apply_to_sentence(self, sentence: Union[Sentence, str], timeout: int = 10) -> Sentence:
81 | """文に解析器を適用する.
82 |
83 | Args:
84 | sentence: 文.
85 | timeout: 最大処理時間.
86 | """
87 | raise NotImplementedError
88 |
--------------------------------------------------------------------------------
/src/rhoknp/processors/senter.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | import threading
4 | from typing import ClassVar, Union
5 |
6 | try:
7 | from typing import override # type: ignore[attr-defined]
8 | except ImportError:
9 | from typing_extensions import override
10 |
11 | from rhoknp.processors.processor import Processor
12 | from rhoknp.units import Document, Sentence
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | class RegexSenter(Processor):
18 | """正規表現にもとづく文分割クラス.
19 |
20 | Example:
21 | >>> from rhoknp import RegexSenter
22 | >>> senter = RegexSenter()
23 | >>> document = senter.apply("天気が良かったので散歩した。途中で先生に会った。")
24 | """
25 |
26 | _PERIOD_PAT: ClassVar[re.Pattern] = re.compile(r"[。.?!♪☆★…?!]+") #: ピリオドとみなすパターン.
27 |
28 | def __repr__(self) -> str:
29 | return f"{self.__class__.__name__}()"
30 |
31 | @override
32 | def apply_to_document(self, document: Union[Document, str], timeout: int = 10) -> Document:
33 | """文書に RegexSenter を適用する.
34 |
35 | Args:
36 | document: 文書.
37 | timeout: 最大処理時間..
38 | """
39 | if isinstance(document, str):
40 | document = Document(document)
41 | doc_id = document.doc_id
42 |
43 | sentences: list[str] = []
44 |
45 | def worker() -> None:
46 | nonlocal sentences
47 | sentences = self._split_document(document.text)
48 |
49 | thread = threading.Thread(target=worker, daemon=True)
50 | thread.start()
51 | thread.join(timeout)
52 |
53 | if thread.is_alive():
54 | raise TimeoutError(f"Operation timed out after {timeout} seconds.")
55 |
56 | ret = Document.from_sentences(sentences)
57 | if doc_id != "":
58 | ret.doc_id = doc_id
59 | for sentence in ret.sentences:
60 | sentence.doc_id = doc_id
61 | return ret
62 |
63 | @override
64 | def apply_to_sentence(self, sentence: Union[Sentence, str], timeout: int = 10) -> Sentence:
65 | """文に RegexSenter を適用する.
66 |
67 | Args:
68 | sentence: 文.
69 | timeout: 最大処理時間.
70 | """
71 | if isinstance(sentence, str):
72 | sentence = Sentence(sentence)
73 | return sentence
74 |
75 | def _split_document(self, text: str) -> list[str]:
76 | if text == "":
77 | return []
78 |
79 | def split_text_by_period(text: str) -> list[str]:
80 | segments: list[str] = []
81 | start: int = 0
82 | for match in self._PERIOD_PAT.finditer(text):
83 | end: int = match.end()
84 | segments.append(text[start:end])
85 | start = end
86 | if start < len(text):
87 | segments.append(text[start:])
88 | return [segment.strip() for segment in segments]
89 |
90 | sentences: list[str] = []
91 | for line in text.split("\n"):
92 | # Split by periods
93 | sentence_candidates: list[str] = split_text_by_period(line)
94 |
95 | # Merge sentence candidates so that strings in parentheses or brackets are not split
96 | parenthesis_level: int = 0
97 | hook_bracket_level: int = 0
98 | double_hook_bracket_level: int = 0
99 | sentence: str = ""
100 | while sentence_candidates:
101 | sentence_candidate: str = sentence_candidates.pop(0)
102 |
103 | sentence += sentence_candidate
104 |
105 | parenthesis_level += sentence_candidate.count("(") - sentence_candidate.count(")")
106 | parenthesis_level += sentence_candidate.count("(") - sentence_candidate.count(")")
107 | hook_bracket_level += sentence_candidate.count("「") - sentence_candidate.count("」")
108 | double_hook_bracket_level += sentence_candidate.count("『") - sentence_candidate.count("』")
109 | if parenthesis_level == hook_bracket_level == double_hook_bracket_level == 0:
110 | if sentence.strip():
111 | sentences.append(sentence.strip())
112 | sentence = ""
113 | if sentence.strip():
114 | sentences.extend(split_text_by_period(sentence.strip()))
115 |
116 | return sentences
117 |
--------------------------------------------------------------------------------
/src/rhoknp/props/__init__.py:
--------------------------------------------------------------------------------
1 | from rhoknp.props.dependency import DepType
2 | from rhoknp.props.feature import FeatureDict
3 | from rhoknp.props.memo import MemoTag
4 | from rhoknp.props.named_entity import NamedEntity, NamedEntityCategory
5 | from rhoknp.props.semantics import SemanticsDict
6 |
7 | __all__ = [
8 | "DepType",
9 | "FeatureDict",
10 | "MemoTag",
11 | "NamedEntity",
12 | "NamedEntityCategory",
13 | "SemanticsDict",
14 | ]
15 |
--------------------------------------------------------------------------------
/src/rhoknp/props/dependency.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 |
4 | class DepType(Enum):
5 | """文節,基本句の係り受けタイプを表す列挙体."""
6 |
7 | DEPENDENCY = "D"
8 | PARALLEL = "P"
9 | APPOSITION = "A"
10 | IMPERFECT_PARALLEL = "I"
11 |
--------------------------------------------------------------------------------
/src/rhoknp/props/feature.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | from typing import ClassVar, Union
4 |
5 | logger = logging.getLogger(__name__)
6 |
7 |
8 | class FeatureDict(dict[str, Union[str, bool]]):
9 | """文節,基本句,形態素の素性情報を表すクラス."""
10 |
11 | IGNORE_TAG_PREFIXES: ClassVar[set[str]] = {"rel ", "memo "}
12 | _FEATURE_KEY_PAT: ClassVar[re.Pattern] = re.compile(r"(?P([^:\"]|\"[^\"]*?\")+?)")
13 | _FEATURE_VALUE_PAT: ClassVar[re.Pattern] = re.compile(r"(?P([^>\\]|\\>?)+)")
14 | PAT: ClassVar[re.Pattern] = re.compile(
15 | rf"(?P(<{_FEATURE_KEY_PAT.pattern}(:{_FEATURE_VALUE_PAT.pattern})?>)*)"
16 | )
17 | FEATURE_PAT: ClassVar[re.Pattern] = re.compile(
18 | rf"<(?!({'|'.join(IGNORE_TAG_PREFIXES)})){_FEATURE_KEY_PAT.pattern}(:{_FEATURE_VALUE_PAT.pattern})?>"
19 | )
20 |
21 | def __setitem__(self, key: str, value: Union[str, bool]) -> None:
22 | if key == "rel":
23 | logger.warning(
24 | f"Adding 'rel' to {self.__class__.__name__} is not supported and was ignored. Instead, add a RelTag "
25 | f"object to BasePhrase.rel_tags and call Document.reparse()."
26 | )
27 | return
28 | if key == "memo":
29 | logger.warning(
30 | f"Adding 'memo' to {self.__class__.__name__} is not supported and was ignored. Instead, set a MemoTag "
31 | f"object to BasePhrase.memo_tag."
32 | )
33 | return
34 | super().__setitem__(key, value)
35 |
36 | @classmethod
37 | def from_fstring(cls, fstring: str) -> "FeatureDict":
38 | """素性文字列をパースして辞書型に変換する.
39 |
40 | 例:"<正規化代表表記:遅れる/おくれる>" -> {"正規化代表表記": "遅れる/おくれる"}
41 |
42 | Args:
43 | fstring: KNP 形式における素性文字列.
44 | """
45 | features = cls()
46 | for match in cls.FEATURE_PAT.finditer(fstring):
47 | features[match["key"]] = match["value"].replace(r"\>", ">") if match["value"] is not None else True
48 | return features
49 |
50 | def to_fstring(self) -> str:
51 | """素性文字列に変換."""
52 | return "".join(self._item_to_fstring(k, v) for k, v in self.items())
53 |
54 | @staticmethod
55 | def _item_to_fstring(key: str, value: Union[str, bool]) -> str:
56 | if value is False:
57 | return ""
58 | if value is True:
59 | return f"<{key}>"
60 | escaped_value = value.replace(">", r"\>") # escape ">"
61 | return f"<{key}:{escaped_value}>"
62 |
--------------------------------------------------------------------------------
/src/rhoknp/props/memo.py:
--------------------------------------------------------------------------------
1 | import re
2 | from dataclasses import dataclass
3 | from typing import ClassVar
4 |
5 |
6 | @dataclass(frozen=True)
7 | class MemoTag:
8 | """関係タグ付きコーパスにおける タグを表すクラス."""
9 |
10 | PAT: ClassVar[re.Pattern] = re.compile(r'')
11 | text: str = "" #: メモの内容.
12 |
13 | @classmethod
14 | def from_fstring(cls, fstring: str) -> "MemoTag":
15 | """KNP における素性文字列からオブジェクトを作成."""
16 | match = cls.PAT.search(fstring)
17 | memo_tag = MemoTag(text=match["text"] if match is not None else "")
18 | return memo_tag
19 |
20 | def to_fstring(self) -> str:
21 | """素性文字列に変換."""
22 | return f''
23 |
24 | def __str__(self) -> str:
25 | return self.to_fstring()
26 |
27 | def __bool__(self) -> bool:
28 | return bool(self.text)
29 |
--------------------------------------------------------------------------------
/src/rhoknp/props/named_entity.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | from dataclasses import dataclass
4 | from enum import Enum
5 | from typing import TYPE_CHECKING, ClassVar, Optional
6 |
7 | if TYPE_CHECKING:
8 | from rhoknp.units.morpheme import Morpheme
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | class NamedEntityCategory(Enum):
14 | """固有表現カテゴリを表す列挙体."""
15 |
16 | ORGANIZATION = "ORGANIZATION"
17 | PERSON = "PERSON"
18 | LOCATION = "LOCATION"
19 | ARTIFACT = "ARTIFACT"
20 | DATE = "DATE"
21 | TIME = "TIME"
22 | MONEY = "MONEY"
23 | PERCENT = "PERCENT"
24 | OPTIONAL = "OPTIONAL"
25 |
26 | @classmethod
27 | def has_value(cls, value: str) -> bool:
28 | """有効な固有表現カテゴリであれば True.
29 |
30 | Args:
31 | value: 固有表現のカテゴリ.
32 | """
33 | return any(value == item.value for item in cls)
34 |
35 |
36 | @dataclass
37 | class NamedEntity:
38 | """固有表現を表すクラス."""
39 |
40 | PAT: ClassVar[re.Pattern] = re.compile(r"\w+):(?P([^>\\]|\\>?)+)>")
41 |
42 | category: NamedEntityCategory
43 | morphemes: list["Morpheme"]
44 |
45 | def __str__(self) -> str:
46 | return self.text
47 |
48 | @property
49 | def text(self) -> str:
50 | """固有表現の表層文字列."""
51 | return "".join(m.text for m in self.morphemes)
52 |
53 | @classmethod
54 | def from_fstring(cls, fstring: str, candidate_morphemes: list["Morpheme"]) -> Optional["NamedEntity"]:
55 | """KNP における素性文字列からオブジェクトを作成."""
56 | match = cls.PAT.match(fstring)
57 | if match is None:
58 | logger.warning(f"{fstring} is not a valid NE fstring")
59 | return None
60 | category: str = match["cat"]
61 | if not NamedEntityCategory.has_value(category):
62 | logger.warning(f"{candidate_morphemes[0].sentence.sid}: unknown NE category: {category}")
63 | return None
64 | name: str = match["name"].replace(r"\>", ">")
65 | span = cls._find_morpheme_span(name, candidate_morphemes)
66 | if span is None:
67 | logger.warning(f"{candidate_morphemes[0].sentence.sid}: morpheme span of '{name}' not found")
68 | return None
69 | return NamedEntity(NamedEntityCategory(category), candidate_morphemes[span.start : span.stop])
70 |
71 | def to_fstring(self) -> str:
72 | """素性文字列に変換."""
73 | escaped_text = self.text.replace(">", r"\>") # escape ">"
74 | return f""
75 |
76 | @staticmethod
77 | def _find_morpheme_span(name: str, candidates: list["Morpheme"]) -> Optional[range]:
78 | """固有表現の文字列にマッチする形態素の範囲を返す.
79 |
80 | Args:
81 | name: 固有表現の文字列
82 | candidates: 固有表現を構成する候補形態素のリスト
83 | """
84 | stop = len(candidates)
85 | while stop > 0:
86 | for start in reversed(range(stop)):
87 | if "".join(m.text for m in candidates[start:stop]) == name:
88 | return range(start, stop)
89 | stop -= 1
90 | return None
91 |
--------------------------------------------------------------------------------
/src/rhoknp/props/semantics.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import Optional, Union
3 |
4 |
5 | class SemanticsDict(dict[str, Union[str, bool]]):
6 | """形態素の意味情報を表すクラス."""
7 |
8 | NIL = "NIL"
9 | PAT = re.compile(rf'(?P("[^"]+?")|{NIL})')
10 | SEM_PAT = re.compile(r"(?P[^:\s]+)(:(?P\S+))?(\s|$)")
11 |
12 | def __init__(self, semantics: Optional[dict[str, Union[str, bool]]] = None, is_nil: bool = False) -> None:
13 | if semantics is None:
14 | semantics = {}
15 | super().__init__(semantics)
16 | self.nil: bool = is_nil
17 |
18 | def is_nil(self) -> bool:
19 | """NIL なら True."""
20 | return self.nil
21 |
22 | @classmethod
23 | def from_sstring(cls, sstring: str) -> "SemanticsDict":
24 | """意味情報文字列をパースして辞書型に変換する.
25 |
26 | 例:"代表表記:日本/にほん 地名:国" -> {"代表表記": "日本/にほん", "地名": "国"}
27 |
28 | Args:
29 | sstring: KNP 形式における意味情報文字列.
30 | """
31 | is_nil = sstring == cls.NIL
32 | semantics = {}
33 | if not is_nil:
34 | for match in cls.SEM_PAT.finditer(sstring.strip('"')):
35 | semantics[match["key"]] = match["value"] or True
36 | return cls(semantics, is_nil)
37 |
38 | def to_sstring(self) -> str:
39 | """意味情報文字列に変換."""
40 | if len(self) == 0:
41 | return "" if not self.is_nil() else self.NIL
42 | return f'"{" ".join(self._item_to_sstring(k, v) for k, v in self.items())}"'
43 |
44 | @staticmethod
45 | def _item_to_sstring(key: str, value: Union[str, bool]) -> str:
46 | if value is False:
47 | return ""
48 | if value is True:
49 | return f"{key}"
50 | return f"{key}:{value}"
51 |
--------------------------------------------------------------------------------
/src/rhoknp/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/src/rhoknp/py.typed
--------------------------------------------------------------------------------
/src/rhoknp/units/__init__.py:
--------------------------------------------------------------------------------
1 | from rhoknp.units.base_phrase import BasePhrase
2 | from rhoknp.units.clause import Clause
3 | from rhoknp.units.document import Document
4 | from rhoknp.units.morpheme import Morpheme
5 | from rhoknp.units.phrase import Phrase
6 | from rhoknp.units.sentence import Sentence
7 |
8 | __all__ = ["BasePhrase", "Clause", "Document", "Morpheme", "Phrase", "Sentence"]
9 |
--------------------------------------------------------------------------------
/src/rhoknp/units/unit.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from collections.abc import Sequence
3 | from typing import Optional
4 |
5 |
6 | class Unit(ABC):
7 | """言語単位の基底クラス・"""
8 |
9 | def __init__(self) -> None:
10 | self._text: Optional[str] = None
11 |
12 | def __post_init__(self) -> None:
13 | if self.child_units is not None:
14 | for child_unit in self.child_units:
15 | child_unit.__post_init__()
16 |
17 | @abstractmethod
18 | def __eq__(self, other: object) -> bool:
19 | raise NotImplementedError
20 |
21 | def __str__(self) -> str:
22 | return self.text
23 |
24 | def __repr__(self) -> str:
25 | return f"<{self.__module__}.{self.__class__.__name__}: {self.text!r}>"
26 |
27 | @property
28 | @abstractmethod
29 | def parent_unit(self) -> Optional["Unit"]:
30 | """上位の言語単位."""
31 | raise NotImplementedError
32 |
33 | @property
34 | @abstractmethod
35 | def child_units(self) -> Optional[Sequence["Unit"]]:
36 | """下位の言語単位."""
37 | raise NotImplementedError
38 |
39 | @property
40 | def text(self) -> str:
41 | """言語単位の表層文字列."""
42 | if self._text is not None:
43 | return self._text
44 | if self.child_units is not None:
45 | self._text = "".join(str(child_unit) for child_unit in self.child_units)
46 | return self._text
47 | raise AttributeError
48 |
49 | @text.setter
50 | def text(self, text: str) -> None:
51 | """言語単位の表層文字列.
52 |
53 | Args:
54 | text: 言語単位の表層文字列.
55 | """
56 | self._text = text
57 |
--------------------------------------------------------------------------------
/src/rhoknp/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ku-nlp/rhoknp/62e79c2f2212cdcff63b0d11261817537bdae9f3/src/rhoknp/utils/__init__.py
--------------------------------------------------------------------------------
/src/rhoknp/utils/comment.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | from typing import Optional
4 |
5 | from rhoknp.units.morpheme import Morpheme
6 |
7 | logger = logging.getLogger(__name__)
8 |
9 |
10 | def is_comment_line(line: str) -> bool:
11 | """行がコメント行かどうかを判定する.
12 |
13 | Args:
14 | line: 行.
15 |
16 | Returns:
17 | bool: コメント行ならTrue.
18 | """
19 | return line.startswith("#") and not Morpheme.is_morpheme_line(line)
20 |
21 |
22 | def extract_did_and_sid(comment_line: str, patterns: list[re.Pattern]) -> tuple[Optional[str], Optional[str], str]:
23 | """コメント行から文書IDおよび文IDを抽出する.
24 |
25 | Args:
26 | comment_line: コメント行.
27 | patterns: 文書IDを抽出する正規表現のリスト.最初にマッチしたものが使用される.
28 |
29 | Returns:
30 | Optional[str]: 文書ID(見つからなければNone).
31 | Optional[str]: 文ID(見つからなければNone).
32 | str: 残りのコメント行.
33 | """
34 | match_sid = re.match(r"# S-ID: ?(\S*)( .+)?$", comment_line)
35 | if match_sid is not None:
36 | sid_string = match_sid[1]
37 | for pattern in patterns:
38 | match = pattern.match(sid_string)
39 | if match is not None:
40 | return match["did"], match["sid"], match_sid[2].lstrip() if match_sid[2] else ""
41 | logger.warning(f"Invalid S-ID: {sid_string}")
42 | return None, None, comment_line.lstrip("#").lstrip()
43 |
--------------------------------------------------------------------------------
/src/rhoknp/utils/reader.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | from collections.abc import Iterator
4 | from functools import partial
5 | from typing import Callable, Optional, TextIO, Union
6 |
7 | from rhoknp import Sentence
8 | from rhoknp.utils.comment import extract_did_and_sid
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | def chunk_by_sentence(f: TextIO) -> Iterator[str]:
14 | """解析結果ファイルを文ごとに分割するジェネレータ.
15 |
16 | Args:
17 | f: 分割するファイル.
18 |
19 | Example:
20 | >>> from rhoknp.units import Sentence
21 | >>> from rhoknp.utils.reader import chunk_by_sentence
22 | >>> with open("example.knp") as f:
23 | ... for knp in chunk_by_sentence(f):
24 | ... sentence = Sentence.from_knp(knp)
25 | """
26 | buffer = []
27 | for line in f:
28 | if line.strip() == "":
29 | continue
30 | buffer.append(line)
31 | if line.rstrip("\n") == Sentence.EOS:
32 | yield "".join(buffer)
33 | buffer = []
34 | if buffer:
35 | yield "".join(buffer)
36 |
37 |
38 | def chunk_by_document(f: TextIO, doc_id_format: Union[str, Callable] = "default") -> Iterator[str]:
39 | """解析結果ファイルを文書ごとに分割するジェネレータ.
40 |
41 | Args:
42 | f: 分割するファイル.
43 | doc_id_format: 文書IDのフォーマット.
44 |
45 | Example:
46 | >>> from rhoknp.units import Document
47 | >>> from rhoknp.utils.reader import chunk_by_document
48 | >>> with open("example.knp") as f:
49 | ... for knp in chunk_by_document(f):
50 | ... document = Document.from_knp(knp)
51 |
52 | .. note::
53 | 文書IDのフォーマットとして指定可能なのは以下の通り:
54 | * "default": 文ID (S-ID) の最後のハイフン以前を文書IDとみなす.
55 | (例) # S-ID:A-X-1 -> 文書ID: A-X
56 | * "kwdlc": KWDLCの文IDから文書IDを取り出す.
57 | (例) # S-ID:w201106-0000060050-1 -> 文書ID: w201106-0000060050
58 | * "wac": WACの文IDから文書IDを取り出す.
59 | (例) # S-ID:wiki00100176-00 -> 文書ID: wiki00100176
60 |
61 | 関数が指定された場合,文解析結果の先頭行から文書IDを取り出す関数とみなす.
62 | 例えば default 相当の処理を行うには以下のような関数を渡す.
63 |
64 | >>> def default_doc_id_format(line: str) -> str:
65 | ... return line.lstrip("# S-ID:").rsplit("-", maxsplit=1)[0]
66 | """
67 | extract_doc_id: Callable[[str], Optional[str]]
68 | if isinstance(doc_id_format, str):
69 | if doc_id_format == "default":
70 | extract_doc_id = partial(_extract_doc_id, pat=Sentence.SID_PAT)
71 | elif doc_id_format == "kwdlc":
72 | extract_doc_id = partial(_extract_doc_id, pat=Sentence.SID_PAT_KWDLC)
73 | elif doc_id_format == "wac":
74 | extract_doc_id = partial(_extract_doc_id, pat=Sentence.SID_PAT_WAC)
75 | else:
76 | raise ValueError(f"Invalid doc_id_format: {doc_id_format}")
77 | elif callable(doc_id_format):
78 | extract_doc_id = doc_id_format
79 | else:
80 | raise TypeError(f"Invalid doc_id_format: {doc_id_format}")
81 |
82 | prev_doc_id: Optional[str] = None
83 | buffer: list[str] = []
84 | for sentence in chunk_by_sentence(f):
85 | doc_id = extract_doc_id(sentence.split("\n")[0])
86 | if buffer and (prev_doc_id != doc_id or doc_id is None):
87 | yield "".join(buffer)
88 | buffer = []
89 | buffer.append(sentence)
90 | prev_doc_id = doc_id
91 | if buffer:
92 | yield "".join(buffer)
93 |
94 |
95 | def _extract_doc_id(line: str, pat: re.Pattern) -> Optional[str]:
96 | """文書IDを抽出する.
97 |
98 | Args:
99 | line: 文IDが含まれるコメント行.
100 | pat: 文書IDを抽出する正規表現.
101 | """
102 | did, _, _ = extract_did_and_sid(line, [pat])
103 | return did
104 |
--------------------------------------------------------------------------------
/tests/bin/jumanpp-mock.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | while true; do
4 | read -r line
5 |
6 | if [ "$line" = "time consuming input" ]; then
7 | sleep 5
8 | fi
9 |
10 | if [ "$line" = "error causing input" ]; then
11 | echo 'エラー1' >&2
12 | echo 'エラー2' >&2
13 | exit 1
14 | fi
15 |
16 | if [ "$line" = "knp time consuming input" ]; then
17 | echo '# knp time consuming input'
18 | fi
19 |
20 | if [ "$line" = "knp error causing input" ]; then
21 | echo '# knp error causing input'
22 | fi
23 |
24 | echo 'こんにちは こんにちは こんにちは 感動詞 12 * 0 * 0 * 0 "代表表記:こんにちは/こんにちは"'
25 | echo 'さようなら さようなら さようなら 感動詞 12 * 0 * 0 * 0 "代表表記:さようなら/さようなら"'
26 | echo 'EOS'
27 | done
28 |
--------------------------------------------------------------------------------
/tests/bin/knp-mock.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | while true; do
4 | read -r line
5 |
6 | if [ "$line" = "# knp time consuming input" ]; then
7 | sleep 5
8 | fi
9 |
10 | if [ "$line" = "# knp error causing input" ]; then
11 | echo 'エラー1' >&2
12 | echo 'エラー2' >&2
13 | exit 1
14 | fi
15 |
16 | echo '# S-ID:1 KNP:5.0-5c637eb DATE:2023/08/23 SCORE:-22.40768'
17 | echo '* -1D <文頭><文末><体言><用言:判><体言止><レベル:C><区切:5-5><裸名詞><提題受:30><主節><状態述語><正規化代表表記:こんにちは/こんにちは><主辞代表表記:こんにちは/こんにちは>'
18 | echo '+ -1D <文頭><文末><体言><用言:判><体言止><レベル:C><区切:5-5><裸名詞><提題受:30><主節><状態述語><判定詞句><名詞項候補><正規化代表表記:こんにちは/こんにちは><主辞代表表記:こんにちは/こんにちは><用言代表表記:こんにちは/こんにちは><節-区切><節-主辞><時制:非過去><格解析結果:こんにちは/こんにちは:判0:ニ/U/-/-/-/-;カラ/U/-/-/-/-><標準用言代表表記:こんにちは/こんにちは>'
19 | echo 'こんにちは こんにちは こんにちは 感動詞 12 * 0 * 0 * 0 "代表表記:こんにちは/こんにちは" <代表表記:こんにちは/こんにちは><正規化代表表記:こんにちは/こんにちは><かな漢字><ひらがな><文頭><文末><表現文末><自立><内容語><タグ単位始><文節始><文節主辞><用言表記先頭><用言表記末尾><用言意味表記末尾>'
20 | echo 'EOS'
21 | done
22 |
--------------------------------------------------------------------------------
/tests/bin/kwja-mock.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | while true; do
4 | read -r line
5 |
6 | if [ "$line" = "time consuming input" ]; then
7 | sleep 5
8 | fi
9 |
10 | if [ "$line" = "error causing input" ]; then
11 | echo 'エラー1' >&2
12 | echo 'エラー2' >&2
13 | exit 1
14 | fi
15 |
16 | echo '# S-ID:1 KNP:5.0-5c637eb DATE:2023/08/23 SCORE:-22.40768'
17 | echo '* -1D <文頭><文末><体言><用言:判><体言止><レベル:C><区切:5-5><裸名詞><提題受:30><主節><状態述語><正規化代表表記:こんにちは/こんにちは><主辞代表表記:こんにちは/こんにちは>'
18 | echo '+ -1D <文頭><文末><体言><用言:判><体言止><レベル:C><区切:5-5><裸名詞><提題受:30><主節><状態述語><判定詞句><名詞項候補><正規化代表表記:こんにちは/こんにちは><主辞代表表記:こんにちは/こんにちは><用言代表表記:こんにちは/こんにちは><節-区切><節-主辞><時制:非過去><格解析結果:こんにちは/こんにちは:判0:ニ/U/-/-/-/-;カラ/U/-/-/-/-><標準用言代表表記:こんにちは/こんにちは>'
19 | echo 'こんにちは こんにちは こんにちは 感動詞 12 * 0 * 0 * 0 "代表表記:こんにちは/こんにちは" <代表表記:こんにちは/こんにちは><正規化代表表記:こんにちは/こんにちは><かな漢字><ひらがな><文頭><文末><表現文末><自立><内容語><タグ単位始><文節始><文節主辞><用言表記先頭><用言表記末尾><用言意味表記末尾>'
20 | echo 'EOS'
21 | echo 'EOD'
22 | done
23 |
--------------------------------------------------------------------------------
/tests/cli/test_cat.py:
--------------------------------------------------------------------------------
1 | import textwrap
2 |
3 | from rhoknp.cli.cat import print_document
4 | from rhoknp.units import Document
5 |
6 | knp = textwrap.dedent(
7 | """\
8 | # S-ID:1
9 | * 1D
10 | + 1D
11 | 望遠 ぼうえん 望遠 名詞 6 普通名詞 1 * 0 * 0 "代表表記:望遠/ぼうえん カテゴリ:抽象物"
12 | + 2D
13 | 鏡 きょう 鏡 名詞 6 普通名詞 1 * 0 * 0 "代表表記:鏡/きょう カテゴリ:人工物-その他 漢字読み:音"
14 | で で で 助詞 9 格助詞 1 * 0 * 0 NIL
15 | * 2D
16 | + 3D
17 | 泳いで およいで 泳ぐ 動詞 2 * 0 子音動詞ガ行 4 タ系連用テ形 14 "代表表記:泳ぐ/およぐ"
18 | いる いる いる 接尾辞 14 動詞性接尾辞 7 母音動詞 1 基本形 2 "代表表記:いる/いる"
19 | * 3D
20 | + 4D
21 | 少女 しょうじょ 少女 名詞 6 普通名詞 1 * 0 * 0 "代表表記:少女/しょうじょ カテゴリ:人"
22 | を を を 助詞 9 格助詞 1 * 0 * 0 NIL
23 | * -1D
24 | + -1D <節-区切><節-主辞>
25 | 見た みた 見る 動詞 2 * 0 母音動詞 1 タ形 10 "代表表記:見る/みる 自他動詞:自:見える/みえる 補文ト"
26 | 。 。 。 特殊 1 句点 1 * 0 * 0 NIL
27 | EOS
28 | """
29 | )
30 |
31 |
32 | def test_print_document() -> None:
33 | document = Document.from_knp(knp)
34 | print_document(document)
35 |
36 |
37 | def test_print_document_dark() -> None:
38 | document = Document.from_knp(knp)
39 | print_document(document, is_dark=True)
40 |
--------------------------------------------------------------------------------
/tests/cli/test_cli.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | import textwrap
3 |
4 | import pytest
5 | from typer.testing import CliRunner
6 |
7 | from rhoknp import Document, __version__
8 | from rhoknp.cli.cli import app
9 |
10 | runner = CliRunner()
11 |
12 |
13 | knp_text = textwrap.dedent(
14 | """\
15 | # S-ID:1
16 | * 1D
17 | + 1D
18 | 望遠 ぼうえん 望遠 名詞 6 普通名詞 1 * 0 * 0 "代表表記:望遠/ぼうえん カテゴリ:抽象物"
19 | + 2D
20 | 鏡 きょう 鏡 名詞 6 普通名詞 1 * 0 * 0 "代表表記:鏡/きょう カテゴリ:人工物-その他 漢字読み:音"
21 | で で で 助詞 9 格助詞 1 * 0 * 0 NIL
22 | * 2D
23 | + 3D
24 | 泳いで およいで 泳ぐ 動詞 2 * 0 子音動詞ガ行 4 タ系連用テ形 14 "代表表記:泳ぐ/およぐ"
25 | いる いる いる 接尾辞 14 動詞性接尾辞 7 母音動詞 1 基本形 2 "代表表記:いる/いる"
26 | * 3D
27 | + 4D
28 | 少女 しょうじょ 少女 名詞 6 普通名詞 1 * 0 * 0 "代表表記:少女/しょうじょ カテゴリ:人"
29 | を を を 助詞 9 格助詞 1 * 0 * 0 NIL
30 | * -1D
31 | + -1D <節-区切><節-主辞>
32 | 見た みた 見る 動詞 2 * 0 母音動詞 1 タ形 10 "代表表記:見る/みる 自他動詞:自:見える/みえる 補文ト"
33 | 。 。 。 特殊 1 句点 1 * 0 * 0 NIL
34 | EOS
35 | """
36 | )
37 |
38 |
39 | def test_version() -> None:
40 | result = runner.invoke(app, ["-v"])
41 | assert result.exit_code == 0
42 | assert result.stdout.strip() == f"rhoknp version: {__version__}"
43 |
44 |
45 | def test_cat() -> None:
46 | doc = Document.from_knp(knp_text)
47 | with tempfile.NamedTemporaryFile("wt") as f:
48 | f.write(doc.to_knp())
49 | f.flush()
50 | result = runner.invoke(app, ["cat", f.name])
51 | assert result.exit_code == 0
52 |
53 |
54 | @pytest.fixture
55 | def _mock_stdin(monkeypatch: pytest.MonkeyPatch) -> None:
56 | monkeypatch.setattr("sys.stdin", knp_text)
57 |
58 |
59 | @pytest.mark.usefixtures("_mock_stdin")
60 | def test_cat_stdin() -> None:
61 | result = runner.invoke(app, ["cat"])
62 | assert result.exit_code == 0
63 |
64 |
65 | def test_convert() -> None:
66 | doc = Document.from_knp(knp_text)
67 | with tempfile.NamedTemporaryFile("wt") as f:
68 | f.write(doc.to_knp())
69 | f.flush()
70 | for format_ in ("text", "jumanpp", "knp"):
71 | result = runner.invoke(app, ["convert", f.name, "--format", format_])
72 | assert result.exit_code == 0
73 |
74 |
75 | @pytest.mark.usefixtures("_mock_stdin")
76 | def test_convert_stdin() -> None:
77 | for format_ in ("text", "jumanpp", "knp"):
78 | result = runner.invoke(app, ["convert", "--format", format_])
79 | assert result.exit_code == 0
80 |
81 |
82 | def test_convert_value_error() -> None:
83 | doc = Document.from_knp(knp_text)
84 | with tempfile.NamedTemporaryFile("wt") as f:
85 | f.write(doc.to_knp())
86 | f.flush()
87 | result = runner.invoke(app, ["convert", f.name, "--format", "foo"]) # Unknown format
88 | assert result.exit_code == 1
89 |
90 |
91 | def test_show() -> None:
92 | doc = Document.from_knp(knp_text)
93 | with tempfile.NamedTemporaryFile("wt") as f:
94 | f.write(doc.to_knp())
95 | f.flush()
96 | result = runner.invoke(app, ["show", f.name])
97 | assert result.exit_code == 0
98 |
99 |
100 | def test_show_error() -> None:
101 | result = runner.invoke(app, ["show", "foo.knp"]) # not exist
102 | assert result.exit_code == 2
103 |
104 |
105 | def test_stats() -> None:
106 | doc = Document.from_knp(knp_text)
107 | with tempfile.NamedTemporaryFile("wt") as f:
108 | f.write(doc.to_knp())
109 | f.flush()
110 | result = runner.invoke(app, ["stats", f.name])
111 | assert result.exit_code == 0
112 |
113 |
114 | def test_stats_json() -> None:
115 | doc = Document.from_knp(knp_text)
116 | with tempfile.NamedTemporaryFile("wt") as f:
117 | f.write(doc.to_knp())
118 | result = runner.invoke(app, ["stats", f.name, "--json"])
119 | assert result.exit_code == 0
120 |
121 |
122 | def test_stats_error() -> None:
123 | result = runner.invoke(app, ["stats", "foo.knp"]) # not exist
124 | assert result.exit_code == 2
125 |
126 |
127 | def test_serve_error() -> None:
128 | result = runner.invoke(app, ["serve"])
129 | assert result.exit_code == 2
130 |
--------------------------------------------------------------------------------
/tests/cli/test_stats.py:
--------------------------------------------------------------------------------
1 | import textwrap
2 |
3 | from rhoknp.cli.stats import get_document_statistics
4 | from rhoknp.units import Document
5 |
6 | knp = textwrap.dedent(
7 | """\
8 | # S-ID:1
9 | * 1D
10 | + 1D
11 | 望遠 ぼうえん 望遠 名詞 6 普通名詞 1 * 0 * 0 "代表表記:望遠/ぼうえん カテゴリ:抽象物"
12 | + 2D
13 | 鏡 きょう 鏡 名詞 6 普通名詞 1 * 0 * 0 "代表表記:鏡/きょう カテゴリ:人工物-その他 漢字読み:音"
14 | で で で 助詞 9 格助詞 1 * 0 * 0 NIL
15 | * 2D
16 | + 3D
17 | 泳いで およいで 泳ぐ 動詞 2 * 0 子音動詞ガ行 4 タ系連用テ形 14 "代表表記:泳ぐ/およぐ"
18 | いる いる いる 接尾辞 14 動詞性接尾辞 7 母音動詞 1 基本形 2 "代表表記:いる/いる"
19 | * 3D
20 | + 4D
21 | 少女 しょうじょ 少女 名詞 6 普通名詞 1 * 0 * 0 "代表表記:少女/しょうじょ カテゴリ:人"
22 | を を を 助詞 9 格助詞 1 * 0 * 0 NIL
23 | * -1D
24 | + -1D <節-区切><節-主辞>
25 | 見た みた 見る 動詞 2 * 0 母音動詞 1 タ形 10 "代表表記:見る/みる 自他動詞:自:見える/みえる 補文ト"
26 | 。 。 。 特殊 1 句点 1 * 0 * 0 NIL
27 | EOS
28 | """
29 | )
30 |
31 |
32 | def test_stats() -> None:
33 | document = Document.from_knp(knp)
34 | stats = get_document_statistics(document)
35 | assert stats == {
36 | "unit": {
37 | "sentence": 1,
38 | "clause": 1,
39 | "phrase": 4,
40 | "base_phrase": 5,
41 | "morpheme": 9,
42 | },
43 | "cohesion": {
44 | "predicate": 0,
45 | "argument": 0,
46 | "coreference": 0,
47 | "discourse": 0,
48 | },
49 | "other": {
50 | "named_entity": 0,
51 | },
52 | }
53 |
--------------------------------------------------------------------------------
/tests/cohesion/test_argument.py:
--------------------------------------------------------------------------------
1 | import textwrap
2 |
3 | import pytest
4 |
5 | from rhoknp.cohesion import ArgumentType, EndophoraArgument, ExophoraArgument, ExophoraReferent, Pas, Predicate
6 | from rhoknp.units import BasePhrase
7 |
8 |
9 | def test_endophora_argument() -> None:
10 | argument_base_phrase = BasePhrase.from_knp(
11 | textwrap.dedent(
12 | """\
13 | + 4D
14 | 彼 かれ 彼 名詞 6 普通名詞 1 * 0 * 0
15 | は は は 助詞 9 副助詞 2 * 0 * 0
16 | """
17 | )
18 | )
19 | predicate_base_phrase = BasePhrase.from_knp(
20 | textwrap.dedent(
21 | """\
22 | + -1D
23 | 言う いう 言う 動詞 2 * 0 子音動詞ワ行 12 基本形 2
24 | """
25 | )
26 | )
27 | another_predicate_base_phrase = BasePhrase.from_knp(
28 | textwrap.dedent(
29 | """\
30 | + -1D
31 | 食べる たべる 食べる 動詞 2 * 0 母音動詞 1 基本形 2
32 | """
33 | )
34 | )
35 | arg_type = ArgumentType.OMISSION
36 | pas = Pas(Predicate(predicate_base_phrase))
37 | argument = EndophoraArgument("ガ", argument_base_phrase, pas.predicate, arg_type=arg_type)
38 | argument.pas = pas
39 | assert argument.case == "ガ"
40 | assert argument.type == arg_type
41 | assert argument.optional is False
42 | assert argument.is_special() is False
43 | assert argument.pas == pas
44 | assert argument.base_phrase == argument_base_phrase
45 | with pytest.raises(AssertionError):
46 | _ = argument.document
47 | with pytest.raises(AssertionError):
48 | _ = argument.sentence
49 | with pytest.raises(AssertionError):
50 | _ = argument.clause
51 | with pytest.raises(AssertionError):
52 | _ = argument.phrase
53 |
54 | assert repr(argument) == ""
55 | assert str(argument) == argument_base_phrase.text
56 | assert argument != "test"
57 | another_argument = EndophoraArgument("ガ", argument_base_phrase, pas.predicate, arg_type=ArgumentType.EXOPHORA)
58 | another_argument.pas = pas
59 | assert argument == another_argument
60 |
61 | another_pas = Pas(Predicate(another_predicate_base_phrase))
62 | another_argument.pas = another_pas
63 | assert argument != another_argument
64 |
65 |
66 | def test_exophora_argument() -> None:
67 | predicate_base_phrase = BasePhrase.from_knp(
68 | textwrap.dedent(
69 | """\
70 | + -1D
71 | 言う いう 言う 動詞 2 * 0 子音動詞ワ行 12 基本形 2
72 | """
73 | )
74 | )
75 | another_predicate_base_phrase = BasePhrase.from_knp(
76 | textwrap.dedent(
77 | """\
78 | + -1D
79 | 食べる たべる 食べる 動詞 2 * 0 母音動詞 1 基本形 2
80 | """
81 | )
82 | )
83 | pas = Pas(Predicate(predicate_base_phrase))
84 | exophora_referent = ExophoraReferent("不特定:人")
85 | argument = ExophoraArgument("ガ", exophora_referent, eid=3)
86 | argument.pas = pas
87 | assert argument.case == "ガ"
88 | assert argument.type == ArgumentType.EXOPHORA
89 | assert argument.optional is False
90 | assert argument.is_special() is True
91 | assert argument.pas == pas
92 | assert argument.exophora_referent == exophora_referent
93 | assert argument.eid == 3
94 | assert repr(argument) == f"ExophoraArgument(case='ガ', exophora_referent={exophora_referent!r}, eid=3)"
95 | assert eval(repr(argument)) == argument
96 | assert str(argument) == "不特定:人"
97 | assert argument != "test"
98 | another_argument = ExophoraArgument("ガ", exophora_referent, eid=1)
99 | another_argument.pas = pas
100 | assert argument == another_argument
101 |
102 | another_pas = Pas(Predicate(another_predicate_base_phrase))
103 | another_argument.pas = another_pas
104 | assert argument != another_argument
105 |
--------------------------------------------------------------------------------
/tests/cohesion/test_exophora.py:
--------------------------------------------------------------------------------
1 | from rhoknp.cohesion.exophora import ExophoraReferent, ExophoraReferentType
2 |
3 |
4 | def test_exophora() -> None:
5 | referent = ExophoraReferent("著者")
6 | assert referent.type == ExophoraReferentType.WRITER
7 | assert referent.index is None
8 | assert str(referent) == "著者"
9 | assert repr(referent) == "ExophoraReferent(text='著者')"
10 | assert eval(repr(referent)) == referent
11 |
12 |
13 | def test_exophora_number() -> None:
14 | referent = ExophoraReferent("不特定:人3")
15 | assert referent.type == ExophoraReferentType.UNSPECIFIED_PERSON
16 | assert referent.index == 3
17 | assert str(referent) == "不特定:人3"
18 | assert repr(referent) == "ExophoraReferent(text='不特定:人3')"
19 | assert eval(repr(referent)) == referent
20 |
21 |
22 | def test_exophora_other() -> None:
23 | referent = ExophoraReferent("ほげほげ2")
24 | assert referent.type == ExophoraReferentType.OTHER
25 | assert referent.index is None
26 | assert str(referent) == "ほげほげ2"
27 | assert repr(referent) == "ExophoraReferent(text='ほげほげ2')"
28 | assert eval(repr(referent)) == referent
29 |
--------------------------------------------------------------------------------
/tests/cohesion/test_predicate.py:
--------------------------------------------------------------------------------
1 | import textwrap
2 |
3 | import pytest
4 |
5 | from rhoknp.cohesion import Pas, Predicate
6 | from rhoknp.units import BasePhrase
7 |
8 |
9 | def test_predicate() -> None:
10 | knp = textwrap.dedent(
11 | """\
12 | + -1D <格解析結果:行く/いく:動12:ガ/N/彼/0/0/1;ニ/U/-/-/-/-;デ/U/-/-/-/-;ヘ/C/大学/3/0/1;時間/U/-/-/-/->
13 | 行った いった 行く 動詞 2 * 0 子音動詞カ行促音便形 3 タ形 10
14 | 。 。 。 特殊 1 句点 1 * 0 * 0 NIL
15 | """
16 | )
17 | base_phrase = BasePhrase.from_knp(knp)
18 | predicate = Predicate(base_phrase, cfid="行く/いく:動12")
19 | pas = Pas(predicate)
20 | predicate.pas = pas
21 | assert predicate.base_phrase == base_phrase
22 | assert predicate.cfid == "行く/いく:動12"
23 | assert predicate.text == "行った。"
24 | with pytest.raises(AssertionError):
25 | _ = predicate.document
26 | with pytest.raises(AssertionError):
27 | _ = predicate.sentence
28 | with pytest.raises(AssertionError):
29 | _ = predicate.clause
30 | with pytest.raises(AssertionError):
31 | _ = predicate.phrase
32 | with pytest.raises(AssertionError):
33 | _ = predicate.sid
34 | assert predicate.pas == pas
35 | assert str(predicate) == "行った。"
36 | assert repr(predicate) == ""
37 |
38 | assert predicate != "test"
39 | another_predicate = Predicate(base_phrase, cfid="行く/いく:動12")
40 | assert predicate == another_predicate
41 | another_predicate = Predicate(base_phrase, cfid=None)
42 | assert predicate == another_predicate
43 | another_predicate = Predicate(base_phrase, cfid="行く/いく:動3")
44 | assert predicate != another_predicate
45 |
--------------------------------------------------------------------------------
/tests/cohesion/test_rel.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from rhoknp.cohesion.rel import RelMode, RelTagList
4 |
5 | FSTRINGS = [
6 | """""",
7 | """""",
8 | """""",
9 | ]
10 |
11 |
12 | def test_from_fstring_0() -> None:
13 | rel_tags = RelTagList.from_fstring(FSTRINGS[0])
14 | assert len(rel_tags) == 3
15 |
16 | rel_tag = rel_tags[0]
17 | assert rel_tag.type == "=≒"
18 | assert rel_tag.target == "オフェンス"
19 | assert rel_tag.sid == "w201106-0001519365-1"
20 | assert rel_tag.base_phrase_index == 3
21 | assert rel_tag.mode is None
22 |
23 | rel_tag = rel_tags[1]
24 | assert rel_tag.type == "=≒"
25 | assert rel_tag.target == "ディフェンス"
26 | assert rel_tag.sid == "w201106-0001519365-1"
27 | assert rel_tag.base_phrase_index == 4
28 | assert rel_tag.mode == RelMode.AND
29 |
30 | rel_tag = rel_tags[2]
31 | assert rel_tag.type == "ノ?"
32 | assert rel_tag.target == "著者"
33 | assert rel_tag.sid is None
34 | assert rel_tag.base_phrase_index is None
35 | assert rel_tag.mode is None
36 |
37 |
38 | def test_from_fstring_1() -> None:
39 | rel_tags = RelTagList.from_fstring(FSTRINGS[1])
40 | assert len(rel_tags) == 2
41 |
42 | rel_tag = rel_tags[0]
43 | assert rel_tag.type == "ガ"
44 | assert rel_tag.target == ">"
45 | assert rel_tag.sid == "202209271752-05054-00"
46 | assert rel_tag.base_phrase_index == 0
47 | assert rel_tag.mode is None
48 |
49 | rel_tag = rel_tags[1]
50 | assert rel_tag.type == "ニ"
51 | assert rel_tag.target == "不特定:人"
52 | assert rel_tag.sid is None
53 | assert rel_tag.base_phrase_index is None
54 | assert rel_tag.mode is None
55 |
56 |
57 | @pytest.mark.parametrize("fstring", FSTRINGS)
58 | def test_to_fstring(fstring: str) -> None:
59 | rel_tags = RelTagList.from_fstring(fstring)
60 | assert rel_tags.to_fstring() == fstring
61 |
62 |
63 | @pytest.mark.parametrize("fstring", FSTRINGS)
64 | def test_str(fstring: str) -> None:
65 | rel_tags = RelTagList.from_fstring(fstring)
66 | assert str(rel_tags) == fstring
67 |
--------------------------------------------------------------------------------
/tests/data/w201106-0000060050.knp:
--------------------------------------------------------------------------------
1 | # S-ID:w201106-0000060050-1 JUMAN:6.1-20101108 KNP:3.1-20101107 DATE:2011/06/21 SCORE:-44.94406 MOD:2017/10/15 MEMO:
2 | * 2D
3 | + 1D
4 | コイン こいん コイン 名詞 6 普通名詞 1 * 0 * 0
5 | + 3D
6 | トス とす トス 名詞 6 サ変名詞 2 * 0 * 0
7 | を を を 助詞 9 格助詞 1 * 0 * 0
8 | * 2D
9 | + 3D
10 | 3 さん 3 名詞 6 数詞 7 * 0 * 0
11 | 回 かい 回 接尾辞 14 名詞性名詞助数辞 3 * 0 * 0
12 | * -1D
13 | + -1D
14 | 行う おこなう 行う 動詞 2 * 0 子音動詞ワ行 12 基本形 2
15 | 。 。 。 特殊 1 句点 1 * 0 * 0
16 | EOS
17 | # S-ID:w201106-0000060050-2 JUMAN:6.1-20101108 KNP:3.1-20101107 DATE:2011/06/21 SCORE:-64.95916 MOD:2013/04/13
18 | * 1D
19 | + 1D
20 | 表 おもて 表 名詞 6 普通名詞 1 * 0 * 0
21 | が が が 助詞 9 格助詞 1 * 0 * 0
22 | * 2D
23 | + 2D
24 | 出た でた 出る 動詞 2 * 0 母音動詞 1 タ形 10
25 | * 5D
26 | + 5D
27 | 数 かず 数 名詞 6 普通名詞 1 * 0 * 0
28 | だけ だけ だけ 助詞 9 副助詞 2 * 0 * 0
29 | 、 、 、 特殊 1 読点 2 * 0 * 0
30 | * 4D
31 | + 4D
32 | フィールド ふぃーるど フィールド 名詞 6 普通名詞 1 * 0 * 0
33 | 上 じょう 上 接尾辞 14 名詞性名詞接尾辞 2 * 0 * 0
34 | の の の 助詞 9 接続助詞 3 * 0 * 0
35 | * 5D
36 | + 5D
37 | モンスター もんすたー モンスター 名詞 6 普通名詞 1 * 0 * 0
38 | を を を 助詞 9 格助詞 1 * 0 * 0
39 | * -1D
40 | + -1D
41 | 破壊 はかい 破壊 名詞 6 サ変名詞 2 * 0 * 0
42 | する する する 動詞 2 * 0 サ変動詞 16 基本形 2
43 | 。 。 。 特殊 1 句点 1 * 0 * 0
44 | EOS
45 | # S-ID:w201106-0000060050-3 JUMAN:6.1-20101108 KNP:3.1-20101107 DATE:2011/06/21 SCORE:-130.82529 MOD:2016/07/22 MEMO:
46 | * 1D
47 | + 1D
48 | この この この 指示詞 7 連体詞形態指示詞 2 * 0 * 0
49 | * 6D
50 | + 8D
51 | 効果 こうか 効果 名詞 6 普通名詞 1 * 0 * 0
52 | は は は 助詞 9 副助詞 2 * 0 * 0
53 | * 3D
54 | + 3D
55 | 1 いち 1 名詞 6 数詞 7 * 0 * 0
56 | + 4D
57 | ターン たーん ターン 名詞 6 サ変名詞 2 * 0 * 0
58 | に に に 助詞 9 格助詞 1 * 0 * 0
59 | * 6D
60 | + 8D
61 | 1 いち 1 名詞 6 数詞 7 * 0 * 0
62 | 度 ど 度 接尾辞 14 名詞性名詞助数辞 3 * 0 * 0
63 | だけ だけ だけ 助詞 9 副助詞 2 * 0 * 0
64 | * 5D
65 | + 7D
66 | 自分 じぶん 自分 名詞 6 普通名詞 1 * 0 * 0
67 | の の の 助詞 9 接続助詞 3 * 0 * 0
68 | * 6D
69 | + 7D
70 | メイン めいん メインだ 形容詞 3 * 0 ナノ形容詞 22 語幹 1
71 | + 8D
72 | フェイズ ふぇいず フェイズ 名詞 6 普通名詞 1 * 0 * 0
73 | に に に 助詞 9 格助詞 1 * 0 * 0
74 | * -1D
75 | + -1D
76 | 使用 しよう 使用 名詞 6 サ変名詞 2 * 0 * 0
77 | する する する 動詞 2 * 0 サ変動詞 16 基本形 2
78 | 事 こと 事 名詞 6 普通名詞 1 * 0 * 0
79 | が が が 助詞 9 格助詞 1 * 0 * 0
80 | できる できる できる 動詞 2 * 0 母音動詞 1 基本形 2
81 | 。 。 。 特殊 1 句点 1 * 0 * 0
82 | EOS
83 |
--------------------------------------------------------------------------------
/tests/data/w201106-0000074273.knp:
--------------------------------------------------------------------------------
1 | # S-ID:w201106-0000074273-1 JUMAN:6.1-20101108 KNP:3.1-20101107 DATE:2011/06/21 SCORE:-55.96971 MOD:2011/07/04
2 | * 2D
3 | + 2D
4 | 7 なな 7 名詞 6 数詞 7 * 0 * 0
5 | つ つ つ 接尾辞 14 名詞性名詞助数辞 3 * 0 * 0
6 | の の の 助詞 9 接続助詞 3 * 0 * 0
7 | * 2D
8 | + 2D
9 | 女神 めがみ 女神 名詞 6 普通名詞 1 * 0 * 0
10 | の の の 助詞 9 接続助詞 3 * 0 * 0
11 | * 4D
12 | + 4D
13 | 果実 かじつ 果実 名詞 6 普通名詞 1 * 0 * 0
14 | が が が 助詞 9 格助詞 1 * 0 * 0
15 | * 4D
16 | + 4D
17 | 全て すべて 全て 副詞 8 * 0 * 0 * 0
18 | * 6D
19 | + 7D
20 | そろったら そろったら そろう 動詞 2 * 0 子音動詞ワ行 12 タ系条件形 13
21 | * 6D
22 | + 6D
23 | 天使 てんし 天使 名詞 6 普通名詞 1 * 0 * 0
24 | + 7D
25 | 界 かい 界 名詞 6 普通名詞 1 * 0 * 0
26 | に に に 助詞 9 格助詞 1 * 0 * 0
27 | * -1D
28 | + -1D
29 | 向かい むかい 向かう 動詞 2 * 0 子音動詞ワ行 12 基本連用形 8
30 | ます ます ます 接尾辞 14 動詞性接尾辞 7 動詞性接尾辞ます型 31 基本形 2
31 | 。 。 。 特殊 1 句点 1 * 0 * 0
32 | EOS
33 | # S-ID:w201106-0000074273-2 JUMAN:6.1-20101108 KNP:3.1-20101107 DATE:2011/06/21 SCORE:-97.54113 MOD:2011/07/04
34 | * 1D
35 | + 1D
36 | ダーマ だーま ダーマ 名詞 6 固有名詞 3 * 0 * 0 "品詞変更:ダーマ-ダーマ-ダーマ-15-2-0-0"
37 | + 2D
38 | 神殿 しんでん 神殿 名詞 6 普通名詞 1 * 0 * 0
39 | + 3D
40 | 南西 なんせい 南西 名詞 6 普通名詞 1 * 0 * 0
41 | に に に 助詞 9 格助詞 1 * 0 * 0
42 | * 3D
43 | + 5D
44 | ある ある ある 動詞 2 * 0 子音動詞ラ行 10 基本形 2
45 | * 3D
46 | + 5D
47 | 青い あおい 青い 形容詞 3 * 0 イ形容詞アウオ段 18 基本形 2
48 | * 4D
49 | + 6D
50 | 木 き 木 名詞 6 普通名詞 1 * 0 * 0
51 | へ へ へ 助詞 9 格助詞 1 * 0 * 0
52 | * 7D
53 | + 9D
54 | 行き いき 行く 動詞 2 * 0 子音動詞カ行促音便形 3 基本連用形 8
55 | * 6D
56 | + 8D
57 | 天 てん 天 名詞 6 普通名詞 1 * 0 * 0
58 | の の の 助詞 9 接続助詞 3 * 0 * 0
59 | * 7D
60 | + 9D
61 | 箱舟 はこぶね 箱舟 名詞 6 普通名詞 1 * 0 * 0
62 | を を を 助詞 9 格助詞 1 * 0 * 0
63 | * -1D
64 | + -1D
65 | 呼ぶ よぶ 呼ぶ 動詞 2 * 0 子音動詞バ行 8 基本形 2
66 | 。 。 。 特殊 1 句点 1 * 0 * 0
67 | EOS
68 | # S-ID:w201106-0000074273-3 JUMAN:6.1-20101108 KNP:3.1-20101107 DATE:2011/06/21 SCORE:-98.41177 MOD:2013/04/22
69 | * 2D
70 | + 2D
71 | 途中 とちゅう 途中 名詞 6 時相名詞 10 * 0 * 0
72 | で で で 助詞 9 格助詞 1 * 0 * 0
73 | * 2D
74 | + 2D
75 | イベント いべんと イベント 名詞 6 普通名詞 1 * 0 * 0
76 | が が が 助詞 9 格助詞 1 * 0 * 0
77 | * 5P
78 | + 5P
79 | 発生 はっせい 発生 名詞 6 サ変名詞 2 * 0 * 0
80 | し し する 動詞 2 * 0 サ変動詞 16 基本連用形 8
81 | 、 、 、 特殊 1 読点 2 * 0 * 0
82 | * 5D
83 | + 5D
84 | 自動 じどう 自動 名詞 6 普通名詞 1 * 0 * 0
85 | 的に てきに 的だ 接尾辞 14 形容詞性名詞接尾辞 6 ナ形容詞 21 ダ列基本連用形 8
86 | * 5D
87 | + 5D
88 | ナザム なざむ ナザム 名詞 6 地名 4 * 0 * 0 "品詞変更:ナザム-ナザム-ナザム-15-2-0-0"
89 | 村 むら 村 接尾辞 14 名詞性特殊接尾辞 4 * 0 * 0
90 | へ へ へ 助詞 9 格助詞 1 * 0 * 0
91 | * -1D
92 | + -1D
93 | 行き いき 行く 動詞 2 * 0 子音動詞カ行促音便形 3 基本連用形 8
94 | ます ます ます 接尾辞 14 動詞性接尾辞 7 動詞性接尾辞ます型 31 基本形 2
95 | 。 。 。 特殊 1 句点 1 * 0 * 0
96 | EOS
97 |
--------------------------------------------------------------------------------
/tests/data/wiki00100176.knp:
--------------------------------------------------------------------------------
1 | # S-ID:wiki00100176-00 KNP:5.0-6a1f607d DATE:2022/04/11 SCORE:50.00000 MOD:2022/04/29 MEMO:
2 | * 2D
3 | + 1D
4 | 株式 かぶしき 株式 名詞 6 普通名詞 1 * 0 * 0
5 | + 3D
6 | 会社 がいしゃ 会社 名詞 6 普通名詞 1 * 0 * 0
7 | + 3D
8 | ポニー ぽにー ポニー 名詞 6 普通名詞 1 * 0 * 0
9 | + 11D
10 | キャニオン きゃにおん キャニオン 名詞 6 普通名詞 1 * 0 * 0
11 | は は は 助詞 9 副助詞 2 * 0 * 0
12 | 、 、 、 特殊 1 読点 2 * 0 * 0
13 | * 2D
14 | + 5P
15 | フジ ふじ フジ 名詞 6 組織名 6 * 0 * 0
16 | + 6D
17 | サンケイ さんけい サンケイ 名詞 6 組織名 6 * 0 * 0
18 | + 11D
19 | グループ ぐるーぷ グループ 名詞 6 普通名詞 1 * 0 * 0
20 | の の の 助詞 9 接続助詞 3 * 0 * 0
21 | * -1D
22 | + 11D
23 | 大手 おおて 大手 名詞 6 普通名詞 1 * 0 * 0
24 | + 9P
25 | 映像 えいぞう 映像 名詞 6 普通名詞 1 * 0 * 0
26 | ・ ・ ・ 特殊 1 記号 5 * 0 * 0
27 | + 10D
28 | 音楽 おんがく 音楽 名詞 6 普通名詞 1 * 0 * 0
29 | + 11D
30 | ソフト そふと ソフト 名詞 6 普通名詞 1 * 0 * 0
31 | + -1D
32 | メーカー めーかー メーカー 名詞 6 普通名詞 1 * 0 * 0
33 | である である だ 判定詞 4 * 0 判定詞 25 デアル列基本形 15
34 | 。 。 。 特殊 1 句点 1 * 0 * 0
35 | EOS
36 | # S-ID:wiki00100176-01 KNP:5.0-6a1f607d DATE:2022/04/11 SCORE:50.00000 MOD:2022/04/29 MEMO:
37 | * 1D
38 | + 1D
39 | 通称 つうしょう 通称 名詞 6 普通名詞 1 * 0 * 0
40 | は は は 助詞 9 副助詞 2 * 0 * 0
41 | * -1D
42 | + -1D
43 | 「 「 「 特殊 1 括弧始 3 * 0 * 0
44 | ポニキャン ぽにきゃん ポニキャン 名詞 6 組織名 6 * 0 * 0
45 | 」 」 」 特殊 1 括弧終 4 * 0 * 0
46 | 。 。 。 特殊 1 句点 1 * 0 * 0
47 | EOS
48 | # S-ID:wiki00100176-02 KNP:5.0-6a1f607d DATE:2022/04/11 SCORE:0.00000 MOD:2022/04/29 MEMO:
49 | * 1D
50 | + 1D
51 | フジ ふじ フジ 名詞 6 組織名 6 * 0 * 0
52 | ・ ・ ・ 特殊 1 記号 5 * 0 * 0
53 | + 2D
54 | メディア めでぃあ メディア 名詞 6 普通名詞 1 * 0 * 0
55 | ・ ・ ・ 特殊 1 記号 5 * 0 * 0
56 | + 4D
57 | ホールディングス ほーるでぃんぐす ホールディングス 名詞 6 普通名詞 1 * 0 * 0
58 | の の の 助詞 9 接続助詞 3 * 0 * 0
59 | * -1D
60 | + 4D
61 | 連結 れんけつ 連結 名詞 6 サ変名詞 2 * 0 * 0
62 | + -1D
63 | 子会社 こがいしゃ 子会社 名詞 6 普通名詞 1 * 0 * 0
64 | 。 。 。 特殊 1 句点 1 * 0 * 0
65 | EOS
66 |
--------------------------------------------------------------------------------
/tests/processors/test_kwja.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from rhoknp import KNP, KWJA, Document, Jumanpp, Sentence
4 |
5 | is_kwja_available = KWJA(options=["--model-size", "tiny", "--tasks", "typo"]).is_available()
6 |
7 |
8 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available")
9 | def test_get_version() -> None:
10 | kwja = KWJA(options=["--model-size", "tiny"])
11 | _ = kwja.get_version()
12 |
13 |
14 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available")
15 | def test_is_available() -> None:
16 | kwja = KWJA(options=["--model-size", "tiny"])
17 | assert kwja.is_available() is True
18 |
19 | kwja = KWJA("kwjaaaaaaaaaaaaaaaaa")
20 | assert kwja.is_available() is False
21 |
22 | with pytest.raises(RuntimeError):
23 | _ = kwja.apply_to_sentence("test")
24 |
25 | with pytest.raises(RuntimeError):
26 | _ = kwja.apply_to_document("test")
27 |
28 | with pytest.raises(RuntimeError):
29 | _ = kwja.get_version()
30 |
31 |
32 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available")
33 | def test_typo() -> None:
34 | kwja = KWJA(options=["--model-size", "tiny", "--tasks", "typo"])
35 | text = "人口知能"
36 | doc = kwja.apply_to_document(text)
37 | assert doc.text == "人工知能"
38 |
39 |
40 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available")
41 | def test_char() -> None:
42 | kwja = KWJA(options=["--model-size", "tiny", "--tasks", "char"])
43 | text = "こんにちは。さようなら。"
44 | doc = kwja.apply_to_document(text)
45 | morphemes = doc.morphemes
46 | assert len(morphemes) > 0
47 | morpheme = morphemes[0]
48 | assert text.startswith(morpheme.text)
49 | assert morpheme.reading == "*"
50 | assert morpheme.lemma == "*"
51 |
52 |
53 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available")
54 | def test_seq2seq() -> None:
55 | kwja = KWJA(options=["--model-size", "tiny", "--tasks", "char,seq2seq"])
56 | text = "こんにちは。さようなら。"
57 | doc = kwja.apply_to_document(text, timeout=60)
58 | morphemes = doc.morphemes
59 | assert len(morphemes) > 0
60 | morpheme = morphemes[0]
61 | assert text.startswith(morpheme.text)
62 | assert text.startswith(morpheme.reading)
63 | assert text.startswith(morpheme.lemma)
64 |
65 |
66 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available")
67 | def test_word() -> None:
68 | kwja = KWJA(options=["--model-size", "tiny", "--tasks", "char,word"])
69 | text = "こんにちは。さようなら。"
70 | doc = kwja.apply_to_document(text)
71 | morphemes = doc.morphemes
72 | assert len(morphemes) > 0
73 | assert text.startswith(morphemes[0].text)
74 | base_phrases = doc.base_phrases
75 | assert len(base_phrases) > 0
76 | assert text.startswith(base_phrases[0].text)
77 | phrases = doc.phrases
78 | assert len(phrases) > 0
79 | assert text.startswith(phrases[0].text)
80 | clauses = doc.clauses
81 | assert len(clauses) > 0
82 | assert text.startswith(clauses[0].text)
83 |
84 |
85 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available")
86 | def test_raw_input() -> None:
87 | kwja = KWJA(options=["--model-size", "tiny", "--tasks", "typo", "--input-format", "raw"])
88 | text = "人口知能"
89 | doc = kwja.apply_to_document(text)
90 | assert doc.text == "人工知能"
91 |
92 |
93 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available")
94 | def test_jumanpp_input() -> None:
95 | doc0 = Document.from_raw_text("こんにちは。さようなら。")
96 | doc0.doc_id = "test"
97 | doc1 = Jumanpp().apply_to_document(doc0)
98 | for idx, sent in enumerate(doc1.sentences):
99 | sent.sent_id = f"test-{idx}"
100 | assert not doc1.is_jumanpp_required()
101 | doc2 = KWJA(options=["--model-size", "tiny", "--tasks", "word", "--input-format", "jumanpp"]).apply_to_document(
102 | doc1
103 | )
104 | assert doc1.doc_id == doc2.doc_id
105 | assert [sent.sid for sent in doc2.sentences] == [sent.sid for sent in doc1.sentences]
106 | assert [sent.text for sent in doc2.sentences] == [sent.text for sent in doc1.sentences]
107 | assert [mrph.text for mrph in doc2.morphemes] == [mrph.text for mrph in doc1.morphemes]
108 |
109 |
110 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available")
111 | def test_knp_input() -> None:
112 | text = "こんにちは。さようなら。"
113 | doc1 = KNP().apply_to_document(text)
114 | assert not doc1.is_knp_required()
115 | doc2 = KWJA(options=["--model-size", "tiny", "--tasks", "word", "--input-format", "knp"]).apply_to_document(doc1)
116 | assert doc1.doc_id == doc2.doc_id
117 | assert [sent.sid for sent in doc2.sentences] == [sent.sid for sent in doc1.sentences]
118 | assert [sent.text for sent in doc2.sentences] == [sent.text for sent in doc1.sentences]
119 | assert [mrph.text for mrph in doc2.morphemes] == [mrph.text for mrph in doc1.morphemes]
120 |
121 |
122 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available")
123 | def test_apply() -> None:
124 | kwja = KWJA(options=["--model-size", "tiny"])
125 | text = "外国人参政権"
126 | assert isinstance(kwja.apply(text), Document)
127 | assert isinstance(kwja.apply(Document.from_raw_text(text)), Document)
128 | with pytest.raises(NotImplementedError):
129 | _ = kwja.apply(Sentence.from_raw_text(text))
130 | with pytest.raises(TypeError):
131 | _ = kwja.apply(1) # type: ignore
132 |
133 |
134 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available")
135 | def test_keep_doc_id_document() -> None:
136 | kwja = KWJA(options=["--model-size", "tiny"])
137 | doc = Document.from_sentences(["こんにちは。", "さようなら。"])
138 | doc.doc_id = "test"
139 | for sent in doc.sentences:
140 | sent.doc_id = "test"
141 | doc = kwja.apply_to_document(doc)
142 | assert doc.doc_id == "test"
143 | for sent in doc.sentences:
144 | assert sent.doc_id == "test"
145 |
146 |
147 | def test_timeout_error() -> None:
148 | kwja = KWJA("tests/bin/kwja-mock.sh", skip_sanity_check=True)
149 | with pytest.raises(TimeoutError):
150 | _ = kwja.apply_to_document("time consuming input", timeout=1)
151 |
152 |
153 | def test_runtime_error() -> None:
154 | kwja = KWJA("tests/bin/kwja-mock.sh", skip_sanity_check=True)
155 | with pytest.raises(RuntimeError):
156 | _ = kwja.apply_to_document("error causing input")
157 |
158 |
159 | def test_unsupported_option() -> None:
160 | with pytest.raises(ValueError, match=r"invalid task: \['wakachi'\]"):
161 | _ = KWJA(options=["--model-size", "tiny", "--tasks", "wakachi"])
162 | with pytest.raises(ValueError, match="invalid input format: seq2seq"):
163 | _ = KWJA(options=["--model-size", "tiny", "--input-format", "seq2seq"])
164 |
165 |
166 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available")
167 | def test_apply_to_sentence() -> None:
168 | kwja = KWJA(options=["--model-size", "tiny"])
169 | with pytest.raises(NotImplementedError):
170 | _ = kwja.apply_to_sentence("外国人参政権")
171 |
172 |
173 | @pytest.mark.skipif(not is_kwja_available, reason="KWJA is not available")
174 | def test_repr() -> None:
175 | kwja = KWJA(options=["--model-size", "tiny", "--tasks", "char,word"])
176 | assert repr(kwja) == "KWJA(executable='kwja', options=['--model-size', 'tiny', '--tasks', 'char,word'])"
177 |
--------------------------------------------------------------------------------
/tests/processors/test_regex_senter.py:
--------------------------------------------------------------------------------
1 | import time
2 | from unittest.mock import MagicMock
3 |
4 | import pytest
5 |
6 | from rhoknp import Document, RegexSenter, Sentence
7 |
8 |
9 | @pytest.mark.parametrize(
10 | ("document", "sentence_strings"),
11 | [
12 | (
13 | "",
14 | [],
15 | ),
16 | (
17 | "天気がいいので散歩した。",
18 | ["天気がいいので散歩した。"],
19 | ),
20 | (
21 | "天気がいいので散歩した。散歩の途中で先生に出会った。",
22 | ["天気がいいので散歩した。", "散歩の途中で先生に出会った。"],
23 | ),
24 | (
25 | "天気がいいので散歩した.散歩の途中で先生に出会った.",
26 | ["天気がいいので散歩した.", "散歩の途中で先生に出会った."],
27 | ),
28 | (
29 | "天気がいいので散歩した\n散歩の途中で先生に出会った",
30 | ["天気がいいので散歩した", "散歩の途中で先生に出会った"],
31 | ),
32 | (
33 | "天気がいいので散歩した。散歩の途中で Michael に出会った。",
34 | ["天気がいいので散歩した。", "散歩の途中で Michael に出会った。"],
35 | ),
36 | (
37 | "今何時ですか?次の予定があるので失礼します。",
38 | ["今何時ですか?", "次の予定があるので失礼します。"],
39 | ),
40 | (
41 | "今何時ですか?次の予定があるので失礼します。",
42 | ["今何時ですか?", "次の予定があるので失礼します。"],
43 | ),
44 | (
45 | "今何時ですか!次の予定があるので失礼します。",
46 | ["今何時ですか!", "次の予定があるので失礼します。"],
47 | ),
48 | (
49 | "今何時ですか! 次の予定があるので失礼します。",
50 | ["今何時ですか!", "次の予定があるので失礼します。"],
51 | ),
52 | (
53 | "今何時ですか???次の予定があるので失礼します!!!",
54 | ["今何時ですか???", "次の予定があるので失礼します!!!"],
55 | ),
56 | (
57 | "お疲れ様です♪次の予定があるので失礼します。",
58 | ["お疲れ様です♪", "次の予定があるので失礼します。"],
59 | ),
60 | (
61 | "お疲れ様です★次の予定があるので失礼します。",
62 | ["お疲れ様です★", "次の予定があるので失礼します。"],
63 | ),
64 | (
65 | "お疲れ様です☆次の予定があるので失礼します。",
66 | ["お疲れ様です☆", "次の予定があるので失礼します。"],
67 | ),
68 | (
69 | "なるほど…これは難しい問題ですね。",
70 | ["なるほど…", "これは難しい問題ですね。"],
71 | ),
72 | (
73 | "テレビで「今年の夏は暑いので、熱中症に注意しましょう。」と言っていた。",
74 | ["テレビで「今年の夏は暑いので、熱中症に注意しましょう。」と言っていた。"],
75 | ),
76 | (
77 | "そんな(笑\n安心してください(笑",
78 | ["そんな(笑", "安心してください(笑"],
79 | ),
80 | (
81 | "『君の名は。』は良い作品でした。",
82 | ["『君の名は。』は良い作品でした。"],
83 | ),
84 | (
85 | "次の問いに答えよ。 1) tan30°は有理数か。 2) tan1°は有理数か。",
86 | ["次の問いに答えよ。", "1) tan30°は有理数か。", "2) tan1°は有理数か。"],
87 | ),
88 | (
89 | "やっと掃除終わった_(:3 」∠)_もう24時…さっさと寝よう。",
90 | ["やっと掃除終わった_(:3 」∠)_もう24時…", "さっさと寝よう。"],
91 | ),
92 | ],
93 | )
94 | def test_apply_to_document(document: str, sentence_strings: list[str]) -> None:
95 | senter = RegexSenter()
96 | doc = senter.apply_to_document(document)
97 | for i, sentence in enumerate(doc.sentences):
98 | assert sentence.text == sentence_strings[i]
99 |
100 |
101 | def test_apply_to_sentence() -> None:
102 | senter = RegexSenter()
103 | text = "天気がいいので散歩した。"
104 | sent = senter.apply_to_sentence(text)
105 | assert sent.text == text
106 |
107 |
108 | def test_keep_id_sentence() -> None:
109 | senter = RegexSenter()
110 | sent = Sentence.from_raw_text("天気がいいので散歩した。")
111 | sent.doc_id = "test"
112 | sent.sent_id = "test-1"
113 | sent = senter.apply_to_sentence(sent)
114 | assert sent.doc_id == "test"
115 | assert sent.sent_id == "test-1"
116 |
117 |
118 | def test_keep_id_document() -> None:
119 | senter = RegexSenter()
120 | doc = Document.from_raw_text("天気がいいので散歩した。散歩の途中で先生に出会った。")
121 | doc.doc_id = "test"
122 | doc = senter.apply_to_document(doc)
123 | assert doc.doc_id == "test"
124 | for sent in doc.sentences:
125 | assert sent.doc_id == "test"
126 |
127 |
128 | def test_repr() -> None:
129 | senter = RegexSenter()
130 | assert repr(senter) == "RegexSenter()"
131 |
132 |
133 | def test_timeout() -> None:
134 | senter = RegexSenter()
135 | senter._split_document = MagicMock(side_effect=lambda _: time.sleep(5)) # type: ignore
136 | with pytest.raises(TimeoutError):
137 | senter.apply_to_document("天気がいいので散歩した。", timeout=3)
138 |
--------------------------------------------------------------------------------
/tests/props/test_features.py:
--------------------------------------------------------------------------------
1 | from dataclasses import astuple, dataclass
2 | from typing import Union
3 |
4 | import pytest
5 |
6 | from rhoknp.props import FeatureDict
7 |
8 |
9 | @dataclass(frozen=True)
10 | class FeaturesTestCase:
11 | fstring: str
12 | features: dict[str, Union[str, bool]]
13 | length: int
14 |
15 |
16 | cases = [
17 | FeaturesTestCase(
18 | fstring="""<文節内><係:文節内><文頭><体言><名詞項候補><先行詞候補><正規化代表表記:構文/こうぶん>""",
19 | features={
20 | "BGH": "構文/こうぶん",
21 | "文節内": True,
22 | "係": "文節内",
23 | "文頭": True,
24 | "体言": True,
25 | "名詞項候補": True,
26 | "先行詞候補": True,
27 | "正規化代表表記": "構文/こうぶん",
28 | },
29 | length=8,
30 | ),
31 | FeaturesTestCase(
32 | fstring="""""",
33 | features={
34 | 'ALT-京都-きょうと-京都-6-4-0-0-"代表表記:京都/きょうと 地名:日本:府"': True,
35 | },
36 | length=1,
37 | ),
38 | FeaturesTestCase(
39 | fstring=r"""タグ>""",
40 | features={
41 | "NE": r"OPTIONAL:html>タグ",
42 | },
43 | length=1,
44 | ),
45 | FeaturesTestCase(
46 | fstring=r"""<係チ:非用言格解析||用言&&文節内:T解析格-ヲ><正規化代表表記:”/”><主辞代表表記:”/”><照応詞候補:最高">""",
47 | features={
48 | "係チ": r"非用言格解析||用言&&文節内:T解析格-ヲ",
49 | "正規化代表表記": "”/”",
50 | "主辞代表表記": "”/”",
51 | "照応詞候補": '最高"',
52 | "EID": "2",
53 | },
54 | length=5,
55 | ),
56 | ]
57 |
58 |
59 | cases_with_ignored_tag = [
60 | FeaturesTestCase(
61 | fstring="""<解析済><体言>""",
62 | features={
63 | "BGH": "関心/かんしん",
64 | "解析済": True,
65 | "体言": True,
66 | },
67 | length=3,
68 | ),
69 | ]
70 |
71 |
72 | @pytest.mark.parametrize(("fstring", "features", "length"), [astuple(case) for case in cases + cases_with_ignored_tag])
73 | def test_from_fstring(fstring: str, features: dict[str, Union[str, bool]], length: int) -> None:
74 | fs = FeatureDict.from_fstring(fstring)
75 | assert len(fs) == length
76 | assert dict(fs) == features
77 | assert fs.get("dummy") is None
78 |
79 |
80 | @pytest.mark.parametrize("fstring", [case.fstring for case in cases])
81 | def test_to_fstring(fstring: str) -> None:
82 | fs = FeatureDict.from_fstring(fstring)
83 | assert fs.to_fstring() == fstring
84 |
85 |
86 | def test_false() -> None:
87 | assert FeatureDict._item_to_fstring("sem", False) == ""
88 |
89 |
90 | def test_ignore_tag_prefix() -> None:
91 | features = FeatureDict()
92 | features["rel"] = 'type="ノ" target="ユーザー" sid="w201106-0000060560-1" id="1"'
93 | assert len(features) == 0
94 |
95 | features["memo"] = 'text="メモ"'
96 | assert len(features) == 0
97 |
98 |
99 | def test_modification() -> None:
100 | features = FeatureDict.from_fstring("""<用言:動><主節>""")
101 | assert features.to_fstring() == """<用言:動><主節>"""
102 | # Update
103 | features["用言"] = "判"
104 | assert features.to_fstring() == """<用言:判><主節>"""
105 | # Insert
106 | features["文末"] = True
107 | assert features.to_fstring() == """<用言:判><主節><文末>"""
108 | # Delete
109 | del features["主節"]
110 | assert features.to_fstring() == """<用言:判><文末>"""
111 |
--------------------------------------------------------------------------------
/tests/props/test_memo.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from rhoknp.props.memo import MemoTag
4 |
5 | CASES = [
6 | {
7 | "from_fstring": """<体言>""",
8 | "to_fstring": """""",
9 | "text": "メモ",
10 | "bool": True,
11 | },
12 | {
13 | "from_fstring": """""",
14 | "to_fstring": """""",
15 | "text": "メモ1",
16 | "bool": True,
17 | },
18 | {
19 | "from_fstring": """""",
20 | "to_fstring": """""",
21 | "text": "",
22 | "bool": False,
23 | },
24 | {
25 | "from_fstring": """""",
26 | "to_fstring": """""",
27 | "text": """<メモ> 'quote' "double quote\"""",
28 | "bool": True,
29 | },
30 | ]
31 |
32 |
33 | @pytest.mark.parametrize("case", CASES)
34 | def test_from_fstring(case: dict) -> None:
35 | memo_tag = MemoTag.from_fstring(case["from_fstring"])
36 | assert memo_tag.text == case["text"]
37 |
38 |
39 | @pytest.mark.parametrize("case", CASES)
40 | def test_to_fstring(case: dict) -> None:
41 | memo_tag = MemoTag.from_fstring(case["from_fstring"])
42 | assert memo_tag.to_fstring() == case["to_fstring"]
43 |
44 |
45 | @pytest.mark.parametrize("case", CASES)
46 | def test_str(case: dict) -> None:
47 | memo_tag = MemoTag.from_fstring(case["from_fstring"])
48 | assert str(memo_tag) == case["to_fstring"]
49 |
50 |
51 | @pytest.mark.parametrize("case", CASES)
52 | def test_bool(case: dict) -> None:
53 | memo_tag = MemoTag.from_fstring(case["from_fstring"])
54 | assert bool(memo_tag) == case["bool"]
55 |
--------------------------------------------------------------------------------
/tests/props/test_named_entity.py:
--------------------------------------------------------------------------------
1 | import textwrap
2 | from pathlib import Path
3 | from typing import Any
4 |
5 | import pytest
6 |
7 | from rhoknp import Document, Sentence
8 | from rhoknp.props import NamedEntity, NamedEntityCategory
9 |
10 |
11 | @pytest.mark.parametrize(
12 | "case",
13 | [
14 | {
15 | "doc_id": "w201106-0000060877",
16 | "named_entities": [
17 | {
18 | "category": NamedEntityCategory.ORGANIZATION,
19 | "text": "柏市ひまわり園",
20 | "fstring": "",
21 | },
22 | {
23 | "category": NamedEntityCategory.DATE,
24 | "text": "平成23年度",
25 | "fstring": "",
26 | },
27 | ],
28 | },
29 | {
30 | "doc_id": "w201106-0000074273",
31 | "named_entities": [
32 | {
33 | "category": NamedEntityCategory.LOCATION,
34 | "text": "ダーマ神殿",
35 | "fstring": "",
36 | },
37 | {
38 | "category": NamedEntityCategory.ARTIFACT,
39 | "text": "天の箱舟",
40 | "fstring": "",
41 | },
42 | {
43 | "category": NamedEntityCategory.LOCATION,
44 | "text": "ナザム村",
45 | "fstring": "",
46 | },
47 | ],
48 | },
49 | ],
50 | )
51 | def test_ne(case: dict[str, Any]) -> None:
52 | doc = Document.from_knp(Path(f"tests/data/{case['doc_id']}.knp").read_text())
53 | actual_nes = doc.named_entities
54 | expected_nes = case["named_entities"]
55 | assert len(actual_nes) == len(expected_nes)
56 | for actual_ne, expected_ne in zip(actual_nes, expected_nes):
57 | assert actual_ne.category == expected_ne["category"]
58 | assert actual_ne.text == expected_ne["text"]
59 | assert str(actual_ne) == expected_ne["text"]
60 | assert actual_ne.to_fstring() == expected_ne["fstring"]
61 |
62 |
63 | def test_from_fstring_malformed_line() -> None:
64 | fstring = ""
65 | ne = NamedEntity.from_fstring(fstring, [])
66 | assert ne is None
67 |
68 |
69 | def test_unknown_category() -> None:
70 | fstring = ""
71 | sentence = Sentence.from_knp(
72 | textwrap.dedent(
73 | """\
74 | # S-ID:1
75 | * -1D
76 | + -1D
77 | アンノウン アンノウン アンノウン 名詞 6 普通名詞 1 * 0 * 0
78 | EOS
79 | """
80 | )
81 | )
82 | ne = NamedEntity.from_fstring(fstring, sentence.morphemes)
83 | assert ne is None
84 |
85 |
86 | def test_span_not_found() -> None:
87 | fstring = ""
88 | sentence = Sentence.from_knp(
89 | textwrap.dedent(
90 | """\
91 | # S-ID:1
92 | * -1D
93 | + 1D
94 | 東京 とうきょう 東京 名詞 6 地名 4 * 0 * 0
95 | + -1D
96 | 大学 だいがく 大学 名詞 6 普通名詞 1 * 0 * 0
97 | EOS
98 | """
99 | )
100 | )
101 | ne = NamedEntity.from_fstring(fstring, sentence.morphemes)
102 | assert ne is None
103 |
104 |
105 | @pytest.mark.parametrize(
106 | "case",
107 | [
108 | dict(
109 | fstring=r"タグ>",
110 | category=NamedEntityCategory.OPTIONAL,
111 | text="html>タグ",
112 | knp=textwrap.dedent(
113 | """\
114 | # S-ID:1
115 | * -1D
116 | + 1D
117 | < < < 特殊 1 括弧始 3 * 0 * 0
118 | html html html 名詞 6 普通名詞 1 * 0 * 0
119 | > > > 特殊 1 括弧終 4 * 0 * 0
120 | + -1D
121 | タグ たぐ タグ 名詞 6 普通名詞 1 * 0 * 0
122 | EOS
123 | """
124 | ),
125 | ),
126 | dict(
127 | fstring=r"",
128 | category=NamedEntityCategory.OPTIONAL,
129 | text=" > > 特殊 1 括弧終 4 * 0 * 0
138 | + -1D
139 | タグ たぐ タグ 名詞 6 普通名詞 1 * 0 * 0
140 | EOS
141 | """
142 | ),
143 | ),
144 | dict(
145 | fstring=r"",
146 | category=NamedEntityCategory.OPTIONAL,
147 | text=r"バック\スラッシュ",
148 | knp=textwrap.dedent(
149 | r"""
150 | * 1D
151 | + 2D
152 | バック ばっく バック 名詞 6 サ変名詞 2 * 0 * 0
153 | + 2D
154 | \ \ \ 特殊 1 記号 5 * 0 * 0
155 | * -1D
156 | + -1D
157 | スラッシュ すらっしゅ スラッシュ 名詞 6 普通名詞 1 * 0 * 0
158 | EOS
159 | """.lstrip("\n")
160 | ),
161 | ),
162 | ],
163 | )
164 | def test_escape(case: dict[str, Any]) -> None:
165 | sentence = Sentence.from_knp(case["knp"])
166 | ne = NamedEntity.from_fstring(case["fstring"], sentence.morphemes)
167 | assert ne is not None
168 | assert ne.category == case["category"]
169 | assert ne.text == case["text"]
170 | assert ne.to_fstring() == case["fstring"]
171 |
172 |
173 | def test_escape_in_knp() -> None:
174 | knp_text = textwrap.dedent(
175 | r"""
176 | # S-ID:1
177 | * -1D
178 | + 1D
179 | < < < 特殊 1 括弧始 3 * 0 * 0
180 | html html html 名詞 6 普通名詞 1 * 0 * 0
181 | > > > 特殊 1 括弧終 4 * 0 * 0
182 | + -1D タグ>
183 | タグ たぐ タグ 名詞 6 普通名詞 1 * 0 * 0
184 | EOS
185 | """.lstrip("\n")
186 | )
187 | sentence = Sentence.from_knp(knp_text)
188 | assert sentence.named_entities[0].text == "html>タグ"
189 | assert sentence.to_knp() == knp_text
190 |
--------------------------------------------------------------------------------
/tests/props/test_semantics.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | import pytest
4 |
5 | from rhoknp.props import SemanticsDict
6 |
7 | CASES = [
8 | {
9 | "sstring": '"代表表記:天気/てんき カテゴリ:抽象物"',
10 | "dict_": {"代表表記": "天気/てんき", "カテゴリ": "抽象物"},
11 | },
12 | {
13 | "sstring": '"代表表記:新/しん 内容語 NE:ORGANIZATION:head"',
14 | "dict_": {"代表表記": "新/しん", "内容語": True, "NE": "ORGANIZATION:head"},
15 | },
16 | {
17 | "sstring": "NIL",
18 | "dict_": {},
19 | },
20 | ]
21 |
22 |
23 | @pytest.mark.parametrize("case", CASES)
24 | def test_from_fstring(case: dict[str, Any]) -> None:
25 | semantics = SemanticsDict.from_sstring(case["sstring"])
26 | assert dict(semantics) == case["dict_"]
27 |
28 |
29 | @pytest.mark.parametrize("case", CASES)
30 | def test_to_fstring(case: dict[str, Any]) -> None:
31 | semantics = SemanticsDict(case["dict_"], is_nil=True)
32 | assert semantics.to_sstring() == case["sstring"]
33 |
34 |
35 | def test_false() -> None:
36 | assert SemanticsDict._item_to_sstring("sem", False) == ""
37 |
38 |
39 | def test_empty_dict() -> None:
40 | semantics = SemanticsDict({})
41 | assert semantics.to_sstring() == ""
42 |
43 |
44 | def test_void() -> None:
45 | semantics = SemanticsDict()
46 | assert semantics.to_sstring() == ""
47 |
48 |
49 | def test_empty_string() -> None:
50 | semantics = SemanticsDict.from_sstring("")
51 | assert semantics.to_sstring() == ""
52 |
53 |
54 | def test_modification() -> None:
55 | features = SemanticsDict.from_sstring('"代表表記:天気/てんき カテゴリ:抽象物"')
56 | assert features.to_sstring() == '"代表表記:天気/てんき カテゴリ:抽象物"'
57 | # Update
58 | features["代表表記"] = "転機/てんき"
59 | assert features.to_sstring() == '"代表表記:転機/てんき カテゴリ:抽象物"'
60 | # Insert
61 | features["内容語"] = True
62 | assert features.to_sstring() == '"代表表記:転機/てんき カテゴリ:抽象物 内容語"'
63 | # Delete
64 | del features["カテゴリ"]
65 | assert features.to_sstring() == '"代表表記:転機/てんき 内容語"'
66 |
--------------------------------------------------------------------------------
/tests/utils/test_comment.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import Optional
3 |
4 | import pytest
5 |
6 | from rhoknp import Sentence
7 | from rhoknp.utils.comment import extract_did_and_sid, is_comment_line
8 |
9 |
10 | @pytest.mark.parametrize(
11 | ("line", "expected"),
12 | [
13 | ("# S-ID:1", True),
14 | ("# foo-bar", True),
15 | ("#", True),
16 | ("// S-ID:1", False),
17 | ("// foo-bar", False),
18 | ("//", False),
19 | ('# # # 未定義語 15 その他 1 * 0 * 0 "未知語:その他 品詞推定:特殊"', False),
20 | ],
21 | )
22 | def test_is_comment_line(line: str, expected: bool) -> None:
23 | assert is_comment_line(line) == expected
24 |
25 |
26 | @pytest.mark.parametrize(
27 | ("pat", "line", "doc_id", "sent_id"),
28 | [
29 | (Sentence.SID_PAT, "# S-ID:", "", ""),
30 | (Sentence.SID_PAT, "# S-ID:1", "", "1"),
31 | (Sentence.SID_PAT, "# S-ID:123", "", "123"),
32 | (Sentence.SID_PAT, "# S-ID:1a", "1a", "1a"),
33 | (Sentence.SID_PAT, "# S-ID:1-a", "1-a", "1-a"),
34 | (Sentence.SID_PAT, "# S-ID:1-1", "1", "1-1"),
35 | (Sentence.SID_PAT, "# S-ID:1-2", "1", "1-2"),
36 | (Sentence.SID_PAT, "# S-ID:a-1", "a", "a-1"),
37 | (Sentence.SID_PAT, "# S-ID:a-2", "a", "a-2"),
38 | (Sentence.SID_PAT_KWDLC, "# S-ID:w201106-0000060050-1", "w201106-0000060050", "w201106-0000060050-1"),
39 | (Sentence.SID_PAT_WAC, "# S-ID:wiki00100176-00", "wiki00100176", "wiki00100176-00"),
40 | ],
41 | )
42 | def test_extract_doc_id(pat: re.Pattern, line: str, doc_id: Optional[str], sent_id: Optional[str]) -> None:
43 | did, sid, _ = extract_did_and_sid(line, [pat])
44 | assert did == doc_id
45 | assert sid == sent_id
46 |
--------------------------------------------------------------------------------
/tests/utils/test_reader.py:
--------------------------------------------------------------------------------
1 | import textwrap
2 | from io import StringIO
3 | from typing import Any
4 |
5 | import pytest
6 |
7 | from rhoknp.utils.reader import chunk_by_document, chunk_by_sentence
8 |
9 | CASES = [
10 | {
11 | "text": textwrap.dedent(
12 | """\
13 | # S-ID:A-X-1
14 | EOS
15 | # S-ID:A-X-2
16 | EOS
17 | # S-ID:A-Y-1
18 | EOS
19 | """
20 | ),
21 | "sentences": [
22 | "# S-ID:A-X-1\nEOS\n",
23 | "# S-ID:A-X-2\nEOS\n",
24 | "# S-ID:A-Y-1\nEOS\n",
25 | ],
26 | "documents": [
27 | "# S-ID:A-X-1\nEOS\n# S-ID:A-X-2\nEOS\n",
28 | "# S-ID:A-Y-1\nEOS\n",
29 | ],
30 | "doc_id_format": "default",
31 | },
32 | {
33 | "text": textwrap.dedent(
34 | """\
35 | # S-ID:w201106-0000060050-1
36 | EOS
37 | # S-ID:w201106-0000060050-2
38 | EOS
39 | """
40 | ),
41 | "sentences": [
42 | "# S-ID:w201106-0000060050-1\nEOS\n",
43 | "# S-ID:w201106-0000060050-2\nEOS\n",
44 | ],
45 | "documents": [
46 | "# S-ID:w201106-0000060050-1\nEOS\n# S-ID:w201106-0000060050-2\nEOS\n",
47 | ],
48 | "doc_id_format": "kwdlc",
49 | },
50 | {
51 | "text": textwrap.dedent(
52 | """\
53 | # S-ID:wiki00100176-00
54 | EOS
55 | # S-ID:wiki00100176-01
56 | EOS
57 | """
58 | ),
59 | "sentences": [
60 | "# S-ID:wiki00100176-00\nEOS\n",
61 | "# S-ID:wiki00100176-01\nEOS\n",
62 | ],
63 | "documents": [
64 | "# S-ID:wiki00100176-00\nEOS\n# S-ID:wiki00100176-01\nEOS\n",
65 | ],
66 | "doc_id_format": "wac",
67 | },
68 | {
69 | "text": textwrap.dedent(
70 | """\
71 | # 1-1
72 | EOS
73 | # 1-2
74 | EOS
75 | # 2-1
76 | EOS
77 | """
78 | ),
79 | "sentences": [
80 | "# 1-1\nEOS\n",
81 | "# 1-2\nEOS\n",
82 | "# 2-1\nEOS\n",
83 | ],
84 | "documents": [
85 | "# 1-1\nEOS\n# 1-2\nEOS\n",
86 | "# 2-1\nEOS\n",
87 | ],
88 | "doc_id_format": lambda x: x.lstrip("# ").split("-")[0],
89 | },
90 | # empty line
91 | {
92 | "text": textwrap.dedent(
93 | """\
94 | # S-ID:1-1
95 | EOS
96 |
97 | # S-ID:1-2
98 | EOS
99 | """
100 | ),
101 | "sentences": [
102 | "# S-ID:1-1\nEOS\n",
103 | "# S-ID:1-2\nEOS\n",
104 | ],
105 | "documents": [
106 | "# S-ID:1-1\nEOS\n# S-ID:1-2\nEOS\n",
107 | ],
108 | "doc_id_format": "default",
109 | },
110 | # no sid
111 | {
112 | "text": textwrap.dedent(
113 | """\
114 | # 1-1
115 | EOS
116 | # 1-2
117 | EOS
118 | """
119 | ),
120 | "sentences": [
121 | "# 1-1\nEOS\n",
122 | "# 1-2\nEOS\n",
123 | ],
124 | "documents": [
125 | "# 1-1\nEOS\n",
126 | "# 1-2\nEOS\n",
127 | ],
128 | "doc_id_format": "default",
129 | },
130 | # no trailing EOS
131 | {
132 | "text": textwrap.dedent(
133 | """\
134 | # S-ID:1-1
135 | EOS
136 | # S-ID:1-2
137 | """
138 | ),
139 | "sentences": [
140 | "# S-ID:1-1\nEOS\n",
141 | "# S-ID:1-2\n",
142 | ],
143 | "documents": [
144 | "# S-ID:1-1\nEOS\n# S-ID:1-2\n",
145 | ],
146 | "doc_id_format": "default",
147 | },
148 | # invalid sid
149 | {
150 | "text": textwrap.dedent(
151 | """\
152 | # S-ID:1-1
153 | EOS
154 | # S-ID:1-2
155 | EOS
156 | # S-ID:2-1
157 | EOS
158 | """
159 | ),
160 | "sentences": [
161 | "# S-ID:1-1\nEOS\n",
162 | "# S-ID:1-2\nEOS\n",
163 | "# S-ID:2-1\nEOS\n",
164 | ],
165 | "documents": [
166 | "# S-ID:1-1\nEOS\n",
167 | "# S-ID:1-2\nEOS\n",
168 | "# S-ID:2-1\nEOS\n",
169 | ],
170 | "doc_id_format": "kwdlc",
171 | },
172 | ]
173 |
174 |
175 | @pytest.mark.parametrize("case", CASES)
176 | def test_chunk_by_sentence(case: dict[str, Any]) -> None:
177 | actual = list(chunk_by_sentence(StringIO(case["text"])))
178 | assert actual == case["sentences"]
179 |
180 |
181 | @pytest.mark.parametrize("case", CASES)
182 | def test_chunk_by_document(case: dict[str, Any]) -> None:
183 | actual = list(chunk_by_document(StringIO(case["text"]), doc_id_format=case["doc_id_format"]))
184 | assert actual == case["documents"]
185 |
186 |
187 | def test_chunk_by_document_value_error() -> None:
188 | with pytest.raises(ValueError, match="Invalid doc_id_format: ERROR"):
189 | _ = list(chunk_by_document(StringIO(""), doc_id_format="ERROR")) # type: ignore
190 |
191 |
192 | def test_chunk_by_document_type_error() -> None:
193 | with pytest.raises(TypeError):
194 | _ = list(chunk_by_document(StringIO(""), doc_id_format=1)) # type: ignore
195 |
--------------------------------------------------------------------------------