├── .github
├── dependabot.yml
└── workflows
│ ├── build-demo.yml
│ ├── codeql.yml
│ ├── dependency-review.yml
│ ├── java-unittest.yml
│ ├── nodejs-unittest.yml
│ ├── py-unittest.yml
│ ├── scorecard.yml
│ └── style-check.yml
├── .gitignore
├── .markdownlint.yaml
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── budoux
├── __init__.py
├── html_processor.py
├── main.py
├── models
│ ├── ja.json
│ ├── ja_knbc.json
│ ├── th.json
│ ├── zh-hans.json
│ └── zh-hant.json
├── parser.py
├── py.typed
├── skip_nodes.json
└── utils.py
├── bump_version.py
├── data
└── finetuning
│ └── ja
│ ├── train.txt
│ └── val.txt
├── demo
├── package-lock.json
├── package.json
├── src
│ ├── app.ts
│ └── worker.ts
├── static
│ └── index.html
└── tsconfig.json
├── example.png
├── java
├── .gitignore
├── README.md
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── google
│ │ └── budoux
│ │ ├── HTMLProcessor.java
│ │ └── Parser.java
│ └── test
│ └── java
│ └── com
│ └── google
│ └── budoux
│ ├── HTMLProcessorTest.java
│ └── ParserTest.java
├── javascript
├── .npmignore
├── .prettierrc.json
├── README.md
├── bin
│ └── budoux.js
├── eslint.config.mjs
├── karma.conf.js
├── package-lock.json
├── package.json
├── scripts
│ ├── check-cli-version.js
│ └── copy-data.js
├── src
│ ├── cli.ts
│ ├── dom-browser.ts
│ ├── dom.ts
│ ├── html_processor.ts
│ ├── index.ts
│ ├── parser.ts
│ ├── tests
│ │ ├── index.browser.ts
│ │ ├── index.node.ts
│ │ ├── models
│ │ │ └── separate_right_before_a.json
│ │ ├── test_cli.ts
│ │ ├── test_html_processor.ts
│ │ ├── test_parser.ts
│ │ ├── test_webcomponents.ts
│ │ ├── testutils-browser.ts
│ │ └── testutils.ts
│ └── webcomponents
│ │ ├── budoux-base.ts
│ │ ├── budoux-ja.ts
│ │ ├── budoux-th.ts
│ │ ├── budoux-zh-hans.ts
│ │ └── budoux-zh-hant.ts
└── tsconfig.json
├── pyproject.toml
├── scripts
├── README.md
├── __init__.py
├── build_model.py
├── encode_data.py
├── finetune.py
├── prepare_knbc.py
├── prepare_wisesight.py
├── tests
│ ├── test_build_model.py
│ ├── test_encode_data.py
│ ├── test_finetune.py
│ ├── test_prepare_knbc.py
│ ├── test_train.py
│ └── test_translate_model.py
├── train.py
└── translate_model.py
├── setup.cfg
├── setup.py
└── tests
├── in
├── 1.in
├── 2.in
└── 3.in
├── quality
└── ja.tsv
├── test_html_processor.py
├── test_main.py
├── test_parser.py
└── test_quality.py
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: pip
9 | directory: /
10 | schedule:
11 | interval: daily
12 |
13 | - package-ecosystem: github-actions
14 | directory: /
15 | schedule:
16 | interval: daily
17 |
18 | - package-ecosystem: npm
19 | directory: /demo
20 | schedule:
21 | interval: daily
22 |
23 | - package-ecosystem: npm
24 | directory: /javascript
25 | schedule:
26 | interval: daily
27 |
28 | - package-ecosystem: maven
29 | directory: /java
30 | schedule:
31 | interval: daily
32 |
--------------------------------------------------------------------------------
/.github/workflows/build-demo.yml:
--------------------------------------------------------------------------------
1 | name: Build Demo
2 | on:
3 | push:
4 | branches: [ "main" ]
5 | permissions:
6 | contents: read
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - name: Harden Runner
13 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
14 | with:
15 | egress-policy: audit
16 |
17 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
18 | - uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
19 | with:
20 | node-version: '16'
21 | - run: npm install
22 | working-directory: ./javascript
23 | - run: npm install
24 | working-directory: ./demo
25 | - run: npm run build
26 | working-directory: ./demo
27 | - name: Upload static files as artifact
28 | id: deployment
29 | uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # v3.0.1
30 | with:
31 | path: ./demo/static/
32 | deploy:
33 | needs: build
34 | permissions:
35 | pages: write
36 | id-token: write
37 | environment:
38 | name: github-pages
39 | url: ${{ steps.deployment.outputs.page_url }}
40 | runs-on: ubuntu-latest
41 | steps:
42 | - name: Deploy to GitHub Pages
43 | id: deployment
44 | uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4.0.5
45 |
--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ "main" ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ "main" ]
20 | schedule:
21 | - cron: '42 20 * * 3'
22 |
23 | permissions:
24 | contents: read
25 |
26 | jobs:
27 | analyze:
28 | name: Analyze
29 | runs-on: ubuntu-latest
30 | permissions:
31 | actions: read
32 | contents: read
33 | security-events: write
34 |
35 | strategy:
36 | fail-fast: false
37 | matrix:
38 | language: [ 'java', 'javascript', 'python' ]
39 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
40 | # Use only 'java' to analyze code written in Java, Kotlin or both
41 | # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both
42 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
43 |
44 | steps:
45 | - name: Harden Runner
46 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
47 | with:
48 | egress-policy: audit
49 |
50 | - name: Checkout repository
51 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
52 |
53 | # Initializes the CodeQL tools for scanning.
54 | - name: Initialize CodeQL
55 | uses: github/codeql-action/init@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v2.20.1
56 | with:
57 | languages: ${{ matrix.language }}
58 | # If you wish to specify custom queries, you can do so here or in a config file.
59 | # By default, queries listed here will override any specified in a config file.
60 | # Prefix the list here with "+" to use these queries and those in the config file.
61 |
62 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
63 | # queries: security-extended,security-and-quality
64 |
65 |
66 | # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java).
67 | # If this step fails, then you should remove it and run the build manually (see below)
68 | - name: Autobuild
69 | uses: github/codeql-action/autobuild@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v2.20.1
70 |
71 | # ℹ️ Command-line programs to run using the OS shell.
72 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
73 |
74 | # If the Autobuild fails above, remove it and uncomment the following three lines.
75 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
76 |
77 | # - run: |
78 | # echo "Run, Build Application using script"
79 | # ./location_of_script_within_repo/buildscript.sh
80 |
81 | - name: Perform CodeQL Analysis
82 | uses: github/codeql-action/analyze@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v2.20.1
83 | with:
84 | category: "/language:${{matrix.language}}"
85 |
--------------------------------------------------------------------------------
/.github/workflows/dependency-review.yml:
--------------------------------------------------------------------------------
1 | # Dependency Review Action
2 | #
3 | # This Action will scan dependency manifest files that change as part of a Pull Request,
4 | # surfacing known-vulnerable versions of the packages declared or updated in the PR.
5 | # Once installed, if the workflow run is marked as required,
6 | # PRs introducing known-vulnerable packages will be blocked from merging.
7 | #
8 | # Source repository: https://github.com/actions/dependency-review-action
9 | name: 'Dependency Review'
10 | on: [pull_request]
11 |
12 | permissions:
13 | contents: read
14 |
15 | jobs:
16 | dependency-review:
17 | runs-on: ubuntu-latest
18 | steps:
19 | - name: Harden Runner
20 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
21 | with:
22 | egress-policy: audit
23 |
24 | - name: 'Checkout Repository'
25 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
26 | - name: 'Dependency Review'
27 | uses: actions/dependency-review-action@3b139cfc5fae8b618d3eae3675e383bb1769c019 # v4.5.0
28 |
--------------------------------------------------------------------------------
/.github/workflows/java-unittest.yml:
--------------------------------------------------------------------------------
1 | name: Unittest for Java
2 | on:
3 | push:
4 | paths:
5 | - 'java/**'
6 | pull_request:
7 | paths:
8 | - 'java/**'
9 | permissions:
10 | contents: read
11 |
12 | jobs:
13 | java-unittest:
14 | runs-on: ${{ matrix.os }}
15 | strategy:
16 | fail-fast: false
17 | matrix:
18 | os: [ubuntu-latest, macos-latest, windows-latest]
19 | steps:
20 | - name: Harden Runner
21 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
22 | with:
23 | egress-policy: audit
24 |
25 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
26 | - name: Set up JDK 17
27 | uses: actions/setup-java@7a6d8a8234af8eb26422e24e3006232cccaa061b # v4.6.0
28 | with:
29 | java-version: '17'
30 | distribution: 'temurin'
31 | - name: Build with Maven
32 | run: mvn --batch-mode --update-snapshots -f ./java/pom.xml package
33 |
--------------------------------------------------------------------------------
/.github/workflows/nodejs-unittest.yml:
--------------------------------------------------------------------------------
1 | name: Unittest for NodeJS
2 | on:
3 | push:
4 | paths:
5 | - 'javascript/**'
6 | pull_request:
7 | paths:
8 | - 'javascript/**'
9 | permissions:
10 | contents: read
11 |
12 | jobs:
13 | nodejs-unittest:
14 | runs-on: ${{ matrix.os }}
15 | strategy:
16 | fail-fast: false
17 | matrix:
18 | os: [ubuntu-latest, macos-latest, windows-latest]
19 | node-version: [18, 20]
20 |
21 | steps:
22 | - name: Harden Runner
23 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
24 | with:
25 | egress-policy: audit
26 |
27 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
28 | - name: Setup Node ${{ matrix.node-version }}
29 | uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
30 | with:
31 | node-version: ${{ matrix.node-version }}
32 | - name: Install Dependencies
33 | run: npm install
34 | working-directory: ./javascript
35 | - name: Create symlink
36 | run: npm link
37 | working-directory: ./javascript
38 | - name: Build package
39 | run: npm run build --if-present
40 | working-directory: ./javascript
41 | - name: Run testcases
42 | run: npm test
43 | working-directory: ./javascript
44 |
--------------------------------------------------------------------------------
/.github/workflows/py-unittest.yml:
--------------------------------------------------------------------------------
1 | name: Unittest for Python
2 | on:
3 | push:
4 | paths-ignore:
5 | - 'javascript/**'
6 | - 'java/**'
7 | pull_request:
8 | paths-ignore:
9 | - 'javascript/**'
10 | - 'java/**'
11 | permissions:
12 | contents: read
13 |
14 | jobs:
15 | python-unittest:
16 | runs-on: ${{ matrix.os }}
17 | strategy:
18 | fail-fast: false
19 | matrix:
20 | os: [ubuntu-latest, macos-latest, windows-latest]
21 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
22 | steps:
23 | - name: Harden Runner
24 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
25 | with:
26 | egress-policy: audit
27 |
28 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
29 | - name: Setup python ${{ matrix.python-version }}
30 | uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
31 | with:
32 | python-version: ${{ matrix.python-version }}
33 | - name: Install requirements
34 | run: |
35 | python -m pip install --upgrade pip
36 | python -m pip install ".[dev]"
37 | - name: Run unittest
38 | run: pytest ./tests
39 | - name: Install Jax
40 | if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.9' }}
41 | run: pip install ".[jaxcpu]"
42 | - name: Run unittest with Jax
43 | if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.9' }}
44 | run: pytest ./scripts/tests
45 |
--------------------------------------------------------------------------------
/.github/workflows/scorecard.yml:
--------------------------------------------------------------------------------
1 | # This workflow uses actions that are not certified by GitHub. They are provided
2 | # by a third-party and are governed by separate terms of service, privacy
3 | # policy, and support documentation.
4 |
5 | name: Scorecard supply-chain security
6 | on:
7 | # For Branch-Protection check. Only the default branch is supported. See
8 | # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
9 | branch_protection_rule:
10 | # To guarantee Maintained check is occasionally updated. See
11 | # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
12 | schedule:
13 | - cron: '37 11 * * 2'
14 | push:
15 | branches: [ "main" ]
16 |
17 | # Declare default permissions as read only.
18 | permissions: read-all
19 |
20 | jobs:
21 | analysis:
22 | name: Scorecard analysis
23 | runs-on: ubuntu-latest
24 | permissions:
25 | # Needed to upload the results to code-scanning dashboard.
26 | security-events: write
27 | # Needed to publish results and get a badge (see publish_results below).
28 | id-token: write
29 | # Uncomment the permissions below if installing in a private repository.
30 | # contents: read
31 | # actions: read
32 |
33 | steps:
34 | - name: Harden Runner
35 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
36 | with:
37 | egress-policy: audit
38 |
39 | - name: "Checkout code"
40 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
41 | with:
42 | persist-credentials: false
43 |
44 | - name: "Run analysis"
45 | uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0
46 | with:
47 | results_file: results.sarif
48 | results_format: sarif
49 | # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
50 | # - you want to enable the Branch-Protection check on a *public* repository, or
51 | # - you are installing Scorecard on a *private* repository
52 | # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
53 | # repo_token: ${{ secrets.SCORECARD_TOKEN }}
54 |
55 | # Public repositories:
56 | # - Publish results to OpenSSF REST API for easy access by consumers
57 | # - Allows the repository to include the Scorecard badge.
58 | # - See https://github.com/ossf/scorecard-action#publishing-results.
59 | # For private repositories:
60 | # - `publish_results` will always be set to `false`, regardless
61 | # of the value entered here.
62 | publish_results: true
63 |
64 | # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
65 | # format to the repository Actions tab.
66 | - name: "Upload artifact"
67 | uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
68 | with:
69 | name: SARIF file
70 | path: results.sarif
71 | retention-days: 5
72 |
73 | # Upload the results to GitHub's code scanning dashboard.
74 | - name: "Upload to code-scanning"
75 | uses: github/codeql-action/upload-sarif@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9
76 | with:
77 | sarif_file: results.sarif
78 |
--------------------------------------------------------------------------------
/.github/workflows/style-check.yml:
--------------------------------------------------------------------------------
1 | name: Style Check
2 | on: [push, pull_request]
3 | permissions:
4 | contents: read
5 | jobs:
6 | python-style-check:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - name: Harden Runner
10 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
11 | with:
12 | egress-policy: audit
13 |
14 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
15 | - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
16 | with:
17 | python-version: '3.10'
18 | - name: Install dependencies
19 | run: |
20 | pip install --upgrade pip
21 | pip install ".[dev]"
22 | pip install ".[jaxcpu]"
23 | - name: Run isort
24 | run: |
25 | isort --diff --check .
26 | - name: Run yapf
27 | run: |
28 | yapf --diff --recursive budoux tests scripts
29 | - name: Run mypy
30 | run: |
31 | mypy budoux tests scripts
32 | - name: Run flake8
33 | if: ${{ always() }}
34 | uses: suo/flake8-github-action@3e87882219642e01aa8a6bbd03b4b0adb8542c2a
35 | with:
36 | checkName: python-style-check
37 | env:
38 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
39 | typescript-style-check:
40 | runs-on: ubuntu-latest
41 | steps:
42 | - name: Harden Runner
43 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
44 | with:
45 | egress-policy: audit
46 |
47 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
48 | - uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
49 | with:
50 | node-version: '20'
51 | - run: npm install
52 | working-directory: ./javascript
53 | - run: npm run lint
54 | working-directory: ./javascript
55 | java-style-check:
56 | runs-on: ubuntu-latest
57 | steps:
58 | - name: Harden Runner
59 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
60 | with:
61 | egress-policy: audit
62 |
63 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
64 | - uses: actions/setup-java@7a6d8a8234af8eb26422e24e3006232cccaa061b # v4.6.0
65 | with:
66 | java-version: '17'
67 | distribution: 'temurin'
68 | - name: Google Java Format
69 | uses: axel-op/googlejavaformat-action@dbff853fb823671ec5781365233bf86543b13215
70 | with:
71 | args: "--replace"
72 | skip-commit: true
73 | - name: Print diffs
74 | run: git --no-pager diff --exit-code
75 | markdown-style-check:
76 | runs-on: ubuntu-latest
77 | steps:
78 | - name: Harden Runner
79 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
80 | with:
81 | egress-policy: audit
82 |
83 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
84 | - name: markdownlint
85 | uses: nosborn/github-action-markdown-cli@9b5e871c11cc0649c5ac2526af22e23525fa344d
86 | with:
87 | files: '**/*.md'
88 | config_file: .markdownlint.yaml
89 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .venv
3 | /dist
4 | __pycache__
5 | *.pyc
6 | *.log
7 | *.egg-info
8 | *.coverage
9 | cov.xml
10 |
11 | # Python related files
12 | build/
13 |
14 | # JavaScript related files
15 | node_modules
16 | demo/static/app.js
17 | demo/static/worker.js
18 | javascript/bundle
19 | javascript/dist
20 | javascript/module
21 | javascript/src/data
22 |
23 | # Generated files by scripts
24 | source.txt
25 | encoded_data.txt
26 | weights.txt
27 |
28 | .vscode/
29 |
--------------------------------------------------------------------------------
/.markdownlint.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | default: true
15 |
16 | MD013:
17 | code_blocks: false
18 | MD010: false
19 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement (CLA). You (or your employer) retain the copyright to your
10 | contribution; this simply gives us permission to use and redistribute your
11 | contributions as part of the project. Head over to
12 | to see your current agreements on file or
13 | to sign a new one.
14 |
15 | You generally only need to submit a CLA once, so if you've already submitted one
16 | (even if it was for a different project), you probably don't need to do it
17 | again.
18 |
19 | ## Code Reviews
20 |
21 | All submissions, including submissions by project members, require review. We
22 | use GitHub pull requests for this purpose. Consult
23 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
24 | information on using pull requests.
25 |
26 | ## Community Guidelines
27 |
28 | This project follows
29 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/).
30 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include budoux/unicode_blocks.json
2 | include budoux/skip_nodes.json
3 | include budoux/py.typed
4 | recursive-include budoux/models *.json
5 |
--------------------------------------------------------------------------------
/budoux/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """BudouX module."""
15 |
16 | from . import parser
17 |
18 | __version__ = "0.7.0"
19 |
20 | Parser = parser.Parser
21 | load_default_japanese_parser = parser.load_default_japanese_parser
22 | load_default_simplified_chinese_parser = parser.load_default_simplified_chinese_parser
23 | load_default_traditional_chinese_parser = parser.load_default_traditional_chinese_parser
24 | load_default_thai_parser = parser.load_default_thai_parser
25 |
--------------------------------------------------------------------------------
/budoux/html_processor.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """HTML processor."""
15 |
16 | import json
17 | import os
18 | import queue
19 | import typing
20 | from html.parser import HTMLParser
21 |
22 | from .utils import SEP
23 |
24 | HTMLAttr = typing.List[typing.Tuple[str, typing.Union[str, None]]]
25 | PARENT_CSS_STYLE = 'word-break: keep-all; overflow-wrap: anywhere;'
26 | with open(
27 | os.path.join(os.path.dirname(__file__), 'skip_nodes.json'),
28 | encoding='utf-8') as f:
29 | SKIP_NODES: typing.Set[str] = set(json.load(f))
30 |
31 |
32 | class ElementState(object):
33 | """Represents the state for an element.
34 |
35 | Attributes:
36 | tag (str): The tag name.
37 | to_skip (bool): Whether the content should be skipped or not.
38 | """
39 |
40 | def __init__(self, tag: str, to_skip: bool) -> None:
41 | self.tag = tag
42 | self.to_skip = to_skip
43 |
44 |
45 | class TextContentExtractor(HTMLParser):
46 | """An HTML parser to extract text content.
47 |
48 | Attributes:
49 | output (str): Accumulated text content.
50 | """
51 | output = ''
52 |
53 | def handle_data(self, data: str) -> None:
54 | self.output += data
55 |
56 |
57 | class HTMLChunkResolver(HTMLParser):
58 | """An HTML parser to resolve the given HTML string and semantic chunks.
59 |
60 | Attributes:
61 | output (str): The HTML string to output.
62 | """
63 | output = ''
64 |
65 | def __init__(self, chunks: typing.List[str], separator: str):
66 | """Initializes the parser.
67 |
68 | Args:
69 | chunks (List[str]): The chunks to resolve.
70 | separator (str): The separator string.
71 | """
72 | HTMLParser.__init__(self)
73 | self.chunks_joined = SEP.join(chunks)
74 | self.separator = separator
75 | self.to_skip = False
76 | self.scan_index = 0
77 | self.element_stack: queue.LifoQueue[ElementState] = queue.LifoQueue()
78 |
79 | def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
80 | attr_pairs = []
81 | for attr in attrs:
82 | if attr[1] is None:
83 | attr_pairs.append(' ' + attr[0])
84 | else:
85 | attr_pairs.append(' %s="%s"' % (attr[0], attr[1]))
86 | encoded_attrs = ''.join(attr_pairs)
87 | self.element_stack.put(ElementState(tag, self.to_skip))
88 | if tag.upper() in SKIP_NODES:
89 | if not self.to_skip and self.chunks_joined[self.scan_index] == SEP:
90 | self.scan_index += 1
91 | self.output += self.separator
92 | self.to_skip = True
93 | self.output += '<%s%s>' % (tag, encoded_attrs)
94 |
95 | def handle_endtag(self, tag: str) -> None:
96 | self.output += '%s>' % (tag)
97 | while not self.element_stack.empty():
98 | state = self.element_stack.get_nowait()
99 | if state.tag == tag:
100 | self.to_skip = state.to_skip
101 | break
102 | # If the close tag doesn't match the open tag, remove it and keep looking.
103 | # This means that close tags close their corresponding open tags.
104 | # e.g., `abc def ` or `
abcdef
` are both valid
105 | # HTML as per the HTML spec.
106 | # Note the HTML "adoption agency algorithm" isn't fully supported.
107 | # See https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
108 |
109 | def handle_data(self, data: str) -> None:
110 | for char in data:
111 | if not char == self.chunks_joined[self.scan_index]:
112 | if not self.to_skip:
113 | self.output += self.separator
114 | self.scan_index += 1
115 | self.output += char
116 | self.scan_index += 1
117 |
118 |
119 | def get_text(html: str) -> str:
120 | """Gets the text content from the input HTML string.
121 |
122 | Args:
123 | html (str): Input HTML string.
124 |
125 | Returns:
126 | The text content.
127 | """
128 | text_content_extractor = TextContentExtractor()
129 | text_content_extractor.feed(html)
130 | return text_content_extractor.output
131 |
132 |
133 | def resolve(phrases: typing.List[str],
134 | html: str,
135 | separator: str = '\u200b') -> str:
136 | """Wraps phrases in the HTML string with non-breaking markup.
137 |
138 | Args:
139 | phrases (List[str]): The phrases included in the HTML string.
140 | html (str): The HTML string to resolve.
141 | separator (str, optional): The separator string.
142 |
143 | Returns:
144 | The HTML string with phrases wrapped in non-breaking markup.
145 | """
146 | resolver = HTMLChunkResolver(phrases, separator)
147 | resolver.feed(html)
148 | result = '%s ' % (PARENT_CSS_STYLE, resolver.output)
149 | return result
150 |
--------------------------------------------------------------------------------
/budoux/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2021 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # https://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """BudouX Script to provide CLI for user."""
16 | import argparse
17 | import json
18 | import os
19 | import shutil
20 | import sys
21 | import textwrap
22 | import typing
23 | from pathlib import Path
24 |
25 | # TODO: replace with importlib.resources when py3.8 support is dropped.
26 | import importlib_resources
27 |
28 | import budoux
29 |
30 | ArgList = typing.Optional[typing.List[str]]
31 | models: Path = importlib_resources.files('budoux') / "models"
32 | langs = dict((model.stem, model) for model in models.glob("*.json"))
33 |
34 |
35 | class BudouxHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
36 | argparse.RawDescriptionHelpFormatter):
37 | pass
38 |
39 |
40 | def check_file(path: str) -> str:
41 | """Check if a given filepath exists or not.
42 |
43 | Args:
44 | path (str): Model path
45 |
46 | Raises:
47 | FileNotFoundError: Raise if given path does not exist.
48 |
49 | Returns:
50 | str: A model path.
51 | """
52 | if os.path.isfile(path):
53 | return path
54 | else:
55 | raise argparse.ArgumentTypeError(f"'{path}' is not found.")
56 |
57 |
58 | def check_lang(lang: str) -> Path:
59 | """Check if given language exists or not.
60 |
61 | Args:
62 | lang (str): language code (e.g.: 'ja')
63 |
64 | Raises:
65 | argparse.ArgumentTypeError: Raise if no model for given language exists.
66 |
67 | Returns:
68 | The model path.
69 | """
70 | if lang in langs:
71 | return langs[lang]
72 | else:
73 | raise argparse.ArgumentTypeError(
74 | f"'{lang}' does not exist in builtin models. (supported languages: {list(langs.keys())})"
75 | )
76 |
77 |
78 | def parse_args(test: ArgList = None) -> argparse.Namespace:
79 | """Parse commandline arguments.
80 |
81 | Args:
82 | test (typing.Optional[typing.List[str]], optional): Commandline args for testing. Defaults to None.
83 |
84 | Returns:
85 | argparse.Namespace: Parsed data of args.
86 | """
87 | parser = argparse.ArgumentParser(
88 | prog="budoux",
89 | formatter_class=(lambda prog: BudouxHelpFormatter(
90 | prog,
91 | **{
92 | "width": shutil.get_terminal_size(fallback=(120, 50)).columns,
93 | "max_help_position": 30,
94 | },
95 | )),
96 | description=textwrap.dedent("""\
97 | BudouX is the successor to Budou,
98 | the machine learning powered line break organizer tool."""),
99 | epilog="\n- ".join(
100 | ["supported languages of `-l`, `--lang`:", *langs.keys()]))
101 |
102 | parser.add_argument("text", metavar="TXT", nargs="?", type=str, help="text")
103 | parser.add_argument(
104 | "-H",
105 | "--html",
106 | action="store_true",
107 | help="HTML mode",
108 | )
109 | model_select_group = parser.add_mutually_exclusive_group()
110 | model_select_group.add_argument(
111 | "-m",
112 | "--model",
113 | metavar="JSON",
114 | type=check_file,
115 | default=check_lang('ja'),
116 | help="custom model file path",
117 | )
118 | model_select_group.add_argument(
119 | "-l",
120 | "--lang",
121 | metavar="LANG",
122 | type=check_lang,
123 | help="language of custom model",
124 | )
125 | parser.add_argument(
126 | "-s",
127 | "--sep",
128 | metavar="STR",
129 | type=str,
130 | default="\n",
131 | help="output phrase separator in TEXT mode",
132 | )
133 | parser.add_argument(
134 | "-d",
135 | "--delim",
136 | metavar="STR",
137 | type=str,
138 | default="---",
139 | help="output sentence delimiter in TEXT mode",
140 | )
141 | parser.add_argument(
142 | "-V",
143 | "--version",
144 | action="version",
145 | version="%(prog)s {}".format(budoux.__version__),
146 | )
147 | if test is not None:
148 | return parser.parse_args(test)
149 | else:
150 | return parser.parse_args()
151 |
152 |
153 | def _main(test: ArgList = None) -> str:
154 | args = parse_args(test=test)
155 | model_path = args.lang or args.model
156 | with open(model_path, 'r', encoding='utf-8') as f:
157 | model = json.load(f)
158 |
159 | parser = budoux.Parser(model)
160 | if args.html:
161 | if args.text is None:
162 | inputs_html = sys.stdin.read()
163 | else:
164 | inputs_html = args.text
165 | res = parser.translate_html_string(inputs_html)
166 | else:
167 | if args.text is None:
168 | inputs = [v.rstrip() for v in sys.stdin.readlines()]
169 | else:
170 | inputs = [v.rstrip() for v in args.text.splitlines()]
171 | outputs = [parser.parse(sentence) for sentence in inputs]
172 | combined_output = [args.sep.join(output) for output in outputs]
173 | ors = "\n" + args.delim + "\n"
174 | res = ors.join(combined_output)
175 |
176 | return res
177 |
178 |
179 | def main(test: ArgList = None) -> None:
180 | try:
181 | print(_main(test))
182 | except KeyboardInterrupt:
183 | exit(0)
184 |
185 |
186 | if __name__ == "__main__":
187 | main()
188 |
--------------------------------------------------------------------------------
/budoux/parser.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """BudouX parser to provide semantic chunks."""
15 |
16 | import json
17 | import os
18 | import typing
19 |
20 | from .html_processor import get_text, resolve
21 |
22 | MODEL_DIR = os.path.join(os.path.dirname(__file__), 'models')
23 |
24 |
25 | class Parser:
26 | """BudouX's Parser.
27 |
28 | The main parser object with a variety of class methods to provide semantic
29 | chunks and markups from the given input string.
30 |
31 | Attributes:
32 | model: A dict mapping a feature (str) and its score (int).
33 | """
34 |
35 | def __init__(self, model: typing.Dict[str, typing.Dict[str, int]]):
36 | """Initializes the parser.
37 |
38 | Args:
39 | model (Dict[str, Dict[str, int]]): A dict mapping a feature and its score.
40 | """
41 | self.model = model
42 |
43 | def parse(self, sentence: str) -> typing.List[str]:
44 | """Parses the input sentence and returns a list of semantic chunks.
45 |
46 | Args:
47 | sentence (str): An input sentence.
48 |
49 | Returns:
50 | A list of semantic chunks (List[str]).
51 | """
52 | if sentence == '':
53 | return []
54 | chunks = [sentence[0]]
55 | base_score = -sum(sum(g.values()) for g in self.model.values()) * 0.5
56 | for i in range(1, len(sentence)):
57 | score = base_score
58 | if i > 2:
59 | score += self.model.get('UW1', {}).get(sentence[i - 3], 0)
60 | if i > 1:
61 | score += self.model.get('UW2', {}).get(sentence[i - 2], 0)
62 | score += self.model.get('UW3', {}).get(sentence[i - 1], 0)
63 | score += self.model.get('UW4', {}).get(sentence[i], 0)
64 | if i + 1 < len(sentence):
65 | score += self.model.get('UW5', {}).get(sentence[i + 1], 0)
66 | if i + 2 < len(sentence):
67 | score += self.model.get('UW6', {}).get(sentence[i + 2], 0)
68 |
69 | if i > 1:
70 | score += self.model.get('BW1', {}).get(sentence[i - 2:i], 0)
71 | score += self.model.get('BW2', {}).get(sentence[i - 1:i + 1], 0)
72 | if i + 1 < len(sentence):
73 | score += self.model.get('BW3', {}).get(sentence[i:i + 2], 0)
74 |
75 | if i > 2:
76 | score += self.model.get('TW1', {}).get(sentence[i - 3:i], 0)
77 | if i > 1:
78 | score += self.model.get('TW2', {}).get(sentence[i - 2:i + 1], 0)
79 | if i + 1 < len(sentence):
80 | score += self.model.get('TW3', {}).get(sentence[i - 1:i + 2], 0)
81 | if i + 2 < len(sentence):
82 | score += self.model.get('TW4', {}).get(sentence[i:i + 3], 0)
83 |
84 | if score > 0:
85 | chunks.append(sentence[i])
86 | else:
87 | chunks[-1] += sentence[i]
88 | return chunks
89 |
90 | def translate_html_string(self, html: str) -> str:
91 | """Translates the given HTML string with markups for semantic line breaks.
92 |
93 | Args:
94 | html (str): An input html string.
95 |
96 | Returns:
97 | The translated HTML string (str).
98 | """
99 | # TODO: Align with the JavaScript API regarding the parent element addition.
100 | text_content = get_text(html)
101 | chunks = self.parse(text_content)
102 | return resolve(chunks, html)
103 |
104 |
105 | def load_default_japanese_parser() -> Parser:
106 | """Loads a parser equipped with the default Japanese model.
107 |
108 | Returns:
109 | A parser (:obj:`budoux.Parser`).
110 | """
111 | with open(os.path.join(MODEL_DIR, 'ja.json'), encoding='utf-8') as f:
112 | model = json.load(f)
113 | return Parser(model)
114 |
115 |
116 | def load_default_simplified_chinese_parser() -> Parser:
117 | """Loads a parser equipped with the default Simplified Chinese model.
118 |
119 | Returns:
120 | A parser (:obj:`budoux.Parser`).
121 | """
122 | with open(os.path.join(MODEL_DIR, 'zh-hans.json'), encoding='utf-8') as f:
123 | model = json.load(f)
124 | return Parser(model)
125 |
126 |
127 | def load_default_traditional_chinese_parser() -> Parser:
128 | """Loads a parser equipped with the default Traditional Chinese model.
129 |
130 | Returns:
131 | A parser (:obj:`budoux.Parser`).
132 | """
133 | with open(os.path.join(MODEL_DIR, 'zh-hant.json'), encoding='utf-8') as f:
134 | model = json.load(f)
135 | return Parser(model)
136 |
137 |
138 | def load_default_thai_parser() -> Parser:
139 | """Loads a parser equipped with the default Thai model.
140 |
141 | Returns:
142 | A parser (:obj:`budoux.Parser`).
143 | """
144 | with open(os.path.join(MODEL_DIR, 'th.json'), encoding='utf-8') as f:
145 | model = json.load(f)
146 | return Parser(model)
147 |
--------------------------------------------------------------------------------
/budoux/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/budoux/1f232ee0b7b11f9a7b7b3e02a0eecc2aaafe26ca/budoux/py.typed
--------------------------------------------------------------------------------
/budoux/skip_nodes.json:
--------------------------------------------------------------------------------
1 | [
2 | "ABBR",
3 | "BUTTON",
4 | "CODE",
5 | "IFRAME",
6 | "INPUT",
7 | "META",
8 | "NOBR",
9 | "SCRIPT",
10 | "STYLE",
11 | "TEXTAREA",
12 | "TIME",
13 | "VAR"
14 | ]
15 |
--------------------------------------------------------------------------------
/budoux/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Utilities for BudouX."""
15 |
16 | SEP = '▁'
17 | """The separator string to specify breakpoints."""
18 |
--------------------------------------------------------------------------------
/bump_version.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import argparse
16 | import json
17 | import re
18 | import subprocess
19 |
20 |
21 | def main():
22 | parser = argparse.ArgumentParser(description='Bump the version number.')
23 | parser.add_argument(
24 | 'new_version', type=str, help='The new version number (e.g., 1.2.3)')
25 | args = parser.parse_args()
26 | new_version = args.new_version
27 |
28 | # Updates Python port version number
29 | init_file = 'budoux/__init__.py'
30 | with open(init_file, 'r') as f:
31 | content = f.read()
32 | new_content = re.sub(r'(__version__\s+=\s+[\'"])([\.\d]+)([\'"])',
33 | rf'\g<1>{new_version}\g<3>', content)
34 | with open(init_file, 'w') as f:
35 | f.write(new_content)
36 |
37 | # Updates JavaScript port version number
38 | package_json_path = 'javascript/package.json'
39 | with open(package_json_path, 'r') as f:
40 | package_data = json.load(f)
41 | current_version = package_data.get('version')
42 |
43 | if current_version != new_version:
44 | npm_command = ['npm', 'version', new_version, '--no-git-tag-version']
45 | subprocess.run(npm_command, cwd='javascript', check=True)
46 | else:
47 | print(f"JavaScript version is already {new_version}, skipping npm version.")
48 |
49 | cli_file = 'javascript/src/cli.ts'
50 | with open(cli_file, 'r') as f:
51 | content = f.read()
52 | new_content = re.sub(r'(const\s+CLI_VERSION\s+=\s+[\'"])([\.\d]+)([\'"])',
53 | rf'\g<1>{new_version}\g<3>', content)
54 | with open(cli_file, 'w') as f:
55 | f.write(new_content)
56 |
57 | # Updates Java port version number
58 | mvn_command = [
59 | 'mvn', 'versions:set', f'-DnewVersion={new_version}',
60 | '-DgenerateBackupPoms=false'
61 | ]
62 | subprocess.run(mvn_command, cwd='java', check=True)
63 |
64 |
65 | if __name__ == "__main__":
66 | main()
67 |
--------------------------------------------------------------------------------
/data/finetuning/ja/train.txt:
--------------------------------------------------------------------------------
1 | 指定された▁時間以上▁アプリケーションを▁利用する▁ことは▁できません。
2 | これ以上▁その機器を▁利用する▁場合は▁注意してください。
3 | それ以上▁コップを▁振ると▁こぼれます。
4 | ファイルは▁そのまま▁ご利用いただけます。
5 | 彼は▁そのまま▁行こうとした。
6 | ご利用▁いただき▁ありがとう▁ございます。
7 | フィードバック▁ありがとう▁ございます。
8 | 貴重な▁ご意見▁ありがとう▁ございます。
9 | この本は▁あらゆる▁トピックを▁カバーします。
10 | ドアを▁ありと▁あらゆる▁力を▁込めて▁開けます。
11 | 身の▁回りの▁あらゆる▁ものを▁化学式で▁表す。
12 | 当機は▁まもなく▁着陸態勢に▁入ります。
13 | まもなくして▁彼女が▁来た。
14 | まもなく▁電車が▁到着します。
15 | ようやく▁日が▁暮れた。
16 | やっと▁ようやく▁公開できそうです。
17 | あいつが▁ようやく▁来た。
18 | 夕方▁ようやく▁完成した。
19 | あれが▁入ったのは▁たまたまです。
20 | たまたま▁手に▁入れる▁ことが▁できた。
21 | 彼が▁たまたま▁持っていた。
22 | 全部▁まとめて▁提出します。
23 | 論点を▁まとめる。
24 | 思った▁とおりに▁書く。
25 | 言われた▁とおりに▁動きます。
26 | まるで▁水晶の▁ように▁すきとおって▁いた。
27 | 彼の▁すきとおる▁肌
28 | 冷たさを▁もつ▁青い▁空
29 | 当日券のみ▁有効です。
30 | 該当する方▁のみ▁入場できます。
31 | あの▁青い▁空と▁白い▁雲のみが▁見える。
32 | 白い▁つぶが▁ちりのように▁舞う
33 | つぶつぶの▁食感
34 | 煙が▁どんどん▁広がっていく
35 | さあ▁どんどん▁食べてくれ
36 | そこが▁ちがうと▁思う
37 | はじまりが▁ちがうから▁おわりも▁ちがう
38 | 日が▁しずむまでに▁終わらせよう
39 | うまく▁言葉に▁できない
40 | それは▁子どもの▁遊び場です。
41 | ふだん▁どおりに▁やれば▁大丈夫。
42 | この▁おもちゃを▁ください。
43 | 映画に▁感情移入する。
44 | 制度に▁甘えがちな▁場面
45 | 可能性が▁浮かび▁上がる
--------------------------------------------------------------------------------
/data/finetuning/ja/val.txt:
--------------------------------------------------------------------------------
1 | それ以上▁モニターは▁増やせません
2 | 今回の▁発表は▁以上に▁なります。
3 | そのままに▁しておけば▁良い。
4 | そのまま▁お送りください。
5 | たくさんの▁お便り▁ありがとう▁ございます。
6 | 彼は▁あらゆる▁服を▁持っています。
7 | 係の▁者が▁まもなく▁来ます。
8 | 山の▁頂が▁ようやく▁見えた。
9 | たまたま▁聞こえてきた▁歌声。
10 |
--------------------------------------------------------------------------------
/demo/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "budoux-demo",
3 | "version": "0.1.2",
4 | "description": "A demo app for BudouX",
5 | "main": "static/app.js",
6 | "scripts": {
7 | "build:app": "esbuild src/app.ts --bundle --minify --outfile=static/app.js",
8 | "build:worker": "esbuild src/worker.ts --bundle --minify --outfile=static/worker.js",
9 | "build": "npm run build:app && npm run build:worker",
10 | "watch:app": "esbuild src/app.ts --watch --bundle --minify --outfile=static/app.js",
11 | "watch:worker": "esbuild src/worker.ts --watch --bundle --minify --outfile=static/worker.js",
12 | "watch": "concurrently \"npm run watch:app\" \"npm run watch:worker\"",
13 | "serve": "http-server static",
14 | "dev": "concurrently \"npm run serve\" \"npm run watch\"",
15 | "start": "npm run dev"
16 | },
17 | "keywords": [],
18 | "author": "Shuhei Iitsuka",
19 | "license": "Apache-2.0",
20 | "dependencies": {
21 | "budoux": "file:../javascript",
22 | "dompurify": "^3.2.5"
23 | },
24 | "devDependencies": {
25 | "@types/dompurify": "^3.2.0",
26 | "concurrently": "^9.1.2",
27 | "esbuild": "^0.19.5",
28 | "http-server": "^14.1.1",
29 | "typescript": "^5.2.2"
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/demo/src/app.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import DOMPurify from 'dompurify';
18 | import { loadDefaultJapaneseParser, loadDefaultSimplifiedChineseParser, loadDefaultTraditionalChineseParser, loadDefaultThaiParser } from 'budoux';
19 |
20 | const parsers = new Map([
21 | ['ja', loadDefaultJapaneseParser()],
22 | ['zh-hans', loadDefaultSimplifiedChineseParser()],
23 | ['zh-hant', loadDefaultTraditionalChineseParser()],
24 | ['th', loadDefaultThaiParser()]
25 | ]);
26 | const defaultInputs = new Map([
27 | ['ja', 'Google の使命は、世界中の情報を整理 し、世界中の人がアクセス できて使えるようにすることです。'],
28 | ['zh-hans', '我们的使命是整合 全球信息,供大众使用 ,让人人受益。'],
29 | ['zh-hant', '我們的使命是匯整 全球資訊,供大眾使用 ,使人人受惠。'],
30 | ['th', 'พันธกิจของเราคือการจัดระเบียบข้อมูลในโลกนี้และทำให้เข้าถึงได้ง่ายในทุกที่และมีประโยชน์']
31 | ])
32 | const inputTextElement = document.getElementById('input') as HTMLTextAreaElement;
33 | const outputContainerElement = document.getElementById('output') as HTMLElement;
34 | const fontSizeElement = document.getElementById('fontsize') as HTMLInputElement;
35 | const brCheckElement = document.getElementById('wbr2br') as HTMLInputElement;
36 | const modelSelectElement = document.getElementById('model') as HTMLSelectElement;
37 | const url = new URL(document.location.href);
38 | const worker = new Worker('./worker.js');
39 | worker.onmessage = (e: MessageEvent) => {
40 | console.log('response from worker:', e);
41 | };
42 |
43 |
44 | /**
45 | * Runs the BudouX model to process the input text and render the processed HTML.
46 | */
47 | const run = () => {
48 | outputContainerElement.innerHTML = DOMPurify.sanitize(inputTextElement.value);
49 | const model = modelSelectElement.value;
50 | worker.postMessage({'sentence': outputContainerElement.textContent, 'model': model});
51 | const parser = parsers.get(model);
52 | if (!parser) return;
53 | parser.applyToElement(outputContainerElement);
54 | const renderWithBR = brCheckElement.checked;
55 | if (renderWithBR) {
56 | outputContainerElement.innerHTML = DOMPurify.sanitize(
57 | outputContainerElement.innerHTML.replace(/\u200b/g, ' '));
58 | }
59 | url.searchParams.set('q', inputTextElement.value);
60 | window.history.replaceState('', '', url.toString());
61 | };
62 |
63 | /**
64 | * Initializes the app.
65 | */
66 | const init = () => {
67 | const lang = url.searchParams.get('lang');
68 | if (lang) modelSelectElement.value = lang;
69 | const input = url.searchParams.get('q') || defaultInputs.get(modelSelectElement.value);
70 | if (input) inputTextElement.value = input;
71 | run();
72 | }
73 |
74 | fontSizeElement.addEventListener('input', () => {
75 | outputContainerElement.style.fontSize = `${fontSizeElement.value}rem`;
76 | })
77 |
78 | inputTextElement.addEventListener('input', () => {
79 | run();
80 | });
81 |
82 | brCheckElement.addEventListener('input', () => {
83 | run();
84 | });
85 |
86 | modelSelectElement.addEventListener('change', () => {
87 | url.searchParams.set('lang', modelSelectElement.value);
88 | window.history.pushState('', '', url.toString());
89 | const input = defaultInputs.get(modelSelectElement.value);
90 | if (input) inputTextElement.value = input;
91 | run();
92 | })
93 |
94 | init();
--------------------------------------------------------------------------------
/demo/src/worker.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2023 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import { Parser, jaModel, zhHansModel, zhHantModel } from 'budoux';
18 |
19 | const parsers: { [key: string]: Parser } = {
20 | 'ja': new Parser(jaModel),
21 | 'zh-hans': new Parser(zhHansModel),
22 | 'zh-hant': new Parser(zhHantModel),
23 | };
24 |
25 | onmessage = (e: MessageEvent) => {
26 | const model: string = e.data['model'];
27 | if (!Object.keys(parsers).includes(model)) return;
28 | const parser = parsers[model];
29 | const result = parser.parse(e.data['sentence']);
30 | console.log('It works in Web Worker, too!', result);
31 | postMessage(result);
32 | };
33 |
--------------------------------------------------------------------------------
/demo/static/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | BudouX demo
7 |
57 |
58 |
59 |
70 |
71 |
72 | Language
73 |
74 | Japanese
75 | Simplified Chinese
76 | Traditional Chinese
77 | Thai
78 |
79 |
80 |
81 |
82 | Font size
83 |
84 |
85 |
86 |
87 | Replace ZWSP with BR
88 |
89 |
90 |
91 |
92 |
93 |
94 |
--------------------------------------------------------------------------------
/demo/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "es2017",
4 | "module": "commonjs",
5 | "esModuleInterop": true,
6 | "forceConsistentCasingInFileNames": true,
7 | "strict": true,
8 | "skipLibCheck": true,
9 | "resolveJsonModule": true
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/budoux/1f232ee0b7b11f9a7b7b3e02a0eecc2aaafe26ca/example.png
--------------------------------------------------------------------------------
/java/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | src/main/resources
3 |
--------------------------------------------------------------------------------
/java/README.md:
--------------------------------------------------------------------------------
1 | # BudouX Java Module
2 |
3 | BudouX is a standalone, small, and language-neutral phrase segmenter tool that
4 | provides beautiful and legible line breaks.
5 |
6 | For more details about the project, please refer to the [project README](https://github.com/google/budoux/).
7 |
8 | ## Demo
9 |
10 |
11 |
12 | ## Usage
13 |
14 | ### Simple usage
15 |
16 | You can get a list of phrases by feeding a sentence to the parser.
17 | The easiest way is to get a parser is loading the default parser for each language.
18 |
19 | ```java
20 | import com.google.budoux.Parser;
21 |
22 | public class App
23 | {
24 | public static void main( String[] args )
25 | {
26 | Parser parser = Parser.loadDefaultJapaneseParser();
27 | System.out.println(parser.parse("今日は良い天気ですね。"));
28 | // [今日は, 良い, 天気ですね。]
29 | }
30 | }
31 | ```
32 |
33 | #### Supported languages and their default parsers
34 |
35 | - Japanese: `Parser.loadDefaultJapaneseParser()`
36 | - Simplified Chinese: `Parser.loadDefaultSimplifiedChineseParser()`
37 | - Traditional Chinese: `Parser.loadDefaultTraditionalChineseParser()`
38 | - Thai: `Parser.loadDefaultThaiParser()`
39 |
40 | ### Working with HTML
41 |
42 | If you want to use the result in a website, you can use the `translateHTMLString`
43 | method to get an HTML string that wraps phrases with non-breaking markup,
44 | speicifcally, zero-width space (U+200B).
45 |
46 | ```java
47 | System.out.println(parser.translateHTMLString("今日は良い天気 ですね。"));
48 | //今日は\u200b良い\u200b天気 ですね。
49 | ```
50 |
51 | Please note that separators are denoted as `\u200b` in the example above for
52 | illustrative purposes, but the actual output is an invisible string as it's a
53 | zero-width space.
54 |
55 | ## Caveat
56 |
57 | BudouX supports HTML inputs and outputs HTML strings with markup applied to wrap
58 | phrases, but it's not meant to be used as an HTML sanitizer.
59 | **BudouX doesn't sanitize any inputs.**
60 | Malicious HTML inputs yield malicious HTML outputs.
61 | Please use it with an appropriate sanitizer library if you don't trust the input.
62 |
63 | ## Author
64 |
65 | [Shuhei Iitsuka](https://tushuhei.com)
66 |
67 | ## Disclaimer
68 |
69 | This is not an officially supported Google product.
70 |
--------------------------------------------------------------------------------
/java/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
17 |
19 | 4.0.0
20 |
21 |
22 | org.sonatype.oss
23 | oss-parent
24 | 9
25 |
26 |
27 | com.google.budoux
28 | budoux
29 | 0.7.0
30 |
31 | BudouX
32 | https://google.github.io/budoux/
33 |
34 |
35 | UTF-8
36 | 1.8
37 | 1.8
38 |
39 |
40 |
41 | junit
42 | junit
43 | 4.13.2
44 | test
45 |
46 |
47 | com.google.code.gson
48 | gson
49 | 2.13.0
50 |
51 |
52 | org.jsoup
53 | jsoup
54 | 1.19.1
55 |
56 |
57 |
58 |
59 |
60 |
61 | org.apache.maven.plugins
62 | maven-javadoc-plugin
63 | 3.11.2
64 |
65 |
66 |
67 |
68 |
69 |
70 | maven-clean-plugin
71 | 3.4.1
72 |
73 |
74 |
75 | maven-resources-plugin
76 | 3.3.1
77 |
78 |
79 | copy-data
80 | generate-resources
81 |
82 | copy-resources
83 |
84 |
85 | ${basedir}/src/main/resources
86 |
87 |
88 | ../budoux
89 |
90 | models/*.json
91 | skip_nodes.json
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 | maven-compiler-plugin
101 | 3.14.0
102 |
103 |
104 | maven-surefire-plugin
105 | 3.5.3
106 |
107 |
108 | maven-jar-plugin
109 | 3.4.2
110 |
111 |
112 | maven-install-plugin
113 | 3.1.4
114 |
115 |
116 | maven-deploy-plugin
117 | 3.1.4
118 |
119 |
120 |
121 | maven-site-plugin
122 | 3.21.0
123 |
124 |
125 | maven-project-info-reports-plugin
126 | 3.9.0
127 |
128 |
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/budoux/HTMLProcessor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2023 Google LLC
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.google.budoux;
18 |
19 | import com.google.gson.Gson;
20 | import com.google.gson.JsonIOException;
21 | import com.google.gson.JsonSyntaxException;
22 | import java.io.IOException;
23 | import java.io.InputStream;
24 | import java.io.InputStreamReader;
25 | import java.io.Reader;
26 | import java.nio.charset.StandardCharsets;
27 | import java.util.ArrayDeque;
28 | import java.util.Arrays;
29 | import java.util.HashSet;
30 | import java.util.List;
31 | import java.util.Locale;
32 | import java.util.Set;
33 | import java.util.stream.Collectors;
34 | import org.jsoup.Jsoup;
35 | import org.jsoup.nodes.Comment;
36 | import org.jsoup.nodes.Document;
37 | import org.jsoup.nodes.Element;
38 | import org.jsoup.nodes.Node;
39 | import org.jsoup.nodes.TextNode;
40 | import org.jsoup.select.NodeVisitor;
41 |
42 | /** Processes phrases into an HTML string wrapping them in no-breaking markup. */
43 | final class HTMLProcessor {
44 | private static final Set skipNodes;
45 | private static final String STYLE = "word-break: keep-all; overflow-wrap: anywhere;";
46 |
47 | private HTMLProcessor() {}
48 |
49 | static {
50 | Gson gson = new Gson();
51 | InputStream inputStream = HTMLProcessor.class.getResourceAsStream("/skip_nodes.json");
52 | try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) {
53 | String[] skipNodesStrings = gson.fromJson(reader, String[].class);
54 | skipNodes = new HashSet<>(Arrays.asList(skipNodesStrings));
55 | } catch (JsonSyntaxException | JsonIOException | IOException e) {
56 | throw new AssertionError(e);
57 | }
58 | }
59 |
60 | /**
61 | * A `NodeVisitor` subclass that concatenates all `TextNode`s to a string.
62 | *
63 | * It also converts `<br>` to `\n`.
64 | */
65 | private static class TextizeNodeVisitor implements NodeVisitor {
66 | private StringBuilder output = new StringBuilder();
67 |
68 | public String getString() {
69 | return output.toString();
70 | }
71 |
72 | @Override
73 | public void head(Node node, int depth) {
74 | if (node instanceof Element) {
75 | final String nodeName = node.nodeName();
76 | if (nodeName.equals("br")) {
77 | output.append('\n');
78 | }
79 | } else if (node instanceof TextNode) {
80 | output.append(((TextNode) node).getWholeText());
81 | }
82 | }
83 |
84 | @Override
85 | public void tail(Node node, int depth) {}
86 | }
87 |
88 | private static class PhraseResolvingNodeVisitor implements NodeVisitor {
89 | private static final char SEP = '\uFFFF';
90 | private final String phrasesJoined;
91 | private final String separator;
92 | private final StringBuilder output = new StringBuilder();
93 | private Integer scanIndex = 0;
94 | private boolean toSkip = false;
95 | private final ArrayDeque elementStack = new ArrayDeque<>();
96 |
97 | /**
98 | * Constructs a PhraseResolvingNodeVisitor.
99 | *
100 | * @param phrases a list of phrase strings.
101 | * @param separator the separator string.
102 | */
103 | PhraseResolvingNodeVisitor(List phrases, String separator) {
104 | this.separator = separator;
105 | this.phrasesJoined = String.join(Character.toString(SEP), phrases);
106 | }
107 |
108 | /**
109 | * Returns the resolved output string.
110 | *
111 | * @return the output string.
112 | */
113 | public StringBuilder getOutput() {
114 | return output;
115 | }
116 |
117 | @Override
118 | public void head(Node node, int depth) {
119 | if (node.nodeName().equals("body")) {
120 | return;
121 | }
122 | if (node instanceof Element) {
123 | elementStack.push(toSkip);
124 | String attributesEncoded =
125 | node.attributes().asList().stream()
126 | .map(attribute -> " " + attribute)
127 | .collect(Collectors.joining(""));
128 | final String nodeName = node.nodeName();
129 | if (nodeName.equals("br")) {
130 | // ` ` is converted to `\n`, see `TextizeNodeVisitor.head`.
131 | // Assume phrasesJoined.charAt(scanIndex) == '\n'.
132 | scanIndex++;
133 | } else if (skipNodes.contains(nodeName.toUpperCase(Locale.ENGLISH))) {
134 | if (!toSkip
135 | && scanIndex < phrasesJoined.length()
136 | && phrasesJoined.charAt(scanIndex) == SEP) {
137 | output.append(separator);
138 | scanIndex++;
139 | }
140 | toSkip = true;
141 | }
142 | output.append(String.format("<%s%s>", nodeName, attributesEncoded));
143 | } else if (node instanceof TextNode) {
144 | String data = ((TextNode) node).getWholeText();
145 | for (int i = 0; i < data.length(); i++) {
146 | char c = data.charAt(i);
147 | if (c != phrasesJoined.charAt(scanIndex)) {
148 | // Assume phrasesJoined.charAt(scanIndex) == SEP.
149 | if (!toSkip) {
150 | output.append(separator);
151 | }
152 | scanIndex++;
153 | }
154 | scanIndex++;
155 | output.append(c);
156 | }
157 | }
158 | }
159 |
160 | @Override
161 | public void tail(Node node, int depth) {
162 | if (node.nodeName().equals("body") || node instanceof TextNode || node instanceof Comment) {
163 | return;
164 | }
165 | // assume node instanceof Element;
166 | toSkip = elementStack.pop();
167 | Element element = (Element) node;
168 | if (element.tag().isSelfClosing()) {
169 | return;
170 | }
171 | output.append(String.format("%s>", node.nodeName()));
172 | }
173 | }
174 |
175 | /**
176 | * Wraps phrases in the HTML string with non-breaking markup.
177 | *
178 | * @param phrases the phrases included in the HTML string.
179 | * @param html the HTML string to resolve.
180 | * @return the HTML string of phrases wrapped in non-breaking markup.
181 | */
182 | public static String resolve(List phrases, String html) {
183 | return resolve(phrases, html, "\u200b");
184 | }
185 |
186 | /**
187 | * Wraps phrases in the HTML string with non-breaking markup.
188 | *
189 | * @param phrases the phrases included in the HTML string.
190 | * @param html the HTML string to resolve.
191 | * @param separator the separator string.
192 | * @return the HTML string of phrases wrapped in non-breaking markup.
193 | */
194 | public static String resolve(List phrases, String html, String separator) {
195 | Document doc = Jsoup.parseBodyFragment(html);
196 | PhraseResolvingNodeVisitor nodeVisitor = new PhraseResolvingNodeVisitor(phrases, separator);
197 | doc.body().traverse(nodeVisitor);
198 | return String.format("%s ", STYLE, nodeVisitor.getOutput());
199 | }
200 |
201 | /**
202 | * Gets the text content from the input HTML string.
203 | *
204 | * @param html an HTML string.
205 | * @return the text content.
206 | */
207 | public static String getText(String html) {
208 | Document doc = Jsoup.parseBodyFragment(html);
209 | TextizeNodeVisitor nodeVisitor = new TextizeNodeVisitor();
210 | doc.body().traverse(nodeVisitor);
211 | return nodeVisitor.getString();
212 | }
213 | }
214 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/budoux/Parser.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2023 Google LLC
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.google.budoux;
18 |
19 | import com.google.gson.Gson;
20 | import com.google.gson.JsonIOException;
21 | import com.google.gson.JsonSyntaxException;
22 | import com.google.gson.reflect.TypeToken;
23 | import java.io.IOException;
24 | import java.io.InputStream;
25 | import java.io.InputStreamReader;
26 | import java.io.Reader;
27 | import java.lang.reflect.Type;
28 | import java.nio.charset.StandardCharsets;
29 | import java.util.ArrayList;
30 | import java.util.List;
31 | import java.util.Map;
32 | import java.util.Optional;
33 |
34 | /**
35 | * The BudouX parser that translates the input sentence into phrases.
36 | *
37 | * You can create a parser instance by invoking {@code new Parser(model)} with the model data you
38 | * want to use. You can also create a parser by specifying the model file path with {@code
39 | * Parser.loadByFileName(modelFileName)}.
40 | *
41 | *
In most cases, it's sufficient to use the default parser for the language. For example, you
42 | * can create a default Japanese parser as follows.
43 | *
44 | *
45 | * Parser parser = Parser.loadDefaultJapaneseParser();
46 | *
47 | */
48 | public class Parser {
49 | private final Map> model;
50 |
51 | /**
52 | * Constructs a BudouX parser.
53 | *
54 | * @param model the model data.
55 | */
56 | public Parser(Map> model) {
57 | this.model = model;
58 | }
59 |
60 | /**
61 | * Loads the default Japanese parser.
62 | *
63 | * @return a BudouX parser with the default Japanese model.
64 | */
65 | public static Parser loadDefaultJapaneseParser() {
66 | return loadByFileName("/models/ja.json");
67 | }
68 |
69 | /**
70 | * Loads the default Simplified Chinese parser.
71 | *
72 | * @return a BudouX parser with the default Simplified Chinese model.
73 | */
74 | public static Parser loadDefaultSimplifiedChineseParser() {
75 | return loadByFileName("/models/zh-hans.json");
76 | }
77 |
78 | /**
79 | * Loads the default Traditional Chinese parser.
80 | *
81 | * @return a BudouX parser with the default Traditional Chinese model.
82 | */
83 | public static Parser loadDefaultTraditionalChineseParser() {
84 | return loadByFileName("/models/zh-hant.json");
85 | }
86 |
87 | /**
88 | * Loads the default Thai parser.
89 | *
90 | * @return a BudouX parser with the default Thai model.
91 | */
92 | public static Parser loadDefaultThaiParser() {
93 | return loadByFileName("/models/th.json");
94 | }
95 |
96 | /**
97 | * Loads a parser by specifying the model file path.
98 | *
99 | * @param modelFileName the model file path.
100 | * @return a BudouX parser.
101 | */
102 | public static Parser loadByFileName(String modelFileName) {
103 | Gson gson = new Gson();
104 | Type type = new TypeToken>>() {}.getType();
105 | InputStream inputStream = Parser.class.getResourceAsStream(modelFileName);
106 | try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) {
107 | Map> model = gson.fromJson(reader, type);
108 | return new Parser(model);
109 | } catch (JsonIOException | JsonSyntaxException | IOException e) {
110 | throw new AssertionError(e);
111 | }
112 | }
113 |
114 | /**
115 | * Gets the score for the specified feature of the given sequence.
116 | *
117 | * @param featureKey the feature key to examine.
118 | * @param sequence the sequence to look up the score.
119 | * @return the contribution score to support a phrase break.
120 | */
121 | private int getScore(String featureKey, String sequence) {
122 | return Optional.ofNullable(this.model.get(featureKey))
123 | .map(group -> group.get(sequence))
124 | .orElse(0);
125 | }
126 |
127 | /**
128 | * Parses a sentence into phrases.
129 | *
130 | * @param sentence the sentence to break by phrase.
131 | * @return a list of phrases.
132 | */
133 | public List parse(String sentence) {
134 | if (sentence.isEmpty()) {
135 | return new ArrayList<>();
136 | }
137 | List result = new ArrayList<>();
138 | result.add(String.valueOf(sentence.charAt(0)));
139 | int totalScore =
140 | this.model.values().stream()
141 | .mapToInt(group -> group.values().stream().mapToInt(Integer::intValue).sum())
142 | .sum();
143 | for (int i = 1; i < sentence.length(); i++) {
144 | int score = -totalScore;
145 | if (i - 2 > 0) {
146 | score += 2 * this.getScore("UW1", sentence.substring(i - 3, i - 2));
147 | }
148 | if (i - 1 > 0) {
149 | score += 2 * this.getScore("UW2", sentence.substring(i - 2, i - 1));
150 | }
151 | score += 2 * this.getScore("UW3", sentence.substring(i - 1, i));
152 | score += 2 * this.getScore("UW4", sentence.substring(i, i + 1));
153 | if (i + 1 < sentence.length()) {
154 | score += 2 * this.getScore("UW5", sentence.substring(i + 1, i + 2));
155 | }
156 | if (i + 2 < sentence.length()) {
157 | score += 2 * this.getScore("UW6", sentence.substring(i + 2, i + 3));
158 | }
159 | if (i > 1) {
160 | score += 2 * this.getScore("BW1", sentence.substring(i - 2, i));
161 | }
162 | score += 2 * this.getScore("BW2", sentence.substring(i - 1, i + 1));
163 | if (i + 1 < sentence.length()) {
164 | score += 2 * this.getScore("BW3", sentence.substring(i, i + 2));
165 | }
166 | if (i - 2 > 0) {
167 | score += 2 * this.getScore("TW1", sentence.substring(i - 3, i));
168 | }
169 | if (i - 1 > 0) {
170 | score += 2 * this.getScore("TW2", sentence.substring(i - 2, i + 1));
171 | }
172 | if (i + 1 < sentence.length()) {
173 | score += 2 * this.getScore("TW3", sentence.substring(i - 1, i + 2));
174 | }
175 | if (i + 2 < sentence.length()) {
176 | score += 2 * this.getScore("TW4", sentence.substring(i, i + 3));
177 | }
178 | if (score > 0) {
179 | result.add("");
180 | }
181 | result.set(result.size() - 1, result.get(result.size() - 1) + sentence.charAt(i));
182 | }
183 | return result;
184 | }
185 |
186 | /**
187 | * Translates an HTML string with phrases wrapped in no-breaking markup.
188 | *
189 | * @param html an HTML string.
190 | * @return the translated HTML string with no-breaking markup.
191 | */
192 | public String translateHTMLString(String html) {
193 | String sentence = HTMLProcessor.getText(html);
194 | List phrases = parse(sentence);
195 | return HTMLProcessor.resolve(phrases, html, "\u200b");
196 | }
197 | }
198 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/budoux/HTMLProcessorTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2023 Google LLC
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.google.budoux;
18 |
19 | import static org.junit.Assert.assertEquals;
20 |
21 | import java.util.Arrays;
22 | import java.util.List;
23 | import org.junit.Test;
24 | import org.junit.runner.RunWith;
25 | import org.junit.runners.JUnit4;
26 |
27 | /** Unit tests for {@link HTMLProcessor}. */
28 | @RunWith(JUnit4.class)
29 | public class HTMLProcessorTest {
30 | String pre = "";
31 | String post = " ";
32 |
33 | private String wrap(String input) {
34 | return this.pre + input + this.post;
35 | }
36 |
37 | @Test
38 | public void testResolveWithSimpleTextInput() {
39 | List phrases = Arrays.asList("abc", "def");
40 | String html = "abcdef";
41 | String result = HTMLProcessor.resolve(phrases, html, "");
42 | assertEquals(this.wrap("abcdef"), result);
43 | }
44 |
45 | @Test
46 | public void testResolveWithStandardHTMLInput() {
47 | List phrases = Arrays.asList("abc", "def");
48 | String html = "abcd ef";
49 | String result = HTMLProcessor.resolve(phrases, html, "");
50 | assertEquals(this.wrap("abcd ef"), result);
51 | }
52 |
53 | @Test
54 | public void testResolveWithImg() {
55 | List phrases = Arrays.asList("abc", "def");
56 | String html = " abcdef";
57 | String result = HTMLProcessor.resolve(phrases, html, "");
58 | assertEquals(this.wrap(" abcdef"), result);
59 | }
60 |
61 | @Test
62 | public void testResolveWithUnpairedClose() {
63 | List phrases = Arrays.asList("abc", "def");
64 | String html = "abcdef
";
65 | String result = HTMLProcessor.resolve(phrases, html, "");
66 | assertEquals(this.wrap("abcdef
"), result);
67 | }
68 |
69 | @Test
70 | public void testResolveWithNodesToSkip() {
71 | List phrases = Arrays.asList("abc", "def", "ghi");
72 | String html = "abcde fghi";
73 | String result = HTMLProcessor.resolve(phrases, html, "");
74 | assertEquals(this.wrap("abcde fghi"), result);
75 | }
76 |
77 | @Test
78 | public void testResolveWithNodesBreakBeforeSkip() {
79 | List phrases = Arrays.asList("abc", "def", "ghi", "jkl");
80 | String html = "abcdefghi jkl";
81 | String result = HTMLProcessor.resolve(phrases, html, "");
82 | assertEquals(this.wrap("abcdefghi jkl"), result);
83 | }
84 |
85 | @Test
86 | public void testResolveWithAfterSkip() {
87 | List phrases = Arrays.asList("abc", "def", "ghi", "jkl");
88 | String html = "abcdef ghijkl";
89 | String result = HTMLProcessor.resolve(phrases, html, "");
90 | assertEquals(this.wrap("abcdef ghijkl"), result);
91 | }
92 |
93 | @Test
94 | public void testResolveWithAfterSkipWithImg() {
95 | List phrases = Arrays.asList("abc", "def", "ghi", "jkl");
96 | String html = "abcd ef ghijkl";
97 | String result = HTMLProcessor.resolve(phrases, html, "");
98 | assertEquals(this.wrap("abcd ef ghijkl"), result);
99 | }
100 |
101 | @Test
102 | public void testResolveWithNothingToSplit() {
103 | List phrases = Arrays.asList("abcdef");
104 | String html = "abcdef";
105 | String result = HTMLProcessor.resolve(phrases, html, "");
106 | assertEquals(this.wrap("abcdef"), result);
107 | }
108 |
109 | @Test
110 | public void testResolveBR() {
111 | String html = " 1 2 ";
112 | String text = HTMLProcessor.getText(html);
113 | assertEquals(" 1 \n 2 ", text);
114 | List phrases = Arrays.asList(" 1 \n 2 ");
115 | String result = HTMLProcessor.resolve(phrases, html, "");
116 | assertEquals(this.wrap(" 1 2 "), result);
117 | }
118 |
119 | @Test
120 | public void testGetText() {
121 | String html = "Hello W orld !";
122 | String result = HTMLProcessor.getText(html);
123 | assertEquals("Hello World!", result);
124 | }
125 |
126 | @Test
127 | public void testGetTextWhiteSpace() {
128 | String html = " H e ";
129 | String result = HTMLProcessor.getText(html);
130 | assertEquals(" H e ", result);
131 | }
132 |
133 | @Test
134 | public void testGetTextWhiteSpaceAcrossElements() {
135 | String html = " 1
2
";
136 | String result = HTMLProcessor.getText(html);
137 | assertEquals(" 1 2 ", result);
138 | }
139 |
140 | @Test
141 | public void testResolveSkipNodeAtTheEnd() {
142 | List phrases = Arrays.asList("abc", "def", "ghi", "jkl");
143 | String html = "abcdefghijkl ";
144 | String result = HTMLProcessor.resolve(phrases, html, "");
145 | assertEquals(this.wrap("abcdefghijkl "), result);
146 | }
147 |
148 | @Test
149 | public void testResolveWithComments() {
150 | List phrases = Arrays.asList("abc", "def", "ghi", "jkl");
151 | String html = "abcdefghijkl";
152 | String result = HTMLProcessor.resolve(phrases, html, "");
153 | assertEquals(this.wrap("abcdefghijkl"), result);
154 | }
155 | }
156 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/budoux/ParserTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2023 Google LLC
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.google.budoux;
18 |
19 | import static org.junit.Assert.assertEquals;
20 |
21 | import java.util.Arrays;
22 | import java.util.HashMap;
23 | import java.util.List;
24 | import java.util.Map;
25 | import org.junit.Test;
26 | import org.junit.runner.RunWith;
27 | import org.junit.runners.JUnit4;
28 |
29 | /** Unit tests for {@link Parser}. */
30 | @RunWith(JUnit4.class)
31 | public class ParserTest {
32 |
33 | @Test
34 | public void testParse() {
35 | Map> model = new HashMap<>();
36 | Map uw4 = new HashMap<>();
37 | uw4.put("a", 100);
38 | model.put("UW4", uw4);
39 | Parser parser = new Parser(model);
40 | List result = parser.parse("xyzabc");
41 | List expected = Arrays.asList("xyz", "abc");
42 | assertEquals(expected, result);
43 | }
44 |
45 | @Test
46 | public void testLoadDefaultJapaneseParser() {
47 | Parser parser = Parser.loadDefaultJapaneseParser();
48 | List result = parser.parse("今日は天気です。");
49 | List expected = Arrays.asList("今日は", "天気です。");
50 | assertEquals(expected, result);
51 | }
52 |
53 | @Test
54 | public void testTranslateHTMLString() {
55 | Map> model = new HashMap<>();
56 | Map uw4 = new HashMap<>();
57 | uw4.put("a", 100);
58 | model.put("UW4", uw4);
59 | Parser parser = new Parser(model);
60 | String html = "xyza bc";
61 | String result = parser.translateHTMLString(html);
62 | assertEquals(
63 | "xyz\u200ba bc ",
65 | result);
66 | }
67 |
68 | @Test
69 | public void testNewline() {
70 | Parser parser = Parser.loadDefaultJapaneseParser();
71 | List result = parser.parse(" 1 \n 2 ");
72 | List expected = Arrays.asList(" 1 \n 2 ");
73 | assertEquals(expected, result);
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/javascript/.npmignore:
--------------------------------------------------------------------------------
1 | scripts
2 |
--------------------------------------------------------------------------------
/javascript/.prettierrc.json:
--------------------------------------------------------------------------------
1 | {
2 | "bracketSpacing": false,
3 | "singleQuote": true,
4 | "trailingComma": "es5",
5 | "arrowParens": "avoid"
6 | }
7 |
--------------------------------------------------------------------------------
/javascript/README.md:
--------------------------------------------------------------------------------
1 |
2 | # BudouX JavaScript module
3 |
4 | BudouX is a standalone, small, and language-neutral phrase segmenter tool that
5 | provides beautiful and legible line breaks.
6 |
7 | For more details about the project, please refer to the [project README](https://github.com/google/budoux/).
8 |
9 | ## Demo
10 |
11 |
12 |
13 | ## Install
14 |
15 | ```shellsession
16 | $ npm install budoux
17 | ```
18 |
19 | ## Usage
20 |
21 | ### Simple usage
22 |
23 | You can get a list of phrases by feeding a sentence to the parser.
24 | The easiest way is to get a parser is loading the default parser for each language.
25 |
26 | **Japanese:**
27 |
28 | ```javascript
29 | import { loadDefaultJapaneseParser } from 'budoux';
30 | const parser = loadDefaultJapaneseParser();
31 | console.log(parser.parse('今日は天気です。'));
32 | // ['今日は', '天気です。']
33 | ```
34 |
35 | **Simplified Chinese:**
36 |
37 | ```javascript
38 | import { loadDefaultSimplifiedChineseParser } from 'budoux';
39 | const parser = loadDefaultSimplifiedChineseParser();
40 | console.log(parser.parse('是今天的天气。'));
41 | // ['是', '今天', '的', '天气。']
42 | ```
43 |
44 | **Traditional Chinese:**
45 |
46 | ```javascript
47 | import { loadDefaultTraditionalChineseParser } from 'budoux';
48 | const parser = loadDefaultTraditionalChineseParser();
49 | console.log(parser.parse('是今天的天氣。'));
50 | // ['是', '今天', '的', '天氣。']
51 | ```
52 |
53 | **Thai:**
54 |
55 | ```javascript
56 | import { loadDefaultThaiParser } from 'budoux';
57 | const parser = loadDefaultThaiParser();
58 | console.log(parser.parse('วันนี้อากาศดี'));
59 | // ['วัน', 'นี้', 'อากาศ', 'ดี']
60 | ```
61 |
62 | ### Translating an HTML string
63 |
64 | You can also translate an HTML string to wrap phrases with non-breaking markup,
65 | specifically, zero-width spaces (U+200B).
66 |
67 | ```javascript
68 | console.log(parser.translateHTMLString('今日はとても天気 です。'));
69 | // 今日は\u200bとても\u200b天気 です。
70 | ```
71 |
72 | Please note that separators are denoted as `\u200b` in the example above for
73 | illustrative purposes, but the actual output is an invisible string as it's a
74 | zero-width space.
75 |
76 | ### Applying to an HTML element
77 |
78 | You can also feed an HTML element to the parser to apply the process.
79 |
80 | ```javascript
81 | const ele = document.querySelector('p.budou-this');
82 | console.log(ele.outerHTML);
83 | // 今日はとても天気 です。
84 | parser.applyToElement(ele);
85 | console.log(ele.outerHTML);
86 | // 今日は\u200bとても\u200b天気 です。
87 | ```
88 |
89 | Internally, the `applyToElement` calls the [`HTMLProcessor`]'s `applyToElement`
90 | function with the zero-width space as the separator.
91 |
92 | You can use the [`HTMLProcessor`] class directly if desired.
93 | For example:
94 |
95 | ```javascript
96 | import { HTMLProcessor } from 'budoux';
97 | const ele = document.querySelector('p.budou-this');
98 | const htmlProcessor = new HTMLProcessor(parser, {
99 | separator: ' '
100 | });
101 | htmlProcessor.applyToElement(ele);
102 | ```
103 |
104 | [`HTMLProcessor`]: https://github.com/google/budoux/blob/main/javascript/src/html_processor.ts
105 |
106 | ### Loading a custom model
107 |
108 | You can load your own custom model as follows.
109 |
110 | ```javascript
111 | import { Parser } from 'budoux';
112 | const model = JSON.parse('{"UW4": {"a": 133}}'); // Content of the custom model JSON file.
113 | const parser = new Parser(model);
114 | parser.parse('xyzabc'); // ['xyz', 'abc']
115 | ```
116 |
117 | ### Working with Web Worker
118 |
119 | If you like to use BudouX inside a Web worker script, constrcut a parser without
120 | `HTMLProcessor`, i.e. use the pure `Parser` instance.
121 | Refer to [worker.ts](https://github.com/google/budoux/blob/main/demo/src/worker.ts)
122 | for a working demo.
123 |
124 | ```javascript
125 | import { Parser, jaModel } from 'budoux';
126 | const parser = new Parser(jaModel);
127 | parser.parse('今日は天気です'); // ['今日は', '天気です']
128 | ```
129 |
130 | ## Web components
131 |
132 | BudouX also offers Web components to integrate the parser with your website quickly.
133 | All you have to do is wrap sentences with:
134 |
135 | - `` for Japanese
136 | - `` for Simplified Chinese
137 | - `` for Traditional Chinese
138 | - `` for Thai
139 |
140 | ```html
141 | 今日は天気です。
142 | 今天是晴天。
143 | 今天是晴天。
144 | วันนี้อากาศดี
145 | ```
146 |
147 | In order to enable the custom element, you can simply add this line to load the bundle.
148 |
149 | ```html
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 | ```
162 |
163 | Otherwise, if you wish to bundle the component with the rest of your source code,
164 | you can import the component as shown below.
165 |
166 | ```javascript
167 | // For Japanese
168 | import 'budoux/module/webcomponents/budoux-ja';
169 |
170 | // For Simplified Chinese
171 | import 'budoux/module/webcomponents/budoux-zh-hans';
172 |
173 | // For Traditional Chinese
174 | import 'budoux/module/webcomponents/budoux-zh-hant';
175 |
176 | // For Thai
177 | import 'budoux/module/webcomponents/budoux-th';
178 | ```
179 |
180 | **Note:** BudouX Web Components directly manipulate the input HTML content
181 | instead of outputting the result to a shadow DOM. This design was chosen because
182 | the goal of BudouX Web Components is to simply insert zero-width spaces (ZWSPs)
183 | into the content, and isolating the style from the rest of the document could
184 | introduce unexpected side effects for developers.
185 |
186 | Consequently, cloning or editing the element might lead to duplicated ZWSPs
187 | between phrases. This is because BudouX Web Components cannot distinguish
188 | between characters that originate in the source and those that are inserted by
189 | BudouX itself once connected to the document. Duplicating ZWSPs will not cause
190 | any severe problems in controlling line breaks, and they are invisible anyway,
191 | but this is the reason we do not support other separator characters for these
192 | components.
193 |
194 | ### CLI
195 |
196 | You can also format inputs on your terminal with `budoux` command.
197 |
198 | ```shellsession
199 | $ budoux 本日は晴天です。
200 | 本日は
201 | 晴天です。
202 | ```
203 |
204 | ```shellsession
205 | $ echo $'本日は晴天です。\n明日は曇りでしょう。' | budoux
206 | 本日は
207 | 晴天です。
208 | ---
209 | 明日は
210 | 曇りでしょう。
211 | ```
212 |
213 | ```shellsession
214 | $ budoux 本日は晴天です。 -H
215 | 本日は\u200b晴天です。
216 | ```
217 |
218 | Please note that separators are denoted as `\u200b` in the example above for
219 | illustrative purposes, but the actual output is an invisible string as it's a
220 | zero-width space.
221 |
222 | If you want to see help, run `budoux -h`.
223 |
224 | ```shellsession
225 | $ budoux -h
226 | Usage: budoux [-h] [-H] [-d STR] [-m JSON] [-V] [TXT]
227 |
228 | BudouX is the successor to Budou, the machine learning powered line break organizer tool.
229 |
230 | Arguments:
231 | txt text
232 |
233 | Options:
234 | -H, --html HTML mode (default: false)
235 | -d, --delim output delimiter in TEXT mode (default: "---")
236 | -m, --model custom model file path
237 | -V, --version output the version number
238 | -h, --help display help for command
239 | ```
240 |
241 | ## Caveat
242 |
243 | BudouX supports HTML inputs and outputs HTML strings with markup applied to wrap
244 | phrases, but it's not meant to be used as an HTML sanitizer.
245 | **BudouX doesn't sanitize any inputs.**
246 | Malicious HTML inputs yield malicious HTML outputs.
247 | Please use it with an appropriate sanitizer library if you don't trust the input.
248 |
249 | ## Author
250 |
251 | [Shuhei Iitsuka](https://tushuhei.com)
252 |
253 | ## Disclaimer
254 |
255 | This is not an officially supported Google product.
256 |
--------------------------------------------------------------------------------
/javascript/bin/budoux.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | /**
3 | * @license
4 | * Copyright 2021 Google LLC
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * https://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | const {cli} = require('../dist/cli');
19 | cli(process.argv);
20 |
--------------------------------------------------------------------------------
/javascript/eslint.config.mjs:
--------------------------------------------------------------------------------
1 | import prettier from "eslint-plugin-prettier";
2 | import tsParser from "@typescript-eslint/parser";
3 | import path from "node:path";
4 | import { fileURLToPath } from "node:url";
5 | import js from "@eslint/js";
6 | import { FlatCompat } from "@eslint/eslintrc";
7 |
8 | const __filename = fileURLToPath(import.meta.url);
9 | const __dirname = path.dirname(__filename);
10 | const compat = new FlatCompat({
11 | baseDirectory: __dirname,
12 | recommendedConfig: js.configs.recommended,
13 | allConfig: js.configs.all
14 | });
15 |
16 | export default [{
17 | ignores: ["**/bundle", "**/dist", "**/module", "src/data"],
18 | }, ...compat.extends("eslint:recommended", "prettier"), {
19 | plugins: {
20 | prettier,
21 | },
22 |
23 | rules: {
24 | "prettier/prettier": "error",
25 | "block-scoped-var": "error",
26 | eqeqeq: "error",
27 | "no-var": "error",
28 | "prefer-const": "error",
29 | "eol-last": "error",
30 | "prefer-arrow-callback": "error",
31 | "no-trailing-spaces": "error",
32 |
33 | quotes: ["warn", "single", {
34 | avoidEscape: true,
35 | }],
36 |
37 | "no-restricted-properties": ["error", {
38 | object: "describe",
39 | property: "only",
40 | }, {
41 | object: "it",
42 | property: "only",
43 | }],
44 | },
45 | }, ...compat.extends("plugin:@typescript-eslint/recommended").map(config => ({
46 | ...config,
47 | files: ["**/*.ts", "**/*.tsx"],
48 | })), {
49 | files: ["**/*.ts", "**/*.tsx"],
50 |
51 | languageOptions: {
52 | parser: tsParser,
53 | ecmaVersion: 2018,
54 | sourceType: "module",
55 | },
56 |
57 | rules: {
58 | "@typescript-eslint/no-non-null-assertion": "off",
59 | "@typescript-eslint/no-use-before-define": "off",
60 | "@typescript-eslint/no-warning-comments": "off",
61 | "@typescript-eslint/no-empty-function": "off",
62 | "@typescript-eslint/no-var-requires": "off",
63 | "@typescript-eslint/explicit-function-return-type": "off",
64 | "@typescript-eslint/explicit-module-boundary-types": "off",
65 | "@typescript-eslint/ban-types": "off",
66 | "@typescript-eslint/camelcase": "off",
67 | "node/no-empty-function": "off",
68 | "node/no-missing-import": "off",
69 | "node/no-unsupported-features/es-syntax": "off",
70 | "node/no-missing-require": "off",
71 | "node/shebang": "off",
72 | "no-dupe-class-members": "off",
73 | "require-atomic-updates": "off",
74 | },
75 | }];
--------------------------------------------------------------------------------
/javascript/karma.conf.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2023 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | module.exports = function (config) {
18 | config.set({
19 | basePath: '',
20 | frameworks: ['jasmine'],
21 | files: ['bundle/tests/*.js'],
22 | reporters: ['progress'],
23 | port: 9876,
24 | colors: true,
25 | logLevel: config.LOG_INFO,
26 | autoWatch: false,
27 | browsers: ['ChromeHeadless'],
28 | singleRun: true,
29 | concurrency: Infinity,
30 | });
31 | };
32 |
--------------------------------------------------------------------------------
/javascript/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "budoux",
3 | "version": "0.7.0",
4 | "description": "A small chunk segmenter.",
5 | "author": "Shuhei Iitsuka",
6 | "license": "Apache-2.0",
7 | "repository": {
8 | "type": "git",
9 | "url": "https://github.com/google/budoux.git",
10 | "directory": "javascript"
11 | },
12 | "main": "./dist/index.js",
13 | "module": "./module/index.js",
14 | "exports": {
15 | ".": {
16 | "import": "./module/index.js",
17 | "require": "./dist/index.js"
18 | }
19 | },
20 | "browser": {
21 | "./dist/dom.js": "./dist/dom-browser.js",
22 | "./module/dom.js": "./module/dom-browser.js",
23 | "./dist/tests/testutils.js": "./dist/tests/testutils-browser.js",
24 | "./module/tests/testutils.js": "./module/tests/testutils-browser.js"
25 | },
26 | "bin": {
27 | "budoux": "./bin/budoux.js"
28 | },
29 | "sideEffects": [
30 | "./module/webcomponents/*",
31 | "./module/tests/*"
32 | ],
33 | "scripts": {
34 | "build": "npm run build:esm && npm run build:cjs",
35 | "build:cjs": "tsc && cp -r src/tests/models/ dist/tests/models/",
36 | "build:esm": "tsc --outDir module --module ES2020 && cp -r src/tests/models/ module/tests/models/",
37 | "bundle": "npm run bundle:webcomponents && npm run bundle:test",
38 | "bundle:test": "esbuild module/tests/index.browser.js --bundle --sourcemap --outfile=bundle/tests/index.browser.js",
39 | "bundle:webcomponents": "npm run bundle:webcomponents:ja && npm run bundle:webcomponents:zh-hans && npm run bundle:webcomponents:zh-hant && npm run bundle:webcomponents:th",
40 | "bundle:webcomponents:ja": "esbuild module/webcomponents/budoux-ja.js --bundle --minify --sourcemap --outfile=bundle/budoux-ja.min.js",
41 | "bundle:webcomponents:zh-hans": "esbuild module/webcomponents/budoux-zh-hans.js --bundle --minify --sourcemap --outfile=bundle/budoux-zh-hans.min.js",
42 | "bundle:webcomponents:zh-hant": "esbuild module/webcomponents/budoux-zh-hant.js --bundle --minify --sourcemap --outfile=bundle/budoux-zh-hant.min.js",
43 | "bundle:webcomponents:th": "esbuild module/webcomponents/budoux-th.js --bundle --minify --sourcemap --outfile=bundle/budoux-th.min.js",
44 | "clean": "rm -rf dist module src/data",
45 | "copy": "node ./scripts/copy-data.js",
46 | "prebuild": "npm run clean && npm run copy",
47 | "prepare": "npm run clean && npm run copy && npm run build && npm run bundle",
48 | "pretest": "npm run build && npm run bundle:test",
49 | "test": "npm run test:jasmine && npm run test:karma && npm run test:cli-version",
50 | "test:cli-version": "node ./scripts/check-cli-version.js",
51 | "test:jasmine": "jasmine dist/tests/index.node.js",
52 | "test:karma": "karma start",
53 | "lint": "eslint src/** --no-error-on-unmatched-pattern",
54 | "fix": "eslint src/** --no-error-on-unmatched-pattern --fix"
55 | },
56 | "devDependencies": {
57 | "@eslint/eslintrc": "^3.1.0",
58 | "@eslint/js": "^9.9.0",
59 | "@types/jasmine": "^5.1.0",
60 | "@types/node": "^22.0.0",
61 | "@typescript-eslint/eslint-plugin": "^8.0.1",
62 | "esbuild": "^0.25.0",
63 | "eslint": "^9.9.0",
64 | "eslint-config-prettier": "^10.0.1",
65 | "eslint-plugin-prettier": "^5.0.0",
66 | "jasmine": "^5.0.2",
67 | "jasmine-core": "^5.0.1",
68 | "karma": "^6.4.2",
69 | "karma-chrome-launcher": "^3.2.0",
70 | "karma-jasmine": "^5.1.0",
71 | "typescript": "^5.1.6"
72 | },
73 | "dependencies": {
74 | "commander": "^13.0.0",
75 | "linkedom": "^0.18.7"
76 | },
77 | "overrides": {
78 | "tough-cookie": "^5.0.0-rc.4"
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/javascript/scripts/check-cli-version.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | const assert = require('assert');
18 | const path = require('path');
19 | const childProcess = require('child_process');
20 | const package = require('../package.json');
21 |
22 | const packageVersion = package.version;
23 | const runCli = args =>
24 | new Promise(resolve => {
25 | childProcess.execFile(
26 | 'node',
27 | [path.resolve(__dirname, '..', 'bin', 'budoux.js'), ...args],
28 | (error, stdout, stderr) => {
29 | resolve({
30 | error,
31 | stdout,
32 | stderr,
33 | });
34 | }
35 | );
36 | });
37 |
38 | runCli(['-V']).then(({stdout}) => {
39 | assert.equal(
40 | stdout.replace('\n', ''),
41 | packageVersion,
42 | 'Package version and CLI version output (-V) should match.'
43 | );
44 | });
45 |
46 | runCli(['--version']).then(({stdout}) => {
47 | assert.equal(
48 | stdout.replace('\n', ''),
49 | packageVersion,
50 | 'Package version and CLI version output (--version) should match.'
51 | );
52 | });
53 |
--------------------------------------------------------------------------------
/javascript/scripts/copy-data.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | const path = require('path');
18 | const fs = require('fs');
19 |
20 | const PROJECT_ROOT = path.join(__dirname, '..', '..');
21 | const DATA_DIR = path.join(PROJECT_ROOT, 'javascript', 'src', 'data');
22 | fs.mkdirSync(path.join(DATA_DIR, 'models'), {recursive: true});
23 |
24 | const copyModels = () => {
25 | const modelsDirPath = path.join(PROJECT_ROOT, 'budoux', 'models');
26 | const files = fs.readdirSync(modelsDirPath);
27 | files.forEach(file => {
28 | const ext = file.split('.').pop();
29 | const body = file.split('.').slice(0, -1).join('.');
30 | if (ext !== 'json') return;
31 | const sourcePath = path.join(modelsDirPath, file);
32 | const targetPath = path.join(DATA_DIR, 'models', `${body}.ts`);
33 | const content = fs.readFileSync(sourcePath);
34 | fs.writeFileSync(
35 | targetPath,
36 | `export const model: {[key:string]: {[key:string]: number}} = ${content}`
37 | );
38 | });
39 | };
40 |
41 | const main = () => {
42 | copyModels();
43 | };
44 |
45 | main();
46 |
--------------------------------------------------------------------------------
/javascript/src/cli.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import {readFileSync} from 'fs';
18 | import * as path from 'path';
19 | import * as readline from 'readline';
20 | import {Command} from 'commander';
21 | import {
22 | HTMLProcessingParser,
23 | loadDefaultParsers,
24 | loadDefaultJapaneseParser,
25 | } from './index.js';
26 |
27 | const CLI_VERSION = '0.7.0';
28 | const defaultParsers = loadDefaultParsers();
29 |
30 | /**
31 | * Run the command line interface program.
32 | * @param argv process.argv.
33 | */
34 | export const cli = (argv: string[]) => {
35 | const program = new Command('budoux');
36 |
37 | program.usage('[-h] [-H] [-d STR] [-s STR] [-m JSON] [-l LANG] [-V] [TXT]');
38 | program.description(
39 | 'BudouX is the successor to Budou, the machine learning powered line break organizer tool.'
40 | );
41 | program
42 | .option('-H, --html', 'HTML mode', false)
43 | .option(
44 | '-d, --delim ',
45 | 'output sentence delimiter in TEXT mode',
46 | '---'
47 | )
48 | .option('-s, --sep ', 'output phrase separator in TEXT mode', '\n')
49 | .option('-m, --model ', 'model file path')
50 | .option(
51 | '-l, --lang ',
52 | `language model to use. -m and --model will be prioritized if any.\navailable languages: ${[
53 | ...defaultParsers.keys(),
54 | ].join(', ')}`
55 | )
56 | .argument('[txt]', 'text')
57 | .allowExcessArguments();
58 |
59 | program.version(CLI_VERSION);
60 |
61 | program.parse(argv);
62 |
63 | const options = program.opts();
64 | const {lang, model, delim, sep, html} = options as {
65 | html: boolean;
66 | delim: string;
67 | sep: string;
68 | model?: string;
69 | lang?: string;
70 | };
71 | const {args} = program;
72 |
73 | const parser = model
74 | ? loadCustomParser(model)
75 | : lang && defaultParsers.has(lang)
76 | ? defaultParsers.get(lang)!
77 | : loadDefaultJapaneseParser();
78 |
79 | switch (args.length) {
80 | case 0: {
81 | const rl = readline.createInterface({
82 | input: process.stdin,
83 | });
84 |
85 | let stdin = '';
86 | rl.on('line', line => {
87 | stdin += line + '\n';
88 | });
89 | process.stdin.on('end', () => {
90 | outputParsedTexts(parser, html, delim, sep, [stdin]);
91 | });
92 | break;
93 | }
94 | case 1: {
95 | outputParsedTexts(parser, html, delim, sep, args);
96 | break;
97 | }
98 | default: {
99 | throw new Error(
100 | 'Too many arguments. Please, pass the only one argument.'
101 | );
102 | }
103 | }
104 | };
105 |
106 | /**
107 | * Run the command line interface program.
108 | * @param parser A parser.
109 | * @param html A flag of html output mode.
110 | * @param delim A delimiter to separate output sentence.
111 | * @param sep A separator to separate output phrases.
112 | * @param args string array to parse. Array should have only one element.
113 | */
114 | const outputParsedTexts = (
115 | parser: HTMLProcessingParser,
116 | html: boolean,
117 | delim: string,
118 | sep: string,
119 | args: string[]
120 | ) => {
121 | if (html) {
122 | const text = args[0];
123 | const output = parser.translateHTMLString(text);
124 | console.log(output);
125 | } else {
126 | const splitedTextsByNewLine = args[0]
127 | .split(/\r?\n/)
128 | .filter(text => text !== '');
129 | splitedTextsByNewLine.forEach((text, index) => {
130 | const parsedTexts = parser.parse(text);
131 | console.log(parsedTexts.join(sep));
132 | if (index + 1 !== splitedTextsByNewLine.length) console.log(delim);
133 | });
134 | }
135 | };
136 |
137 | /**
138 | * Loads a parser equipped with custom model.
139 | * @return A parser with the loaded model.
140 | */
141 | const loadCustomParser = (modelPath: string) => {
142 | const file = readFileSync(path.resolve(modelPath)).toString();
143 | const model = JSON.parse(file);
144 | return new HTMLProcessingParser(model);
145 | };
146 |
--------------------------------------------------------------------------------
/javascript/src/dom-browser.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | /**
18 | * Parses an html string and returns a parsed html document.
19 | * @param html An HTML string.
20 | * @return A Document.
21 | */
22 | export const parseFromString = (html: string) => {
23 | return new window.DOMParser().parseFromString(html, 'text/html');
24 | };
25 |
--------------------------------------------------------------------------------
/javascript/src/dom.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import {DOMParser} from 'linkedom';
18 |
19 | /**
20 | * Parses an html string and returns a parsed html document.
21 | * @param html An HTML string.
22 | * @return A Document.
23 | */
24 | export const parseFromString = (html: string) => {
25 | return new DOMParser().parseFromString(
26 | `${html}`,
27 | 'text/html'
28 | );
29 | };
30 |
--------------------------------------------------------------------------------
/javascript/src/index.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import {model as jaModel} from './data/models/ja.js';
18 | import {model as zhHansModel} from './data/models/zh-hans.js';
19 | import {model as zhHantModel} from './data/models/zh-hant.js';
20 | import {model as thModel} from './data/models/th.js';
21 | import {HTMLProcessingParser} from './html_processor.js';
22 |
23 | export {Parser} from './parser.js';
24 | export {HTMLProcessor, HTMLProcessingParser} from './html_processor.js';
25 | export {jaModel, zhHansModel, zhHantModel};
26 |
27 | /**
28 | * Loads a parser equipped with the default Japanese model.
29 | * @return A parser with the default Japanese model.
30 | */
31 | export const loadDefaultJapaneseParser = () => {
32 | return new HTMLProcessingParser(jaModel);
33 | };
34 |
35 | /**
36 | * Loads a parser equipped with the default Simplified Chinese model.
37 | * @return A parser with the default Simplified Chinese model.
38 | */
39 | export const loadDefaultSimplifiedChineseParser = () => {
40 | return new HTMLProcessingParser(zhHansModel);
41 | };
42 |
43 | /**
44 | * Loads a parser equipped with the default Traditional Chinese model.
45 | * @return A parser with the default Traditional Chinese model.
46 | */
47 | export const loadDefaultTraditionalChineseParser = () => {
48 | return new HTMLProcessingParser(zhHantModel);
49 | };
50 |
51 | /**
52 | * Loads a parser equipped with the default Thai model.
53 | * @returns A parser with the default Thai model.
54 | */
55 | export const loadDefaultThaiParser = () => {
56 | return new HTMLProcessingParser(thModel);
57 | };
58 | /**
59 | * Loads available default parsers.
60 | * @return A map between available lang codes and their default parsers.
61 | */
62 | export const loadDefaultParsers = () => {
63 | return new Map([
64 | ['ja', loadDefaultJapaneseParser()],
65 | ['zh-hans', loadDefaultSimplifiedChineseParser()],
66 | ['zh-hant', loadDefaultTraditionalChineseParser()],
67 | ['th', loadDefaultThaiParser()],
68 | ]);
69 | };
70 |
--------------------------------------------------------------------------------
/javascript/src/parser.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | /**
18 | * Base BudouX parser.
19 | */
20 | export class Parser {
21 | /** BudouX model data */
22 | private readonly model: Map>;
23 | private readonly baseScore: number;
24 |
25 | /**
26 | * Constructs a BudouX parser.
27 | * @param model A model data.
28 | */
29 | constructor(model: {[key: string]: {[key: string]: number}}) {
30 | this.model = new Map(
31 | Object.entries(model).map(([k, v]) => [k, new Map(Object.entries(v))])
32 | );
33 | this.baseScore =
34 | -0.5 *
35 | [...this.model.values()]
36 | .map(group => [...group.values()])
37 | .flat()
38 | .reduce((prev, curr) => prev + curr, 0);
39 | }
40 |
41 | /**
42 | * Parses the input sentence and returns a list of semantic chunks.
43 | *
44 | * @param sentence An input sentence.
45 | * @return The retrieved chunks.
46 | */
47 | parse(sentence: string): string[] {
48 | if (sentence === '') return [];
49 | const boundaries = this.parseBoundaries(sentence);
50 | const result = [];
51 | let start = 0;
52 | for (const boundary of boundaries) {
53 | result.push(sentence.slice(start, boundary));
54 | start = boundary;
55 | }
56 | result.push(sentence.slice(start));
57 | return result;
58 | }
59 |
60 | /**
61 | * Parses the input sentence and returns a list of boundaries.
62 | *
63 | * @param sentence An input sentence.
64 | * @return The list of boundaries.
65 | */
66 | parseBoundaries(sentence: string): number[] {
67 | const result = [];
68 |
69 | for (let i = 1; i < sentence.length; i++) {
70 | let score = this.baseScore;
71 | // NOTE: Score values in models may be negative.
72 | /* eslint-disable */
73 | score += this.model.get('UW1')?.get(sentence.substring(i - 3, i - 2)) || 0;
74 | score += this.model.get('UW2')?.get(sentence.substring(i - 2, i - 1)) || 0;
75 | score += this.model.get('UW3')?.get(sentence.substring(i - 1, i)) || 0;
76 | score += this.model.get('UW4')?.get(sentence.substring(i, i + 1)) || 0;
77 | score += this.model.get('UW5')?.get(sentence.substring(i + 1, i + 2)) || 0;
78 | score += this.model.get('UW6')?.get(sentence.substring(i + 2, i + 3)) || 0;
79 | score += this.model.get('BW1')?.get(sentence.substring(i - 2, i)) || 0;
80 | score += this.model.get('BW2')?.get(sentence.substring(i - 1, i + 1)) || 0;
81 | score += this.model.get('BW3')?.get(sentence.substring(i, i + 2)) || 0;
82 | score += this.model.get('TW1')?.get(sentence.substring(i - 3, i)) || 0;
83 | score += this.model.get('TW2')?.get(sentence.substring(i - 2, i + 1)) || 0;
84 | score += this.model.get('TW3')?.get(sentence.substring(i - 1, i + 2)) || 0;
85 | score += this.model.get('TW4')?.get(sentence.substring(i, i + 3)) || 0;
86 | /* eslint-enable */
87 | if (score > 0) result.push(i);
88 | }
89 | return result;
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/javascript/src/tests/index.browser.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2023 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import './test_html_processor.js';
18 | import './test_parser.js';
19 | import './test_webcomponents.js';
20 |
--------------------------------------------------------------------------------
/javascript/src/tests/index.node.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2023 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import './test_cli.js';
18 | import './test_html_processor.js';
19 | import './test_parser.js';
20 |
--------------------------------------------------------------------------------
/javascript/src/tests/models/separate_right_before_a.json:
--------------------------------------------------------------------------------
1 | {"UW4": {"a": 1001}}
2 |
--------------------------------------------------------------------------------
/javascript/src/tests/test_cli.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import {cli} from '../cli.js';
18 | import {execFile, ExecFileException} from 'child_process';
19 | import * as path from 'path';
20 | import stream from 'stream';
21 | import {loadDefaultParsers} from '../index.js';
22 |
23 | type execFileCallBack = {
24 | error: ExecFileException | null;
25 | stdout: string;
26 | stderr: string;
27 | };
28 |
29 | const runCli = (args: string[], stdin?: string): Promise => {
30 | return new Promise(resolve => {
31 | const binPath = path.resolve('./bin/budoux.js');
32 | const child = execFile(
33 | 'node',
34 | [binPath, ...args],
35 | (error, stdout, stderr) => {
36 | resolve({
37 | error,
38 | stdout,
39 | stderr,
40 | });
41 | }
42 | );
43 |
44 | if (stdin) {
45 | const stdinStream = new stream.Readable();
46 | stdinStream.push(stdin);
47 | stdinStream.push(null);
48 | if (child.stdin) {
49 | stdinStream.pipe(child.stdin);
50 | }
51 | }
52 | });
53 | };
54 |
55 | describe('cli', () => {
56 | it('should output the wrapped HTML sentence when execute budoux command with --html option.', async () => {
57 | const inputText = '今日は天気です。';
58 | const argv = ['--html', inputText];
59 | const expectedStdOut =
60 | '今日は\u200B天気です。 ';
61 | const {stdout} = await runCli(argv);
62 | expect(stdout.trim()).toBe(expectedStdOut);
63 | });
64 |
65 | it('should output the wrapped HTML sentence when execute budoux command with -H option alias.', async () => {
66 | const inputText = '今日は天気です。';
67 | const argv = ['-H', inputText];
68 | const expectedStdOut =
69 | '今日は\u200B天気です。 ';
70 | const {stdout} = await runCli(argv);
71 | expect(stdout.trim()).toBe(expectedStdOut);
72 | });
73 |
74 | it('should output the separated sentence with custom model when execute budoux command with --model option.', async () => {
75 | const inputText = 'abcdeabcd';
76 | const customModelPath = path.resolve(
77 | __dirname,
78 | 'models',
79 | 'separate_right_before_a.json'
80 | );
81 | const argv = ['--model', customModelPath, inputText];
82 | const expectedStdOut = 'abcde\nabcd';
83 | const {stdout} = await runCli(argv);
84 | expect(stdout.trim()).toBe(expectedStdOut);
85 | });
86 |
87 | it('should output the separated sentence with custom model when execute budoux command with -m option alias.', async () => {
88 | const inputText = 'abcdeabcd';
89 | const customModelPath = path.resolve(
90 | __dirname,
91 | 'models',
92 | 'separate_right_before_a.json'
93 | );
94 | const argv = ['-m', customModelPath, inputText];
95 | const expectedStdOut = 'abcde\nabcd';
96 | const {stdout} = await runCli(argv);
97 | expect(stdout.trim()).toBe(expectedStdOut);
98 | });
99 |
100 | it('should use the corresponding language model when the -l parameter is given.', async () => {
101 | const inputTextHans = '我们的使命是整合全球信息,供大众使用,让人人受益。';
102 | const expectedStdOut = loadDefaultParsers()
103 | .get('zh-hans')!
104 | .parse(inputTextHans)
105 | .join('\n');
106 | const argv = ['-l', 'zh-hans', inputTextHans];
107 | const {stdout} = await runCli(argv);
108 | expect(stdout.trim()).toBe(expectedStdOut);
109 | });
110 |
111 | it('should use the corresponding language model when the --lang parameter is given.', async () => {
112 | const inputTextHans = '我們的使命是匯整全球資訊,供大眾使用,使人人受惠。';
113 | const expectedStdOut = loadDefaultParsers()
114 | .get('zh-hant')!
115 | .parse(inputTextHans)
116 | .join('\n');
117 | const argv = ['--lang', 'zh-hant', inputTextHans];
118 | const {stdout} = await runCli(argv);
119 | expect(stdout.trim()).toBe(expectedStdOut);
120 | });
121 |
122 | it('should prioritize -m and --model over -l and --lang', async () => {
123 | const inputTextHans = '我們的使a命';
124 | const customModelPath = path.resolve(
125 | __dirname,
126 | 'models',
127 | 'separate_right_before_a.json'
128 | );
129 | const argv = [
130 | '--model',
131 | customModelPath,
132 | '--lang',
133 | 'zh-hant',
134 | inputTextHans,
135 | ];
136 | const expectedStdOut = '我們的使\na命';
137 | const {stdout} = await runCli(argv);
138 | expect(stdout.trim()).toBe(expectedStdOut);
139 | });
140 |
141 | it('should output the separated sentence with separater when execute budoux command with --delim option.', async () => {
142 | const inputText = '今日は天気です。\n明日は雨かな?';
143 | const argv = ['--delim', '###', inputText];
144 | const expectedStdOut = '今日は\n天気です。\n###\n明日は\n雨かな?';
145 | const {stdout} = await runCli(argv);
146 | expect(stdout.trim()).toBe(expectedStdOut);
147 | });
148 |
149 | it('should output the separated sentence with separater when execute budoux command with -d option alias.', async () => {
150 | const inputText = '今日は天気です。\n明日は雨かな?';
151 | const argv = ['-d', '###', inputText];
152 | const expectedStdOut = '今日は\n天気です。\n###\n明日は\n雨かな?';
153 | const {stdout} = await runCli(argv);
154 | expect(stdout.trim()).toBe(expectedStdOut);
155 | });
156 |
157 | it('should output the separated sentence with separater when execute budoux with stdin inputed by pipe', async () => {
158 | const {stdout} = await runCli([], '今日は天気です。\n明日は雨かな?');
159 | const expectedStdOut = '今日は\n天気です。\n---\n明日は\n雨かな?';
160 | expect(stdout.trim()).toBe(expectedStdOut);
161 | });
162 |
163 | it('should output phrases with the separator specified by -s option', async () => {
164 | const inputText = '今日は天気です。';
165 | const argv = ['-s', '/', inputText];
166 | const expectedStdOut = '今日は/天気です。';
167 | const {stdout} = await runCli(argv);
168 | expect(stdout.trim()).toBe(expectedStdOut);
169 | });
170 |
171 | it('should output phrases with the separator specified by --sep option', async () => {
172 | const inputText = '今日は天気です。';
173 | const argv = ['--sep', '/', inputText];
174 | const expectedStdOut = '今日は/天気です。';
175 | const {stdout} = await runCli(argv);
176 | expect(stdout.trim()).toBe(expectedStdOut);
177 | });
178 |
179 | it('should output the error message when get more than one text argument.', () => {
180 | const argv = [
181 | 'node',
182 | 'budoux',
183 | '今日は天気です。',
184 | '明日は晴れるでしょう。',
185 | ];
186 | const stab = () => cli(argv);
187 |
188 | expect(stab).toThrowError(
189 | 'Too many arguments. Please, pass the only one argument.'
190 | );
191 | });
192 |
193 | it('should output the error message when get extra option argument.', () => {
194 | const argv = [
195 | 'node',
196 | 'budoux',
197 | '--delim',
198 | '---',
199 | '',
200 | '今日は天気です。',
201 | ];
202 | const stab = () => cli(argv);
203 |
204 | expect(stab).toThrowError(
205 | 'Too many arguments. Please, pass the only one argument.'
206 | );
207 | });
208 |
209 | it('should output the error message when get extra option argument.', () => {
210 | const customModelPath = path.resolve(
211 | __dirname,
212 | 'models',
213 | 'separate_right_before_a.json'
214 | );
215 | const argv = [
216 | 'node',
217 | 'budoux',
218 | '--model',
219 | customModelPath,
220 | '',
221 | '今日は天気です。',
222 | ];
223 | const stab = () => cli(argv);
224 |
225 | expect(stab).toThrowError(
226 | 'Too many arguments. Please, pass the only one argument.'
227 | );
228 | });
229 |
230 | it('should output the unknown option error when execute budoux command with -v option.', async () => {
231 | const {stderr} = await runCli(['-v']);
232 |
233 | expect(stderr).toBe("error: unknown option '-v'\n");
234 | });
235 | });
236 |
--------------------------------------------------------------------------------
/javascript/src/tests/test_parser.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import {Parser} from '../parser.js';
18 |
19 | describe('Parser.parse', () => {
20 | const TEST_SENTENCE = 'abcdeabcd';
21 |
22 | it('should separate if a strong feature item supports.', () => {
23 | const model = {
24 | UW4: {a: 10000}, // means "should separate right before 'a'".
25 | };
26 | const parser = new Parser(model);
27 | const result = parser.parse(TEST_SENTENCE);
28 | expect(result).toEqual(['abcde', 'abcd']);
29 | });
30 |
31 | it('should separate even if it makes a phrase of one character.', () => {
32 | const model = {
33 | UW4: {b: 10000}, // means "should separate right before 'b'".
34 | };
35 | const parser = new Parser(model);
36 | const result = parser.parse(TEST_SENTENCE);
37 | expect(result).toEqual(['a', 'bcdea', 'bcd']);
38 | });
39 |
40 | it('should return an empty list when the input is a blank string.', () => {
41 | const parser = new Parser({});
42 | const result = parser.parse('');
43 | expect(result).toEqual([]);
44 | });
45 | });
46 |
--------------------------------------------------------------------------------
/javascript/src/tests/test_webcomponents.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import '../webcomponents/budoux-ja.js';
18 |
19 | describe('Web Components', () => {
20 | beforeAll(async () => {
21 | await window.customElements.whenDefined('budoux-ja');
22 | });
23 |
24 | beforeEach(() => {
25 | window.document.body.innerText = '';
26 | });
27 |
28 | it('should process the provided text.', () => {
29 | const budouxElement = window.document.createElement('budoux-ja');
30 | budouxElement.textContent = '今日は良い天気です。';
31 | window.document.body.appendChild(budouxElement);
32 |
33 | expect(budouxElement.innerHTML).toBe('今日は\u200B良い\u200B天気です。');
34 | });
35 |
36 | it('should react to text content changes after attached.', resolve => {
37 | const budouxElement = window.document.createElement('budoux-ja');
38 | budouxElement.textContent = '今日は良い天気です。';
39 | window.document.body.appendChild(budouxElement);
40 |
41 | const observer = new window.MutationObserver(() => {
42 | expect(budouxElement.innerHTML).toBe('明日は\u200B晴れるかな?');
43 | resolve();
44 | });
45 | observer.observe(budouxElement, {
46 | childList: true,
47 | });
48 | budouxElement.textContent = '明日は晴れるかな?';
49 | });
50 |
51 | it('should work with HTML inputs.', () => {
52 | const budouxElement = window.document.createElement('budoux-ja');
53 | budouxElement.appendChild(window.document.createTextNode('昨日は'));
54 | const b = window.document.createElement('b');
55 | b.textContent = '雨';
56 | budouxElement.appendChild(b);
57 | budouxElement.appendChild(window.document.createTextNode('でした。'));
58 | window.document.body.appendChild(budouxElement);
59 | expect(budouxElement.innerHTML).toBe('昨日は\u200B雨 でした。');
60 | });
61 |
62 | it('should have wrapping styles to control line breaks.', () => {
63 | const budouxElement = window.document.createElement('budoux-ja');
64 | budouxElement.textContent = 'Hello world';
65 | window.document.body.appendChild(budouxElement);
66 | const styles = budouxElement.computedStyleMap();
67 | expect(styles.get('word-break')?.toString()).toBe('keep-all');
68 | expect(styles.get('overflow-wrap')?.toString()).toBe('anywhere');
69 | });
70 | });
71 |
--------------------------------------------------------------------------------
/javascript/src/tests/testutils-browser.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2025 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | /**
18 | * Sets an innerHTML on a given Element.
19 | * @param element An Element.
20 | * @param html An HTML string to set.
21 | */
22 | export const setInnerHtml = (element: Element, html: string) => {
23 | element.innerHTML = html;
24 | };
25 |
26 | /**
27 | * Creates an HTML document.
28 | * @returns Document
29 | */
30 | export const createDocument = () => {
31 | return window.document;
32 | };
33 |
34 | /**
35 | * Whether the running environment is a Web browser.
36 | */
37 | export const isBrowser = true;
38 |
--------------------------------------------------------------------------------
/javascript/src/tests/testutils.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2025 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import {parseHTML} from 'linkedom';
18 |
19 | /**
20 | * Sets an innerHTML on a given Element.
21 | * @param element An Element.
22 | * @param html An HTML string to set.
23 | */
24 | export const setInnerHtml = (element: Element, html: string) => {
25 | element.innerHTML = html;
26 | };
27 |
28 | /**
29 | * Creates an HTML document.
30 | * @returns Document
31 | */
32 | export const createDocument = () => {
33 | const {document} = parseHTML('');
34 | return document;
35 | };
36 |
37 | /**
38 | * Whether the running environment is a Web browser.
39 | */
40 | export const isBrowser = false;
41 |
--------------------------------------------------------------------------------
/javascript/src/webcomponents/budoux-base.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import {applyWrapStyle, type HTMLProcessingParser} from '../html_processor.js';
18 |
19 | const MUTATION_OBSERVER_OPTIONS = {
20 | attributes: false,
21 | characterData: true,
22 | childList: true,
23 | subtree: true,
24 | };
25 |
26 | /**
27 | * Base BudouX Web component.
28 | */
29 | export abstract class BudouXBaseElement extends HTMLElement {
30 | abstract parser: HTMLProcessingParser;
31 | observer: MutationObserver;
32 |
33 | /**
34 | * Base BudouX Web component constructor.
35 | */
36 | constructor() {
37 | super();
38 |
39 | this.observer = new MutationObserver(this.sync.bind(this));
40 | this.observer.observe(this, MUTATION_OBSERVER_OPTIONS);
41 | }
42 |
43 | connectedCallback() {
44 | applyWrapStyle(this);
45 | this.sync();
46 | }
47 |
48 | attributeChangedCallback() {
49 | this.sync();
50 | }
51 |
52 | sync() {
53 | this.observer.disconnect();
54 | this.parser.applyToElement(this);
55 | this.observer.observe(this, MUTATION_OBSERVER_OPTIONS);
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/javascript/src/webcomponents/budoux-ja.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import {
18 | type HTMLProcessingParser,
19 | loadDefaultJapaneseParser,
20 | } from '../index.js';
21 | import {BudouXBaseElement} from './budoux-base.js';
22 |
23 | /**
24 | * BudouX Japanese Web component.
25 | */
26 | export class BudouXJapaneseElement extends BudouXBaseElement {
27 | parser: HTMLProcessingParser;
28 |
29 | /**
30 | * BudouX Japanese Web component constructor.
31 | */
32 | constructor() {
33 | super();
34 | this.parser = loadDefaultJapaneseParser();
35 | }
36 | }
37 |
38 | customElements.define('budoux-ja', BudouXJapaneseElement);
39 |
--------------------------------------------------------------------------------
/javascript/src/webcomponents/budoux-th.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2023 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import {type HTMLProcessingParser, loadDefaultThaiParser} from '../index.js';
18 | import {BudouXBaseElement} from './budoux-base.js';
19 |
20 | /**
21 | * BudouX Thai Web component.
22 | */
23 | export class BudouXThaiElement extends BudouXBaseElement {
24 | parser: HTMLProcessingParser;
25 |
26 | /**
27 | * BudouX Thai Web component constructor.
28 | */
29 | constructor() {
30 | super();
31 | this.parser = loadDefaultThaiParser();
32 | }
33 | }
34 |
35 | customElements.define('budoux-th', BudouXThaiElement);
36 |
--------------------------------------------------------------------------------
/javascript/src/webcomponents/budoux-zh-hans.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2021 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import {
18 | type HTMLProcessingParser,
19 | loadDefaultSimplifiedChineseParser,
20 | } from '../index.js';
21 | import {BudouXBaseElement} from './budoux-base.js';
22 |
23 | /**
24 | * BudouX Simplified Chinese Web component.
25 | */
26 | export class BudouXSimplifiedChineseElement extends BudouXBaseElement {
27 | parser: HTMLProcessingParser;
28 |
29 | /**
30 | * BudouX Simplified Chinese Web component constructor.
31 | */
32 | constructor() {
33 | super();
34 | this.parser = loadDefaultSimplifiedChineseParser();
35 | }
36 | }
37 |
38 | customElements.define('budoux-zh-hans', BudouXSimplifiedChineseElement);
39 |
--------------------------------------------------------------------------------
/javascript/src/webcomponents/budoux-zh-hant.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @license
3 | * Copyright 2022 Google LLC
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import {
18 | type HTMLProcessingParser,
19 | loadDefaultTraditionalChineseParser,
20 | } from '../index.js';
21 | import {BudouXBaseElement} from './budoux-base.js';
22 |
23 | /**
24 | * BudouX Traditional Chinese Web component.
25 | */
26 | export class BudouXTraditionalChineseElement extends BudouXBaseElement {
27 | parser: HTMLProcessingParser;
28 |
29 | /**
30 | * BudouX Traditional Chinese Web component constructor.
31 | */
32 | constructor() {
33 | super();
34 | this.parser = loadDefaultTraditionalChineseParser();
35 | }
36 | }
37 |
38 | customElements.define('budoux-zh-hant', BudouXTraditionalChineseElement);
39 |
--------------------------------------------------------------------------------
/javascript/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "allowUnreachableCode": false,
4 | "allowUnusedLabels": false,
5 | "lib": ["es6", "dom", "dom.iterable"],
6 | "target": "es2017",
7 | "module": "commonjs",
8 | "moduleResolution": "node",
9 | "noEmitOnError": true,
10 | "noFallthroughCasesInSwitch": true,
11 | "noImplicitReturns": true,
12 | "pretty": true,
13 | "resolveJsonModule": true,
14 | "declaration": true,
15 | "sourceMap": true,
16 | "esModuleInterop": true,
17 | "forceConsistentCasingInFileNames": true,
18 | "strict": true,
19 | "skipLibCheck": true,
20 | "outDir": "./dist"
21 | },
22 | "exclude": [
23 | "node_modules"
24 | ],
25 | "include": ["src/**/*.ts"]
26 | }
27 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["wheel", "setuptools"]
3 | build-backend = "setuptools.build_meta"
4 |
--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/budoux/1f232ee0b7b11f9a7b7b3e02a0eecc2aaafe26ca/scripts/__init__.py
--------------------------------------------------------------------------------
/scripts/build_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Builds a model from the learned weights.
15 |
16 | This script outputs a model file in JSON format from the learned weights file
17 | output by the `train.py` script.
18 | """
19 |
20 | import argparse
21 | import json
22 | import typing
23 |
24 |
25 | def aggregate_scores(
26 | weights: typing.List[str]) -> typing.Dict[str, typing.Dict[str, float]]:
27 | """Exports the model by aggregating the weight scores.
28 |
29 | Args:
30 | weights (List[str]): The lines of exported weight score file.
31 |
32 | Returns:
33 | model (Dict[string, Dict[string, float]]) The exported model.
34 | """
35 | decision_trees: typing.Dict[str, typing.Dict[str, float]] = dict()
36 | for row in weights:
37 | row = row.strip()
38 | if not row:
39 | continue
40 | feature = row.split('\t')[0]
41 | feature_group, feature_content = feature.split(':', 1)
42 | score = float(row.split('\t')[1])
43 | decision_trees.setdefault(feature_group, {})
44 | decision_trees[feature_group].setdefault(feature_content, 0)
45 | decision_trees[feature_group][feature_content] += score
46 | return decision_trees
47 |
48 |
49 | def round_model(model: typing.Dict[str, typing.Dict[str, float]],
50 | scale: int) -> typing.Dict[str, typing.Dict[str, int]]:
51 | """Rounds the scores in the model to integer after scaling.
52 |
53 | Args:
54 | model (Dict[str, Dict[str, float]]): The model to round scores.
55 | scale (int, optional): A scale factor to multiply scores.
56 |
57 | Returns:
58 | model_rounded (Dict[str, Dict[str, int]]) The rounded model.
59 | """
60 | model_rounded: typing.Dict[str, typing.Dict[str, int]] = dict()
61 | for feature_group, features in model.items():
62 | for feature_content, score in features.items():
63 | scaled_score = int(score * scale)
64 | if abs(scaled_score) > 0:
65 | model_rounded.setdefault(feature_group, {})
66 | model_rounded[feature_group][feature_content] = scaled_score
67 | return model_rounded
68 |
69 |
70 | def parse_args(
71 | test: typing.Optional[typing.List[str]] = None) -> argparse.Namespace:
72 | """Parses commandline arguments.
73 |
74 | Args:
75 | test (typing.Optional[typing.List[str]], optional): Commandline args for
76 | testing. Defaults to None.
77 |
78 | Returns:
79 | Parsed arguments (argparse.Namespace).
80 | """
81 | parser = argparse.ArgumentParser(
82 | description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
83 | parser.add_argument(
84 | 'weight_file', help='A file path for the learned weights.')
85 | parser.add_argument(
86 | '-o',
87 | '--outfile',
88 | help='A file path to export a model file. (default: model.json)',
89 | default='model.json',
90 | type=str)
91 | parser.add_argument(
92 | '--scale',
93 | help='A scale factor for the output scores',
94 | default=1000,
95 | type=int)
96 | if test is None:
97 | return parser.parse_args()
98 | else:
99 | return parser.parse_args(test)
100 |
101 |
102 | def main() -> None:
103 | args = parse_args()
104 | weights_filename = args.weight_file
105 | model_filename = args.outfile
106 | scale = args.scale
107 | with open(weights_filename) as f:
108 | weights = f.readlines()
109 | model = aggregate_scores(weights)
110 | model_rounded = round_model(model, scale)
111 | with open(model_filename, 'w', encoding='utf-8') as f:
112 | json.dump(model_rounded, f, ensure_ascii=False, separators=(',', ':'))
113 | print('Model file is exported as', model_filename)
114 |
115 |
116 | if __name__ == '__main__':
117 | main()
118 |
--------------------------------------------------------------------------------
/scripts/encode_data.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Encodes the training data with extracted features."""
15 |
16 | import argparse
17 | import functools
18 | import itertools
19 | import multiprocessing
20 | import os
21 | import sys
22 | import typing
23 |
24 | # module hack
25 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
26 | sys.path.insert(0, os.path.abspath(LIB_PATH))
27 |
28 | from budoux import utils # noqa (module hack)
29 |
30 | ArgList = typing.Optional[typing.List[str]]
31 | DEFAULT_OUTPUT_FILENAME = 'encoded_data.txt'
32 |
33 | INVALID = '▔'
34 | """The invalid feature string."""
35 |
36 |
37 | def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str,
38 | w6: str) -> typing.List[str]:
39 | """Generates a feature from characters around (w1-6).
40 |
41 | Args:
42 | w1 (str): The character 3 characters before the break point.
43 | w2 (str): The character 2 characters before the break point.
44 | w3 (str): The character right before the break point.
45 | w4 (str): The character right after the break point.
46 | w5 (str): The character 2 characters after the break point.
47 | w6 (str): The character 3 characters after the break point.
48 |
49 | Returns:
50 | The feature (list[str]).
51 |
52 | """
53 | raw_feature = {
54 | 'UW1': w1,
55 | 'UW2': w2,
56 | 'UW3': w3,
57 | 'UW4': w4,
58 | 'UW5': w5,
59 | 'UW6': w6,
60 | 'BW1': w2 + w3,
61 | 'BW2': w3 + w4,
62 | 'BW3': w4 + w5,
63 | 'TW1': w1 + w2 + w3,
64 | 'TW2': w2 + w3 + w4,
65 | 'TW3': w3 + w4 + w5,
66 | 'TW4': w4 + w5 + w6,
67 | }
68 | for key, value in list(raw_feature.items()):
69 | if INVALID in value:
70 | del raw_feature[key]
71 | return [f'{item[0]}:{item[1]}' for item in raw_feature.items()]
72 |
73 |
74 | def parse_args(test: ArgList = None) -> argparse.Namespace:
75 | """Parses commandline arguments.
76 |
77 | Args:
78 | test (typing.Optional[typing.List[str]], optional): Commandline args for testing. Defaults to None.
79 |
80 | Returns:
81 | argparse.Namespace: Parsed data of args.
82 | """
83 | parser = argparse.ArgumentParser(description=__doc__)
84 | parser.add_argument(
85 | 'source_data',
86 | help='''File path of the source training data to extract features.''')
87 | parser.add_argument(
88 | '-o',
89 | '--outfile',
90 | help='''Output file path for the encoded training data.
91 | (default: encoded_data.txt)''',
92 | default=DEFAULT_OUTPUT_FILENAME)
93 | parser.add_argument(
94 | '--processes',
95 | type=int,
96 | help='''Number of processes to use.
97 | (default: the number of CPUs in the system)''',
98 | default=None)
99 | parser.add_argument(
100 | '--scale',
101 | type=int,
102 | help='''Weight scale for the entries. The value should be a unsigned
103 | integer. (default: 1)''',
104 | default=1)
105 | if test is None:
106 | return parser.parse_args()
107 | else:
108 | return parser.parse_args(test)
109 |
110 |
111 | def process(i: int, sentence: str, sep_indices: typing.Set[int],
112 | scale: int) -> str:
113 | """Outputs an encoded line of features from the given index.
114 |
115 | Args:
116 | i (int): index
117 | sentence (str): A sentence
118 | sep_indices (typing.Set[int]): A set of separator indices.
119 | scale (int): A weight scale for the entries.
120 | """
121 | feature = get_feature(sentence[i - 3] if i > 2 else INVALID,
122 | sentence[i - 2] if i > 1 else INVALID, sentence[i - 1],
123 | sentence[i] if i < len(sentence) else INVALID,
124 | sentence[i + 1] if i + 1 < len(sentence) else INVALID,
125 | sentence[i + 2] if i + 2 < len(sentence) else INVALID)
126 | positive = i in sep_indices
127 | line = '\t'.join(['%d' % (scale) if positive else '%d' % (-scale)] + feature)
128 | return line
129 |
130 |
131 | def normalize_input(data: str) -> typing.Tuple[str, typing.Set[int]]:
132 | """Normalizes the input to one line with separators.
133 |
134 | Args:
135 | data(str): Source input
136 |
137 | Returns:
138 | typing.Tuple[str, typing.Set[int]]: A tuple of the sentence and the
139 | separator indices.
140 | """
141 | chunks = data.replace('\n', utils.SEP).strip().split(utils.SEP)
142 | chunk_lengths = [len(chunk) for chunk in chunks]
143 | sep_indices = set(itertools.accumulate(chunk_lengths, lambda x, y: x + y))
144 | sentence = ''.join(chunks)
145 | return (sentence, sep_indices)
146 |
147 |
148 | def main(test: ArgList = None) -> None:
149 | args = parse_args(test)
150 | source_filename: str = args.source_data
151 | entries_filename: str = args.outfile
152 | processes = None if args.processes is None else int(args.processes)
153 | scale: int = args.scale
154 | with open(source_filename, encoding=sys.getdefaultencoding()) as f:
155 | data = f.read()
156 | sentence, sep_indices = normalize_input(data)
157 | with multiprocessing.Pool(processes) as p:
158 | func = functools.partial(
159 | process, sentence=sentence, sep_indices=sep_indices, scale=scale)
160 | lines = p.map(func, range(1, len(sentence) + 1))
161 |
162 | with open(entries_filename, 'w', encoding=sys.getdefaultencoding()) as f:
163 | for line in lines:
164 | f.write(line + '\n')
165 |
166 | print('\033[92mEncoded training data is out at: %s\033[0m' % entries_filename)
167 |
168 |
169 | if __name__ == '__main__':
170 | main()
171 |
--------------------------------------------------------------------------------
/scripts/finetune.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Finetunes a BudouX model with the given training dataset.
15 |
16 | Example usage:
17 |
18 | $ python finetune.py train_data.txt base_model.json -o weights.txt --val_data=val_data.txt
19 | """
20 |
21 | import argparse
22 | import array
23 | import json
24 | import typing
25 | from collections import OrderedDict
26 |
27 | from jax import Array, grad, jit
28 | from jax import numpy as jnp
29 |
30 | EPSILON = float(jnp.finfo(float).eps)
31 | DEFAULT_OUTPUT_NAME = 'finetuned-weights.txt'
32 | DEFAULT_NUM_ITERS = 1000
33 | DEFAULT_LOG_SPAN = 100
34 | DEFAULT_LEARNING_RATE = 0.01
35 |
36 |
37 | class NormalizedModel(typing.NamedTuple):
38 | features: typing.List[str]
39 | weights: Array
40 |
41 |
42 | class Dataset(typing.NamedTuple):
43 | X: Array
44 | Y: Array
45 |
46 |
47 | class Metrics(typing.NamedTuple):
48 | tp: int
49 | tn: int
50 | fp: int
51 | fn: int
52 | accuracy: float
53 | precision: float
54 | recall: float
55 | fscore: float
56 | loss: float
57 |
58 |
59 | def load_model(file_path: str) -> NormalizedModel:
60 | """Loads a model as a pair of a features list and a normalized weight vector.
61 |
62 | Args:
63 | file_path: A file path for the model JSON file.
64 |
65 | Returns:
66 | A normalized model, which is a pair of a list of feature identifiers and a
67 | normalized weight vector.
68 | """
69 | with open(file_path) as f:
70 | model = json.load(f)
71 | model_flat = OrderedDict()
72 | for category in model:
73 | for item in model[category]:
74 | model_flat['%s:%s' % (category, item)] = model[category][item]
75 | weights = jnp.array(list(model_flat.values()))
76 | weights = weights / weights.std()
77 | weights = weights - weights.mean()
78 | keys = list(model_flat.keys())
79 | return NormalizedModel(keys, weights)
80 |
81 |
82 | def load_dataset(file_path: str, model: NormalizedModel) -> Dataset:
83 | """Loads a dataset from the given file path.
84 |
85 | Args:
86 | file_path: A file path for the encoded data file.
87 | model: A normalized model.
88 |
89 | Returns:
90 | A dataset of inputs (X) and outputs (Y).
91 | """
92 | xs = []
93 | ys = array.array('B')
94 | with open(file_path) as f:
95 | for row in f:
96 | cols = row.strip().split('\t')
97 | if len(cols) < 2:
98 | continue
99 | ys.append(cols[0] == '1')
100 | xs.append(tuple(k in set(cols[1:]) for k in model.features))
101 | X = jnp.array(xs) * 2 - 1
102 | Y = jnp.array(ys)
103 | return Dataset(X, Y)
104 |
105 |
106 | def cross_entropy_loss(weights: Array, x: Array, y: Array) -> Array:
107 | """Calcurates a cross entropy loss with a prediction by a sigmoid function.
108 |
109 | Args:
110 | weights: A weight vector.
111 | x: An input array.
112 | y: A target output array.
113 |
114 | Returns:
115 | A cross entropy loss.
116 | """
117 | pred = 1 / (1 + jnp.exp(-x.dot(weights)))
118 | return -jnp.mean(y * jnp.log(pred) + (1 - y) * jnp.log(1 - pred))
119 |
120 |
121 | def get_metrics(weights: Array, dataset: Dataset) -> Metrics:
122 | """Gets evaluation metrics from the learned weight vector and the dataset.
123 |
124 | Args:
125 | weights: A weight vector.
126 | dataset: A dataset.
127 |
128 | Returns:
129 | result (Metrics): The metrics over the given weights and the dataset.
130 | """
131 | pred = dataset.X.dot(weights) > 0
132 | actual = dataset.Y
133 | tp: int = jnp.sum(jnp.logical_and(pred == 1, actual == 1)) # type: ignore
134 | tn: int = jnp.sum(jnp.logical_and(pred == 0, actual == 0)) # type: ignore
135 | fp: int = jnp.sum(jnp.logical_and(pred == 1, actual == 0)) # type: ignore
136 | fn: int = jnp.sum(jnp.logical_and(pred == 0, actual == 1)) # type: ignore
137 | loss: float = cross_entropy_loss(weights, dataset.X,
138 | dataset.Y) # type: ignore
139 | accuracy = (tp + tn) / (tp + tn + fp + fn)
140 | precision = tp / (tp + fp + EPSILON)
141 | recall = tp / (tp + fn + EPSILON)
142 | fscore = 2 * precision * recall / (precision + recall + EPSILON)
143 | return Metrics(
144 | tp=tp,
145 | tn=tn,
146 | fp=fp,
147 | fn=fn,
148 | accuracy=accuracy,
149 | precision=precision,
150 | recall=recall,
151 | fscore=fscore,
152 | loss=loss,
153 | )
154 |
155 |
156 | def fit(weights: Array,
157 | train_dataset: Dataset,
158 | iters: int,
159 | learning_rate: float,
160 | log_span: int,
161 | val_dataset: typing.Optional[Dataset] = None) -> Array:
162 | """Updates the weights with the given dataset.
163 |
164 | Args:
165 | weights: A weight vector.
166 | train_dataset: A train dataset.
167 | iters: A number of iterations.
168 | learning_rate: A learning rate.
169 | log_span: A span to log metrics.
170 | val_dataset: A validation dataset (optional).
171 |
172 | Returns:
173 | An updated weight vector.
174 | """
175 | grad_loss = jit(grad(cross_entropy_loss, argnums=0))
176 | for t in range(iters):
177 | weights = weights - learning_rate * grad_loss(weights, train_dataset.X,
178 | train_dataset.Y)
179 | if (t + 1) % log_span != 0:
180 | continue
181 | metrics_train = jit(get_metrics)(weights, train_dataset)
182 | print()
183 | print('iter:\t%d' % (t + 1))
184 | print()
185 | print('train accuracy:\t%.5f' % metrics_train.accuracy)
186 | print('train prec.:\t%.5f' % metrics_train.precision)
187 | print('train recall:\t%.5f' % metrics_train.recall)
188 | print('train fscore:\t%.5f' % metrics_train.fscore)
189 | print('train loss:\t%.5f' % metrics_train.loss)
190 | print()
191 |
192 | if val_dataset is None:
193 | continue
194 | metrics_val = jit(get_metrics)(weights, val_dataset)
195 | print('val accuracy:\t%.5f' % metrics_val.accuracy)
196 | print('val prec.:\t%.5f' % metrics_val.precision)
197 | print('val recall:\t%.5f' % metrics_val.recall)
198 | print('val fscore:\t%.5f' % metrics_val.fscore)
199 | print('val loss:\t%.5f' % metrics_val.loss)
200 | print()
201 | return weights
202 |
203 |
204 | def write_weights(file_path: str, weights: Array,
205 | features: typing.List[str]) -> None:
206 | """Writes learned weights and corresponsing features to a file.
207 |
208 | Args:
209 | file_path: A file path for the weights file.
210 | weights: A weight vector.
211 | features: A list of feature identifiers.
212 | """
213 | with open(file_path, 'w') as f:
214 | f.write('\n'.join([
215 | '%s\t%.6f' % (feature, weights[i]) for i, feature in enumerate(features)
216 | ]))
217 |
218 |
219 | def parse_args(
220 | test: typing.Optional[typing.List[str]] = None) -> argparse.Namespace:
221 | """Parses commandline arguments.
222 |
223 | Args:
224 | test (typing.Optional[typing.List[str]], optional): Commandline args for
225 | testing. Defaults to None.
226 |
227 | Returns:
228 | Parsed arguments (argparse.Namespace).
229 | """
230 | parser = argparse.ArgumentParser(
231 | description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
232 | parser.add_argument(
233 | 'train_data', help='File path for the encoded training data.')
234 | parser.add_argument('base_model', help='File path for the base model file.')
235 | parser.add_argument(
236 | '-o',
237 | '--output',
238 | help=f'File path for the output weights. (default: {DEFAULT_OUTPUT_NAME})',
239 | type=str,
240 | default=DEFAULT_OUTPUT_NAME)
241 | parser.add_argument(
242 | '--val-data', help='File path for the encoded validation data.', type=str)
243 | parser.add_argument(
244 | '--iters',
245 | help=f'Number of iterations for training. (default: {DEFAULT_NUM_ITERS})',
246 | type=int,
247 | default=DEFAULT_NUM_ITERS)
248 | parser.add_argument(
249 | '--log-span',
250 | help=f'Iteration span to print metrics. (default: {DEFAULT_LOG_SPAN})',
251 | type=int,
252 | default=DEFAULT_LOG_SPAN)
253 | parser.add_argument(
254 | '--learning-rate',
255 | help=f'Learning rate. (default: {DEFAULT_LEARNING_RATE})',
256 | type=float,
257 | default=DEFAULT_LEARNING_RATE)
258 | if test is None:
259 | return parser.parse_args()
260 | else:
261 | return parser.parse_args(test)
262 |
263 |
264 | def main() -> None:
265 | args = parse_args()
266 | train_data_path: str = args.train_data
267 | base_model_path: str = args.base_model
268 | weights_path: str = args.output
269 | iters: int = args.iters
270 | log_span: int = args.log_span
271 | learning_rate: float = args.learning_rate
272 | val_data_path: typing.Optional[str] = args.val_data
273 |
274 | model = load_model(base_model_path)
275 | train_dataset = load_dataset(train_data_path, model)
276 | val_dataset = load_dataset(val_data_path, model) if val_data_path else None
277 | weights = fit(
278 | model.weights,
279 | train_dataset,
280 | iters=iters,
281 | log_span=log_span,
282 | learning_rate=learning_rate,
283 | val_dataset=val_dataset)
284 | write_weights(weights_path, weights, model.features)
285 |
286 |
287 | if __name__ == '__main__':
288 | main()
289 |
--------------------------------------------------------------------------------
/scripts/prepare_knbc.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Prepares a dataset from the KNBC corpus.
15 |
16 | Before running this script, you need to download the KNBC corpus by running:
17 |
18 | $ curl -o knbc.tar.bz2 https://nlp.ist.i.kyoto-u.ac.jp/kuntt/KNBC_v1.0_090925_utf8.tar.bz2
19 | $ tar -xf knbc.tar.bz2
20 |
21 | Now you should have a directory named `KNBC_v1.0_090925_utf8`.
22 | Run the following to generate a dataset named `source_knbc.txt`.
23 |
24 | $ python scripts/prepare_knbc.py KNBC_v1.0_090925_utf8 -o source_knbc.txt
25 | """
26 |
27 | import argparse
28 | import os
29 | import sys
30 | import typing
31 | from html.parser import HTMLParser
32 |
33 | # module hack
34 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
35 | sys.path.insert(0, os.path.abspath(LIB_PATH))
36 |
37 | from budoux import utils # noqa (module hack)
38 |
39 | GRANULARITY_OPTIONS = {'phrase', 'tag', 'word'}
40 | Granularity = typing.Literal['phrase', 'tag', 'word']
41 |
42 |
43 | class KNBCHTMLParser(HTMLParser):
44 | """Parses the HTML files in the KNBC corpus to collect chunks.
45 |
46 | Attributes:
47 | chunks: The collected chunks.
48 | row: The current row index.
49 | col: The current column index.
50 | current_word: The current word to process.
51 | on_split_row: Whether the scan is on the splitting row.
52 | granularity: Granularity of the output chunks.
53 | """
54 |
55 | BUNSETSU_SPLIT_ID = 'bnst-kugiri'
56 | TAG_SPLIT_ID = 'tag-kugiri'
57 |
58 | def __init__(self, granularity: Granularity) -> None:
59 | """Initializes the HTML parser for the KNBC corpus.
60 |
61 | Args:
62 | granularity: Granularity of the output chunks.
63 | """
64 | super().__init__()
65 | self.chunks = ['']
66 | self.row = 0
67 | self.col = 0
68 | self.current_word = ''
69 | self.on_split_row = False
70 | self.granularity = granularity
71 |
72 | def handle_starttag(
73 | self, tag: str,
74 | attributes: typing.List[typing.Tuple[str, typing.Optional[str]]]) -> None:
75 | if tag == 'tr':
76 | self.row += 1
77 | self.col = 0
78 | self.current_word = ''
79 | self.on_split_row = False
80 |
81 | if tag == 'td':
82 | self.col += 1
83 | for name, value in attributes:
84 | bunsetsu_row = name == 'id' and value == self.BUNSETSU_SPLIT_ID
85 | tag_row = name == 'id' and value == self.TAG_SPLIT_ID
86 | if bunsetsu_row or (self.granularity == 'tag' and tag_row):
87 | self.on_split_row = True
88 |
89 | def handle_endtag(self, tag: str) -> None:
90 | if tag != 'tr': # Skip all tags but TR.
91 | return None
92 | if self.row < 3: # Skip the first two rows.
93 | return None
94 | if self.on_split_row:
95 | return self.chunks.append('')
96 | if self.col == 5:
97 | if self.granularity == 'word' and self.chunks[-1]:
98 | self.chunks.append('')
99 | self.chunks[-1] += self.current_word
100 |
101 | def handle_data(self, data: str) -> None:
102 | if self.col == 1:
103 | self.current_word = data
104 |
105 |
106 | def break_before_sequence(chunks: typing.List[str],
107 | sequence: str) -> typing.List[str]:
108 | """Breaks chunks before a specified character sequence appears.
109 |
110 | Args:
111 | chunks (List[str]): Chunks to break.
112 | sequence (str): A character sequence to break chunks before.
113 |
114 | Returns:
115 | Processed chunks.
116 | """
117 | chunks = utils.SEP.join(chunks).replace(sequence,
118 | utils.SEP + sequence).split(utils.SEP)
119 | chunks = [chunk for chunk in chunks if len(chunk) > 0]
120 | return chunks
121 |
122 |
123 | def postprocess(chunks: typing.List[str]) -> typing.List[str]:
124 | """Applies some processes to modify the extracted chunks.
125 |
126 | Args:
127 | chunks (List[str]): Source chunks.
128 |
129 | Returns:
130 | Processed chunks.
131 | """
132 | chunks = break_before_sequence(chunks, '(')
133 | chunks = break_before_sequence(chunks, 'もら')
134 | return chunks
135 |
136 |
137 | def parse_args() -> argparse.Namespace:
138 | DEFAULT_OUT_PATH = 'source.txt'
139 | DEFAULT_GRANULARITY = 'phrase'
140 | parser = argparse.ArgumentParser(
141 | description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
142 | parser.add_argument('source_dir', help='Path to the KNBC corpus directory.')
143 | parser.add_argument(
144 | '-o',
145 | '--outfile',
146 | help=f'File path to the output dataset. (default: {DEFAULT_OUT_PATH})',
147 | default=DEFAULT_OUT_PATH)
148 | parser.add_argument(
149 | '--granularity',
150 | help=f'''Granularity of the output chunks. (default: {DEFAULT_GRANULARITY})
151 | The value should be one of "phrase", "tag", or "word".
152 | "phrase" is equivalent to Bunsetu-based segmentation.
153 | "tag" provides more granular segmentation than "phrase".
154 | "word" is equivalent to word-based segmentation.
155 |
156 | e.g. 携帯ユーザーの仲間入りをするかです。
157 | phrase: 携帯ユーザーの / 仲間入りを / するかです。
158 | tag: 携帯 / ユーザーの / 仲間 / 入りを / するかです。
159 | word: 携帯 / ユーザー / の / 仲間 / 入り / を / する / か / です / 。
160 | ''',
161 | choices=GRANULARITY_OPTIONS,
162 | default=DEFAULT_GRANULARITY)
163 | return parser.parse_args()
164 |
165 |
166 | def main() -> None:
167 | args = parse_args()
168 | source_dir = args.source_dir
169 | outfile = args.outfile
170 | granularity = args.granularity
171 | html_dir = os.path.join(source_dir, 'html')
172 | with open(outfile, 'w') as f:
173 | for file in sorted(os.listdir(html_dir)):
174 | if file[-11:] != '-morph.html':
175 | continue
176 | parser = KNBCHTMLParser(granularity)
177 | data = open(os.path.join(html_dir, file)).read()
178 | parser.feed(data)
179 | chunks = parser.chunks
180 | chunks = postprocess(chunks)
181 | if len(chunks) < 2:
182 | continue
183 | f.write(utils.SEP.join(chunks) + '\n')
184 | print('\033[92mTraining data is output to: %s\033[0m' % (outfile))
185 |
186 |
187 | if __name__ == '__main__':
188 | main()
189 |
--------------------------------------------------------------------------------
/scripts/prepare_wisesight.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Prepares a dataset from the Wisesight corpus.
15 |
16 | Before running this script, you need to download the Wisesight corpus by running:
17 |
18 | $ curl -o wisesight-1000-samples-tokenised.label https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/word-tokenization/wisesight-1000-samples-tokenised.label
19 | $ curl -o wisesight-160-samples-tokenised.label https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/word-tokenization/wisesight-160-samples-tokenised.label
20 |
21 | Then run this command as follows over each file.
22 |
23 | $ python scripts/prepare_wisesight.py wisesight-1000-samples-tokenised.label -o source_train.txt
24 | $ python scripts/prepare_wisesight.py wisesight-160-samples-tokenised.label -o source_val.txt
25 | """
26 | import argparse
27 | import re
28 |
29 | import regex
30 |
31 |
32 | def parse_args() -> argparse.Namespace:
33 | DEFAULT_OUT_PATH = 'source.txt'
34 | parser = argparse.ArgumentParser(
35 | description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
36 | parser.add_argument(
37 | 'source_filepath', help='Path to a Wisesight corpus label file.')
38 | parser.add_argument(
39 | '-o',
40 | '--outfile',
41 | help=f'File path to the output dataset. (default: {DEFAULT_OUT_PATH})',
42 | default=DEFAULT_OUT_PATH)
43 | return parser.parse_args()
44 |
45 |
46 | def main() -> None:
47 | args = parse_args()
48 | source_filepath = args.source_filepath
49 | target_filepath = args.outfile
50 |
51 | with open(target_filepath, 'w') as outfile:
52 | with open(source_filepath) as infile:
53 | for line in infile:
54 | line = line.strip()
55 | line = re.sub(r'https?://[^ ]+', '', line) # Remove URLs
56 | line = re.sub(r'#[^ ]+', '', line) # Remove hashtags
57 | line = regex.compile(r'\p{Emoji_Presentation=Yes}+').sub(
58 | '', line) # Remove emojis
59 | line = re.sub(r'\|+', '|', line) # Remove consecutive separators
60 | line = re.sub(r'(\|\s)*\|$', '', line) # Remove redundant spaces
61 | outfile.write(line.replace('|', '▁') + '\n') # Replace the separators.
62 | print('\033[92mTraining data is output to: %s\033[0m' % (target_filepath))
63 |
64 |
65 | if __name__ == '__main__':
66 | main()
67 |
--------------------------------------------------------------------------------
/scripts/tests/test_build_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Tests the model build script."""
15 |
16 | import os
17 | import sys
18 | import unittest
19 |
20 | # module hack
21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
22 | sys.path.insert(0, os.path.abspath(LIB_PATH))
23 |
24 | from scripts import build_model # noqa (module hack)
25 |
26 |
27 | class TestAggregateScores(unittest.TestCase):
28 |
29 | def test_standard(self) -> None:
30 | weights = [
31 | 'AB:x\t2.893\n', 'BC:y\t0.123\n', 'AB:y\t2.123\n', 'BC:y\t1.234\n'
32 | ]
33 | model = build_model.aggregate_scores(weights)
34 | self.assertDictEqual(model, {
35 | 'AB': {
36 | 'x': 2.893,
37 | 'y': 2.123
38 | },
39 | 'BC': {
40 | 'y': 1.357
41 | }
42 | }, 'should group scores by feature type.')
43 |
44 | def test_blank_line(self) -> None:
45 | weights = [
46 | '\n', 'AB:x\t2.893\n', 'BC:y\t0.123\n', '\n', 'AB:y\t2.123\n',
47 | 'BC:y\t1.234\n'
48 | ]
49 | model = build_model.aggregate_scores(weights)
50 | self.assertDictEqual(model, {
51 | 'AB': {
52 | 'x': 2.893,
53 | 'y': 2.123
54 | },
55 | 'BC': {
56 | 'y': 1.357
57 | }
58 | }, 'should skip blank lines.')
59 |
60 | def test_colon(self) -> None:
61 | weights = ['AB::\t8.123']
62 | model = build_model.aggregate_scores(weights)
63 | self.assertDictEqual(
64 | model, {'AB': {
65 | ':': 8.123
66 | }}, 'should consider the first colon only as a delimiter.')
67 |
68 |
69 | class TestRoundModel(unittest.TestCase):
70 |
71 | def test_standard(self) -> None:
72 | model = {
73 | 'AB': {
74 | 'x': 1.0002,
75 | 'y': 4.1237,
76 | },
77 | 'BC': {
78 | 'z': 2.1111,
79 | }
80 | }
81 | model_rounded = build_model.round_model(model, 1000)
82 | self.assertDictEqual(model_rounded, {
83 | 'AB': {
84 | 'x': 1000,
85 | 'y': 4123
86 | },
87 | 'BC': {
88 | 'z': 2111
89 | }
90 | }, 'should scale and round scores to integer.')
91 |
92 | def test_insignificant_score(self) -> None:
93 | model = {
94 | 'AB': {
95 | 'x': 0.0009,
96 | 'y': 4.1237,
97 | },
98 | 'BC': {
99 | 'z': 2.1111,
100 | }
101 | }
102 | model_rounded = build_model.round_model(model, 1000)
103 | self.assertDictEqual(model_rounded, {
104 | 'AB': {
105 | 'y': 4123
106 | },
107 | 'BC': {
108 | 'z': 2111
109 | }
110 | }, 'should remove insignificant scores lower than 1.')
111 |
112 |
113 | class TestArgParse(unittest.TestCase):
114 |
115 | def test_cmdargs_invalid_option(self) -> None:
116 | cmdargs = ['-v']
117 | with self.assertRaises(SystemExit) as cm:
118 | build_model.parse_args(cmdargs)
119 | self.assertEqual(cm.exception.code, 2)
120 |
121 | def test_cmdargs_help(self) -> None:
122 | cmdargs = ['-h']
123 | with self.assertRaises(SystemExit) as cm:
124 | build_model.parse_args(cmdargs)
125 | self.assertEqual(cm.exception.code, 0)
126 |
127 | def test_cmdargs_no_input(self) -> None:
128 | with self.assertRaises(SystemExit) as cm:
129 | build_model.parse_args([])
130 | self.assertEqual(cm.exception.code, 2)
131 |
132 | def test_cmdargs_default(self) -> None:
133 | output = build_model.parse_args(['weight.txt'])
134 | self.assertEqual(output.weight_file, 'weight.txt')
135 | self.assertEqual(output.outfile, 'model.json')
136 | self.assertEqual(output.scale, 1000)
137 |
138 | def test_cmdargs_with_scale(self) -> None:
139 | output = build_model.parse_args(
140 | ['weight.txt', '-o', 'foo.json', '--scale', '200'])
141 | self.assertEqual(output.weight_file, 'weight.txt')
142 | self.assertEqual(output.outfile, 'foo.json')
143 | self.assertEqual(output.scale, 200)
144 |
--------------------------------------------------------------------------------
/scripts/tests/test_encode_data.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Tests the data encoder script."""
15 |
16 | import os
17 | import sys
18 | import typing
19 | import unittest
20 |
21 | # module hack
22 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
23 | sys.path.insert(0, os.path.abspath(LIB_PATH))
24 |
25 | from budoux import utils # noqa (module hack)
26 | from scripts import encode_data # noqa (module hack)
27 |
28 |
29 | class TestGetFeature(unittest.TestCase):
30 |
31 | def test_standard(self) -> None:
32 | feature = encode_data.get_feature('a', 'b', 'c', 'd', 'e', 'f')
33 | self.assertSetEqual(
34 | set(feature),
35 | {
36 | # Unigram of Words (UW)
37 | 'UW1:a',
38 | 'UW2:b',
39 | 'UW3:c',
40 | 'UW4:d',
41 | 'UW5:e',
42 | 'UW6:f',
43 |
44 | # Bigram of Words (BW)
45 | 'BW1:bc',
46 | 'BW2:cd',
47 | 'BW3:de',
48 |
49 | # Trigram of Words (TW)
50 | 'TW1:abc',
51 | 'TW2:bcd',
52 | 'TW3:cde',
53 | 'TW4:def',
54 | },
55 | 'Features should be extracted.')
56 |
57 | def test_with_invalid(self) -> None:
58 |
59 | def find_by_prefix(prefix: str, feature: typing.List[str]) -> bool:
60 | for item in feature:
61 | if item.startswith(prefix):
62 | return True
63 | return False
64 |
65 | feature = encode_data.get_feature('a', 'a', encode_data.INVALID, 'a', 'a',
66 | 'a')
67 | self.assertFalse(
68 | find_by_prefix('UW3:', feature),
69 | 'Should omit the Unigram feature when the character is invalid.')
70 | self.assertFalse(
71 | find_by_prefix('BW2:', feature),
72 | 'Should omit the Bigram feature that covers an invalid character.')
73 |
74 |
75 | class TestArgParse(unittest.TestCase):
76 |
77 | def test_cmdargs_invalid_option(self) -> None:
78 | cmdargs = ['-v']
79 | with self.assertRaises(SystemExit) as cm:
80 | encode_data.parse_args(cmdargs)
81 | self.assertEqual(cm.exception.code, 2)
82 |
83 | def test_cmdargs_help(self) -> None:
84 | cmdargs = ['-h']
85 | with self.assertRaises(SystemExit) as cm:
86 | encode_data.parse_args(cmdargs)
87 | self.assertEqual(cm.exception.code, 0)
88 |
89 | def test_cmdargs_no_source(self) -> None:
90 | with self.assertRaises(SystemExit) as cm:
91 | encode_data.parse_args([])
92 | self.assertEqual(cm.exception.code, 2)
93 |
94 | def test_cmdargs_default(self) -> None:
95 | cmdargs = ['source.txt']
96 | output = encode_data.parse_args(cmdargs)
97 | self.assertEqual(output.source_data, 'source.txt')
98 | self.assertEqual(output.outfile, encode_data.DEFAULT_OUTPUT_FILENAME)
99 | self.assertIsNone(output.processes)
100 | self.assertEqual(output.scale, 1)
101 |
102 | def test_cmdargs_with_outfile(self) -> None:
103 | cmdargs = ['source.txt', '-o', 'out.txt']
104 | output = encode_data.parse_args(cmdargs)
105 | self.assertEqual(output.source_data, 'source.txt')
106 | self.assertEqual(output.outfile, 'out.txt')
107 | self.assertIsNone(output.processes)
108 | self.assertEqual(output.scale, 1)
109 |
110 | def test_cmdargs_with_processes(self) -> None:
111 | cmdargs = ['source.txt', '--processes', '8']
112 | output = encode_data.parse_args(cmdargs)
113 | self.assertEqual(output.source_data, 'source.txt')
114 | self.assertEqual(output.outfile, encode_data.DEFAULT_OUTPUT_FILENAME)
115 | self.assertEqual(output.processes, 8)
116 | self.assertEqual(output.scale, 1)
117 |
118 | def test_cmdargs_with_scale(self) -> None:
119 | cmdargs = ['source.txt', '--scale', '20']
120 | output = encode_data.parse_args(cmdargs)
121 | self.assertEqual(output.source_data, 'source.txt')
122 | self.assertEqual(output.outfile, encode_data.DEFAULT_OUTPUT_FILENAME)
123 | self.assertIsNone(output.processes)
124 | self.assertEqual(output.scale, 20)
125 |
126 |
127 | class TestProcess(unittest.TestCase):
128 |
129 | sentence = '六本木ヒルズでお昼を食べる。'
130 | sep_indices = {7, 10, 13}
131 |
132 | def test_on_negative_point_with_scale(self) -> None:
133 | line = encode_data.process(8, self.sentence, self.sep_indices, 16)
134 | items = line.split('\t')
135 | weight = items[0]
136 | features = set(items[1:])
137 | self.assertEqual(weight, '-16')
138 | self.assertIn('UW2:で', features)
139 |
140 | def test_on_positive_point_with_scale(self) -> None:
141 | line = encode_data.process(7, self.sentence, self.sep_indices, 13)
142 | items = line.split('\t')
143 | weight = items[0]
144 | features = set(items[1:])
145 | self.assertEqual(weight, '13')
146 | self.assertIn('UW3:で', features)
147 |
148 |
149 | class TestNormalizeInput(unittest.TestCase):
150 |
151 | def test_standard_input(self) -> None:
152 | source = f'ABC{utils.SEP}DE{utils.SEP}FGHI'
153 | sentence, sep_indices = encode_data.normalize_input(source)
154 | self.assertEqual(sentence, 'ABCDEFGHI')
155 | self.assertEqual(sep_indices, {3, 5, 9})
156 |
157 | def test_with_linebreaks(self) -> None:
158 | source = f'AB\nCDE{utils.SEP}FG'
159 | sentence, sep_indices = encode_data.normalize_input(source)
160 | self.assertEqual(sentence, 'ABCDEFG')
161 | self.assertEqual(sep_indices, {2, 5, 7})
162 |
163 | def test_doubled_seps(self) -> None:
164 | source = f'ABC{utils.SEP}{utils.SEP}DE\n\nFG'
165 | sentence, sep_indices = encode_data.normalize_input(source)
166 | self.assertEqual(sentence, 'ABCDEFG')
167 | self.assertEqual(sep_indices, {3, 5, 7})
168 |
169 |
170 | if __name__ == '__main__':
171 | unittest.main()
172 |
--------------------------------------------------------------------------------
/scripts/tests/test_finetune.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Tests the finetune script."""
15 |
16 | import os
17 | import sys
18 | import tempfile
19 | import unittest
20 |
21 | from jax import numpy as jnp
22 |
23 | # module hack
24 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
25 | sys.path.insert(0, os.path.abspath(LIB_PATH))
26 |
27 | from scripts import finetune # noqa (module hack)
28 |
29 |
30 | class TestArgParse(unittest.TestCase):
31 |
32 | def test_cmdargs_invalid_option(self) -> None:
33 | cmdargs = ['-v']
34 | with self.assertRaises(SystemExit) as cm:
35 | finetune.parse_args(cmdargs)
36 | self.assertEqual(cm.exception.code, 2)
37 |
38 | def test_cmdargs_help(self) -> None:
39 | cmdargs = ['-h']
40 | with self.assertRaises(SystemExit) as cm:
41 | finetune.parse_args(cmdargs)
42 | self.assertEqual(cm.exception.code, 0)
43 |
44 | def test_cmdargs_no_data(self) -> None:
45 | with self.assertRaises(SystemExit) as cm:
46 | finetune.parse_args([])
47 | self.assertEqual(cm.exception.code, 2)
48 |
49 | def test_cmdargs_no_base_model(self) -> None:
50 | with self.assertRaises(SystemExit) as cm:
51 | finetune.parse_args(['encoded.txt'])
52 | self.assertEqual(cm.exception.code, 2)
53 |
54 | def test_cmdargs_default(self) -> None:
55 | cmdargs = ['encoded.txt', 'model.json']
56 | output = finetune.parse_args(cmdargs)
57 | self.assertEqual(output.train_data, 'encoded.txt')
58 | self.assertEqual(output.base_model, 'model.json')
59 | self.assertEqual(output.iters, finetune.DEFAULT_NUM_ITERS)
60 | self.assertEqual(output.log_span, finetune.DEFAULT_LOG_SPAN)
61 | self.assertEqual(output.learning_rate, finetune.DEFAULT_LEARNING_RATE)
62 | self.assertEqual(output.val_data, None)
63 |
64 | def test_cmdargs_with_values(self) -> None:
65 | cmdargs = [
66 | 'encoded.txt', 'model.json', '--iters', '50', '--log-span', '10',
67 | '--learning-rate', '0.1', '--val-data', 'val.txt'
68 | ]
69 | output = finetune.parse_args(cmdargs)
70 | self.assertEqual(output.train_data, 'encoded.txt')
71 | self.assertEqual(output.base_model, 'model.json')
72 | self.assertEqual(output.iters, 50)
73 | self.assertEqual(output.log_span, 10)
74 | self.assertEqual(output.learning_rate, 0.1)
75 | self.assertEqual(output.val_data, 'val.txt')
76 |
77 |
78 | class TestLoadModel(unittest.TestCase):
79 |
80 | def setUp(self) -> None:
81 | self.model_file_path = tempfile.NamedTemporaryFile().name
82 | with open(self.model_file_path, 'w') as f:
83 | f.write('{"UW1": {"a": 12, "b": 23}, "TW3": {"xyz": 47}}')
84 |
85 | def test_extracted_keys(self) -> None:
86 | result = finetune.load_model(self.model_file_path).features
87 | self.assertListEqual(result, ['UW1:a', 'UW1:b', 'TW3:xyz'])
88 |
89 | def test_value_variance(self) -> None:
90 | result = finetune.load_model(self.model_file_path).weights.var()
91 | self.assertAlmostEqual(float(result), 1, places=5)
92 |
93 | def test_value_mean(self) -> None:
94 | result = finetune.load_model(self.model_file_path).weights.sum()
95 | self.assertAlmostEqual(float(result), 0, places=5)
96 |
97 | def test_value_order(self) -> None:
98 | result = finetune.load_model(self.model_file_path).weights.tolist()
99 | self.assertGreater(result[1], result[0])
100 | self.assertGreater(result[2], result[1])
101 |
102 |
103 | class TestLoadDataset(unittest.TestCase):
104 |
105 | def setUp(self) -> None:
106 | self.entries_file_path = tempfile.NamedTemporaryFile().name
107 | with open(self.entries_file_path, 'w') as f:
108 | f.write(('1\tfoo\tbar\n'
109 | '-1\tfoo\n'
110 | '1\tfoo\tbar\tbaz\n'
111 | '1\tbar\tfoo\n'
112 | '-1\tbaz\tqux\n'))
113 | self.model = finetune.NormalizedModel(['foo', 'bar'], jnp.array([23, -37]))
114 |
115 | def test_y(self) -> None:
116 | result = finetune.load_dataset(self.entries_file_path, self.model)
117 | expected = [True, False, True, True, False]
118 | self.assertListEqual(result.Y.tolist(), expected)
119 |
120 | def test_x(self) -> None:
121 | result = finetune.load_dataset(self.entries_file_path, self.model)
122 | expected = [[1, 1], [1, -1], [1, 1], [1, 1], [-1, -1]]
123 | self.assertListEqual(result.X.tolist(), expected)
124 |
125 |
126 | class TestFit(unittest.TestCase):
127 |
128 | def test_health(self) -> None:
129 | w = jnp.array([.9, .5, -.3])
130 | X = jnp.array([[-1, 1, 1], [1, -1, 1], [1, 1, -1]])
131 | # The current result is x.dot(w) = [-0.7, 0.1, 1.1] => [False, True, True]
132 | # It tests if the method can learn a new weight that inverses the result.
133 | Y = jnp.array([True, False, False])
134 | dataset = finetune.Dataset(X, Y)
135 | w = finetune.fit(w, dataset, iters=1000, learning_rate=.01, log_span=100)
136 | self.assertGreater(X.dot(w).tolist()[0], 0) # x.dot(w) > 0 => True.
137 |
138 |
139 | class TestWriteWeights(unittest.TestCase):
140 |
141 | def test_write_weights(self) -> None:
142 | weights = jnp.array([0.012, 0.238, -0.1237])
143 | features = ['foo', 'bar', 'baz']
144 | weights_path = tempfile.NamedTemporaryFile().name
145 | finetune.write_weights(weights_path, weights, features)
146 | with open(weights_path) as f:
147 | result = f.read()
148 | self.assertEqual(result, 'foo\t0.012000\nbar\t0.238000\nbaz\t-0.123700')
149 |
--------------------------------------------------------------------------------
/scripts/tests/test_prepare_knbc.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Tests the prepare KNBC script."""
15 |
16 | import os
17 | import sys
18 | import unittest
19 |
20 | # module hack
21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
22 | sys.path.insert(0, os.path.abspath(LIB_PATH))
23 |
24 | from scripts import prepare_knbc # noqa (module hack)
25 |
26 |
27 | class TestBreakBeforeSequence(unittest.TestCase):
28 |
29 | def test_standard(self) -> None:
30 | chunks = ['abcdef', 'ghi']
31 | result = prepare_knbc.break_before_sequence(chunks, 'de')
32 | self.assertListEqual(result, ['abc', 'def', 'ghi'])
33 |
34 | def test_sequence_on_top(self) -> None:
35 | chunks = ['abcdef', 'ghi']
36 | result = prepare_knbc.break_before_sequence(chunks, 'gh')
37 | self.assertListEqual(result, ['abcdef', 'ghi'])
38 |
39 | def test_multiple_hit(self) -> None:
40 | chunks = ['abcabc', 'def']
41 | result = prepare_knbc.break_before_sequence(chunks, 'bc')
42 | self.assertListEqual(result, ['a', 'bca', 'bc', 'def'])
43 |
44 |
45 | class TestKNBCHTMLParser(unittest.TestCase):
46 | example_html = '''
47 |
48 |
49 |
50 | HA HB HC HD HE
51 | 文節区切り
52 | abc
53 | de
54 | タグ区切り
55 | fgh
56 | ijkl
57 | 文節区切り
58 | mn
59 |
60 |
61 |
62 | '''
63 |
64 | def test_parse_phrase(self) -> None:
65 | parser = prepare_knbc.KNBCHTMLParser('phrase')
66 | parser.feed(self.example_html)
67 | self.assertListEqual(parser.chunks, ['abcdefghijkl', 'mn'])
68 |
69 | def test_parse_tag(self) -> None:
70 | parser = prepare_knbc.KNBCHTMLParser('tag')
71 | parser.feed(self.example_html)
72 | self.assertListEqual(parser.chunks, ['abcde', 'fghijkl', 'mn'])
73 |
74 | def test_parse_word(self) -> None:
75 | parser = prepare_knbc.KNBCHTMLParser('word')
76 | parser.feed(self.example_html)
77 | self.assertListEqual(parser.chunks, ['abc', 'de', 'fgh', 'ijkl', 'mn'])
78 |
--------------------------------------------------------------------------------
/scripts/tests/test_translate_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Tests the model translator script."""
15 |
16 | import os
17 | import sys
18 | import unittest
19 |
20 | # module hack
21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
22 | sys.path.insert(0, os.path.abspath(LIB_PATH))
23 |
24 | from scripts import translate_model # noqa (module hack)
25 |
26 |
27 | class TestNormalize(unittest.TestCase):
28 |
29 | def test_old_format_input(self) -> None:
30 | model = {'a:x': 48, 'a:y': 21, 'b:x': 2, 'b:z': 89}
31 | expect = {'a': {'x': 48, 'y': 21}, 'b': {'x': 2, 'z': 89}}
32 | result = translate_model.normalize(model)
33 | self.assertDictEqual(result, expect)
34 |
35 | def test_new_format_input(self) -> None:
36 | model = {'a': {'x': 48, 'y': 21}, 'b': {'x': 2, 'z': 89}}
37 | result = translate_model.normalize(model)
38 | self.assertDictEqual(result, model)
39 |
40 | def test_broken_input1(self) -> None:
41 | model = {'a:x': 23, 'b': {'x': 37, 'y': 18}}
42 | with self.assertRaises(Exception) as cm:
43 | translate_model.normalize(model)
44 | self.assertTrue('Unsupported model format' in str(cm.exception))
45 |
46 | def test_broken_input2(self) -> None:
47 | model = {'b': {'x': 37, 'y': {'z': 123}}}
48 | with self.assertRaises(Exception) as cm:
49 | translate_model.normalize(model)
50 | self.assertTrue('Unsupported model format' in str(cm.exception))
51 |
52 |
53 | class TestTranslateICU(unittest.TestCase):
54 |
55 | def test_standard(self) -> None:
56 | model = {}
57 | model['b'] = {'x': 47, 'z': 13}
58 | model['a'] = {'x': 12, 'y': 88}
59 | expect = '''
60 | jaml {
61 | aKeys {
62 | "x",
63 | "y",
64 | }
65 | aValues:intvector {
66 | 12,
67 | 88,
68 | }
69 | bKeys {
70 | "x",
71 | "z",
72 | }
73 | bValues:intvector {
74 | 47,
75 | 13,
76 | }
77 | }
78 | '''.strip()
79 | result = translate_model.translate_icu(model)
80 | self.assertEqual(result, expect)
81 |
--------------------------------------------------------------------------------
/scripts/translate_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Translates a model JSON file to another format, such as ICU Resource Bundle.
15 |
16 | Example usage:
17 |
18 | $ python translate_model.py --format=icu model.json > icurb.txt
19 |
20 | You can also use this script to update the model files older than v0.5.0 to make
21 | it work with the latest version.
22 |
23 | $ python translate_model.py --format=json old-model.json > new-model.json
24 | """
25 |
26 | import argparse
27 | import itertools
28 | import json
29 | import typing
30 |
31 | ArgList = typing.Optional[typing.List[str]]
32 |
33 |
34 | def translate_icu(model: typing.Dict[str, typing.Dict[str, int]]) -> str:
35 | """Translates a model to the ICU Resource Bundle format.
36 |
37 | The output is intended to update the data in:
38 | https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/adaboost/jaml.txt
39 |
40 | Args:
41 | model: A model.
42 | Returns:
43 | A model string formatted in the ICU Resource Bundle format.
44 | """
45 | indent = ' '
46 | output = 'jaml {\n'
47 | for group_name, members in sorted(model.items()):
48 | output += f'{indent}{group_name}Keys {{\n'
49 | for key in members.keys():
50 | output += f'{indent}{indent}"{key}",\n'
51 | output += f'{indent}}}\n'
52 | output += f'{indent}{group_name}Values:intvector {{\n'
53 | for val in members.values():
54 | output += f'{indent}{indent}{val},\n'
55 | output += f'{indent}}}\n'
56 | output += '}'
57 | return output
58 |
59 |
60 | def normalize(
61 | model: typing.Dict[str,
62 | typing.Any]) -> typing.Dict[str, typing.Dict[str, int]]:
63 | """Updates a model to the latest format. Does nothing if it's updated already.
64 |
65 | Args:
66 | model: A model.
67 | Returns:
68 | An updated model.
69 | """
70 | is_old_format = all([isinstance(v, int) for v in model.values()])
71 | if is_old_format:
72 | output = {}
73 | sorted_items = sorted(model.items(), key=lambda x: x[0])
74 | groups = itertools.groupby(sorted_items, key=lambda x: x[0].split(':')[0])
75 | for group in groups:
76 | output[group[0]] = dict(
77 | (item[0].split(':')[-1], item[1]) for item in group[1])
78 | return output
79 | try:
80 | assert (all([
81 | isinstance(v, int)
82 | for groups in model.values()
83 | for v in groups.values()
84 | ])), 'Scores should be integers'
85 | except (AssertionError, AttributeError) as e:
86 | raise Exception('Unsupported model format:', e)
87 | else:
88 | return model
89 |
90 |
91 | def main() -> None:
92 | DEFAULT_FORMAT = 'json'
93 | parser = argparse.ArgumentParser(
94 | description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
95 | parser.add_argument(
96 | 'model', help='File path for the JSON format model file.', type=str)
97 | parser.add_argument(
98 | '--format',
99 | help=f'Target format (default: {DEFAULT_FORMAT})',
100 | type=str,
101 | default=DEFAULT_FORMAT,
102 | choices={DEFAULT_FORMAT, 'icu'})
103 | args = parser.parse_args()
104 | model_path: str = args.model
105 | format: str = args.format
106 | with open(model_path) as f:
107 | model = json.load(f)
108 | model = normalize(model)
109 | if format == 'json':
110 | print(json.dumps(model, ensure_ascii=False, separators=(',', ':')))
111 | elif format == 'icu':
112 | print(translate_icu(model))
113 | else:
114 | pass
115 |
116 |
117 | if __name__ == '__main__':
118 | main()
119 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = budoux
3 | version = attr: budoux.__init__.__version__
4 | description = BudouX is the successor of Budou
5 | long_description = file: README.md
6 | long_description_content_type = text/markdown
7 | license = Apache-2.0
8 | author = Shuhei Iitsuka
9 | author_email = tushuhei@google.com
10 | classifiers =
11 | Development Status :: 3 - Alpha
12 | Operating System :: OS Independent
13 | License :: OSI Approved :: Apache Software License
14 | Programming Language :: Python :: 3.9
15 | Programming Language :: Python :: 3.10
16 | Programming Language :: Python :: 3.11
17 | Programming Language :: Python :: 3.12
18 | Programming Language :: Python :: 3.13
19 |
20 | [options]
21 | python_requires= >= 3.9
22 | packages = find:
23 | include_package_data = True
24 | test_suite = tests
25 | install_requires =
26 | importlib-resources
27 |
28 | [options.extras_require]
29 | dev =
30 | build
31 | flake8
32 | isort
33 | mypy==1.15.0
34 | pytest
35 | regex
36 | toml
37 | twine
38 | types-regex
39 | types-setuptools
40 | yapf
41 |
42 | jaxcpu =
43 | jax==0.5.2
44 |
45 | [options.entry_points]
46 | console_scripts =
47 | budoux = budoux.main:main
48 |
49 | [yapf]
50 | based_on_style = yapf
51 |
52 | [flake8]
53 | # E124: closing bracket does not match visual indentation
54 | # E126: over-indentation
55 | # E501: line too long
56 | # BLK100: black formattable
57 | ignore = E124,E126,E501,BLK100
58 | indent-size = 2
59 |
60 | [mypy]
61 | python_version = 3.10
62 | pretty = True
63 | strict = True
64 | allow_untyped_calls = True
65 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from setuptools import setup
16 |
17 | setup()
18 |
--------------------------------------------------------------------------------
/tests/in/1.in:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tests/in/2.in:
--------------------------------------------------------------------------------
1 | これはテストです。
2 |
--------------------------------------------------------------------------------
/tests/in/3.in:
--------------------------------------------------------------------------------
1 | これはテスト です。
2 |
--------------------------------------------------------------------------------
/tests/quality/ja.tsv:
--------------------------------------------------------------------------------
1 | # label sentence
2 | init 今日は▁とても▁良い▁天気です。
3 | init これ以上▁利用する▁場合は▁教えてください。
4 | init 食器は▁そのまま▁入れて▁大丈夫です。
5 | gh152 ダウンロード▁ありがとう▁ございます。
6 | gh152 ご利用▁ありがとう▁ございました。
7 | gh157 要点を▁まとめる▁必要が▁ある。
8 | gh160 目指すのは▁あらゆる▁人に▁便利な▁ソフトウェア
9 | gh160 商品が▁まもなく▁到着します。
10 | gh160 プロジェクトが▁ようやく▁日の▁目を▁見る。
11 | gh160 明け方に▁ようやく▁目覚めると、
12 | gh160 明け方▁ようやく▁目覚めると、
13 | gh160 これは▁たまたま▁見つけた▁宝物
14 | gh160 歩いていて▁たまたま▁目に▁入った▁光景
15 | gh216 あなたの▁意図した▁とおりに▁情報を▁伝える。
16 | gh220 あの▁イーハトーヴォの▁すきとおった▁風、▁夏でも▁底に▁冷たさを▁もつ▁青い▁そら、▁うつくしい▁森で▁飾られた▁モリーオ市、▁郊外の▁ぎらぎら▁ひかる▁草の▁波。
17 | gh387 購入された▁お客様のみ▁入れます。
18 | gh387 購入された▁お客様のみ▁入場できます。
19 | gh387 パワーのみ▁有効だ
20 | b320113958 小さな▁つぶや▁空気中の▁ちり
21 | b320113958 光が▁どんどん▁空▁いっぱいに▁広がる
22 | b320113958 太陽の▁位置が▁ちがうから
23 | b320113958 太陽が▁しずむころに▁帰る
24 | b320113958 多すぎると▁うまく▁いかない
25 | b320113958 世界の▁子どもの▁命や▁権利
26 | b320113958 「ふだん▁どおり」を▁保つ
27 | b320113958 おもちゃや▁遊びに▁使える
28 | b320113958 コントロールできない▁ほど▁感情移入してしまう
29 | b320113958 いつも▁甘えがちに▁なる
30 | b320113958 存在が▁浮かび▁上がった。
31 |
--------------------------------------------------------------------------------
/tests/test_html_processor.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | "Tests the HTML Processor."
15 |
16 | import os
17 | import sys
18 | import unittest
19 |
20 | # module hack
21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
22 | sys.path.insert(0, os.path.abspath(LIB_PATH))
23 |
24 | from budoux import html_processor # noqa (module hack)
25 |
26 |
27 | class TestTextContentExtractor(unittest.TestCase):
28 |
29 | def test_output(self) -> None:
30 | input = 'Hello , World
'
31 | expected = 'Hello, World'
32 | extractor = html_processor.TextContentExtractor()
33 | extractor.feed(input)
34 | self.assertEqual(
35 | extractor.output, expected,
36 | 'Text content should be extacted from the given HTML string.')
37 |
38 |
39 | class TestHTMLChunkResolver(unittest.TestCase):
40 |
41 | def test_output(self) -> None:
42 | input = 'abcde f
'
43 | expected = 'abcde f
'
44 | resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '')
45 | resolver.feed(input)
46 | self.assertEqual(resolver.output, expected,
47 | 'WBR tags should be inserted as specified by chunks.')
48 |
49 | def test_unpaired(self) -> None:
50 | input = 'abcdef
'
51 | expected = 'abcdef
'
52 | resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '')
53 | resolver.feed(input)
54 | self.assertEqual(resolver.output, expected,
55 | 'Unpaired close tag should not cause errors.')
56 |
57 | def test_nobr(self) -> None:
58 | input = 'abcde f
'
59 | expected = 'abcde f
'
60 | resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '')
61 | resolver.feed(input)
62 | self.assertEqual(resolver.output, expected,
63 | 'WBR tags should not be inserted if in NOBR.')
64 |
65 | def test_after_nobr(self) -> None:
66 | input = 'abxy abcdef
'
67 | expected = 'abxy abcdef
'
68 | resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '')
69 | resolver.feed(input)
70 | self.assertEqual(resolver.output, expected,
71 | 'WBR tags should be inserted if after NOBR.')
72 |
73 | def test_img_in_nobr(self) -> None:
74 | input = 'abx y abcdef
'
75 | expected = 'abx y abcdef
'
76 | resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '')
77 | resolver.feed(input)
78 | self.assertEqual(resolver.output, expected,
79 | 'IMG should not affect surrounding NOBR.')
80 |
81 |
82 | class TestResolve(unittest.TestCase):
83 |
84 | def test_with_simple_text_input(self) -> None:
85 | chunks = ['abc', 'def']
86 | html = 'abcdef'
87 | result = html_processor.resolve(chunks, html)
88 | expected = 'abc\u200bdef '
89 | self.assertEqual(result, expected)
90 |
91 | def test_with_standard_html_input(self) -> None:
92 | chunks = ['abc', 'def']
93 | html = 'abcd ef'
94 | result = html_processor.resolve(chunks, html)
95 | expected = 'abc\u200bd ef '
96 | self.assertEqual(result, expected)
97 |
98 | def test_with_nodes_to_skip(self) -> None:
99 | chunks = ['abc', 'def', 'ghi']
100 | html = "abcde fghi"
101 | result = html_processor.resolve(chunks, html)
102 | expected = 'abcde f\u200bghi '
103 | self.assertEqual(result, expected)
104 |
105 | def test_with_break_before_skip(self) -> None:
106 | chunks = ['abc', 'def', 'ghi', 'jkl']
107 | html = "abcdefghi jkl"
108 | result = html_processor.resolve(chunks, html)
109 | expected = 'abc\u200bdefghi \u200bjkl '
110 | self.assertEqual(result, expected)
111 |
112 | def test_with_nothing_to_split(self) -> None:
113 | chunks = ['abcdef']
114 | html = 'abcdef'
115 | result = html_processor.resolve(chunks, html)
116 | expected = 'abcdef '
117 | self.assertEqual(result, expected)
118 |
--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Tests the BudouX CLI."""
15 |
16 | import io
17 | import sys
18 | import unittest
19 | from os.path import abspath, dirname, join
20 |
21 | # module hack
22 | LIB_PATH = join(dirname(__file__), '..')
23 | sys.path.insert(0, abspath(LIB_PATH))
24 |
25 | from budoux import main # noqa (module hack)
26 |
27 | if isinstance(sys.stdin, io.TextIOWrapper):
28 | sys.stdin.reconfigure(encoding='utf-8')
29 |
30 | if isinstance(sys.stdout, io.TextIOWrapper):
31 | sys.stdout.reconfigure(encoding='utf-8')
32 |
33 |
34 | class TestCommonOption(unittest.TestCase):
35 |
36 | def test_cmdargs_invalid_option(self) -> None:
37 | cmdargs = ['-v']
38 | with self.assertRaises(SystemExit) as cm:
39 | main.parse_args(cmdargs)
40 |
41 | self.assertEqual(cm.exception.code, 2)
42 |
43 | def test_cmdargs_help(self) -> None:
44 | cmdargs = ['-h']
45 | with self.assertRaises(SystemExit) as cm:
46 | main.parse_args(cmdargs)
47 |
48 | self.assertEqual(cm.exception.code, 0)
49 |
50 | def test_cmdargs_version(self) -> None:
51 | cmdargs = ['-V']
52 | with self.assertRaises(SystemExit) as cm:
53 | main.parse_args(cmdargs)
54 |
55 | self.assertEqual(cm.exception.code, 0)
56 |
57 |
58 | class TestModelOption(unittest.TestCase):
59 |
60 | def test_cmdargs_invalid_json(self) -> None:
61 | cmdargs = ['-m', '404.json']
62 | with self.assertRaises(SystemExit) as cm:
63 | main.parse_args(cmdargs)
64 |
65 | self.assertEqual(cm.exception.code, 2)
66 |
67 | def test_cmdargs_invalid_lang_1(self) -> None:
68 | cmdargs = ['-l', 'aa']
69 | with self.assertRaises(SystemExit) as cm:
70 | main.parse_args(cmdargs)
71 |
72 | self.assertEqual(cm.exception.code, 2)
73 |
74 | def test_cmdargs_invalid_lang_2(self) -> None:
75 | cmdargs = ['-l', 'ja-abc']
76 | with self.assertRaises(SystemExit) as cm:
77 | main.parse_args(cmdargs)
78 |
79 | self.assertEqual(cm.exception.code, 2)
80 |
81 | def test_cmdargs_lang_ja(self) -> None:
82 | cmdargs = ['-l', 'ja', '今日は良い天気ですね。']
83 | output = main._main(cmdargs)
84 |
85 | self.assertEqual(output, '今日は\n良い\n天気ですね。')
86 |
87 | def test_cmdargs_lang_zh_hans(self) -> None:
88 | cmdargs = ['-l', 'zh-hans', '今天天气晴朗。']
89 | output = main._main(cmdargs)
90 |
91 | self.assertEqual(output, '今天\n天气\n晴朗。')
92 |
93 |
94 | class TestTextArguments(unittest.TestCase):
95 |
96 | def test_cmdargs_single_text(self) -> None:
97 | cmdargs = ['これはテストです。']
98 | output = main._main(cmdargs)
99 |
100 | self.assertEqual(output, "これは\nテストです。")
101 |
102 | def test_cmdargs_single_multiline_text(self) -> None:
103 | cmdargs = ["これはテストです。\n今日は晴天です。"]
104 | output = main._main(cmdargs)
105 |
106 | self.assertEqual(output, "これは\nテストです。\n---\n今日は\n晴天です。")
107 |
108 | def test_cmdargs_single_multiline_text_with_delimiter(self) -> None:
109 | cmdargs = ["これはテストです。\n今日は晴天です。", "-d", "@"]
110 | output = main._main(cmdargs)
111 |
112 | self.assertEqual(output, "これは\nテストです。\n@\n今日は\n晴天です。")
113 |
114 | def test_cmdargs_single_multiline_text_with_empty_delimiter(self) -> None:
115 | cmdargs = ["これはテストです。\n今日は晴天です。", "-d", ""]
116 | output = main._main(cmdargs)
117 |
118 | self.assertEqual(output, "これは\nテストです。\n\n今日は\n晴天です。")
119 |
120 | def test_cmdargs_multi_text(self) -> None:
121 | cmdargs = ['これはテストです。', '今日は晴天です。']
122 | with self.assertRaises(SystemExit) as cm:
123 | main.main(cmdargs)
124 |
125 | self.assertEqual(cm.exception.code, 2)
126 |
127 | def test_cmdargs_single_html(self) -> None:
128 | cmdargs = ['-H', '今日はとても天気 です。']
129 | output = main._main(cmdargs)
130 |
131 | self.assertEqual(
132 | output, ''
133 | '今日は\u200bとても\u200b天気 です。 ')
134 |
135 | def test_cmdargs_multi_html(self) -> None:
136 | cmdargs = ['-H', '今日はとても天気 です。', 'これはテスト です。']
137 | with self.assertRaises(SystemExit) as cm:
138 | main._main(cmdargs)
139 |
140 | self.assertEqual(cm.exception.code, 2)
141 |
142 |
143 | class TestStdin(unittest.TestCase):
144 |
145 | def test_cmdargs_blank_stdin(self) -> None:
146 | with open(
147 | join(abspath(dirname(__file__)), "in/1.in"),
148 | "r",
149 | encoding=sys.getdefaultencoding()) as f:
150 | sys.stdin = f
151 | output = main._main([])
152 |
153 | self.assertEqual(output, "")
154 |
155 | def test_cmdargs_text_stdin(self) -> None:
156 | with open(
157 | join(abspath(dirname(__file__)), "in/2.in"),
158 | "r",
159 | encoding=sys.getdefaultencoding()) as f:
160 | sys.stdin = f
161 | output = main._main([])
162 |
163 | self.assertEqual(output, "これは\nテストです。")
164 |
165 | def test_cmdargs_html_stdin(self) -> None:
166 | with open(
167 | join(abspath(dirname(__file__)), "in/3.in"),
168 | "r",
169 | encoding=sys.getdefaultencoding()) as f:
170 | sys.stdin = f
171 | output = main._main(["-H"])
172 |
173 | self.assertEqual(
174 | output, ''
175 | 'これは\u200bテスト です。\u200b\n'
176 | ' ')
177 |
178 |
179 | if __name__ == '__main__':
180 | unittest.main()
181 |
--------------------------------------------------------------------------------
/tests/test_parser.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Tests the BudouX parser."""
15 |
16 | import os
17 | import sys
18 | import unittest
19 |
20 | # module hack
21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
22 | sys.path.insert(0, os.path.abspath(LIB_PATH))
23 |
24 | from budoux import parser # noqa (module hack)
25 |
26 |
27 | class TestParser(unittest.TestCase):
28 | TEST_SENTENCE = 'abcdeabcd'
29 |
30 | def test_parse(self) -> None:
31 | p = parser.Parser({
32 | 'UW4': {
33 | 'a': 10000
34 | }, # means "should separate right before 'a'".
35 | })
36 | chunks = p.parse(TestParser.TEST_SENTENCE)
37 | self.assertListEqual(chunks, ['abcde', 'abcd'],
38 | 'Should separate if a strong feature item supports.')
39 |
40 | p = parser.Parser({
41 | 'UW4': {
42 | 'b': 10000
43 | }, # means "should separate right before 'b'".
44 | })
45 | chunks = p.parse(TestParser.TEST_SENTENCE)
46 | self.assertListEqual(
47 | chunks, ['a', 'bcdea', 'bcd'],
48 | 'Should separate even if it makes the first character a sole phrase.')
49 |
50 | p = parser.Parser({})
51 | chunks = p.parse('')
52 | self.assertListEqual(chunks, [],
53 | 'Should return a blank list when the input is blank.')
54 |
55 | def test_translate_html_string(self) -> None:
56 | p = parser.Parser({
57 | 'UW4': {
58 | 'a': 10000
59 | }, # means "should separate right before 'a'".
60 | })
61 |
62 | input_html = 'xyzabcd'
63 | expected_html = (
64 | ''
65 | 'xyz\u200babcd ')
66 | output_html = p.translate_html_string(input_html)
67 | self.assertEqual(
68 | output_html, expected_html,
69 | 'Should output a html string with a SPAN parent with proper style attributes.'
70 | )
71 |
72 | input_html = 'xyzxyzabc'
73 | # TODO: Because the content for skip elements are included, this test tries
74 | # to break before "alert". We may want to distinguish "skip from the
75 | # content" and "skip breaking" in future.
76 | expected_html = (
77 | ''
78 | 'xyz\u200bxyz\u200babc ')
79 | output_html = p.translate_html_string(input_html)
80 | self.assertEqual(output_html, expected_html,
81 | 'Should pass script tags as is.')
82 |
83 | input_html = 'xyzabc
abc'
84 | expected_html = (
85 | ''
86 | 'xyz\u200babc
\u200babc ')
87 | output_html = p.translate_html_string(input_html)
88 | self.assertEqual(output_html, expected_html,
89 | 'Should skip some specific tags.')
90 |
91 | input_html = 'xyzabc abc'
92 | expected_html = (
93 | ''
94 | 'xyz\u200babc \u200babc ')
95 | output_html = p.translate_html_string(input_html)
96 | self.assertEqual(output_html, expected_html,
97 | 'Should not ruin attributes of child elements.')
98 |
99 | input_html = 'xyza🇯🇵🇵🇹abc'
100 | expected_html = (
101 | ''
102 | 'xyz\u200ba🇯🇵🇵🇹\u200babc ')
103 | output_html = p.translate_html_string(input_html)
104 | self.assertEqual(output_html, expected_html, 'Should work with emojis.')
105 |
106 |
107 | class TestDefaultParser(unittest.TestCase):
108 |
109 | def test_load_default_japanese_parser(self) -> None:
110 | p_ja = parser.load_default_japanese_parser()
111 | phrases = p_ja.parse('Google の使命は、世界中の情報を整理し、世界中の人がアクセスできて使えるようにすることです。')
112 | self.assertListEqual(phrases, [
113 | 'Google の',
114 | '使命は、',
115 | '世界中の',
116 | '情報を',
117 | '整理し、',
118 | '世界中の',
119 | '人が',
120 | 'アクセスできて',
121 | '使えるように',
122 | 'する',
123 | 'ことです。',
124 | ])
125 |
126 | def test_load_default_simplified_chinese_parser(self) -> None:
127 | p_hans = parser.load_default_simplified_chinese_parser()
128 | phrases = p_hans.parse('我们的使命是整合全球信息,供大众使用,让人人受益。')
129 | self.assertListEqual(phrases, [
130 | '我们',
131 | '的',
132 | '使命',
133 | '是',
134 | '整合',
135 | '全球',
136 | '信息,',
137 | '供',
138 | '大众',
139 | '使用,',
140 | '让',
141 | '人',
142 | '人',
143 | '受益。',
144 | ])
145 |
146 | def test_load_default_traditional_chinese_parser(self) -> None:
147 | p_hant = parser.load_default_traditional_chinese_parser()
148 | phrases = p_hant.parse('我們的使命是匯整全球資訊,供大眾使用,使人人受惠。')
149 | self.assertListEqual(phrases, [
150 | '我們',
151 | '的',
152 | '使命',
153 | '是',
154 | '匯整',
155 | '全球',
156 | '資訊,',
157 | '供',
158 | '大眾',
159 | '使用,',
160 | '使',
161 | '人',
162 | '人',
163 | '受惠。',
164 | ])
165 |
166 |
167 | if __name__ == '__main__':
168 | unittest.main()
169 |
--------------------------------------------------------------------------------
/tests/test_quality.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Quality regression test."""
15 |
16 | import os
17 | import sys
18 | import unittest
19 |
20 | # module hack
21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
22 | sys.path.insert(0, os.path.abspath(LIB_PATH))
23 |
24 | from budoux import load_default_japanese_parser, utils # noqa (module hack)
25 |
26 |
27 | class TestQuality(unittest.TestCase):
28 |
29 | def test_ja(self) -> None:
30 | errors = []
31 | parser = load_default_japanese_parser()
32 | fp = os.path.join(os.path.dirname(__file__), 'quality', 'ja.tsv')
33 | with open(fp, 'r', encoding='utf-8') as f:
34 | data = [line.split('\t') for line in f.readlines() if line[0] != '#']
35 | expected_sentences = [line[1].strip() for line in data if len(line) > 1]
36 | for expected in expected_sentences:
37 | result = utils.SEP.join(parser.parse(expected.replace(utils.SEP, '')))
38 | if result != expected:
39 | errors.append((expected, result))
40 | self.assertEqual(
41 | len(errors), 0, 'Failing sentences:\n{}'.format('\n'.join(
42 | [f'expected:{err[0]}\tactual:{err[1]}' for err in errors])))
43 |
--------------------------------------------------------------------------------