├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── build-demo.yml
    │   ├── codeql.yml
    │   ├── dependency-review.yml
    │   ├── java-unittest.yml
    │   ├── nodejs-unittest.yml
    │   ├── py-unittest.yml
    │   ├── scorecard.yml
    │   └── style-check.yml
├── .gitignore
├── .markdownlint.yaml
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── budoux
    ├── __init__.py
    ├── html_processor.py
    ├── main.py
    ├── models
    │   ├── ja.json
    │   ├── ja_knbc.json
    │   ├── th.json
    │   ├── zh-hans.json
    │   └── zh-hant.json
    ├── parser.py
    ├── py.typed
    ├── skip_nodes.json
    └── utils.py
├── bump_version.py
├── data
    └── finetuning
    │   └── ja
    │       ├── train.txt
    │       └── val.txt
├── demo
    ├── package-lock.json
    ├── package.json
    ├── src
    │   ├── app.ts
    │   └── worker.ts
    ├── static
    │   └── index.html
    └── tsconfig.json
├── example.png
├── java
    ├── .gitignore
    ├── README.md
    ├── pom.xml
    └── src
    │   ├── main
    │       └── java
    │       │   └── com
    │       │       └── google
    │       │           └── budoux
    │       │               ├── HTMLProcessor.java
    │       │               └── Parser.java
    │   └── test
    │       └── java
    │           └── com
    │               └── google
    │                   └── budoux
    │                       ├── HTMLProcessorTest.java
    │                       └── ParserTest.java
├── javascript
    ├── .npmignore
    ├── .prettierrc.json
    ├── README.md
    ├── bin
    │   └── budoux.js
    ├── eslint.config.mjs
    ├── karma.conf.js
    ├── package-lock.json
    ├── package.json
    ├── scripts
    │   ├── check-cli-version.js
    │   └── copy-data.js
    ├── src
    │   ├── cli.ts
    │   ├── dom-browser.ts
    │   ├── dom.ts
    │   ├── html_processor.ts
    │   ├── index.ts
    │   ├── parser.ts
    │   ├── tests
    │   │   ├── index.browser.ts
    │   │   ├── index.node.ts
    │   │   ├── models
    │   │   │   └── separate_right_before_a.json
    │   │   ├── test_cli.ts
    │   │   ├── test_html_processor.ts
    │   │   ├── test_parser.ts
    │   │   ├── test_webcomponents.ts
    │   │   ├── testutils-browser.ts
    │   │   └── testutils.ts
    │   └── webcomponents
    │   │   ├── budoux-base.ts
    │   │   ├── budoux-ja.ts
    │   │   ├── budoux-th.ts
    │   │   ├── budoux-zh-hans.ts
    │   │   └── budoux-zh-hant.ts
    └── tsconfig.json
├── pyproject.toml
├── scripts
    ├── README.md
    ├── __init__.py
    ├── build_model.py
    ├── encode_data.py
    ├── finetune.py
    ├── prepare_knbc.py
    ├── prepare_wisesight.py
    ├── tests
    │   ├── test_build_model.py
    │   ├── test_encode_data.py
    │   ├── test_finetune.py
    │   ├── test_prepare_knbc.py
    │   ├── test_train.py
    │   └── test_translate_model.py
    ├── train.py
    └── translate_model.py
├── setup.cfg
├── setup.py
└── tests
    ├── in
        ├── 1.in
        ├── 2.in
        └── 3.in
    ├── quality
        └── ja.tsv
    ├── test_html_processor.py
    ├── test_main.py
    ├── test_parser.py
    └── test_quality.py


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: pip
 9 |     directory: /
10 |     schedule:
11 |       interval: daily
12 | 
13 |   - package-ecosystem: github-actions
14 |     directory: /
15 |     schedule:
16 |       interval: daily
17 | 
18 |   - package-ecosystem: npm
19 |     directory: /demo
20 |     schedule:
21 |       interval: daily
22 | 
23 |   - package-ecosystem: npm
24 |     directory: /javascript
25 |     schedule:
26 |       interval: daily
27 | 
28 |   - package-ecosystem: maven
29 |     directory: /java
30 |     schedule:
31 |       interval: daily
32 | 


--------------------------------------------------------------------------------
/.github/workflows/build-demo.yml:
--------------------------------------------------------------------------------
 1 | name: Build Demo
 2 | on:
 3 |   push:
 4 |     branches: [ "main" ]
 5 | permissions:
 6 |   contents: read
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Harden Runner
13 |         uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
14 |         with:
15 |           egress-policy: audit
16 | 
17 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
18 |       - uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
19 |         with:
20 |           node-version: '16'
21 |       - run: npm install
22 |         working-directory: ./javascript
23 |       - run: npm install
24 |         working-directory: ./demo
25 |       - run: npm run build
26 |         working-directory: ./demo
27 |       - name: Upload static files as artifact
28 |         id: deployment
29 |         uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # v3.0.1
30 |         with:
31 |           path: ./demo/static/
32 |   deploy:
33 |     needs: build
34 |     permissions:
35 |       pages: write
36 |       id-token: write
37 |     environment:
38 |       name: github-pages
39 |       url: ${{ steps.deployment.outputs.page_url }}
40 |     runs-on: ubuntu-latest
41 |     steps:
42 |       - name: Deploy to GitHub Pages
43 |         id: deployment
44 |         uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4.0.5
45 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ "main" ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ "main" ]
20 |   schedule:
21 |     - cron: '42 20 * * 3'
22 | 
23 | permissions:
24 |   contents: read
25 | 
26 | jobs:
27 |   analyze:
28 |     name: Analyze
29 |     runs-on: ubuntu-latest
30 |     permissions:
31 |       actions: read
32 |       contents: read
33 |       security-events: write
34 | 
35 |     strategy:
36 |       fail-fast: false
37 |       matrix:
38 |         language: [ 'java', 'javascript', 'python' ]
39 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
40 |         # Use only 'java' to analyze code written in Java, Kotlin or both
41 |         # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both
42 |         # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
43 | 
44 |     steps:
45 |     - name: Harden Runner
46 |       uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
47 |       with:
48 |         egress-policy: audit
49 | 
50 |     - name: Checkout repository
51 |       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
52 | 
53 |     # Initializes the CodeQL tools for scanning.
54 |     - name: Initialize CodeQL
55 |       uses: github/codeql-action/init@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v2.20.1
56 |       with:
57 |         languages: ${{ matrix.language }}
58 |         # If you wish to specify custom queries, you can do so here or in a config file.
59 |         # By default, queries listed here will override any specified in a config file.
60 |         # Prefix the list here with "+" to use these queries and those in the config file.
61 | 
62 |         # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
63 |         # queries: security-extended,security-and-quality
64 | 
65 | 
66 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, Go, or Java).
67 |     # If this step fails, then you should remove it and run the build manually (see below)
68 |     - name: Autobuild
69 |       uses: github/codeql-action/autobuild@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v2.20.1
70 | 
71 |     # ℹ️ Command-line programs to run using the OS shell.
72 |     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
73 | 
74 |     #   If the Autobuild fails above, remove it and uncomment the following three lines.
75 |     #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
76 | 
77 |     # - run: |
78 |     #     echo "Run, Build Application using script"
79 |     #     ./location_of_script_within_repo/buildscript.sh
80 | 
81 |     - name: Perform CodeQL Analysis
82 |       uses: github/codeql-action/analyze@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v2.20.1
83 |       with:
84 |         category: "/language:${{matrix.language}}"
85 | 


--------------------------------------------------------------------------------
/.github/workflows/dependency-review.yml:
--------------------------------------------------------------------------------
 1 | # Dependency Review Action
 2 | #
 3 | # This Action will scan dependency manifest files that change as part of a Pull Request,
 4 | # surfacing known-vulnerable versions of the packages declared or updated in the PR.
 5 | # Once installed, if the workflow run is marked as required, 
 6 | # PRs introducing known-vulnerable packages will be blocked from merging.
 7 | #
 8 | # Source repository: https://github.com/actions/dependency-review-action
 9 | name: 'Dependency Review'
10 | on: [pull_request]
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   dependency-review:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - name: Harden Runner
20 |         uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
21 |         with:
22 |           egress-policy: audit
23 | 
24 |       - name: 'Checkout Repository'
25 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
26 |       - name: 'Dependency Review'
27 |         uses: actions/dependency-review-action@3b139cfc5fae8b618d3eae3675e383bb1769c019 # v4.5.0
28 | 


--------------------------------------------------------------------------------
/.github/workflows/java-unittest.yml:
--------------------------------------------------------------------------------
 1 | name: Unittest for Java
 2 | on:
 3 |   push:
 4 |     paths:
 5 |       - 'java/**'
 6 |   pull_request:
 7 |     paths:
 8 |       - 'java/**'
 9 | permissions:
10 |   contents: read
11 | 
12 | jobs:
13 |   java-unittest:
14 |     runs-on: ${{ matrix.os }}
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         os: [ubuntu-latest, macos-latest, windows-latest]
19 |     steps:
20 |       - name: Harden Runner
21 |         uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
22 |         with:
23 |           egress-policy: audit
24 | 
25 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
26 |       - name: Set up JDK 17
27 |         uses: actions/setup-java@7a6d8a8234af8eb26422e24e3006232cccaa061b # v4.6.0
28 |         with:
29 |           java-version: '17'
30 |           distribution: 'temurin'
31 |       - name: Build with Maven
32 |         run: mvn --batch-mode --update-snapshots -f ./java/pom.xml package
33 | 


--------------------------------------------------------------------------------
/.github/workflows/nodejs-unittest.yml:
--------------------------------------------------------------------------------
 1 | name: Unittest for NodeJS
 2 | on:
 3 |   push:
 4 |     paths:
 5 |       - 'javascript/**'
 6 |   pull_request:
 7 |     paths:
 8 |       - 'javascript/**'
 9 | permissions:
10 |   contents: read
11 | 
12 | jobs:
13 |   nodejs-unittest:
14 |     runs-on: ${{ matrix.os }}
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         os: [ubuntu-latest, macos-latest, windows-latest]
19 |         node-version: [18, 20]
20 | 
21 |     steps:
22 |       - name: Harden Runner
23 |         uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
24 |         with:
25 |           egress-policy: audit
26 | 
27 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
28 |       - name: Setup Node ${{ matrix.node-version }}
29 |         uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
30 |         with:
31 |           node-version: ${{ matrix.node-version }}
32 |       - name: Install Dependencies
33 |         run: npm install
34 |         working-directory: ./javascript
35 |       - name: Create symlink
36 |         run: npm link
37 |         working-directory: ./javascript
38 |       - name: Build package
39 |         run: npm run build --if-present
40 |         working-directory: ./javascript
41 |       - name: Run testcases
42 |         run: npm test
43 |         working-directory: ./javascript
44 | 


--------------------------------------------------------------------------------
/.github/workflows/py-unittest.yml:
--------------------------------------------------------------------------------
 1 | name: Unittest for Python
 2 | on:
 3 |   push:
 4 |     paths-ignore:
 5 |       - 'javascript/**'
 6 |       - 'java/**'
 7 |   pull_request:
 8 |     paths-ignore:
 9 |       - 'javascript/**'
10 |       - 'java/**'
11 | permissions:
12 |   contents: read
13 | 
14 | jobs:
15 |   python-unittest:
16 |     runs-on: ${{ matrix.os }}
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         os: [ubuntu-latest, macos-latest, windows-latest]
21 |         python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
22 |     steps:
23 |       - name: Harden Runner
24 |         uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
25 |         with:
26 |           egress-policy: audit
27 | 
28 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
29 |       - name: Setup python ${{ matrix.python-version }}
30 |         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
31 |         with:
32 |           python-version: ${{ matrix.python-version }}
33 |       - name: Install requirements
34 |         run: |
35 |           python -m pip install --upgrade pip
36 |           python -m pip install ".[dev]"
37 |       - name: Run unittest
38 |         run: pytest ./tests
39 |       - name: Install Jax
40 |         if:  ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.9' }}
41 |         run: pip install ".[jaxcpu]"
42 |       - name: Run unittest with Jax
43 |         if:  ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.9' }}
44 |         run: pytest ./scripts/tests
45 | 


--------------------------------------------------------------------------------
/.github/workflows/scorecard.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub. They are provided
 2 | # by a third-party and are governed by separate terms of service, privacy
 3 | # policy, and support documentation.
 4 | 
 5 | name: Scorecard supply-chain security
 6 | on:
 7 |   # For Branch-Protection check. Only the default branch is supported. See
 8 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
 9 |   branch_protection_rule:
10 |   # To guarantee Maintained check is occasionally updated. See
11 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
12 |   schedule:
13 |     - cron: '37 11 * * 2'
14 |   push:
15 |     branches: [ "main" ]
16 | 
17 | # Declare default permissions as read only.
18 | permissions: read-all
19 | 
20 | jobs:
21 |   analysis:
22 |     name: Scorecard analysis
23 |     runs-on: ubuntu-latest
24 |     permissions:
25 |       # Needed to upload the results to code-scanning dashboard.
26 |       security-events: write
27 |       # Needed to publish results and get a badge (see publish_results below).
28 |       id-token: write
29 |       # Uncomment the permissions below if installing in a private repository.
30 |       # contents: read
31 |       # actions: read
32 | 
33 |     steps:
34 |       - name: Harden Runner
35 |         uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
36 |         with:
37 |           egress-policy: audit
38 | 
39 |       - name: "Checkout code"
40 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
41 |         with:
42 |           persist-credentials: false
43 | 
44 |       - name: "Run analysis"
45 |         uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0
46 |         with:
47 |           results_file: results.sarif
48 |           results_format: sarif
49 |           # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
50 |           # - you want to enable the Branch-Protection check on a *public* repository, or
51 |           # - you are installing Scorecard on a *private* repository
52 |           # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
53 |           # repo_token: ${{ secrets.SCORECARD_TOKEN }}
54 | 
55 |           # Public repositories:
56 |           #   - Publish results to OpenSSF REST API for easy access by consumers
57 |           #   - Allows the repository to include the Scorecard badge.
58 |           #   - See https://github.com/ossf/scorecard-action#publishing-results.
59 |           # For private repositories:
60 |           #   - `publish_results` will always be set to `false`, regardless
61 |           #     of the value entered here.
62 |           publish_results: true
63 | 
64 |       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
65 |       # format to the repository Actions tab.
66 |       - name: "Upload artifact"
67 |         uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
68 |         with:
69 |           name: SARIF file
70 |           path: results.sarif
71 |           retention-days: 5
72 | 
73 |       # Upload the results to GitHub's code scanning dashboard.
74 |       - name: "Upload to code-scanning"
75 |         uses: github/codeql-action/upload-sarif@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9
76 |         with:
77 |           sarif_file: results.sarif
78 | 


--------------------------------------------------------------------------------
/.github/workflows/style-check.yml:
--------------------------------------------------------------------------------
 1 | name: Style Check
 2 | on: [push, pull_request]
 3 | permissions:
 4 |   contents: read
 5 | jobs:
 6 |   python-style-check:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - name: Harden Runner
10 |         uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
11 |         with:
12 |           egress-policy: audit
13 | 
14 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
15 |       - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
16 |         with:
17 |           python-version: '3.10'
18 |       - name: Install dependencies
19 |         run: |
20 |           pip install --upgrade pip
21 |           pip install ".[dev]"
22 |           pip install ".[jaxcpu]"
23 |       - name: Run isort
24 |         run: |
25 |           isort --diff --check .
26 |       - name: Run yapf
27 |         run: |
28 |           yapf --diff --recursive budoux tests scripts
29 |       - name: Run mypy
30 |         run: |
31 |           mypy budoux tests scripts
32 |       - name: Run flake8
33 |         if: ${{ always() }}
34 |         uses: suo/flake8-github-action@3e87882219642e01aa8a6bbd03b4b0adb8542c2a
35 |         with:
36 |           checkName: python-style-check
37 |         env:
38 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
39 |   typescript-style-check:
40 |     runs-on: ubuntu-latest
41 |     steps:
42 |       - name: Harden Runner
43 |         uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
44 |         with:
45 |           egress-policy: audit
46 | 
47 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
48 |       - uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
49 |         with:
50 |           node-version: '20'
51 |       - run: npm install
52 |         working-directory: ./javascript
53 |       - run: npm run lint
54 |         working-directory: ./javascript
55 |   java-style-check:
56 |     runs-on: ubuntu-latest
57 |     steps:
58 |       - name: Harden Runner
59 |         uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
60 |         with:
61 |           egress-policy: audit
62 | 
63 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
64 |       - uses: actions/setup-java@7a6d8a8234af8eb26422e24e3006232cccaa061b # v4.6.0
65 |         with:
66 |           java-version: '17'
67 |           distribution: 'temurin'
68 |       - name: Google Java Format
69 |         uses: axel-op/googlejavaformat-action@dbff853fb823671ec5781365233bf86543b13215
70 |         with:
71 |           args: "--replace"
72 |           skip-commit: true
73 |       - name: Print diffs
74 |         run: git --no-pager diff --exit-code
75 |   markdown-style-check:
76 |     runs-on: ubuntu-latest
77 |     steps:
78 |       - name: Harden Runner
79 |         uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
80 |         with:
81 |           egress-policy: audit
82 | 
83 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
84 |       - name: markdownlint
85 |         uses: nosborn/github-action-markdown-cli@9b5e871c11cc0649c5ac2526af22e23525fa344d
86 |         with:
87 |           files: '**/*.md'
88 |           config_file: .markdownlint.yaml
89 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .venv
 3 | /dist
 4 | __pycache__
 5 | *.pyc
 6 | *.log
 7 | *.egg-info
 8 | *.coverage
 9 | cov.xml
10 | 
11 | # Python related files
12 | build/
13 | 
14 | # JavaScript related files
15 | node_modules
16 | demo/static/app.js
17 | demo/static/worker.js
18 | javascript/bundle
19 | javascript/dist
20 | javascript/module
21 | javascript/src/data
22 | 
23 | # Generated files by scripts
24 | source.txt
25 | encoded_data.txt
26 | weights.txt
27 | 
28 | .vscode/
29 | 


--------------------------------------------------------------------------------
/.markdownlint.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | default: true
15 | 
16 | MD013:
17 |   code_blocks: false
18 | MD010: false
19 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement (CLA). You (or your employer) retain the copyright to your
10 | contribution; this simply gives us permission to use and redistribute your
11 | contributions as part of the project. Head over to
12 | <https://cla.developers.google.com/> to see your current agreements on file or
13 | to sign a new one.
14 | 
15 | You generally only need to submit a CLA once, so if you've already submitted one
16 | (even if it was for a different project), you probably don't need to do it
17 | again.
18 | 
19 | ## Code Reviews
20 | 
21 | All submissions, including submissions by project members, require review. We
22 | use GitHub pull requests for this purpose. Consult
23 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
24 | information on using pull requests.
25 | 
26 | ## Community Guidelines
27 | 
28 | This project follows
29 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/).
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include budoux/unicode_blocks.json
2 | include budoux/skip_nodes.json
3 | include budoux/py.typed
4 | recursive-include budoux/models *.json
5 | 


--------------------------------------------------------------------------------
/budoux/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """BudouX module."""
15 | 
16 | from . import parser
17 | 
18 | __version__ = "0.7.0"
19 | 
20 | Parser = parser.Parser
21 | load_default_japanese_parser = parser.load_default_japanese_parser
22 | load_default_simplified_chinese_parser = parser.load_default_simplified_chinese_parser
23 | load_default_traditional_chinese_parser = parser.load_default_traditional_chinese_parser
24 | load_default_thai_parser = parser.load_default_thai_parser
25 | 


--------------------------------------------------------------------------------
/budoux/html_processor.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """HTML processor."""
 15 | 
 16 | import json
 17 | import os
 18 | import queue
 19 | import typing
 20 | from html.parser import HTMLParser
 21 | 
 22 | from .utils import SEP
 23 | 
 24 | HTMLAttr = typing.List[typing.Tuple[str, typing.Union[str, None]]]
 25 | PARENT_CSS_STYLE = 'word-break: keep-all; overflow-wrap: anywhere;'
 26 | with open(
 27 |     os.path.join(os.path.dirname(__file__), 'skip_nodes.json'),
 28 |     encoding='utf-8') as f:
 29 |   SKIP_NODES: typing.Set[str] = set(json.load(f))
 30 | 
 31 | 
 32 | class ElementState(object):
 33 |   """Represents the state for an element.
 34 | 
 35 |   Attributes:
 36 |     tag (str): The tag name.
 37 |     to_skip (bool): Whether the content should be skipped or not.
 38 |   """
 39 | 
 40 |   def __init__(self, tag: str, to_skip: bool) -> None:
 41 |     self.tag = tag
 42 |     self.to_skip = to_skip
 43 | 
 44 | 
 45 | class TextContentExtractor(HTMLParser):
 46 |   """An HTML parser to extract text content.
 47 | 
 48 |   Attributes:
 49 |     output (str): Accumulated text content.
 50 |   """
 51 |   output = ''
 52 | 
 53 |   def handle_data(self, data: str) -> None:
 54 |     self.output += data
 55 | 
 56 | 
 57 | class HTMLChunkResolver(HTMLParser):
 58 |   """An HTML parser to resolve the given HTML string and semantic chunks.
 59 | 
 60 |   Attributes:
 61 |     output (str): The HTML string to output.
 62 |   """
 63 |   output = ''
 64 | 
 65 |   def __init__(self, chunks: typing.List[str], separator: str):
 66 |     """Initializes the parser.
 67 | 
 68 |     Args:
 69 |       chunks (List[str]): The chunks to resolve.
 70 |       separator (str): The separator string.
 71 |     """
 72 |     HTMLParser.__init__(self)
 73 |     self.chunks_joined = SEP.join(chunks)
 74 |     self.separator = separator
 75 |     self.to_skip = False
 76 |     self.scan_index = 0
 77 |     self.element_stack: queue.LifoQueue[ElementState] = queue.LifoQueue()
 78 | 
 79 |   def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
 80 |     attr_pairs = []
 81 |     for attr in attrs:
 82 |       if attr[1] is None:
 83 |         attr_pairs.append(' ' + attr[0])
 84 |       else:
 85 |         attr_pairs.append(' %s="%s"' % (attr[0], attr[1]))
 86 |     encoded_attrs = ''.join(attr_pairs)
 87 |     self.element_stack.put(ElementState(tag, self.to_skip))
 88 |     if tag.upper() in SKIP_NODES:
 89 |       if not self.to_skip and self.chunks_joined[self.scan_index] == SEP:
 90 |         self.scan_index += 1
 91 |         self.output += self.separator
 92 |       self.to_skip = True
 93 |     self.output += '<%s%s>' % (tag, encoded_attrs)
 94 | 
 95 |   def handle_endtag(self, tag: str) -> None:
 96 |     self.output += '</%s>' % (tag)
 97 |     while not self.element_stack.empty():
 98 |       state = self.element_stack.get_nowait()
 99 |       if state.tag == tag:
100 |         self.to_skip = state.to_skip
101 |         break
102 |       # If the close tag doesn't match the open tag, remove it and keep looking.
103 |       # This means that close tags close their corresponding open tags.
104 |       # e.g., `<span>abc<img>def</span>` or `<p>abc<span>def</p>` are both valid
105 |       # HTML as per the HTML spec.
106 |       # Note the HTML "adoption agency algorithm" isn't fully supported.
107 |       # See https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
108 | 
109 |   def handle_data(self, data: str) -> None:
110 |     for char in data:
111 |       if not char == self.chunks_joined[self.scan_index]:
112 |         if not self.to_skip:
113 |           self.output += self.separator
114 |         self.scan_index += 1
115 |       self.output += char
116 |       self.scan_index += 1
117 | 
118 | 
119 | def get_text(html: str) -> str:
120 |   """Gets the text content from the input HTML string.
121 | 
122 |   Args:
123 |     html (str): Input HTML string.
124 | 
125 |   Returns:
126 |     The text content.
127 |   """
128 |   text_content_extractor = TextContentExtractor()
129 |   text_content_extractor.feed(html)
130 |   return text_content_extractor.output
131 | 
132 | 
133 | def resolve(phrases: typing.List[str],
134 |             html: str,
135 |             separator: str = '\u200b') -> str:
136 |   """Wraps phrases in the HTML string with non-breaking markup.
137 | 
138 |   Args:
139 |     phrases (List[str]): The phrases included in the HTML string.
140 |     html (str): The HTML string to resolve.
141 |     separator (str, optional): The separator string.
142 | 
143 |   Returns:
144 |     The HTML string with phrases wrapped in non-breaking markup.
145 |   """
146 |   resolver = HTMLChunkResolver(phrases, separator)
147 |   resolver.feed(html)
148 |   result = '<span style="%s">%s</span>' % (PARENT_CSS_STYLE, resolver.output)
149 |   return result
150 | 


--------------------------------------------------------------------------------
/budoux/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright 2021 Google LLC
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     https://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """BudouX Script to provide CLI for user."""
 16 | import argparse
 17 | import json
 18 | import os
 19 | import shutil
 20 | import sys
 21 | import textwrap
 22 | import typing
 23 | from pathlib import Path
 24 | 
 25 | # TODO: replace with importlib.resources when py3.8 support is dropped.
 26 | import importlib_resources
 27 | 
 28 | import budoux
 29 | 
 30 | ArgList = typing.Optional[typing.List[str]]
 31 | models: Path = importlib_resources.files('budoux') / "models"
 32 | langs = dict((model.stem, model) for model in models.glob("*.json"))
 33 | 
 34 | 
 35 | class BudouxHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
 36 |                           argparse.RawDescriptionHelpFormatter):
 37 |   pass
 38 | 
 39 | 
 40 | def check_file(path: str) -> str:
 41 |   """Check if a given filepath exists or not.
 42 | 
 43 |   Args:
 44 |       path (str): Model path
 45 | 
 46 |   Raises:
 47 |       FileNotFoundError: Raise if given path does not exist.
 48 | 
 49 |   Returns:
 50 |       str: A model path.
 51 |   """
 52 |   if os.path.isfile(path):
 53 |     return path
 54 |   else:
 55 |     raise argparse.ArgumentTypeError(f"'{path}' is not found.")
 56 | 
 57 | 
 58 | def check_lang(lang: str) -> Path:
 59 |   """Check if given language exists or not.
 60 | 
 61 |   Args:
 62 |       lang (str): language code (e.g.: 'ja')
 63 | 
 64 |   Raises:
 65 |       argparse.ArgumentTypeError: Raise if no model for given language exists.
 66 | 
 67 |   Returns:
 68 |       The model path.
 69 |   """
 70 |   if lang in langs:
 71 |     return langs[lang]
 72 |   else:
 73 |     raise argparse.ArgumentTypeError(
 74 |         f"'{lang}' does not exist in builtin models. (supported languages: {list(langs.keys())})"
 75 |     )
 76 | 
 77 | 
 78 | def parse_args(test: ArgList = None) -> argparse.Namespace:
 79 |   """Parse commandline arguments.
 80 | 
 81 |   Args:
 82 |       test (typing.Optional[typing.List[str]], optional): Commandline args for testing. Defaults to None.
 83 | 
 84 |   Returns:
 85 |       argparse.Namespace: Parsed data of args.
 86 |   """
 87 |   parser = argparse.ArgumentParser(
 88 |       prog="budoux",
 89 |       formatter_class=(lambda prog: BudouxHelpFormatter(
 90 |           prog,
 91 |           **{
 92 |               "width": shutil.get_terminal_size(fallback=(120, 50)).columns,
 93 |               "max_help_position": 30,
 94 |           },
 95 |       )),
 96 |       description=textwrap.dedent("""\
 97 |         BudouX is the successor to Budou,
 98 |         the machine learning powered line break organizer tool."""),
 99 |       epilog="\n- ".join(
100 |           ["supported languages of `-l`, `--lang`:", *langs.keys()]))
101 | 
102 |   parser.add_argument("text", metavar="TXT", nargs="?", type=str, help="text")
103 |   parser.add_argument(
104 |       "-H",
105 |       "--html",
106 |       action="store_true",
107 |       help="HTML mode",
108 |   )
109 |   model_select_group = parser.add_mutually_exclusive_group()
110 |   model_select_group.add_argument(
111 |       "-m",
112 |       "--model",
113 |       metavar="JSON",
114 |       type=check_file,
115 |       default=check_lang('ja'),
116 |       help="custom model file path",
117 |   )
118 |   model_select_group.add_argument(
119 |       "-l",
120 |       "--lang",
121 |       metavar="LANG",
122 |       type=check_lang,
123 |       help="language of custom model",
124 |   )
125 |   parser.add_argument(
126 |       "-s",
127 |       "--sep",
128 |       metavar="STR",
129 |       type=str,
130 |       default="\n",
131 |       help="output phrase separator in TEXT mode",
132 |   )
133 |   parser.add_argument(
134 |       "-d",
135 |       "--delim",
136 |       metavar="STR",
137 |       type=str,
138 |       default="---",
139 |       help="output sentence delimiter in TEXT mode",
140 |   )
141 |   parser.add_argument(
142 |       "-V",
143 |       "--version",
144 |       action="version",
145 |       version="%(prog)s {}".format(budoux.__version__),
146 |   )
147 |   if test is not None:
148 |     return parser.parse_args(test)
149 |   else:
150 |     return parser.parse_args()
151 | 
152 | 
153 | def _main(test: ArgList = None) -> str:
154 |   args = parse_args(test=test)
155 |   model_path = args.lang or args.model
156 |   with open(model_path, 'r', encoding='utf-8') as f:
157 |     model = json.load(f)
158 | 
159 |   parser = budoux.Parser(model)
160 |   if args.html:
161 |     if args.text is None:
162 |       inputs_html = sys.stdin.read()
163 |     else:
164 |       inputs_html = args.text
165 |     res = parser.translate_html_string(inputs_html)
166 |   else:
167 |     if args.text is None:
168 |       inputs = [v.rstrip() for v in sys.stdin.readlines()]
169 |     else:
170 |       inputs = [v.rstrip() for v in args.text.splitlines()]
171 |     outputs = [parser.parse(sentence) for sentence in inputs]
172 |     combined_output = [args.sep.join(output) for output in outputs]
173 |     ors = "\n" + args.delim + "\n"
174 |     res = ors.join(combined_output)
175 | 
176 |   return res
177 | 
178 | 
179 | def main(test: ArgList = None) -> None:
180 |   try:
181 |     print(_main(test))
182 |   except KeyboardInterrupt:
183 |     exit(0)
184 | 
185 | 
186 | if __name__ == "__main__":
187 |   main()
188 | 


--------------------------------------------------------------------------------
/budoux/parser.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """BudouX parser to provide semantic chunks."""
 15 | 
 16 | import json
 17 | import os
 18 | import typing
 19 | 
 20 | from .html_processor import get_text, resolve
 21 | 
 22 | MODEL_DIR = os.path.join(os.path.dirname(__file__), 'models')
 23 | 
 24 | 
 25 | class Parser:
 26 |   """BudouX's Parser.
 27 | 
 28 |   The main parser object with a variety of class methods to provide semantic
 29 |   chunks and markups from the given input string.
 30 | 
 31 |   Attributes:
 32 |     model: A dict mapping a feature (str) and its score (int).
 33 |   """
 34 | 
 35 |   def __init__(self, model: typing.Dict[str, typing.Dict[str, int]]):
 36 |     """Initializes the parser.
 37 | 
 38 |     Args:
 39 |       model (Dict[str, Dict[str, int]]): A dict mapping a feature and its score.
 40 |     """
 41 |     self.model = model
 42 | 
 43 |   def parse(self, sentence: str) -> typing.List[str]:
 44 |     """Parses the input sentence and returns a list of semantic chunks.
 45 | 
 46 |     Args:
 47 |       sentence (str): An input sentence.
 48 | 
 49 |     Returns:
 50 |       A list of semantic chunks (List[str]).
 51 |     """
 52 |     if sentence == '':
 53 |       return []
 54 |     chunks = [sentence[0]]
 55 |     base_score = -sum(sum(g.values()) for g in self.model.values()) * 0.5
 56 |     for i in range(1, len(sentence)):
 57 |       score = base_score
 58 |       if i > 2:
 59 |         score += self.model.get('UW1', {}).get(sentence[i - 3], 0)
 60 |       if i > 1:
 61 |         score += self.model.get('UW2', {}).get(sentence[i - 2], 0)
 62 |       score += self.model.get('UW3', {}).get(sentence[i - 1], 0)
 63 |       score += self.model.get('UW4', {}).get(sentence[i], 0)
 64 |       if i + 1 < len(sentence):
 65 |         score += self.model.get('UW5', {}).get(sentence[i + 1], 0)
 66 |       if i + 2 < len(sentence):
 67 |         score += self.model.get('UW6', {}).get(sentence[i + 2], 0)
 68 | 
 69 |       if i > 1:
 70 |         score += self.model.get('BW1', {}).get(sentence[i - 2:i], 0)
 71 |       score += self.model.get('BW2', {}).get(sentence[i - 1:i + 1], 0)
 72 |       if i + 1 < len(sentence):
 73 |         score += self.model.get('BW3', {}).get(sentence[i:i + 2], 0)
 74 | 
 75 |       if i > 2:
 76 |         score += self.model.get('TW1', {}).get(sentence[i - 3:i], 0)
 77 |       if i > 1:
 78 |         score += self.model.get('TW2', {}).get(sentence[i - 2:i + 1], 0)
 79 |       if i + 1 < len(sentence):
 80 |         score += self.model.get('TW3', {}).get(sentence[i - 1:i + 2], 0)
 81 |       if i + 2 < len(sentence):
 82 |         score += self.model.get('TW4', {}).get(sentence[i:i + 3], 0)
 83 | 
 84 |       if score > 0:
 85 |         chunks.append(sentence[i])
 86 |       else:
 87 |         chunks[-1] += sentence[i]
 88 |     return chunks
 89 | 
 90 |   def translate_html_string(self, html: str) -> str:
 91 |     """Translates the given HTML string with markups for semantic line breaks.
 92 | 
 93 |     Args:
 94 |       html (str): An input html string.
 95 | 
 96 |     Returns:
 97 |       The translated HTML string (str).
 98 |     """
 99 |     # TODO: Align with the JavaScript API regarding the parent element addition.
100 |     text_content = get_text(html)
101 |     chunks = self.parse(text_content)
102 |     return resolve(chunks, html)
103 | 
104 | 
105 | def load_default_japanese_parser() -> Parser:
106 |   """Loads a parser equipped with the default Japanese model.
107 | 
108 |   Returns:
109 |     A parser (:obj:`budoux.Parser`).
110 |   """
111 |   with open(os.path.join(MODEL_DIR, 'ja.json'), encoding='utf-8') as f:
112 |     model = json.load(f)
113 |   return Parser(model)
114 | 
115 | 
116 | def load_default_simplified_chinese_parser() -> Parser:
117 |   """Loads a parser equipped with the default Simplified Chinese model.
118 | 
119 |   Returns:
120 |     A parser (:obj:`budoux.Parser`).
121 |   """
122 |   with open(os.path.join(MODEL_DIR, 'zh-hans.json'), encoding='utf-8') as f:
123 |     model = json.load(f)
124 |   return Parser(model)
125 | 
126 | 
127 | def load_default_traditional_chinese_parser() -> Parser:
128 |   """Loads a parser equipped with the default Traditional Chinese model.
129 | 
130 |   Returns:
131 |     A parser (:obj:`budoux.Parser`).
132 |   """
133 |   with open(os.path.join(MODEL_DIR, 'zh-hant.json'), encoding='utf-8') as f:
134 |     model = json.load(f)
135 |   return Parser(model)
136 | 
137 | 
138 | def load_default_thai_parser() -> Parser:
139 |   """Loads a parser equipped with the default Thai model.
140 | 
141 |   Returns:
142 |     A parser (:obj:`budoux.Parser`).
143 |   """
144 |   with open(os.path.join(MODEL_DIR, 'th.json'), encoding='utf-8') as f:
145 |     model = json.load(f)
146 |   return Parser(model)
147 | 


--------------------------------------------------------------------------------
/budoux/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/budoux/1f232ee0b7b11f9a7b7b3e02a0eecc2aaafe26ca/budoux/py.typed


--------------------------------------------------------------------------------
/budoux/skip_nodes.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   "ABBR",
 3 |   "BUTTON",
 4 |   "CODE",
 5 |   "IFRAME",
 6 |   "INPUT",
 7 |   "META",
 8 |   "NOBR",
 9 |   "SCRIPT",
10 |   "STYLE",
11 |   "TEXTAREA",
12 |   "TIME",
13 |   "VAR"
14 | ]
15 | 


--------------------------------------------------------------------------------
/budoux/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Utilities for BudouX."""
15 | 
16 | SEP = '▁'
17 | """The separator string to specify breakpoints."""
18 | 


--------------------------------------------------------------------------------
/bump_version.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | import json
17 | import re
18 | import subprocess
19 | 
20 | 
21 | def main():
22 |   parser = argparse.ArgumentParser(description='Bump the version number.')
23 |   parser.add_argument(
24 |       'new_version', type=str, help='The new version number (e.g., 1.2.3)')
25 |   args = parser.parse_args()
26 |   new_version = args.new_version
27 | 
28 |   # Updates Python port version number
29 |   init_file = 'budoux/__init__.py'
30 |   with open(init_file, 'r') as f:
31 |     content = f.read()
32 |   new_content = re.sub(r'(__version__\s+=\s+[\'"])([\.\d]+)([\'"])',
33 |                        rf'\g<1>{new_version}\g<3>', content)
34 |   with open(init_file, 'w') as f:
35 |     f.write(new_content)
36 | 
37 |   # Updates JavaScript port version number
38 |   package_json_path = 'javascript/package.json'
39 |   with open(package_json_path, 'r') as f:
40 |     package_data = json.load(f)
41 |     current_version = package_data.get('version')
42 | 
43 |   if current_version != new_version:
44 |     npm_command = ['npm', 'version', new_version, '--no-git-tag-version']
45 |     subprocess.run(npm_command, cwd='javascript', check=True)
46 |   else:
47 |     print(f"JavaScript version is already {new_version}, skipping npm version.")
48 | 
49 |   cli_file = 'javascript/src/cli.ts'
50 |   with open(cli_file, 'r') as f:
51 |     content = f.read()
52 |   new_content = re.sub(r'(const\s+CLI_VERSION\s+=\s+[\'"])([\.\d]+)([\'"])',
53 |                        rf'\g<1>{new_version}\g<3>', content)
54 |   with open(cli_file, 'w') as f:
55 |     f.write(new_content)
56 | 
57 |   # Updates Java port version number
58 |   mvn_command = [
59 |       'mvn', 'versions:set', f'-DnewVersion={new_version}',
60 |       '-DgenerateBackupPoms=false'
61 |   ]
62 |   subprocess.run(mvn_command, cwd='java', check=True)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |   main()
67 | 


--------------------------------------------------------------------------------
/data/finetuning/ja/train.txt:
--------------------------------------------------------------------------------
 1 | 指定された▁時間以上▁アプリケーションを▁利用する▁ことは▁できません。
 2 | これ以上▁その機器を▁利用する▁場合は▁注意してください。
 3 | それ以上▁コップを▁振ると▁こぼれます。
 4 | ファイルは▁そのまま▁ご利用いただけます。
 5 | 彼は▁そのまま▁行こうとした。
 6 | ご利用▁いただき▁ありがとう▁ございます。
 7 | フィードバック▁ありがとう▁ございます。
 8 | 貴重な▁ご意見▁ありがとう▁ございます。
 9 | この本は▁あらゆる▁トピックを▁カバーします。
10 | ドアを▁ありと▁あらゆる▁力を▁込めて▁開けます。
11 | 身の▁回りの▁あらゆる▁ものを▁化学式で▁表す。
12 | 当機は▁まもなく▁着陸態勢に▁入ります。
13 | まもなくして▁彼女が▁来た。
14 | まもなく▁電車が▁到着します。
15 | ようやく▁日が▁暮れた。
16 | やっと▁ようやく▁公開できそうです。
17 | あいつが▁ようやく▁来た。
18 | 夕方▁ようやく▁完成した。
19 | あれが▁入ったのは▁たまたまです。
20 | たまたま▁手に▁入れる▁ことが▁できた。
21 | 彼が▁たまたま▁持っていた。
22 | 全部▁まとめて▁提出します。
23 | 論点を▁まとめる。
24 | 思った▁とおりに▁書く。
25 | 言われた▁とおりに▁動きます。
26 | まるで▁水晶の▁ように▁すきとおって▁いた。
27 | 彼の▁すきとおる▁肌
28 | 冷たさを▁もつ▁青い▁空
29 | 当日券のみ▁有効です。
30 | 該当する方▁のみ▁入場できます。
31 | あの▁青い▁空と▁白い▁雲のみが▁見える。
32 | 白い▁つぶが▁ちりのように▁舞う
33 | つぶつぶの▁食感
34 | 煙が▁どんどん▁広がっていく
35 | さあ▁どんどん▁食べてくれ
36 | そこが▁ちがうと▁思う
37 | はじまりが▁ちがうから▁おわりも▁ちがう
38 | 日が▁しずむまでに▁終わらせよう
39 | うまく▁言葉に▁できない
40 | それは▁子どもの▁遊び場です。
41 | ふだん▁どおりに▁やれば▁大丈夫。
42 | この▁おもちゃを▁ください。
43 | 映画に▁感情移入する。
44 | 制度に▁甘えがちな▁場面
45 | 可能性が▁浮かび▁上がる


--------------------------------------------------------------------------------
/data/finetuning/ja/val.txt:
--------------------------------------------------------------------------------
 1 | それ以上▁モニターは▁増やせません
 2 | 今回の▁発表は▁以上に▁なります。
 3 | そのままに▁しておけば▁良い。
 4 | そのまま▁お送りください。
 5 | たくさんの▁お便り▁ありがとう▁ございます。
 6 | 彼は▁あらゆる▁服を▁持っています。
 7 | 係の▁者が▁まもなく▁来ます。
 8 | 山の▁頂が▁ようやく▁見えた。
 9 | たまたま▁聞こえてきた▁歌声。
10 | 


--------------------------------------------------------------------------------
/demo/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "budoux-demo",
 3 |   "version": "0.1.2",
 4 |   "description": "A demo app for BudouX",
 5 |   "main": "static/app.js",
 6 |   "scripts": {
 7 |     "build:app": "esbuild src/app.ts --bundle --minify --outfile=static/app.js",
 8 |     "build:worker": "esbuild src/worker.ts --bundle --minify --outfile=static/worker.js",
 9 |     "build": "npm run build:app && npm run build:worker",
10 |     "watch:app": "esbuild src/app.ts --watch --bundle --minify --outfile=static/app.js",
11 |     "watch:worker": "esbuild src/worker.ts --watch --bundle --minify --outfile=static/worker.js",
12 |     "watch": "concurrently \"npm run watch:app\" \"npm run watch:worker\"",
13 |     "serve": "http-server static",
14 |     "dev": "concurrently \"npm run serve\" \"npm run watch\"",
15 |     "start": "npm run dev"
16 |   },
17 |   "keywords": [],
18 |   "author": "Shuhei Iitsuka",
19 |   "license": "Apache-2.0",
20 |   "dependencies": {
21 |     "budoux": "file:../javascript",
22 |     "dompurify": "^3.2.5"
23 |   },
24 |   "devDependencies": {
25 |     "@types/dompurify": "^3.2.0",
26 |     "concurrently": "^9.1.2",
27 |     "esbuild": "^0.19.5",
28 |     "http-server": "^14.1.1",
29 |     "typescript": "^5.2.2"
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/demo/src/app.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2021 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import DOMPurify from 'dompurify';
18 | import { loadDefaultJapaneseParser, loadDefaultSimplifiedChineseParser, loadDefaultTraditionalChineseParser, loadDefaultThaiParser } from 'budoux';
19 | 
20 | const parsers = new Map([
21 |   ['ja', loadDefaultJapaneseParser()],
22 |   ['zh-hans', loadDefaultSimplifiedChineseParser()],
23 |   ['zh-hant', loadDefaultTraditionalChineseParser()],
24 |   ['th', loadDefaultThaiParser()]
25 | ]);
26 | const defaultInputs = new Map([
27 |   ['ja', 'Google の使命は、世界中の情報を<strong>整理</strong>し、<em>世界中の人がアクセス</em>できて使えるようにすることです。'],
28 |   ['zh-hans', '我们的使命是<strong>整合</strong>全球信息，<em>供大众使用</em>，让人人受益。'],
29 |   ['zh-hant', '我們的使命是<strong>匯整</strong>全球資訊，<em>供大眾使用</em>，使人人受惠。'],
30 |   ['th', 'พันธกิจของเราคือการจัดระเบียบข้อมูลในโลกนี้และทำให้เข้าถึงได้ง่ายในทุกที่และมีประโยชน์']
31 | ])
32 | const inputTextElement = document.getElementById('input') as HTMLTextAreaElement;
33 | const outputContainerElement = document.getElementById('output') as HTMLElement;
34 | const fontSizeElement = document.getElementById('fontsize') as HTMLInputElement;
35 | const brCheckElement = document.getElementById('wbr2br') as HTMLInputElement;
36 | const modelSelectElement = document.getElementById('model') as HTMLSelectElement;
37 | const url = new URL(document.location.href);
38 | const worker = new Worker('./worker.js');
39 | worker.onmessage = (e: MessageEvent) => {
40 |   console.log('response from worker:', e);
41 | };
42 | 
43 | 
44 | /**
45 |  * Runs the BudouX model to process the input text and render the processed HTML.
46 |  */
47 | const run = () => {
48 |   outputContainerElement.innerHTML = DOMPurify.sanitize(inputTextElement.value);
49 |   const model = modelSelectElement.value;
50 |   worker.postMessage({'sentence': outputContainerElement.textContent, 'model': model});
51 |   const parser = parsers.get(model);
52 |   if (!parser) return;
53 |   parser.applyToElement(outputContainerElement);
54 |   const renderWithBR = brCheckElement.checked;
55 |   if (renderWithBR) {
56 |     outputContainerElement.innerHTML = DOMPurify.sanitize(
57 |       outputContainerElement.innerHTML.replace(/\u200b/g, '<br>'));
58 |   }
59 |   url.searchParams.set('q', inputTextElement.value);
60 |   window.history.replaceState('', '', url.toString());
61 | };
62 | 
63 | /**
64 |  * Initializes the app.
65 |  */
66 | const init = () => {
67 |   const lang = url.searchParams.get('lang');
68 |   if (lang) modelSelectElement.value = lang;
69 |   const input = url.searchParams.get('q') || defaultInputs.get(modelSelectElement.value);
70 |   if (input) inputTextElement.value = input;
71 |   run();
72 | }
73 | 
74 | fontSizeElement.addEventListener('input', () => {
75 |   outputContainerElement.style.fontSize = `${fontSizeElement.value}rem`;
76 | })
77 | 
78 | inputTextElement.addEventListener('input', () => {
79 |   run();
80 | });
81 | 
82 | brCheckElement.addEventListener('input', () => {
83 |   run();
84 | });
85 | 
86 | modelSelectElement.addEventListener('change', () => {
87 |   url.searchParams.set('lang', modelSelectElement.value);
88 |   window.history.pushState('', '', url.toString());
89 |   const input = defaultInputs.get(modelSelectElement.value);
90 |   if (input) inputTextElement.value = input;
91 |   run();
92 | })
93 | 
94 | init();


--------------------------------------------------------------------------------
/demo/src/worker.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2023 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import { Parser, jaModel, zhHansModel, zhHantModel } from 'budoux';
18 | 
19 | const parsers: { [key: string]: Parser } = {
20 |   'ja': new Parser(jaModel),
21 |   'zh-hans': new Parser(zhHansModel),
22 |   'zh-hant': new Parser(zhHantModel),
23 | };
24 | 
25 | onmessage = (e: MessageEvent) => {
26 |   const model: string = e.data['model'];
27 |   if (!Object.keys(parsers).includes(model)) return;
28 |   const parser = parsers[model];
29 |   const result = parser.parse(e.data['sentence']);
30 |   console.log('It works in Web Worker, too!', result);
31 |   postMessage(result);
32 | };
33 | 


--------------------------------------------------------------------------------
/demo/static/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <meta content="width=device-width, initial-scale=1" name="viewport">
 6 |     <title>BudouX demo</title>
 7 |     <style>
 8 |       body {
 9 |         font-family: sans-serif;
10 |         margin: 0;
11 |       }
12 | 
13 |       header {
14 |         background: #5e35b1;
15 |         color: #fff;
16 |         font-size: 1rem;
17 |         line-height: 3rem;
18 |       }
19 | 
20 |       header h1 {
21 |         margin-bottom: 0;
22 |       }
23 | 
24 |       header p {
25 |         line-height: 1.5rem;
26 |         margin-top: 0;
27 |       }
28 | 
29 |       header a {
30 |         color: #aed581;
31 |       }
32 | 
33 |       .form-item {
34 |         display: inline-block;
35 |         line-height: 2rem;
36 |         padding-right: 5rem;
37 |       }
38 | 
39 |       .container {
40 |         margin: 0 auto;
41 |         max-width: 960px;
42 |         padding: 1rem;
43 |       }
44 | 
45 |       #input {
46 |         height: 6rem;
47 |         line-height: 1.5rem;
48 |         margin: 2rem 0;
49 |         width: 100%;
50 |       }
51 | 
52 |       #output {
53 |         font-size: 3rem;
54 |         margin: 2rem 0;
55 |       }
56 |     </style>
57 |   </head>
58 |   <body>
59 |     <header>
60 |       <div class="container">
61 |         <h1>BudouX 🍇</h1>
62 |         <p>A small, standalone, and language-neutral line break organizer.</p>
63 |         <p>
64 |           [<a href="https://github.com/google/budoux/" target="_blank" rel="noopener noreferrer">GitHub</a>]
65 |           [<a href="https://pypi.org/project/budoux/" target="_blank" rel="noopener noreferrer">PyPI</a>]
66 |           [<a href="https://www.npmjs.com/package/budoux" target="_blank" rel="noopener noreferrer">NPM</a>]
67 |         </p>
68 |       </div>
69 |     </header>
70 |     <main class="container">
71 |       <p>
72 |         <label for="model">Language</label>
73 |         <select id="model">
74 |           <option value="ja">Japanese</option>
75 |           <option value="zh-hans">Simplified Chinese</option>
76 |           <option value="zh-hant">Traditional Chinese</option>
77 |           <option value="th">Thai</option>
78 |         </select>
79 |       </p>
80 |       <textarea id="input"></textarea>
81 |       <div class="form-item">
82 |         <label for="fontsize">Font size</label>
83 |         <input type="range" id="fontsize" min="1" max="8" step="0.1" value="3">
84 |       </div>
85 |       <div class="form-item">
86 |         <input type="checkbox" id="wbr2br">
87 |         <label for="wbr2br">Replace ZWSP with BR</label>
88 |       </div>
89 |       <p id="output"></p>
90 |     </main>
91 |     <script src="./app.js"></script>
92 |   </body>
93 | </html>
94 | 


--------------------------------------------------------------------------------
/demo/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "es2017",
 4 |     "module": "commonjs",
 5 |     "esModuleInterop": true,
 6 |     "forceConsistentCasingInFileNames": true,
 7 |     "strict": true,
 8 |     "skipLibCheck": true,
 9 |     "resolveJsonModule": true
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/budoux/1f232ee0b7b11f9a7b7b3e02a0eecc2aaafe26ca/example.png


--------------------------------------------------------------------------------
/java/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | src/main/resources
3 | 


--------------------------------------------------------------------------------
/java/README.md:
--------------------------------------------------------------------------------
 1 | # BudouX Java Module
 2 | 
 3 | BudouX is a standalone, small, and language-neutral phrase segmenter tool that
 4 | provides beautiful and legible line breaks.
 5 | 
 6 | For more details about the project, please refer to the [project README](https://github.com/google/budoux/).
 7 | 
 8 | ## Demo
 9 | 
10 | <https://google.github.io/budoux>
11 | 
12 | ## Usage
13 | 
14 | ### Simple usage
15 | 
16 | You can get a list of phrases by feeding a sentence to the parser.
17 | The easiest way is to get a parser is loading the default parser for each language.
18 | 
19 | ```java
20 | import com.google.budoux.Parser;
21 | 
22 | public class App
23 | {
24 |     public static void main( String[] args )
25 |     {
26 |         Parser parser = Parser.loadDefaultJapaneseParser();
27 |         System.out.println(parser.parse("今日は良い天気ですね。"));
28 |         // [今日は, 良い, 天気ですね。]
29 |     }
30 | }
31 | ```
32 | 
33 | #### Supported languages and their default parsers
34 | 
35 | - Japanese: `Parser.loadDefaultJapaneseParser()`
36 | - Simplified Chinese: `Parser.loadDefaultSimplifiedChineseParser()`
37 | - Traditional Chinese: `Parser.loadDefaultTraditionalChineseParser()`
38 | - Thai: `Parser.loadDefaultThaiParser()`
39 | 
40 | ### Working with HTML
41 | 
42 | If you want to use the result in a website, you can use the `translateHTMLString`
43 | method to get an HTML string that wraps phrases with non-breaking markup,
44 | speicifcally, zero-width space (U+200B).
45 | 
46 | ```java
47 | System.out.println(parser.translateHTMLString("今日は<strong>良い天気</strong>ですね。"));
48 | //<span style="word-break: keep-all; overflow-wrap: anywhere;">今日は<strong>\u200b良い\u200b天気</strong>ですね。</span>
49 | ```
50 | 
51 | Please note that separators are denoted as `\u200b` in the example above for
52 | illustrative purposes, but the actual output is an invisible string as it's a
53 | zero-width space.
54 | 
55 | ## Caveat
56 | 
57 | BudouX supports HTML inputs and outputs HTML strings with markup applied to wrap
58 | phrases, but it's not meant to be used as an HTML sanitizer.
59 | **BudouX doesn't sanitize any inputs.**
60 | Malicious HTML inputs yield malicious HTML outputs.
61 | Please use it with an appropriate sanitizer library if you don't trust the input.
62 | 
63 | ## Author
64 | 
65 | [Shuhei Iitsuka](https://tushuhei.com)
66 | 
67 | ## Disclaimer
68 | 
69 | This is not an officially supported Google product.
70 | 


--------------------------------------------------------------------------------
/java/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 |   Copyright 2023 Google LLC
  4 | 
  5 |   Licensed under the Apache License, Version 2.0 (the "License");
  6 |   you may not use this file except in compliance with the License.
  7 |   You may obtain a copy of the License at
  8 | 
  9 |       https://www.apache.org/licenses/LICENSE-2.0
 10 | 
 11 |   Unless required by applicable law or agreed to in writing, software
 12 |   distributed under the License is distributed on an "AS IS" BASIS,
 13 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |   See the License for the specific language governing permissions and
 15 |   limitations under the License.
 16 | -->
 17 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 18 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 19 |   <modelVersion>4.0.0</modelVersion>
 20 | 
 21 |   <parent>
 22 |     <groupId>org.sonatype.oss</groupId>
 23 |     <artifactId>oss-parent</artifactId>
 24 |     <version>9</version>
 25 |   </parent>
 26 | 
 27 |   <groupId>com.google.budoux</groupId>
 28 |   <artifactId>budoux</artifactId>
 29 |   <version>0.7.0</version>
 30 | 
 31 |   <name>BudouX</name>
 32 |   <url>https://google.github.io/budoux/</url>
 33 | 
 34 |   <properties>
 35 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 36 |     <maven.compiler.source>1.8</maven.compiler.source>
 37 |     <maven.compiler.target>1.8</maven.compiler.target>
 38 |   </properties>
 39 |   <dependencies>
 40 |     <dependency>
 41 |       <groupId>junit</groupId>
 42 |       <artifactId>junit</artifactId>
 43 |       <version>4.13.2</version>
 44 |       <scope>test</scope>
 45 |     </dependency>
 46 |     <dependency>
 47 |       <groupId>com.google.code.gson</groupId>
 48 |       <artifactId>gson</artifactId>
 49 |       <version>2.13.0</version>
 50 |     </dependency>
 51 |     <dependency>
 52 |       <groupId>org.jsoup</groupId>
 53 |       <artifactId>jsoup</artifactId>
 54 |       <version>1.19.1</version>
 55 |     </dependency>
 56 |   </dependencies>
 57 | 
 58 |   <build>
 59 |     <plugins>
 60 |       <plugin>
 61 |         <groupId>org.apache.maven.plugins</groupId>
 62 |         <artifactId>maven-javadoc-plugin</artifactId>
 63 |         <version>3.11.2</version>
 64 |       </plugin>
 65 |     </plugins>
 66 |     <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
 67 |       <plugins>
 68 |         <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
 69 |         <plugin>
 70 |           <artifactId>maven-clean-plugin</artifactId>
 71 |           <version>3.4.1</version>
 72 |         </plugin>
 73 |         <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
 74 |         <plugin>
 75 |           <artifactId>maven-resources-plugin</artifactId>
 76 |           <version>3.3.1</version>
 77 |           <executions>
 78 |             <execution>
 79 |               <id>copy-data</id>
 80 |               <phase>generate-resources</phase>
 81 |               <goals>
 82 |                 <goal>copy-resources</goal>
 83 |               </goals>
 84 |               <configuration>
 85 |                 <outputDirectory>${basedir}/src/main/resources</outputDirectory>
 86 |                 <resources>
 87 |                   <resource>
 88 |                     <directory>../budoux</directory>
 89 |                     <includes>
 90 |                       <include>models/*.json</include>
 91 |                       <include>skip_nodes.json</include>
 92 |                     </includes>
 93 |                   </resource>
 94 |                 </resources>
 95 |               </configuration>
 96 |             </execution>
 97 |           </executions>
 98 |         </plugin>
 99 |         <plugin>
100 |           <artifactId>maven-compiler-plugin</artifactId>
101 |           <version>3.14.0</version>
102 |         </plugin>
103 |         <plugin>
104 |           <artifactId>maven-surefire-plugin</artifactId>
105 |           <version>3.5.3</version>
106 |         </plugin>
107 |         <plugin>
108 |           <artifactId>maven-jar-plugin</artifactId>
109 |           <version>3.4.2</version>
110 |         </plugin>
111 |         <plugin>
112 |           <artifactId>maven-install-plugin</artifactId>
113 |           <version>3.1.4</version>
114 |         </plugin>
115 |         <plugin>
116 |           <artifactId>maven-deploy-plugin</artifactId>
117 |           <version>3.1.4</version>
118 |         </plugin>
119 |         <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
120 |         <plugin>
121 |           <artifactId>maven-site-plugin</artifactId>
122 |           <version>3.21.0</version>
123 |         </plugin>
124 |         <plugin>
125 |           <artifactId>maven-project-info-reports-plugin</artifactId>
126 |           <version>3.9.0</version>
127 |         </plugin>
128 |       </plugins>
129 |     </pluginManagement>
130 |   </build>
131 | </project>
132 | 


--------------------------------------------------------------------------------
/java/src/main/java/com/google/budoux/HTMLProcessor.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2023 Google LLC
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     https://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.google.budoux;
 18 | 
 19 | import com.google.gson.Gson;
 20 | import com.google.gson.JsonIOException;
 21 | import com.google.gson.JsonSyntaxException;
 22 | import java.io.IOException;
 23 | import java.io.InputStream;
 24 | import java.io.InputStreamReader;
 25 | import java.io.Reader;
 26 | import java.nio.charset.StandardCharsets;
 27 | import java.util.ArrayDeque;
 28 | import java.util.Arrays;
 29 | import java.util.HashSet;
 30 | import java.util.List;
 31 | import java.util.Locale;
 32 | import java.util.Set;
 33 | import java.util.stream.Collectors;
 34 | import org.jsoup.Jsoup;
 35 | import org.jsoup.nodes.Comment;
 36 | import org.jsoup.nodes.Document;
 37 | import org.jsoup.nodes.Element;
 38 | import org.jsoup.nodes.Node;
 39 | import org.jsoup.nodes.TextNode;
 40 | import org.jsoup.select.NodeVisitor;
 41 | 
 42 | /** Processes phrases into an HTML string wrapping them in no-breaking markup. */
 43 | final class HTMLProcessor {
 44 |   private static final Set<String> skipNodes;
 45 |   private static final String STYLE = "word-break: keep-all; overflow-wrap: anywhere;";
 46 | 
 47 |   private HTMLProcessor() {}
 48 | 
 49 |   static {
 50 |     Gson gson = new Gson();
 51 |     InputStream inputStream = HTMLProcessor.class.getResourceAsStream("/skip_nodes.json");
 52 |     try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) {
 53 |       String[] skipNodesStrings = gson.fromJson(reader, String[].class);
 54 |       skipNodes = new HashSet<>(Arrays.asList(skipNodesStrings));
 55 |     } catch (JsonSyntaxException | JsonIOException | IOException e) {
 56 |       throw new AssertionError(e);
 57 |     }
 58 |   }
 59 | 
 60 |   /**
 61 |    * A `NodeVisitor` subclass that concatenates all `TextNode`s to a string.
 62 |    *
 63 |    * <p>It also converts `&lt;br>` to `\n`.
 64 |    */
 65 |   private static class TextizeNodeVisitor implements NodeVisitor {
 66 |     private StringBuilder output = new StringBuilder();
 67 | 
 68 |     public String getString() {
 69 |       return output.toString();
 70 |     }
 71 | 
 72 |     @Override
 73 |     public void head(Node node, int depth) {
 74 |       if (node instanceof Element) {
 75 |         final String nodeName = node.nodeName();
 76 |         if (nodeName.equals("br")) {
 77 |           output.append('\n');
 78 |         }
 79 |       } else if (node instanceof TextNode) {
 80 |         output.append(((TextNode) node).getWholeText());
 81 |       }
 82 |     }
 83 | 
 84 |     @Override
 85 |     public void tail(Node node, int depth) {}
 86 |   }
 87 | 
 88 |   private static class PhraseResolvingNodeVisitor implements NodeVisitor {
 89 |     private static final char SEP = '\uFFFF';
 90 |     private final String phrasesJoined;
 91 |     private final String separator;
 92 |     private final StringBuilder output = new StringBuilder();
 93 |     private Integer scanIndex = 0;
 94 |     private boolean toSkip = false;
 95 |     private final ArrayDeque<Boolean> elementStack = new ArrayDeque<>();
 96 | 
 97 |     /**
 98 |      * Constructs a PhraseResolvingNodeVisitor.
 99 |      *
100 |      * @param phrases a list of phrase strings.
101 |      * @param separator the separator string.
102 |      */
103 |     PhraseResolvingNodeVisitor(List<String> phrases, String separator) {
104 |       this.separator = separator;
105 |       this.phrasesJoined = String.join(Character.toString(SEP), phrases);
106 |     }
107 | 
108 |     /**
109 |      * Returns the resolved output string.
110 |      *
111 |      * @return the output string.
112 |      */
113 |     public StringBuilder getOutput() {
114 |       return output;
115 |     }
116 | 
117 |     @Override
118 |     public void head(Node node, int depth) {
119 |       if (node.nodeName().equals("body")) {
120 |         return;
121 |       }
122 |       if (node instanceof Element) {
123 |         elementStack.push(toSkip);
124 |         String attributesEncoded =
125 |             node.attributes().asList().stream()
126 |                 .map(attribute -> " " + attribute)
127 |                 .collect(Collectors.joining(""));
128 |         final String nodeName = node.nodeName();
129 |         if (nodeName.equals("br")) {
130 |           // `<br>` is converted to `\n`, see `TextizeNodeVisitor.head`.
131 |           // Assume phrasesJoined.charAt(scanIndex) == '\n'.
132 |           scanIndex++;
133 |         } else if (skipNodes.contains(nodeName.toUpperCase(Locale.ENGLISH))) {
134 |           if (!toSkip
135 |               && scanIndex < phrasesJoined.length()
136 |               && phrasesJoined.charAt(scanIndex) == SEP) {
137 |             output.append(separator);
138 |             scanIndex++;
139 |           }
140 |           toSkip = true;
141 |         }
142 |         output.append(String.format("<%s%s>", nodeName, attributesEncoded));
143 |       } else if (node instanceof TextNode) {
144 |         String data = ((TextNode) node).getWholeText();
145 |         for (int i = 0; i < data.length(); i++) {
146 |           char c = data.charAt(i);
147 |           if (c != phrasesJoined.charAt(scanIndex)) {
148 |             // Assume phrasesJoined.charAt(scanIndex) == SEP.
149 |             if (!toSkip) {
150 |               output.append(separator);
151 |             }
152 |             scanIndex++;
153 |           }
154 |           scanIndex++;
155 |           output.append(c);
156 |         }
157 |       }
158 |     }
159 | 
160 |     @Override
161 |     public void tail(Node node, int depth) {
162 |       if (node.nodeName().equals("body") || node instanceof TextNode || node instanceof Comment) {
163 |         return;
164 |       }
165 |       // assume node instanceof Element;
166 |       toSkip = elementStack.pop();
167 |       Element element = (Element) node;
168 |       if (element.tag().isSelfClosing()) {
169 |         return;
170 |       }
171 |       output.append(String.format("</%s>", node.nodeName()));
172 |     }
173 |   }
174 | 
175 |   /**
176 |    * Wraps phrases in the HTML string with non-breaking markup.
177 |    *
178 |    * @param phrases the phrases included in the HTML string.
179 |    * @param html the HTML string to resolve.
180 |    * @return the HTML string of phrases wrapped in non-breaking markup.
181 |    */
182 |   public static String resolve(List<String> phrases, String html) {
183 |     return resolve(phrases, html, "\u200b");
184 |   }
185 | 
186 |   /**
187 |    * Wraps phrases in the HTML string with non-breaking markup.
188 |    *
189 |    * @param phrases the phrases included in the HTML string.
190 |    * @param html the HTML string to resolve.
191 |    * @param separator the separator string.
192 |    * @return the HTML string of phrases wrapped in non-breaking markup.
193 |    */
194 |   public static String resolve(List<String> phrases, String html, String separator) {
195 |     Document doc = Jsoup.parseBodyFragment(html);
196 |     PhraseResolvingNodeVisitor nodeVisitor = new PhraseResolvingNodeVisitor(phrases, separator);
197 |     doc.body().traverse(nodeVisitor);
198 |     return String.format("<span style=\"%s\">%s</span>", STYLE, nodeVisitor.getOutput());
199 |   }
200 | 
201 |   /**
202 |    * Gets the text content from the input HTML string.
203 |    *
204 |    * @param html an HTML string.
205 |    * @return the text content.
206 |    */
207 |   public static String getText(String html) {
208 |     Document doc = Jsoup.parseBodyFragment(html);
209 |     TextizeNodeVisitor nodeVisitor = new TextizeNodeVisitor();
210 |     doc.body().traverse(nodeVisitor);
211 |     return nodeVisitor.getString();
212 |   }
213 | }
214 | 


--------------------------------------------------------------------------------
/java/src/main/java/com/google/budoux/Parser.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2023 Google LLC
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     https://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.google.budoux;
 18 | 
 19 | import com.google.gson.Gson;
 20 | import com.google.gson.JsonIOException;
 21 | import com.google.gson.JsonSyntaxException;
 22 | import com.google.gson.reflect.TypeToken;
 23 | import java.io.IOException;
 24 | import java.io.InputStream;
 25 | import java.io.InputStreamReader;
 26 | import java.io.Reader;
 27 | import java.lang.reflect.Type;
 28 | import java.nio.charset.StandardCharsets;
 29 | import java.util.ArrayList;
 30 | import java.util.List;
 31 | import java.util.Map;
 32 | import java.util.Optional;
 33 | 
 34 | /**
 35 |  * The BudouX parser that translates the input sentence into phrases.
 36 |  *
 37 |  * <p>You can create a parser instance by invoking {@code new Parser(model)} with the model data you
 38 |  * want to use. You can also create a parser by specifying the model file path with {@code
 39 |  * Parser.loadByFileName(modelFileName)}.
 40 |  *
 41 |  * <p>In most cases, it's sufficient to use the default parser for the language. For example, you
 42 |  * can create a default Japanese parser as follows.
 43 |  *
 44 |  * <pre>
 45 |  * Parser parser = Parser.loadDefaultJapaneseParser();
 46 |  * </pre>
 47 |  */
 48 | public class Parser {
 49 |   private final Map<String, Map<String, Integer>> model;
 50 | 
 51 |   /**
 52 |    * Constructs a BudouX parser.
 53 |    *
 54 |    * @param model the model data.
 55 |    */
 56 |   public Parser(Map<String, Map<String, Integer>> model) {
 57 |     this.model = model;
 58 |   }
 59 | 
 60 |   /**
 61 |    * Loads the default Japanese parser.
 62 |    *
 63 |    * @return a BudouX parser with the default Japanese model.
 64 |    */
 65 |   public static Parser loadDefaultJapaneseParser() {
 66 |     return loadByFileName("/models/ja.json");
 67 |   }
 68 | 
 69 |   /**
 70 |    * Loads the default Simplified Chinese parser.
 71 |    *
 72 |    * @return a BudouX parser with the default Simplified Chinese model.
 73 |    */
 74 |   public static Parser loadDefaultSimplifiedChineseParser() {
 75 |     return loadByFileName("/models/zh-hans.json");
 76 |   }
 77 | 
 78 |   /**
 79 |    * Loads the default Traditional Chinese parser.
 80 |    *
 81 |    * @return a BudouX parser with the default Traditional Chinese model.
 82 |    */
 83 |   public static Parser loadDefaultTraditionalChineseParser() {
 84 |     return loadByFileName("/models/zh-hant.json");
 85 |   }
 86 | 
 87 |   /**
 88 |    * Loads the default Thai parser.
 89 |    *
 90 |    * @return a BudouX parser with the default Thai model.
 91 |    */
 92 |   public static Parser loadDefaultThaiParser() {
 93 |     return loadByFileName("/models/th.json");
 94 |   }
 95 | 
 96 |   /**
 97 |    * Loads a parser by specifying the model file path.
 98 |    *
 99 |    * @param modelFileName the model file path.
100 |    * @return a BudouX parser.
101 |    */
102 |   public static Parser loadByFileName(String modelFileName) {
103 |     Gson gson = new Gson();
104 |     Type type = new TypeToken<Map<String, Map<String, Integer>>>() {}.getType();
105 |     InputStream inputStream = Parser.class.getResourceAsStream(modelFileName);
106 |     try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) {
107 |       Map<String, Map<String, Integer>> model = gson.fromJson(reader, type);
108 |       return new Parser(model);
109 |     } catch (JsonIOException | JsonSyntaxException | IOException e) {
110 |       throw new AssertionError(e);
111 |     }
112 |   }
113 | 
114 |   /**
115 |    * Gets the score for the specified feature of the given sequence.
116 |    *
117 |    * @param featureKey the feature key to examine.
118 |    * @param sequence the sequence to look up the score.
119 |    * @return the contribution score to support a phrase break.
120 |    */
121 |   private int getScore(String featureKey, String sequence) {
122 |     return Optional.ofNullable(this.model.get(featureKey))
123 |         .map(group -> group.get(sequence))
124 |         .orElse(0);
125 |   }
126 | 
127 |   /**
128 |    * Parses a sentence into phrases.
129 |    *
130 |    * @param sentence the sentence to break by phrase.
131 |    * @return a list of phrases.
132 |    */
133 |   public List<String> parse(String sentence) {
134 |     if (sentence.isEmpty()) {
135 |       return new ArrayList<>();
136 |     }
137 |     List<String> result = new ArrayList<>();
138 |     result.add(String.valueOf(sentence.charAt(0)));
139 |     int totalScore =
140 |         this.model.values().stream()
141 |             .mapToInt(group -> group.values().stream().mapToInt(Integer::intValue).sum())
142 |             .sum();
143 |     for (int i = 1; i < sentence.length(); i++) {
144 |       int score = -totalScore;
145 |       if (i - 2 > 0) {
146 |         score += 2 * this.getScore("UW1", sentence.substring(i - 3, i - 2));
147 |       }
148 |       if (i - 1 > 0) {
149 |         score += 2 * this.getScore("UW2", sentence.substring(i - 2, i - 1));
150 |       }
151 |       score += 2 * this.getScore("UW3", sentence.substring(i - 1, i));
152 |       score += 2 * this.getScore("UW4", sentence.substring(i, i + 1));
153 |       if (i + 1 < sentence.length()) {
154 |         score += 2 * this.getScore("UW5", sentence.substring(i + 1, i + 2));
155 |       }
156 |       if (i + 2 < sentence.length()) {
157 |         score += 2 * this.getScore("UW6", sentence.substring(i + 2, i + 3));
158 |       }
159 |       if (i > 1) {
160 |         score += 2 * this.getScore("BW1", sentence.substring(i - 2, i));
161 |       }
162 |       score += 2 * this.getScore("BW2", sentence.substring(i - 1, i + 1));
163 |       if (i + 1 < sentence.length()) {
164 |         score += 2 * this.getScore("BW3", sentence.substring(i, i + 2));
165 |       }
166 |       if (i - 2 > 0) {
167 |         score += 2 * this.getScore("TW1", sentence.substring(i - 3, i));
168 |       }
169 |       if (i - 1 > 0) {
170 |         score += 2 * this.getScore("TW2", sentence.substring(i - 2, i + 1));
171 |       }
172 |       if (i + 1 < sentence.length()) {
173 |         score += 2 * this.getScore("TW3", sentence.substring(i - 1, i + 2));
174 |       }
175 |       if (i + 2 < sentence.length()) {
176 |         score += 2 * this.getScore("TW4", sentence.substring(i, i + 3));
177 |       }
178 |       if (score > 0) {
179 |         result.add("");
180 |       }
181 |       result.set(result.size() - 1, result.get(result.size() - 1) + sentence.charAt(i));
182 |     }
183 |     return result;
184 |   }
185 | 
186 |   /**
187 |    * Translates an HTML string with phrases wrapped in no-breaking markup.
188 |    *
189 |    * @param html an HTML string.
190 |    * @return the translated HTML string with no-breaking markup.
191 |    */
192 |   public String translateHTMLString(String html) {
193 |     String sentence = HTMLProcessor.getText(html);
194 |     List<String> phrases = parse(sentence);
195 |     return HTMLProcessor.resolve(phrases, html, "\u200b");
196 |   }
197 | }
198 | 


--------------------------------------------------------------------------------
/java/src/test/java/com/google/budoux/HTMLProcessorTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2023 Google LLC
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     https://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.google.budoux;
 18 | 
 19 | import static org.junit.Assert.assertEquals;
 20 | 
 21 | import java.util.Arrays;
 22 | import java.util.List;
 23 | import org.junit.Test;
 24 | import org.junit.runner.RunWith;
 25 | import org.junit.runners.JUnit4;
 26 | 
 27 | /** Unit tests for {@link HTMLProcessor}. */
 28 | @RunWith(JUnit4.class)
 29 | public class HTMLProcessorTest {
 30 |   String pre = "<span style=\"word-break: keep-all; overflow-wrap: anywhere;\">";
 31 |   String post = "</span>";
 32 | 
 33 |   private String wrap(String input) {
 34 |     return this.pre + input + this.post;
 35 |   }
 36 | 
 37 |   @Test
 38 |   public void testResolveWithSimpleTextInput() {
 39 |     List<String> phrases = Arrays.asList("abc", "def");
 40 |     String html = "abcdef";
 41 |     String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
 42 |     assertEquals(this.wrap("abc<wbr>def"), result);
 43 |   }
 44 | 
 45 |   @Test
 46 |   public void testResolveWithStandardHTMLInput() {
 47 |     List<String> phrases = Arrays.asList("abc", "def");
 48 |     String html = "ab<a href=\"http://example.com\">cd</a>ef";
 49 |     String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
 50 |     assertEquals(this.wrap("ab<a href=\"http://example.com\">c<wbr>d</a>ef"), result);
 51 |   }
 52 | 
 53 |   @Test
 54 |   public void testResolveWithImg() {
 55 |     List<String> phrases = Arrays.asList("abc", "def");
 56 |     String html = "<img>abcdef";
 57 |     String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
 58 |     assertEquals(this.wrap("<img>abc<wbr>def"), result);
 59 |   }
 60 | 
 61 |   @Test
 62 |   public void testResolveWithUnpairedClose() {
 63 |     List<String> phrases = Arrays.asList("abc", "def");
 64 |     String html = "abcdef</p>";
 65 |     String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
 66 |     assertEquals(this.wrap("abc<wbr>def<p></p>"), result);
 67 |   }
 68 | 
 69 |   @Test
 70 |   public void testResolveWithNodesToSkip() {
 71 |     List<String> phrases = Arrays.asList("abc", "def", "ghi");
 72 |     String html = "a<button>bcde</button>fghi";
 73 |     String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
 74 |     assertEquals(this.wrap("a<button>bcde</button>f<wbr>ghi"), result);
 75 |   }
 76 | 
 77 |   @Test
 78 |   public void testResolveWithNodesBreakBeforeSkip() {
 79 |     List<String> phrases = Arrays.asList("abc", "def", "ghi", "jkl");
 80 |     String html = "abc<nobr>defghi</nobr>jkl";
 81 |     String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
 82 |     assertEquals(this.wrap("abc<wbr><nobr>defghi</nobr><wbr>jkl"), result);
 83 |   }
 84 | 
 85 |   @Test
 86 |   public void testResolveWithAfterSkip() {
 87 |     List<String> phrases = Arrays.asList("abc", "def", "ghi", "jkl");
 88 |     String html = "abc<nobr>def</nobr>ghijkl";
 89 |     String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
 90 |     assertEquals(this.wrap("abc<wbr><nobr>def</nobr><wbr>ghi<wbr>jkl"), result);
 91 |   }
 92 | 
 93 |   @Test
 94 |   public void testResolveWithAfterSkipWithImg() {
 95 |     List<String> phrases = Arrays.asList("abc", "def", "ghi", "jkl");
 96 |     String html = "abc<nobr>d<img>ef</nobr>ghijkl";
 97 |     String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
 98 |     assertEquals(this.wrap("abc<wbr><nobr>d<img>ef</nobr><wbr>ghi<wbr>jkl"), result);
 99 |   }
100 | 
101 |   @Test
102 |   public void testResolveWithNothingToSplit() {
103 |     List<String> phrases = Arrays.asList("abcdef");
104 |     String html = "abcdef";
105 |     String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
106 |     assertEquals(this.wrap("abcdef"), result);
107 |   }
108 | 
109 |   @Test
110 |   public void testResolveBR() {
111 |     String html = " 1  <br>  2 ";
112 |     String text = HTMLProcessor.getText(html);
113 |     assertEquals(" 1  \n  2 ", text);
114 |     List<String> phrases = Arrays.asList(" 1  \n  2 ");
115 |     String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
116 |     assertEquals(this.wrap(" 1  <br>  2 "), result);
117 |   }
118 | 
119 |   @Test
120 |   public void testGetText() {
121 |     String html = "Hello <button><b>W</b>orld</button>!";
122 |     String result = HTMLProcessor.getText(html);
123 |     assertEquals("Hello World!", result);
124 |   }
125 | 
126 |   @Test
127 |   public void testGetTextWhiteSpace() {
128 |     String html = " H    e  ";
129 |     String result = HTMLProcessor.getText(html);
130 |     assertEquals(" H    e  ", result);
131 |   }
132 | 
133 |   @Test
134 |   public void testGetTextWhiteSpaceAcrossElements() {
135 |     String html = "<div> 1 </div><div> 2 </div>";
136 |     String result = HTMLProcessor.getText(html);
137 |     assertEquals(" 1  2 ", result);
138 |   }
139 | 
140 |   @Test
141 |   public void testResolveSkipNodeAtTheEnd() {
142 |     List<String> phrases = Arrays.asList("abc", "def", "ghi", "jkl");
143 |     String html = "abcdefghijkl<img src=\"example.png\">";
144 |     String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
145 |     assertEquals(this.wrap("abc<wbr>def<wbr>ghi<wbr>jkl<img src=\"example.png\">"), result);
146 |   }
147 | 
148 |   @Test
149 |   public void testResolveWithComments() {
150 |     List<String> phrases = Arrays.asList("abc", "def", "ghi", "jkl");
151 |     String html = "abcdef<!-- comments should be ignored-->ghijkl";
152 |     String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
153 |     assertEquals(this.wrap("abc<wbr>def<wbr>ghi<wbr>jkl"), result);
154 |   }
155 | }
156 | 


--------------------------------------------------------------------------------
/java/src/test/java/com/google/budoux/ParserTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.google.budoux;
18 | 
19 | import static org.junit.Assert.assertEquals;
20 | 
21 | import java.util.Arrays;
22 | import java.util.HashMap;
23 | import java.util.List;
24 | import java.util.Map;
25 | import org.junit.Test;
26 | import org.junit.runner.RunWith;
27 | import org.junit.runners.JUnit4;
28 | 
29 | /** Unit tests for {@link Parser}. */
30 | @RunWith(JUnit4.class)
31 | public class ParserTest {
32 | 
33 |   @Test
34 |   public void testParse() {
35 |     Map<String, Map<String, Integer>> model = new HashMap<>();
36 |     Map<String, Integer> uw4 = new HashMap<>();
37 |     uw4.put("a", 100);
38 |     model.put("UW4", uw4);
39 |     Parser parser = new Parser(model);
40 |     List<String> result = parser.parse("xyzabc");
41 |     List<String> expected = Arrays.asList("xyz", "abc");
42 |     assertEquals(expected, result);
43 |   }
44 | 
45 |   @Test
46 |   public void testLoadDefaultJapaneseParser() {
47 |     Parser parser = Parser.loadDefaultJapaneseParser();
48 |     List<String> result = parser.parse("今日は天気です。");
49 |     List<String> expected = Arrays.asList("今日は", "天気です。");
50 |     assertEquals(expected, result);
51 |   }
52 | 
53 |   @Test
54 |   public void testTranslateHTMLString() {
55 |     Map<String, Map<String, Integer>> model = new HashMap<>();
56 |     Map<String, Integer> uw4 = new HashMap<>();
57 |     uw4.put("a", 100);
58 |     model.put("UW4", uw4);
59 |     Parser parser = new Parser(model);
60 |     String html = "<a href=\"http://example.com\">xyza</a>bc";
61 |     String result = parser.translateHTMLString(html);
62 |     assertEquals(
63 |         "<span style=\"word-break: keep-all; overflow-wrap: anywhere;\"><a"
64 |             + " href=\"http://example.com\">xyz\u200ba</a>bc</span>",
65 |         result);
66 |   }
67 | 
68 |   @Test
69 |   public void testNewline() {
70 |     Parser parser = Parser.loadDefaultJapaneseParser();
71 |     List<String> result = parser.parse(" 1  \n  2 ");
72 |     List<String> expected = Arrays.asList(" 1  \n  2 ");
73 |     assertEquals(expected, result);
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/javascript/.npmignore:
--------------------------------------------------------------------------------
1 | scripts
2 | 


--------------------------------------------------------------------------------
/javascript/.prettierrc.json:
--------------------------------------------------------------------------------
1 | {
2 |   "bracketSpacing": false,
3 |   "singleQuote": true,
4 |   "trailingComma": "es5",
5 |   "arrowParens": "avoid"
6 | }
7 | 


--------------------------------------------------------------------------------
/javascript/README.md:
--------------------------------------------------------------------------------
  1 | <!-- markdownlint-disable MD014 -->
  2 | # BudouX JavaScript module
  3 | 
  4 | BudouX is a standalone, small, and language-neutral phrase segmenter tool that
  5 | provides beautiful and legible line breaks.
  6 | 
  7 | For more details about the project, please refer to the [project README](https://github.com/google/budoux/).
  8 | 
  9 | ## Demo
 10 | 
 11 | <https://google.github.io/budoux>
 12 | 
 13 | ## Install
 14 | 
 15 | ```shellsession
 16 | $ npm install budoux
 17 | ```
 18 | 
 19 | ## Usage
 20 | 
 21 | ### Simple usage
 22 | 
 23 | You can get a list of phrases by feeding a sentence to the parser.
 24 | The easiest way is to get a parser is loading the default parser for each language.
 25 | 
 26 | **Japanese:**
 27 | 
 28 | ```javascript
 29 | import { loadDefaultJapaneseParser } from 'budoux';
 30 | const parser = loadDefaultJapaneseParser();
 31 | console.log(parser.parse('今日は天気です。'));
 32 | // ['今日は', '天気です。']
 33 | ```
 34 | 
 35 | **Simplified Chinese:**
 36 | 
 37 | ```javascript
 38 | import { loadDefaultSimplifiedChineseParser } from 'budoux';
 39 | const parser = loadDefaultSimplifiedChineseParser();
 40 | console.log(parser.parse('是今天的天气。'));
 41 | // ['是', '今天', '的', '天气。']
 42 | ```
 43 | 
 44 | **Traditional Chinese:**
 45 | 
 46 | ```javascript
 47 | import { loadDefaultTraditionalChineseParser } from 'budoux';
 48 | const parser = loadDefaultTraditionalChineseParser();
 49 | console.log(parser.parse('是今天的天氣。'));
 50 | // ['是', '今天', '的', '天氣。']
 51 | ```
 52 | 
 53 | **Thai:**
 54 | 
 55 | ```javascript
 56 | import { loadDefaultThaiParser } from 'budoux';
 57 | const parser = loadDefaultThaiParser();
 58 | console.log(parser.parse('วันนี้อากาศดี'));
 59 | // ['วัน', 'นี้', 'อากาศ', 'ดี']
 60 | ```
 61 | 
 62 | ### Translating an HTML string
 63 | 
 64 | You can also translate an HTML string to wrap phrases with non-breaking markup,
 65 | specifically, zero-width spaces (U+200B).
 66 | 
 67 | ```javascript
 68 | console.log(parser.translateHTMLString('今日は<b>とても天気</b>です。'));
 69 | // <span style="word-break: keep-all; overflow-wrap: anywhere;">今日は<b>\u200bとても\u200b天気</b>です。</span>
 70 | ```
 71 | 
 72 | Please note that separators are denoted as `\u200b` in the example above for
 73 | illustrative purposes, but the actual output is an invisible string as it's a
 74 | zero-width space.
 75 | 
 76 | ### Applying to an HTML element
 77 | 
 78 | You can also feed an HTML element to the parser to apply the process.
 79 | 
 80 | ```javascript
 81 | const ele = document.querySelector('p.budou-this');
 82 | console.log(ele.outerHTML);
 83 | // <p class="budou-this">今日は<b>とても天気</b>です。</p>
 84 | parser.applyToElement(ele);
 85 | console.log(ele.outerHTML);
 86 | // <p class="budou-this" style="word-break: keep-all; overflow-wrap: anywhere;">今日は<b>\u200bとても\u200b天気</b>です。</p>
 87 | ```
 88 | 
 89 | Internally, the `applyToElement` calls the [`HTMLProcessor`]'s `applyToElement`
 90 | function with the zero-width space as the separator.
 91 | 
 92 | You can use the [`HTMLProcessor`] class directly if desired.
 93 | For example:
 94 | 
 95 | ```javascript
 96 | import { HTMLProcessor } from 'budoux';
 97 | const ele = document.querySelector('p.budou-this');
 98 | const htmlProcessor = new HTMLProcessor(parser, {
 99 |   separator: ' '
100 | });
101 | htmlProcessor.applyToElement(ele);
102 | ```
103 | 
104 | [`HTMLProcessor`]: https://github.com/google/budoux/blob/main/javascript/src/html_processor.ts
105 | 
106 | ### Loading a custom model
107 | 
108 | You can load your own custom model as follows.
109 | 
110 | ```javascript
111 | import { Parser } from 'budoux';
112 | const model = JSON.parse('{"UW4": {"a": 133}}');  // Content of the custom model JSON file.
113 | const parser = new Parser(model);
114 | parser.parse('xyzabc');  // ['xyz', 'abc']
115 | ```
116 | 
117 | ### Working with Web Worker
118 | 
119 | If you like to use BudouX inside a Web worker script, constrcut a parser without
120 | `HTMLProcessor`, i.e. use the pure `Parser` instance.
121 | Refer to [worker.ts](https://github.com/google/budoux/blob/main/demo/src/worker.ts)
122 | for a working demo.
123 | 
124 | ```javascript
125 | import { Parser, jaModel } from 'budoux';
126 | const parser = new Parser(jaModel);
127 | parser.parse('今日は天気です');  // ['今日は', '天気です']
128 | ```
129 | 
130 | ## Web components
131 | 
132 | BudouX also offers Web components to integrate the parser with your website quickly.
133 | All you have to do is wrap sentences with:
134 | 
135 | - `<budoux-ja>` for Japanese
136 | - `<budoux-zh-hans>` for Simplified Chinese
137 | - `<budoux-zh-hant>` for Traditional Chinese
138 | - `<budoux-th>` for Thai
139 | 
140 | ```html
141 | <budoux-ja>今日は天気です。</budoux-ja>
142 | <budoux-zh-hans>今天是晴天。</budoux-zh-hans>
143 | <budoux-zh-hant>今天是晴天。</budoux-zh-hant>
144 | <budoux-th>วันนี้อากาศดี</budoux-th>
145 | ```
146 | 
147 | In order to enable the custom element, you can simply add this line to load the bundle.
148 | 
149 | ```html
150 | <!-- For Japanese -->
151 | <script src="https://unpkg.com/budoux/bundle/budoux-ja.min.js"></script>
152 | 
153 | <!-- For Simplified Chinese -->
154 | <script src="https://unpkg.com/budoux/bundle/budoux-zh-hans.min.js"></script>
155 | 
156 | <!-- For Traditional Chinese -->
157 | <script src="https://unpkg.com/budoux/bundle/budoux-zh-hant.min.js"></script>
158 | 
159 | <!-- For Thai -->
160 | <script src="https://unpkg.com/budoux/bundle/budoux-th.min.js"></script>
161 | ```
162 | 
163 | Otherwise, if you wish to bundle the component with the rest of your source code,
164 | you can import the component as shown below.
165 | 
166 | ```javascript
167 | // For Japanese
168 | import 'budoux/module/webcomponents/budoux-ja';
169 | 
170 | // For Simplified Chinese
171 | import 'budoux/module/webcomponents/budoux-zh-hans';
172 | 
173 | // For Traditional Chinese
174 | import 'budoux/module/webcomponents/budoux-zh-hant';
175 | 
176 | // For Thai
177 | import 'budoux/module/webcomponents/budoux-th';
178 | ```
179 | 
180 | **Note:** BudouX Web Components directly manipulate the input HTML content
181 | instead of outputting the result to a shadow DOM. This design was chosen because
182 | the goal of BudouX Web Components is to simply insert zero-width spaces (ZWSPs)
183 | into the content, and isolating the style from the rest of the document could
184 | introduce unexpected side effects for developers.
185 | 
186 | Consequently, cloning or editing the element might lead to duplicated ZWSPs
187 | between phrases. This is because BudouX Web Components cannot distinguish
188 | between characters that originate in the source and those that are inserted by
189 | BudouX itself once connected to the document. Duplicating ZWSPs will not cause
190 | any severe problems in controlling line breaks, and they are invisible anyway,
191 | but this is the reason we do not support other separator characters for these
192 | components.
193 | 
194 | ### CLI
195 | 
196 | You can also format inputs on your terminal with `budoux` command.
197 | 
198 | ```shellsession
199 | $ budoux 本日は晴天です。
200 | 本日は
201 | 晴天です。
202 | ```
203 | 
204 | ```shellsession
205 | $ echo $'本日は晴天です。\n明日は曇りでしょう。' | budoux
206 | 本日は
207 | 晴天です。
208 | ---
209 | 明日は
210 | 曇りでしょう。
211 | ```
212 | 
213 | ```shellsession
214 | $ budoux 本日は晴天です。 -H
215 | <span style="word-break: keep-all; overflow-wrap: anywhere;">本日は\u200b晴天です。</span>
216 | ```
217 | 
218 | Please note that separators are denoted as `\u200b` in the example above for
219 | illustrative purposes, but the actual output is an invisible string as it's a
220 | zero-width space.
221 | 
222 | If you want to see help, run `budoux -h`.
223 | 
224 | ```shellsession
225 | $ budoux -h
226 | Usage: budoux [-h] [-H] [-d STR] [-m JSON] [-V] [TXT]
227 | 
228 | BudouX is the successor to Budou, the machine learning powered line break organizer tool.
229 | 
230 | Arguments:
231 |   txt                   text
232 | 
233 | Options:
234 |   -H, --html            HTML mode (default: false)
235 |   -d, --delim <str>     output delimiter in TEXT mode (default: "---")
236 |   -m, --model <json>    custom model file path
237 |   -V, --version         output the version number
238 |   -h, --help            display help for command
239 | ```
240 | 
241 | ## Caveat
242 | 
243 | BudouX supports HTML inputs and outputs HTML strings with markup applied to wrap
244 | phrases, but it's not meant to be used as an HTML sanitizer.
245 | **BudouX doesn't sanitize any inputs.**
246 | Malicious HTML inputs yield malicious HTML outputs.
247 | Please use it with an appropriate sanitizer library if you don't trust the input.
248 | 
249 | ## Author
250 | 
251 | [Shuhei Iitsuka](https://tushuhei.com)
252 | 
253 | ## Disclaimer
254 | 
255 | This is not an officially supported Google product.
256 | 


--------------------------------------------------------------------------------
/javascript/bin/budoux.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | /**
 3 |  * @license
 4 |  * Copyright 2021 Google LLC
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | const {cli} = require('../dist/cli');
19 | cli(process.argv);
20 | 


--------------------------------------------------------------------------------
/javascript/eslint.config.mjs:
--------------------------------------------------------------------------------
 1 | import prettier from "eslint-plugin-prettier";
 2 | import tsParser from "@typescript-eslint/parser";
 3 | import path from "node:path";
 4 | import { fileURLToPath } from "node:url";
 5 | import js from "@eslint/js";
 6 | import { FlatCompat } from "@eslint/eslintrc";
 7 | 
 8 | const __filename = fileURLToPath(import.meta.url);
 9 | const __dirname = path.dirname(__filename);
10 | const compat = new FlatCompat({
11 |     baseDirectory: __dirname,
12 |     recommendedConfig: js.configs.recommended,
13 |     allConfig: js.configs.all
14 | });
15 | 
16 | export default [{
17 |     ignores: ["**/bundle", "**/dist", "**/module", "src/data"],
18 | }, ...compat.extends("eslint:recommended", "prettier"), {
19 |     plugins: {
20 |         prettier,
21 |     },
22 | 
23 |     rules: {
24 |         "prettier/prettier": "error",
25 |         "block-scoped-var": "error",
26 |         eqeqeq: "error",
27 |         "no-var": "error",
28 |         "prefer-const": "error",
29 |         "eol-last": "error",
30 |         "prefer-arrow-callback": "error",
31 |         "no-trailing-spaces": "error",
32 | 
33 |         quotes: ["warn", "single", {
34 |             avoidEscape: true,
35 |         }],
36 | 
37 |         "no-restricted-properties": ["error", {
38 |             object: "describe",
39 |             property: "only",
40 |         }, {
41 |             object: "it",
42 |             property: "only",
43 |         }],
44 |     },
45 | }, ...compat.extends("plugin:@typescript-eslint/recommended").map(config => ({
46 |     ...config,
47 |     files: ["**/*.ts", "**/*.tsx"],
48 | })), {
49 |     files: ["**/*.ts", "**/*.tsx"],
50 | 
51 |     languageOptions: {
52 |         parser: tsParser,
53 |         ecmaVersion: 2018,
54 |         sourceType: "module",
55 |     },
56 | 
57 |     rules: {
58 |         "@typescript-eslint/no-non-null-assertion": "off",
59 |         "@typescript-eslint/no-use-before-define": "off",
60 |         "@typescript-eslint/no-warning-comments": "off",
61 |         "@typescript-eslint/no-empty-function": "off",
62 |         "@typescript-eslint/no-var-requires": "off",
63 |         "@typescript-eslint/explicit-function-return-type": "off",
64 |         "@typescript-eslint/explicit-module-boundary-types": "off",
65 |         "@typescript-eslint/ban-types": "off",
66 |         "@typescript-eslint/camelcase": "off",
67 |         "node/no-empty-function": "off",
68 |         "node/no-missing-import": "off",
69 |         "node/no-unsupported-features/es-syntax": "off",
70 |         "node/no-missing-require": "off",
71 |         "node/shebang": "off",
72 |         "no-dupe-class-members": "off",
73 |         "require-atomic-updates": "off",
74 |     },
75 | }];


--------------------------------------------------------------------------------
/javascript/karma.conf.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2023 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | module.exports = function (config) {
18 |   config.set({
19 |     basePath: '',
20 |     frameworks: ['jasmine'],
21 |     files: ['bundle/tests/*.js'],
22 |     reporters: ['progress'],
23 |     port: 9876,
24 |     colors: true,
25 |     logLevel: config.LOG_INFO,
26 |     autoWatch: false,
27 |     browsers: ['ChromeHeadless'],
28 |     singleRun: true,
29 |     concurrency: Infinity,
30 |   });
31 | };
32 | 


--------------------------------------------------------------------------------
/javascript/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "budoux",
 3 |   "version": "0.7.0",
 4 |   "description": "A small chunk segmenter.",
 5 |   "author": "Shuhei Iitsuka",
 6 |   "license": "Apache-2.0",
 7 |   "repository": {
 8 |     "type": "git",
 9 |     "url": "https://github.com/google/budoux.git",
10 |     "directory": "javascript"
11 |   },
12 |   "main": "./dist/index.js",
13 |   "module": "./module/index.js",
14 |   "exports": {
15 |     ".": {
16 |       "import": "./module/index.js",
17 |       "require": "./dist/index.js"
18 |     }
19 |   },
20 |   "browser": {
21 |     "./dist/dom.js": "./dist/dom-browser.js",
22 |     "./module/dom.js": "./module/dom-browser.js",
23 |     "./dist/tests/testutils.js": "./dist/tests/testutils-browser.js",
24 |     "./module/tests/testutils.js": "./module/tests/testutils-browser.js"
25 |   },
26 |   "bin": {
27 |     "budoux": "./bin/budoux.js"
28 |   },
29 |   "sideEffects": [
30 |     "./module/webcomponents/*",
31 |     "./module/tests/*"
32 |   ],
33 |   "scripts": {
34 |     "build": "npm run build:esm && npm run build:cjs",
35 |     "build:cjs": "tsc && cp -r src/tests/models/ dist/tests/models/",
36 |     "build:esm": "tsc --outDir module --module ES2020 && cp -r src/tests/models/ module/tests/models/",
37 |     "bundle": "npm run bundle:webcomponents && npm run bundle:test",
38 |     "bundle:test": "esbuild module/tests/index.browser.js --bundle --sourcemap --outfile=bundle/tests/index.browser.js",
39 |     "bundle:webcomponents": "npm run bundle:webcomponents:ja && npm run bundle:webcomponents:zh-hans && npm run bundle:webcomponents:zh-hant && npm run bundle:webcomponents:th",
40 |     "bundle:webcomponents:ja": "esbuild module/webcomponents/budoux-ja.js --bundle --minify --sourcemap --outfile=bundle/budoux-ja.min.js",
41 |     "bundle:webcomponents:zh-hans": "esbuild module/webcomponents/budoux-zh-hans.js --bundle --minify --sourcemap --outfile=bundle/budoux-zh-hans.min.js",
42 |     "bundle:webcomponents:zh-hant": "esbuild module/webcomponents/budoux-zh-hant.js --bundle --minify --sourcemap --outfile=bundle/budoux-zh-hant.min.js",
43 |     "bundle:webcomponents:th": "esbuild module/webcomponents/budoux-th.js --bundle --minify --sourcemap --outfile=bundle/budoux-th.min.js",
44 |     "clean": "rm -rf dist module src/data",
45 |     "copy": "node ./scripts/copy-data.js",
46 |     "prebuild": "npm run clean && npm run copy",
47 |     "prepare": "npm run clean && npm run copy && npm run build && npm run bundle",
48 |     "pretest": "npm run build && npm run bundle:test",
49 |     "test": "npm run test:jasmine && npm run test:karma && npm run test:cli-version",
50 |     "test:cli-version": "node ./scripts/check-cli-version.js",
51 |     "test:jasmine": "jasmine dist/tests/index.node.js",
52 |     "test:karma": "karma start",
53 |     "lint": "eslint src/** --no-error-on-unmatched-pattern",
54 |     "fix": "eslint src/** --no-error-on-unmatched-pattern --fix"
55 |   },
56 |   "devDependencies": {
57 |     "@eslint/eslintrc": "^3.1.0",
58 |     "@eslint/js": "^9.9.0",
59 |     "@types/jasmine": "^5.1.0",
60 |     "@types/node": "^22.0.0",
61 |     "@typescript-eslint/eslint-plugin": "^8.0.1",
62 |     "esbuild": "^0.25.0",
63 |     "eslint": "^9.9.0",
64 |     "eslint-config-prettier": "^10.0.1",
65 |     "eslint-plugin-prettier": "^5.0.0",
66 |     "jasmine": "^5.0.2",
67 |     "jasmine-core": "^5.0.1",
68 |     "karma": "^6.4.2",
69 |     "karma-chrome-launcher": "^3.2.0",
70 |     "karma-jasmine": "^5.1.0",
71 |     "typescript": "^5.1.6"
72 |   },
73 |   "dependencies": {
74 |     "commander": "^13.0.0",
75 |     "linkedom": "^0.18.7"
76 |   },
77 |   "overrides": {
78 |     "tough-cookie": "^5.0.0-rc.4"
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/javascript/scripts/check-cli-version.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2021 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | const assert = require('assert');
18 | const path = require('path');
19 | const childProcess = require('child_process');
20 | const package = require('../package.json');
21 | 
22 | const packageVersion = package.version;
23 | const runCli = args =>
24 |   new Promise(resolve => {
25 |     childProcess.execFile(
26 |       'node',
27 |       [path.resolve(__dirname, '..', 'bin', 'budoux.js'), ...args],
28 |       (error, stdout, stderr) => {
29 |         resolve({
30 |           error,
31 |           stdout,
32 |           stderr,
33 |         });
34 |       }
35 |     );
36 |   });
37 | 
38 | runCli(['-V']).then(({stdout}) => {
39 |   assert.equal(
40 |     stdout.replace('\n', ''),
41 |     packageVersion,
42 |     'Package version and CLI version output (-V) should match.'
43 |   );
44 | });
45 | 
46 | runCli(['--version']).then(({stdout}) => {
47 |   assert.equal(
48 |     stdout.replace('\n', ''),
49 |     packageVersion,
50 |     'Package version and CLI version output (--version) should match.'
51 |   );
52 | });
53 | 


--------------------------------------------------------------------------------
/javascript/scripts/copy-data.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2021 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | const path = require('path');
18 | const fs = require('fs');
19 | 
20 | const PROJECT_ROOT = path.join(__dirname, '..', '..');
21 | const DATA_DIR = path.join(PROJECT_ROOT, 'javascript', 'src', 'data');
22 | fs.mkdirSync(path.join(DATA_DIR, 'models'), {recursive: true});
23 | 
24 | const copyModels = () => {
25 |   const modelsDirPath = path.join(PROJECT_ROOT, 'budoux', 'models');
26 |   const files = fs.readdirSync(modelsDirPath);
27 |   files.forEach(file => {
28 |     const ext = file.split('.').pop();
29 |     const body = file.split('.').slice(0, -1).join('.');
30 |     if (ext !== 'json') return;
31 |     const sourcePath = path.join(modelsDirPath, file);
32 |     const targetPath = path.join(DATA_DIR, 'models', `${body}.ts`);
33 |     const content = fs.readFileSync(sourcePath);
34 |     fs.writeFileSync(
35 |       targetPath,
36 |       `export const model: {[key:string]: {[key:string]: number}} = ${content}`
37 |     );
38 |   });
39 | };
40 | 
41 | const main = () => {
42 |   copyModels();
43 | };
44 | 
45 | main();
46 | 


--------------------------------------------------------------------------------
/javascript/src/cli.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @license
  3 |  * Copyright 2021 Google LLC
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     https://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | import {readFileSync} from 'fs';
 18 | import * as path from 'path';
 19 | import * as readline from 'readline';
 20 | import {Command} from 'commander';
 21 | import {
 22 |   HTMLProcessingParser,
 23 |   loadDefaultParsers,
 24 |   loadDefaultJapaneseParser,
 25 | } from './index.js';
 26 | 
 27 | const CLI_VERSION = '0.7.0';
 28 | const defaultParsers = loadDefaultParsers();
 29 | 
 30 | /**
 31 |  * Run the command line interface program.
 32 |  * @param argv process.argv.
 33 |  */
 34 | export const cli = (argv: string[]) => {
 35 |   const program = new Command('budoux');
 36 | 
 37 |   program.usage('[-h] [-H] [-d STR] [-s STR] [-m JSON] [-l LANG] [-V] [TXT]');
 38 |   program.description(
 39 |     'BudouX is the successor to Budou, the machine learning powered line break organizer tool.'
 40 |   );
 41 |   program
 42 |     .option('-H, --html', 'HTML mode', false)
 43 |     .option(
 44 |       '-d, --delim <str>',
 45 |       'output sentence delimiter in TEXT mode',
 46 |       '---'
 47 |     )
 48 |     .option('-s, --sep <str>', 'output phrase separator in TEXT mode', '\n')
 49 |     .option('-m, --model <json>', 'model file path')
 50 |     .option(
 51 |       '-l, --lang <str>',
 52 |       `language model to use. -m and --model will be prioritized if any.\navailable languages: ${[
 53 |         ...defaultParsers.keys(),
 54 |       ].join(', ')}`
 55 |     )
 56 |     .argument('[txt]', 'text')
 57 |     .allowExcessArguments();
 58 | 
 59 |   program.version(CLI_VERSION);
 60 | 
 61 |   program.parse(argv);
 62 | 
 63 |   const options = program.opts();
 64 |   const {lang, model, delim, sep, html} = options as {
 65 |     html: boolean;
 66 |     delim: string;
 67 |     sep: string;
 68 |     model?: string;
 69 |     lang?: string;
 70 |   };
 71 |   const {args} = program;
 72 | 
 73 |   const parser = model
 74 |     ? loadCustomParser(model)
 75 |     : lang && defaultParsers.has(lang)
 76 |     ? defaultParsers.get(lang)!
 77 |     : loadDefaultJapaneseParser();
 78 | 
 79 |   switch (args.length) {
 80 |     case 0: {
 81 |       const rl = readline.createInterface({
 82 |         input: process.stdin,
 83 |       });
 84 | 
 85 |       let stdin = '';
 86 |       rl.on('line', line => {
 87 |         stdin += line + '\n';
 88 |       });
 89 |       process.stdin.on('end', () => {
 90 |         outputParsedTexts(parser, html, delim, sep, [stdin]);
 91 |       });
 92 |       break;
 93 |     }
 94 |     case 1: {
 95 |       outputParsedTexts(parser, html, delim, sep, args);
 96 |       break;
 97 |     }
 98 |     default: {
 99 |       throw new Error(
100 |         'Too many arguments. Please, pass the only one argument.'
101 |       );
102 |     }
103 |   }
104 | };
105 | 
106 | /**
107 |  * Run the command line interface program.
108 |  * @param parser A parser.
109 |  * @param html A flag of html output mode.
110 |  * @param delim A delimiter to separate output sentence.
111 |  * @param sep A separator to separate output phrases.
112 |  * @param args string array to parse. Array should have only one element.
113 |  */
114 | const outputParsedTexts = (
115 |   parser: HTMLProcessingParser,
116 |   html: boolean,
117 |   delim: string,
118 |   sep: string,
119 |   args: string[]
120 | ) => {
121 |   if (html) {
122 |     const text = args[0];
123 |     const output = parser.translateHTMLString(text);
124 |     console.log(output);
125 |   } else {
126 |     const splitedTextsByNewLine = args[0]
127 |       .split(/\r?\n/)
128 |       .filter(text => text !== '');
129 |     splitedTextsByNewLine.forEach((text, index) => {
130 |       const parsedTexts = parser.parse(text);
131 |       console.log(parsedTexts.join(sep));
132 |       if (index + 1 !== splitedTextsByNewLine.length) console.log(delim);
133 |     });
134 |   }
135 | };
136 | 
137 | /**
138 |  * Loads a parser equipped with custom model.
139 |  * @return A parser with the loaded model.
140 |  */
141 | const loadCustomParser = (modelPath: string) => {
142 |   const file = readFileSync(path.resolve(modelPath)).toString();
143 |   const model = JSON.parse(file);
144 |   return new HTMLProcessingParser(model);
145 | };
146 | 


--------------------------------------------------------------------------------
/javascript/src/dom-browser.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2021 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /**
18 |  * Parses an html string and returns a parsed html document.
19 |  * @param html An HTML string.
20 |  * @return A Document.
21 |  */
22 | export const parseFromString = (html: string) => {
23 |   return new window.DOMParser().parseFromString(html, 'text/html');
24 | };
25 | 


--------------------------------------------------------------------------------
/javascript/src/dom.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2021 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import {DOMParser} from 'linkedom';
18 | 
19 | /**
20 |  * Parses an html string and returns a parsed html document.
21 |  * @param html An HTML string.
22 |  * @return A Document.
23 |  */
24 | export const parseFromString = (html: string) => {
25 |   return new DOMParser().parseFromString(
26 |     `<!doctype html><html><body>${html}</body></html>`,
27 |     'text/html'
28 |   );
29 | };
30 | 


--------------------------------------------------------------------------------
/javascript/src/index.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2021 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import {model as jaModel} from './data/models/ja.js';
18 | import {model as zhHansModel} from './data/models/zh-hans.js';
19 | import {model as zhHantModel} from './data/models/zh-hant.js';
20 | import {model as thModel} from './data/models/th.js';
21 | import {HTMLProcessingParser} from './html_processor.js';
22 | 
23 | export {Parser} from './parser.js';
24 | export {HTMLProcessor, HTMLProcessingParser} from './html_processor.js';
25 | export {jaModel, zhHansModel, zhHantModel};
26 | 
27 | /**
28 |  * Loads a parser equipped with the default Japanese model.
29 |  * @return A parser with the default Japanese model.
30 |  */
31 | export const loadDefaultJapaneseParser = () => {
32 |   return new HTMLProcessingParser(jaModel);
33 | };
34 | 
35 | /**
36 |  * Loads a parser equipped with the default Simplified Chinese model.
37 |  * @return A parser with the default Simplified Chinese model.
38 |  */
39 | export const loadDefaultSimplifiedChineseParser = () => {
40 |   return new HTMLProcessingParser(zhHansModel);
41 | };
42 | 
43 | /**
44 |  * Loads a parser equipped with the default Traditional Chinese model.
45 |  * @return A parser with the default Traditional Chinese model.
46 |  */
47 | export const loadDefaultTraditionalChineseParser = () => {
48 |   return new HTMLProcessingParser(zhHantModel);
49 | };
50 | 
51 | /**
52 |  * Loads a parser equipped with the default Thai model.
53 |  * @returns A parser with the default Thai model.
54 |  */
55 | export const loadDefaultThaiParser = () => {
56 |   return new HTMLProcessingParser(thModel);
57 | };
58 | /**
59 |  * Loads available default parsers.
60 |  * @return A map between available lang codes and their default parsers.
61 |  */
62 | export const loadDefaultParsers = () => {
63 |   return new Map([
64 |     ['ja', loadDefaultJapaneseParser()],
65 |     ['zh-hans', loadDefaultSimplifiedChineseParser()],
66 |     ['zh-hant', loadDefaultTraditionalChineseParser()],
67 |     ['th', loadDefaultThaiParser()],
68 |   ]);
69 | };
70 | 


--------------------------------------------------------------------------------
/javascript/src/parser.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2021 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /**
18 |  * Base BudouX parser.
19 |  */
20 | export class Parser {
21 |   /** BudouX model data */
22 |   private readonly model: Map<string, Map<string, number>>;
23 |   private readonly baseScore: number;
24 | 
25 |   /**
26 |    * Constructs a BudouX parser.
27 |    * @param model A model data.
28 |    */
29 |   constructor(model: {[key: string]: {[key: string]: number}}) {
30 |     this.model = new Map(
31 |       Object.entries(model).map(([k, v]) => [k, new Map(Object.entries(v))])
32 |     );
33 |     this.baseScore =
34 |       -0.5 *
35 |       [...this.model.values()]
36 |         .map(group => [...group.values()])
37 |         .flat()
38 |         .reduce((prev, curr) => prev + curr, 0);
39 |   }
40 | 
41 |   /**
42 |    * Parses the input sentence and returns a list of semantic chunks.
43 |    *
44 |    * @param sentence An input sentence.
45 |    * @return The retrieved chunks.
46 |    */
47 |   parse(sentence: string): string[] {
48 |     if (sentence === '') return [];
49 |     const boundaries = this.parseBoundaries(sentence);
50 |     const result = [];
51 |     let start = 0;
52 |     for (const boundary of boundaries) {
53 |       result.push(sentence.slice(start, boundary));
54 |       start = boundary;
55 |     }
56 |     result.push(sentence.slice(start));
57 |     return result;
58 |   }
59 | 
60 |   /**
61 |    * Parses the input sentence and returns a list of boundaries.
62 |    *
63 |    * @param sentence An input sentence.
64 |    * @return The list of boundaries.
65 |    */
66 |   parseBoundaries(sentence: string): number[] {
67 |     const result = [];
68 | 
69 |     for (let i = 1; i < sentence.length; i++) {
70 |       let score = this.baseScore;
71 |       // NOTE: Score values in models may be negative.
72 |       /* eslint-disable */
73 |       score += this.model.get('UW1')?.get(sentence.substring(i - 3, i - 2)) || 0;
74 |       score += this.model.get('UW2')?.get(sentence.substring(i - 2, i - 1)) || 0;
75 |       score += this.model.get('UW3')?.get(sentence.substring(i - 1, i)) || 0;
76 |       score += this.model.get('UW4')?.get(sentence.substring(i, i + 1)) || 0;
77 |       score += this.model.get('UW5')?.get(sentence.substring(i + 1, i + 2)) || 0;
78 |       score += this.model.get('UW6')?.get(sentence.substring(i + 2, i + 3)) || 0;
79 |       score += this.model.get('BW1')?.get(sentence.substring(i - 2, i)) || 0;
80 |       score += this.model.get('BW2')?.get(sentence.substring(i - 1, i + 1)) || 0;
81 |       score += this.model.get('BW3')?.get(sentence.substring(i, i + 2)) || 0;
82 |       score += this.model.get('TW1')?.get(sentence.substring(i - 3, i)) || 0;
83 |       score += this.model.get('TW2')?.get(sentence.substring(i - 2, i + 1)) || 0;
84 |       score += this.model.get('TW3')?.get(sentence.substring(i - 1, i + 2)) || 0;
85 |       score += this.model.get('TW4')?.get(sentence.substring(i, i + 3)) || 0;
86 |       /* eslint-enable */
87 |       if (score > 0) result.push(i);
88 |     }
89 |     return result;
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/javascript/src/tests/index.browser.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2023 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import './test_html_processor.js';
18 | import './test_parser.js';
19 | import './test_webcomponents.js';
20 | 


--------------------------------------------------------------------------------
/javascript/src/tests/index.node.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2023 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import './test_cli.js';
18 | import './test_html_processor.js';
19 | import './test_parser.js';
20 | 


--------------------------------------------------------------------------------
/javascript/src/tests/models/separate_right_before_a.json:
--------------------------------------------------------------------------------
1 | {"UW4": {"a": 1001}}
2 | 


--------------------------------------------------------------------------------
/javascript/src/tests/test_cli.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @license
  3 |  * Copyright 2021 Google LLC
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     https://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | import {cli} from '../cli.js';
 18 | import {execFile, ExecFileException} from 'child_process';
 19 | import * as path from 'path';
 20 | import stream from 'stream';
 21 | import {loadDefaultParsers} from '../index.js';
 22 | 
 23 | type execFileCallBack = {
 24 |   error: ExecFileException | null;
 25 |   stdout: string;
 26 |   stderr: string;
 27 | };
 28 | 
 29 | const runCli = (args: string[], stdin?: string): Promise<execFileCallBack> => {
 30 |   return new Promise(resolve => {
 31 |     const binPath = path.resolve('./bin/budoux.js');
 32 |     const child = execFile(
 33 |       'node',
 34 |       [binPath, ...args],
 35 |       (error, stdout, stderr) => {
 36 |         resolve({
 37 |           error,
 38 |           stdout,
 39 |           stderr,
 40 |         });
 41 |       }
 42 |     );
 43 | 
 44 |     if (stdin) {
 45 |       const stdinStream = new stream.Readable();
 46 |       stdinStream.push(stdin);
 47 |       stdinStream.push(null);
 48 |       if (child.stdin) {
 49 |         stdinStream.pipe(child.stdin);
 50 |       }
 51 |     }
 52 |   });
 53 | };
 54 | 
 55 | describe('cli', () => {
 56 |   it('should output the wrapped HTML sentence when execute budoux command with --html option.', async () => {
 57 |     const inputText = '今日は天気です。';
 58 |     const argv = ['--html', inputText];
 59 |     const expectedStdOut =
 60 |       '<span style="word-break:keep-all;overflow-wrap:anywhere">今日は\u200B天気です。</span>';
 61 |     const {stdout} = await runCli(argv);
 62 |     expect(stdout.trim()).toBe(expectedStdOut);
 63 |   });
 64 | 
 65 |   it('should output the wrapped HTML sentence when execute budoux command with -H option alias.', async () => {
 66 |     const inputText = '今日は天気です。';
 67 |     const argv = ['-H', inputText];
 68 |     const expectedStdOut =
 69 |       '<span style="word-break:keep-all;overflow-wrap:anywhere">今日は\u200B天気です。</span>';
 70 |     const {stdout} = await runCli(argv);
 71 |     expect(stdout.trim()).toBe(expectedStdOut);
 72 |   });
 73 | 
 74 |   it('should output the separated sentence with custom model when execute budoux command with --model option.', async () => {
 75 |     const inputText = 'abcdeabcd';
 76 |     const customModelPath = path.resolve(
 77 |       __dirname,
 78 |       'models',
 79 |       'separate_right_before_a.json'
 80 |     );
 81 |     const argv = ['--model', customModelPath, inputText];
 82 |     const expectedStdOut = 'abcde\nabcd';
 83 |     const {stdout} = await runCli(argv);
 84 |     expect(stdout.trim()).toBe(expectedStdOut);
 85 |   });
 86 | 
 87 |   it('should output the separated sentence with custom model when execute budoux command with -m option alias.', async () => {
 88 |     const inputText = 'abcdeabcd';
 89 |     const customModelPath = path.resolve(
 90 |       __dirname,
 91 |       'models',
 92 |       'separate_right_before_a.json'
 93 |     );
 94 |     const argv = ['-m', customModelPath, inputText];
 95 |     const expectedStdOut = 'abcde\nabcd';
 96 |     const {stdout} = await runCli(argv);
 97 |     expect(stdout.trim()).toBe(expectedStdOut);
 98 |   });
 99 | 
100 |   it('should use the corresponding language model when the -l parameter is given.', async () => {
101 |     const inputTextHans = '我们的使命是整合全球信息，供大众使用，让人人受益。';
102 |     const expectedStdOut = loadDefaultParsers()
103 |       .get('zh-hans')!
104 |       .parse(inputTextHans)
105 |       .join('\n');
106 |     const argv = ['-l', 'zh-hans', inputTextHans];
107 |     const {stdout} = await runCli(argv);
108 |     expect(stdout.trim()).toBe(expectedStdOut);
109 |   });
110 | 
111 |   it('should use the corresponding language model when the --lang parameter is given.', async () => {
112 |     const inputTextHans = '我們的使命是匯整全球資訊，供大眾使用，使人人受惠。';
113 |     const expectedStdOut = loadDefaultParsers()
114 |       .get('zh-hant')!
115 |       .parse(inputTextHans)
116 |       .join('\n');
117 |     const argv = ['--lang', 'zh-hant', inputTextHans];
118 |     const {stdout} = await runCli(argv);
119 |     expect(stdout.trim()).toBe(expectedStdOut);
120 |   });
121 | 
122 |   it('should prioritize -m and --model over -l and --lang', async () => {
123 |     const inputTextHans = '我們的使a命';
124 |     const customModelPath = path.resolve(
125 |       __dirname,
126 |       'models',
127 |       'separate_right_before_a.json'
128 |     );
129 |     const argv = [
130 |       '--model',
131 |       customModelPath,
132 |       '--lang',
133 |       'zh-hant',
134 |       inputTextHans,
135 |     ];
136 |     const expectedStdOut = '我們的使\na命';
137 |     const {stdout} = await runCli(argv);
138 |     expect(stdout.trim()).toBe(expectedStdOut);
139 |   });
140 | 
141 |   it('should output the separated sentence with separater when execute budoux command with --delim option.', async () => {
142 |     const inputText = '今日は天気です。\n明日は雨かな？';
143 |     const argv = ['--delim', '###', inputText];
144 |     const expectedStdOut = '今日は\n天気です。\n###\n明日は\n雨かな？';
145 |     const {stdout} = await runCli(argv);
146 |     expect(stdout.trim()).toBe(expectedStdOut);
147 |   });
148 | 
149 |   it('should output the separated sentence with separater when execute budoux command with -d option alias.', async () => {
150 |     const inputText = '今日は天気です。\n明日は雨かな？';
151 |     const argv = ['-d', '###', inputText];
152 |     const expectedStdOut = '今日は\n天気です。\n###\n明日は\n雨かな？';
153 |     const {stdout} = await runCli(argv);
154 |     expect(stdout.trim()).toBe(expectedStdOut);
155 |   });
156 | 
157 |   it('should output the separated sentence with separater when execute budoux with stdin inputed by pipe', async () => {
158 |     const {stdout} = await runCli([], '今日は天気です。\n明日は雨かな？');
159 |     const expectedStdOut = '今日は\n天気です。\n---\n明日は\n雨かな？';
160 |     expect(stdout.trim()).toBe(expectedStdOut);
161 |   });
162 | 
163 |   it('should output phrases with the separator specified by -s option', async () => {
164 |     const inputText = '今日は天気です。';
165 |     const argv = ['-s', '/', inputText];
166 |     const expectedStdOut = '今日は/天気です。';
167 |     const {stdout} = await runCli(argv);
168 |     expect(stdout.trim()).toBe(expectedStdOut);
169 |   });
170 | 
171 |   it('should output phrases with the separator specified by --sep option', async () => {
172 |     const inputText = '今日は天気です。';
173 |     const argv = ['--sep', '/', inputText];
174 |     const expectedStdOut = '今日は/天気です。';
175 |     const {stdout} = await runCli(argv);
176 |     expect(stdout.trim()).toBe(expectedStdOut);
177 |   });
178 | 
179 |   it('should output the error message when get more than one text argument.', () => {
180 |     const argv = [
181 |       'node',
182 |       'budoux',
183 |       '今日は天気です。',
184 |       '明日は晴れるでしょう。',
185 |     ];
186 |     const stab = () => cli(argv);
187 | 
188 |     expect(stab).toThrowError(
189 |       'Too many arguments. Please, pass the only one argument.'
190 |     );
191 |   });
192 | 
193 |   it('should output the error message when get extra option argument.', () => {
194 |     const argv = [
195 |       'node',
196 |       'budoux',
197 |       '--delim',
198 |       '---',
199 |       '<extra delimiter option arguments>',
200 |       '今日は天気です。',
201 |     ];
202 |     const stab = () => cli(argv);
203 | 
204 |     expect(stab).toThrowError(
205 |       'Too many arguments. Please, pass the only one argument.'
206 |     );
207 |   });
208 | 
209 |   it('should output the error message when get extra option argument.', () => {
210 |     const customModelPath = path.resolve(
211 |       __dirname,
212 |       'models',
213 |       'separate_right_before_a.json'
214 |     );
215 |     const argv = [
216 |       'node',
217 |       'budoux',
218 |       '--model',
219 |       customModelPath,
220 |       '<extra model option arguments>',
221 |       '今日は天気です。',
222 |     ];
223 |     const stab = () => cli(argv);
224 | 
225 |     expect(stab).toThrowError(
226 |       'Too many arguments. Please, pass the only one argument.'
227 |     );
228 |   });
229 | 
230 |   it('should output the unknown option error when execute budoux command with -v option.', async () => {
231 |     const {stderr} = await runCli(['-v']);
232 | 
233 |     expect(stderr).toBe("error: unknown option '-v'\n");
234 |   });
235 | });
236 | 


--------------------------------------------------------------------------------
/javascript/src/tests/test_parser.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2021 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import {Parser} from '../parser.js';
18 | 
19 | describe('Parser.parse', () => {
20 |   const TEST_SENTENCE = 'abcdeabcd';
21 | 
22 |   it('should separate if a strong feature item supports.', () => {
23 |     const model = {
24 |       UW4: {a: 10000}, // means "should separate right before 'a'".
25 |     };
26 |     const parser = new Parser(model);
27 |     const result = parser.parse(TEST_SENTENCE);
28 |     expect(result).toEqual(['abcde', 'abcd']);
29 |   });
30 | 
31 |   it('should separate even if it makes a phrase of one character.', () => {
32 |     const model = {
33 |       UW4: {b: 10000}, // means "should separate right before 'b'".
34 |     };
35 |     const parser = new Parser(model);
36 |     const result = parser.parse(TEST_SENTENCE);
37 |     expect(result).toEqual(['a', 'bcdea', 'bcd']);
38 |   });
39 | 
40 |   it('should return an empty list when the input is a blank string.', () => {
41 |     const parser = new Parser({});
42 |     const result = parser.parse('');
43 |     expect(result).toEqual([]);
44 |   });
45 | });
46 | 


--------------------------------------------------------------------------------
/javascript/src/tests/test_webcomponents.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2021 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import '../webcomponents/budoux-ja.js';
18 | 
19 | describe('Web Components', () => {
20 |   beforeAll(async () => {
21 |     await window.customElements.whenDefined('budoux-ja');
22 |   });
23 | 
24 |   beforeEach(() => {
25 |     window.document.body.innerText = '';
26 |   });
27 | 
28 |   it('should process the provided text.', () => {
29 |     const budouxElement = window.document.createElement('budoux-ja');
30 |     budouxElement.textContent = '今日は良い天気です。';
31 |     window.document.body.appendChild(budouxElement);
32 | 
33 |     expect(budouxElement.innerHTML).toBe('今日は\u200B良い\u200B天気です。');
34 |   });
35 | 
36 |   it('should react to text content changes after attached.', resolve => {
37 |     const budouxElement = window.document.createElement('budoux-ja');
38 |     budouxElement.textContent = '今日は良い天気です。';
39 |     window.document.body.appendChild(budouxElement);
40 | 
41 |     const observer = new window.MutationObserver(() => {
42 |       expect(budouxElement.innerHTML).toBe('明日は\u200B晴れるかな？');
43 |       resolve();
44 |     });
45 |     observer.observe(budouxElement, {
46 |       childList: true,
47 |     });
48 |     budouxElement.textContent = '明日は晴れるかな？';
49 |   });
50 | 
51 |   it('should work with HTML inputs.', () => {
52 |     const budouxElement = window.document.createElement('budoux-ja');
53 |     budouxElement.appendChild(window.document.createTextNode('昨日は'));
54 |     const b = window.document.createElement('b');
55 |     b.textContent = '雨';
56 |     budouxElement.appendChild(b);
57 |     budouxElement.appendChild(window.document.createTextNode('でした。'));
58 |     window.document.body.appendChild(budouxElement);
59 |     expect(budouxElement.innerHTML).toBe('昨日は<b>\u200B雨</b>でした。');
60 |   });
61 | 
62 |   it('should have wrapping styles to control line breaks.', () => {
63 |     const budouxElement = window.document.createElement('budoux-ja');
64 |     budouxElement.textContent = 'Hello world';
65 |     window.document.body.appendChild(budouxElement);
66 |     const styles = budouxElement.computedStyleMap();
67 |     expect(styles.get('word-break')?.toString()).toBe('keep-all');
68 |     expect(styles.get('overflow-wrap')?.toString()).toBe('anywhere');
69 |   });
70 | });
71 | 


--------------------------------------------------------------------------------
/javascript/src/tests/testutils-browser.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2025 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /**
18 |  * Sets an innerHTML on a given Element.
19 |  * @param element An Element.
20 |  * @param html An HTML string to set.
21 |  */
22 | export const setInnerHtml = (element: Element, html: string) => {
23 |   element.innerHTML = html;
24 | };
25 | 
26 | /**
27 |  * Creates an HTML document.
28 |  * @returns Document
29 |  */
30 | export const createDocument = () => {
31 |   return window.document;
32 | };
33 | 
34 | /**
35 |  * Whether the running environment is a Web browser.
36 |  */
37 | export const isBrowser = true;
38 | 


--------------------------------------------------------------------------------
/javascript/src/tests/testutils.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2025 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import {parseHTML} from 'linkedom';
18 | 
19 | /**
20 |  * Sets an innerHTML on a given Element.
21 |  * @param element An Element.
22 |  * @param html An HTML string to set.
23 |  */
24 | export const setInnerHtml = (element: Element, html: string) => {
25 |   element.innerHTML = html;
26 | };
27 | 
28 | /**
29 |  * Creates an HTML document.
30 |  * @returns Document
31 |  */
32 | export const createDocument = () => {
33 |   const {document} = parseHTML('<!doctype html><html></html>');
34 |   return document;
35 | };
36 | 
37 | /**
38 |  * Whether the running environment is a Web browser.
39 |  */
40 | export const isBrowser = false;
41 | 


--------------------------------------------------------------------------------
/javascript/src/webcomponents/budoux-base.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2021 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import {applyWrapStyle, type HTMLProcessingParser} from '../html_processor.js';
18 | 
19 | const MUTATION_OBSERVER_OPTIONS = {
20 |   attributes: false,
21 |   characterData: true,
22 |   childList: true,
23 |   subtree: true,
24 | };
25 | 
26 | /**
27 |  * Base BudouX Web component.
28 |  */
29 | export abstract class BudouXBaseElement extends HTMLElement {
30 |   abstract parser: HTMLProcessingParser;
31 |   observer: MutationObserver;
32 | 
33 |   /**
34 |    * Base BudouX Web component constructor.
35 |    */
36 |   constructor() {
37 |     super();
38 | 
39 |     this.observer = new MutationObserver(this.sync.bind(this));
40 |     this.observer.observe(this, MUTATION_OBSERVER_OPTIONS);
41 |   }
42 | 
43 |   connectedCallback() {
44 |     applyWrapStyle(this);
45 |     this.sync();
46 |   }
47 | 
48 |   attributeChangedCallback() {
49 |     this.sync();
50 |   }
51 | 
52 |   sync() {
53 |     this.observer.disconnect();
54 |     this.parser.applyToElement(this);
55 |     this.observer.observe(this, MUTATION_OBSERVER_OPTIONS);
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/javascript/src/webcomponents/budoux-ja.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2021 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import {
18 |   type HTMLProcessingParser,
19 |   loadDefaultJapaneseParser,
20 | } from '../index.js';
21 | import {BudouXBaseElement} from './budoux-base.js';
22 | 
23 | /**
24 |  * BudouX Japanese Web component.
25 |  */
26 | export class BudouXJapaneseElement extends BudouXBaseElement {
27 |   parser: HTMLProcessingParser;
28 | 
29 |   /**
30 |    * BudouX Japanese Web component constructor.
31 |    */
32 |   constructor() {
33 |     super();
34 |     this.parser = loadDefaultJapaneseParser();
35 |   }
36 | }
37 | 
38 | customElements.define('budoux-ja', BudouXJapaneseElement);
39 | 


--------------------------------------------------------------------------------
/javascript/src/webcomponents/budoux-th.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2023 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import {type HTMLProcessingParser, loadDefaultThaiParser} from '../index.js';
18 | import {BudouXBaseElement} from './budoux-base.js';
19 | 
20 | /**
21 |  * BudouX Thai Web component.
22 |  */
23 | export class BudouXThaiElement extends BudouXBaseElement {
24 |   parser: HTMLProcessingParser;
25 | 
26 |   /**
27 |    * BudouX Thai Web component constructor.
28 |    */
29 |   constructor() {
30 |     super();
31 |     this.parser = loadDefaultThaiParser();
32 |   }
33 | }
34 | 
35 | customElements.define('budoux-th', BudouXThaiElement);
36 | 


--------------------------------------------------------------------------------
/javascript/src/webcomponents/budoux-zh-hans.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2021 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import {
18 |   type HTMLProcessingParser,
19 |   loadDefaultSimplifiedChineseParser,
20 | } from '../index.js';
21 | import {BudouXBaseElement} from './budoux-base.js';
22 | 
23 | /**
24 |  * BudouX Simplified Chinese Web component.
25 |  */
26 | export class BudouXSimplifiedChineseElement extends BudouXBaseElement {
27 |   parser: HTMLProcessingParser;
28 | 
29 |   /**
30 |    * BudouX Simplified Chinese Web component constructor.
31 |    */
32 |   constructor() {
33 |     super();
34 |     this.parser = loadDefaultSimplifiedChineseParser();
35 |   }
36 | }
37 | 
38 | customElements.define('budoux-zh-hans', BudouXSimplifiedChineseElement);
39 | 


--------------------------------------------------------------------------------
/javascript/src/webcomponents/budoux-zh-hant.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @license
 3 |  * Copyright 2022 Google LLC
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | import {
18 |   type HTMLProcessingParser,
19 |   loadDefaultTraditionalChineseParser,
20 | } from '../index.js';
21 | import {BudouXBaseElement} from './budoux-base.js';
22 | 
23 | /**
24 |  * BudouX Traditional Chinese Web component.
25 |  */
26 | export class BudouXTraditionalChineseElement extends BudouXBaseElement {
27 |   parser: HTMLProcessingParser;
28 | 
29 |   /**
30 |    * BudouX Traditional Chinese Web component constructor.
31 |    */
32 |   constructor() {
33 |     super();
34 |     this.parser = loadDefaultTraditionalChineseParser();
35 |   }
36 | }
37 | 
38 | customElements.define('budoux-zh-hant', BudouXTraditionalChineseElement);
39 | 


--------------------------------------------------------------------------------
/javascript/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "allowUnreachableCode": false,
 4 |     "allowUnusedLabels": false,
 5 |     "lib": ["es6", "dom", "dom.iterable"],
 6 |     "target": "es2017",
 7 |     "module": "commonjs",
 8 |     "moduleResolution": "node",
 9 |     "noEmitOnError": true,
10 |     "noFallthroughCasesInSwitch": true,
11 |     "noImplicitReturns": true,
12 |     "pretty": true,
13 |     "resolveJsonModule": true,
14 |     "declaration": true,
15 |     "sourceMap": true,
16 |     "esModuleInterop": true,
17 |     "forceConsistentCasingInFileNames": true,
18 |     "strict": true,
19 |     "skipLibCheck": true,
20 |     "outDir": "./dist"
21 |   },
22 |   "exclude": [
23 |     "node_modules"
24 |   ],
25 |   "include": ["src/**/*.ts"]
26 | }
27 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["wheel", "setuptools"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/budoux/1f232ee0b7b11f9a7b7b3e02a0eecc2aaafe26ca/scripts/__init__.py


--------------------------------------------------------------------------------
/scripts/build_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Builds a model from the learned weights.
 15 | 
 16 | This script outputs a model file in JSON format from the learned weights file
 17 | output by the `train.py` script.
 18 | """
 19 | 
 20 | import argparse
 21 | import json
 22 | import typing
 23 | 
 24 | 
 25 | def aggregate_scores(
 26 |     weights: typing.List[str]) -> typing.Dict[str, typing.Dict[str, float]]:
 27 |   """Exports the model by aggregating the weight scores.
 28 | 
 29 |   Args:
 30 |     weights (List[str]): The lines of exported weight score file.
 31 | 
 32 |   Returns:
 33 |     model (Dict[string, Dict[string, float]]) The exported model.
 34 |   """
 35 |   decision_trees: typing.Dict[str, typing.Dict[str, float]] = dict()
 36 |   for row in weights:
 37 |     row = row.strip()
 38 |     if not row:
 39 |       continue
 40 |     feature = row.split('\t')[0]
 41 |     feature_group, feature_content = feature.split(':', 1)
 42 |     score = float(row.split('\t')[1])
 43 |     decision_trees.setdefault(feature_group, {})
 44 |     decision_trees[feature_group].setdefault(feature_content, 0)
 45 |     decision_trees[feature_group][feature_content] += score
 46 |   return decision_trees
 47 | 
 48 | 
 49 | def round_model(model: typing.Dict[str, typing.Dict[str, float]],
 50 |                 scale: int) -> typing.Dict[str, typing.Dict[str, int]]:
 51 |   """Rounds the scores in the model to integer after scaling.
 52 | 
 53 |   Args:
 54 |     model (Dict[str, Dict[str, float]]): The model to round scores.
 55 |     scale (int, optional): A scale factor to multiply scores.
 56 | 
 57 |   Returns:
 58 |     model_rounded (Dict[str, Dict[str, int]]) The rounded model.
 59 |   """
 60 |   model_rounded: typing.Dict[str, typing.Dict[str, int]] = dict()
 61 |   for feature_group, features in model.items():
 62 |     for feature_content, score in features.items():
 63 |       scaled_score = int(score * scale)
 64 |       if abs(scaled_score) > 0:
 65 |         model_rounded.setdefault(feature_group, {})
 66 |         model_rounded[feature_group][feature_content] = scaled_score
 67 |   return model_rounded
 68 | 
 69 | 
 70 | def parse_args(
 71 |     test: typing.Optional[typing.List[str]] = None) -> argparse.Namespace:
 72 |   """Parses commandline arguments.
 73 | 
 74 |   Args:
 75 |     test (typing.Optional[typing.List[str]], optional): Commandline args for
 76 |       testing. Defaults to None.
 77 | 
 78 |   Returns:
 79 |     Parsed arguments (argparse.Namespace).
 80 |   """
 81 |   parser = argparse.ArgumentParser(
 82 |       description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
 83 |   parser.add_argument(
 84 |       'weight_file', help='A file path for the learned weights.')
 85 |   parser.add_argument(
 86 |       '-o',
 87 |       '--outfile',
 88 |       help='A file path to export a model file. (default: model.json)',
 89 |       default='model.json',
 90 |       type=str)
 91 |   parser.add_argument(
 92 |       '--scale',
 93 |       help='A scale factor for the output scores',
 94 |       default=1000,
 95 |       type=int)
 96 |   if test is None:
 97 |     return parser.parse_args()
 98 |   else:
 99 |     return parser.parse_args(test)
100 | 
101 | 
102 | def main() -> None:
103 |   args = parse_args()
104 |   weights_filename = args.weight_file
105 |   model_filename = args.outfile
106 |   scale = args.scale
107 |   with open(weights_filename) as f:
108 |     weights = f.readlines()
109 |   model = aggregate_scores(weights)
110 |   model_rounded = round_model(model, scale)
111 |   with open(model_filename, 'w', encoding='utf-8') as f:
112 |     json.dump(model_rounded, f, ensure_ascii=False, separators=(',', ':'))
113 |   print('Model file is exported as', model_filename)
114 | 
115 | 
116 | if __name__ == '__main__':
117 |   main()
118 | 


--------------------------------------------------------------------------------
/scripts/encode_data.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Encodes the training data with extracted features."""
 15 | 
 16 | import argparse
 17 | import functools
 18 | import itertools
 19 | import multiprocessing
 20 | import os
 21 | import sys
 22 | import typing
 23 | 
 24 | # module hack
 25 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
 26 | sys.path.insert(0, os.path.abspath(LIB_PATH))
 27 | 
 28 | from budoux import utils  # noqa (module hack)
 29 | 
 30 | ArgList = typing.Optional[typing.List[str]]
 31 | DEFAULT_OUTPUT_FILENAME = 'encoded_data.txt'
 32 | 
 33 | INVALID = '▔'
 34 | """The invalid feature string."""
 35 | 
 36 | 
 37 | def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str,
 38 |                 w6: str) -> typing.List[str]:
 39 |   """Generates a feature from characters around (w1-6).
 40 | 
 41 |   Args:
 42 |     w1 (str): The character 3 characters before the break point.
 43 |     w2 (str): The character 2 characters before the break point.
 44 |     w3 (str): The character right before the break point.
 45 |     w4 (str): The character right after the break point.
 46 |     w5 (str): The character 2 characters after the break point.
 47 |     w6 (str): The character 3 characters after the break point.
 48 | 
 49 |   Returns:
 50 |     The feature (list[str]).
 51 | 
 52 |   """
 53 |   raw_feature = {
 54 |       'UW1': w1,
 55 |       'UW2': w2,
 56 |       'UW3': w3,
 57 |       'UW4': w4,
 58 |       'UW5': w5,
 59 |       'UW6': w6,
 60 |       'BW1': w2 + w3,
 61 |       'BW2': w3 + w4,
 62 |       'BW3': w4 + w5,
 63 |       'TW1': w1 + w2 + w3,
 64 |       'TW2': w2 + w3 + w4,
 65 |       'TW3': w3 + w4 + w5,
 66 |       'TW4': w4 + w5 + w6,
 67 |   }
 68 |   for key, value in list(raw_feature.items()):
 69 |     if INVALID in value:
 70 |       del raw_feature[key]
 71 |   return [f'{item[0]}:{item[1]}' for item in raw_feature.items()]
 72 | 
 73 | 
 74 | def parse_args(test: ArgList = None) -> argparse.Namespace:
 75 |   """Parses commandline arguments.
 76 | 
 77 |   Args:
 78 |     test (typing.Optional[typing.List[str]], optional): Commandline args for testing. Defaults to None.
 79 | 
 80 |   Returns:
 81 |     argparse.Namespace: Parsed data of args.
 82 |   """
 83 |   parser = argparse.ArgumentParser(description=__doc__)
 84 |   parser.add_argument(
 85 |       'source_data',
 86 |       help='''File path of the source training data to extract features.''')
 87 |   parser.add_argument(
 88 |       '-o',
 89 |       '--outfile',
 90 |       help='''Output file path for the encoded training data.
 91 |             (default: encoded_data.txt)''',
 92 |       default=DEFAULT_OUTPUT_FILENAME)
 93 |   parser.add_argument(
 94 |       '--processes',
 95 |       type=int,
 96 |       help='''Number of processes to use.
 97 |           (default: the number of CPUs in the system)''',
 98 |       default=None)
 99 |   parser.add_argument(
100 |       '--scale',
101 |       type=int,
102 |       help='''Weight scale for the entries. The value should be a unsigned
103 |          integer. (default: 1)''',
104 |       default=1)
105 |   if test is None:
106 |     return parser.parse_args()
107 |   else:
108 |     return parser.parse_args(test)
109 | 
110 | 
111 | def process(i: int, sentence: str, sep_indices: typing.Set[int],
112 |             scale: int) -> str:
113 |   """Outputs an encoded line of features from the given index.
114 | 
115 |   Args:
116 |     i (int): index
117 |     sentence (str): A sentence
118 |     sep_indices (typing.Set[int]): A set of separator indices.
119 |     scale (int): A weight scale for the entries.
120 |   """
121 |   feature = get_feature(sentence[i - 3] if i > 2 else INVALID,
122 |                         sentence[i - 2] if i > 1 else INVALID, sentence[i - 1],
123 |                         sentence[i] if i < len(sentence) else INVALID,
124 |                         sentence[i + 1] if i + 1 < len(sentence) else INVALID,
125 |                         sentence[i + 2] if i + 2 < len(sentence) else INVALID)
126 |   positive = i in sep_indices
127 |   line = '\t'.join(['%d' % (scale) if positive else '%d' % (-scale)] + feature)
128 |   return line
129 | 
130 | 
131 | def normalize_input(data: str) -> typing.Tuple[str, typing.Set[int]]:
132 |   """Normalizes the input to one line with separators.
133 | 
134 |   Args:
135 |     data(str): Source input
136 | 
137 |   Returns:
138 |     typing.Tuple[str, typing.Set[int]]: A tuple of the sentence and the
139 |       separator indices.
140 |   """
141 |   chunks = data.replace('\n', utils.SEP).strip().split(utils.SEP)
142 |   chunk_lengths = [len(chunk) for chunk in chunks]
143 |   sep_indices = set(itertools.accumulate(chunk_lengths, lambda x, y: x + y))
144 |   sentence = ''.join(chunks)
145 |   return (sentence, sep_indices)
146 | 
147 | 
148 | def main(test: ArgList = None) -> None:
149 |   args = parse_args(test)
150 |   source_filename: str = args.source_data
151 |   entries_filename: str = args.outfile
152 |   processes = None if args.processes is None else int(args.processes)
153 |   scale: int = args.scale
154 |   with open(source_filename, encoding=sys.getdefaultencoding()) as f:
155 |     data = f.read()
156 |   sentence, sep_indices = normalize_input(data)
157 |   with multiprocessing.Pool(processes) as p:
158 |     func = functools.partial(
159 |         process, sentence=sentence, sep_indices=sep_indices, scale=scale)
160 |     lines = p.map(func, range(1, len(sentence) + 1))
161 | 
162 |   with open(entries_filename, 'w', encoding=sys.getdefaultencoding()) as f:
163 |     for line in lines:
164 |       f.write(line + '\n')
165 | 
166 |   print('\033[92mEncoded training data is out at: %s\033[0m' % entries_filename)
167 | 
168 | 
169 | if __name__ == '__main__':
170 |   main()
171 | 


--------------------------------------------------------------------------------
/scripts/finetune.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Finetunes a BudouX model with the given training dataset.
 15 | 
 16 | Example usage:
 17 | 
 18 | $ python finetune.py train_data.txt base_model.json -o weights.txt --val_data=val_data.txt
 19 | """
 20 | 
 21 | import argparse
 22 | import array
 23 | import json
 24 | import typing
 25 | from collections import OrderedDict
 26 | 
 27 | from jax import Array, grad, jit
 28 | from jax import numpy as jnp
 29 | 
 30 | EPSILON = float(jnp.finfo(float).eps)
 31 | DEFAULT_OUTPUT_NAME = 'finetuned-weights.txt'
 32 | DEFAULT_NUM_ITERS = 1000
 33 | DEFAULT_LOG_SPAN = 100
 34 | DEFAULT_LEARNING_RATE = 0.01
 35 | 
 36 | 
 37 | class NormalizedModel(typing.NamedTuple):
 38 |   features: typing.List[str]
 39 |   weights: Array
 40 | 
 41 | 
 42 | class Dataset(typing.NamedTuple):
 43 |   X: Array
 44 |   Y: Array
 45 | 
 46 | 
 47 | class Metrics(typing.NamedTuple):
 48 |   tp: int
 49 |   tn: int
 50 |   fp: int
 51 |   fn: int
 52 |   accuracy: float
 53 |   precision: float
 54 |   recall: float
 55 |   fscore: float
 56 |   loss: float
 57 | 
 58 | 
 59 | def load_model(file_path: str) -> NormalizedModel:
 60 |   """Loads a model as a pair of a features list and a normalized weight vector.
 61 | 
 62 |   Args:
 63 |     file_path: A file path for the model JSON file.
 64 | 
 65 |   Returns:
 66 |     A normalized model, which is a pair of a list of feature identifiers and a
 67 |     normalized weight vector.
 68 |   """
 69 |   with open(file_path) as f:
 70 |     model = json.load(f)
 71 |   model_flat = OrderedDict()
 72 |   for category in model:
 73 |     for item in model[category]:
 74 |       model_flat['%s:%s' % (category, item)] = model[category][item]
 75 |   weights = jnp.array(list(model_flat.values()))
 76 |   weights = weights / weights.std()
 77 |   weights = weights - weights.mean()
 78 |   keys = list(model_flat.keys())
 79 |   return NormalizedModel(keys, weights)
 80 | 
 81 | 
 82 | def load_dataset(file_path: str, model: NormalizedModel) -> Dataset:
 83 |   """Loads a dataset from the given file path.
 84 | 
 85 |   Args:
 86 |     file_path: A file path for the encoded data file.
 87 |     model: A normalized model.
 88 | 
 89 |   Returns:
 90 |     A dataset of inputs (X) and outputs (Y).
 91 |   """
 92 |   xs = []
 93 |   ys = array.array('B')
 94 |   with open(file_path) as f:
 95 |     for row in f:
 96 |       cols = row.strip().split('\t')
 97 |       if len(cols) < 2:
 98 |         continue
 99 |       ys.append(cols[0] == '1')
100 |       xs.append(tuple(k in set(cols[1:]) for k in model.features))
101 |   X = jnp.array(xs) * 2 - 1
102 |   Y = jnp.array(ys)
103 |   return Dataset(X, Y)
104 | 
105 | 
106 | def cross_entropy_loss(weights: Array, x: Array, y: Array) -> Array:
107 |   """Calcurates a cross entropy loss with a prediction by a sigmoid function.
108 | 
109 |   Args:
110 |     weights: A weight vector.
111 |     x: An input array.
112 |     y: A target output array.
113 | 
114 |   Returns:
115 |     A cross entropy loss.
116 |   """
117 |   pred = 1 / (1 + jnp.exp(-x.dot(weights)))
118 |   return -jnp.mean(y * jnp.log(pred) + (1 - y) * jnp.log(1 - pred))
119 | 
120 | 
121 | def get_metrics(weights: Array, dataset: Dataset) -> Metrics:
122 |   """Gets evaluation metrics from the learned weight vector and the dataset.
123 | 
124 |   Args:
125 |     weights: A weight vector.
126 |     dataset: A dataset.
127 | 
128 |   Returns:
129 |     result (Metrics): The metrics over the given weights and the dataset.
130 |   """
131 |   pred = dataset.X.dot(weights) > 0
132 |   actual = dataset.Y
133 |   tp: int = jnp.sum(jnp.logical_and(pred == 1, actual == 1))  # type: ignore
134 |   tn: int = jnp.sum(jnp.logical_and(pred == 0, actual == 0))  # type: ignore
135 |   fp: int = jnp.sum(jnp.logical_and(pred == 1, actual == 0))  # type: ignore
136 |   fn: int = jnp.sum(jnp.logical_and(pred == 0, actual == 1))  # type: ignore
137 |   loss: float = cross_entropy_loss(weights, dataset.X,
138 |                                    dataset.Y)  # type: ignore
139 |   accuracy = (tp + tn) / (tp + tn + fp + fn)
140 |   precision = tp / (tp + fp + EPSILON)
141 |   recall = tp / (tp + fn + EPSILON)
142 |   fscore = 2 * precision * recall / (precision + recall + EPSILON)
143 |   return Metrics(
144 |       tp=tp,
145 |       tn=tn,
146 |       fp=fp,
147 |       fn=fn,
148 |       accuracy=accuracy,
149 |       precision=precision,
150 |       recall=recall,
151 |       fscore=fscore,
152 |       loss=loss,
153 |   )
154 | 
155 | 
156 | def fit(weights: Array,
157 |         train_dataset: Dataset,
158 |         iters: int,
159 |         learning_rate: float,
160 |         log_span: int,
161 |         val_dataset: typing.Optional[Dataset] = None) -> Array:
162 |   """Updates the weights with the given dataset.
163 | 
164 |   Args:
165 |     weights: A weight vector.
166 |     train_dataset: A train dataset.
167 |     iters: A number of iterations.
168 |     learning_rate: A learning rate.
169 |     log_span: A span to log metrics.
170 |     val_dataset: A validation dataset (optional).
171 | 
172 |   Returns:
173 |     An updated weight vector.
174 |   """
175 |   grad_loss = jit(grad(cross_entropy_loss, argnums=0))
176 |   for t in range(iters):
177 |     weights = weights - learning_rate * grad_loss(weights, train_dataset.X,
178 |                                                   train_dataset.Y)
179 |     if (t + 1) % log_span != 0:
180 |       continue
181 |     metrics_train = jit(get_metrics)(weights, train_dataset)
182 |     print()
183 |     print('iter:\t%d' % (t + 1))
184 |     print()
185 |     print('train accuracy:\t%.5f' % metrics_train.accuracy)
186 |     print('train prec.:\t%.5f' % metrics_train.precision)
187 |     print('train recall:\t%.5f' % metrics_train.recall)
188 |     print('train fscore:\t%.5f' % metrics_train.fscore)
189 |     print('train loss:\t%.5f' % metrics_train.loss)
190 |     print()
191 | 
192 |     if val_dataset is None:
193 |       continue
194 |     metrics_val = jit(get_metrics)(weights, val_dataset)
195 |     print('val accuracy:\t%.5f' % metrics_val.accuracy)
196 |     print('val prec.:\t%.5f' % metrics_val.precision)
197 |     print('val recall:\t%.5f' % metrics_val.recall)
198 |     print('val fscore:\t%.5f' % metrics_val.fscore)
199 |     print('val loss:\t%.5f' % metrics_val.loss)
200 |     print()
201 |   return weights
202 | 
203 | 
204 | def write_weights(file_path: str, weights: Array,
205 |                   features: typing.List[str]) -> None:
206 |   """Writes learned weights and corresponsing features to a file.
207 | 
208 |   Args:
209 |     file_path: A file path for the weights file.
210 |     weights: A weight vector.
211 |     features: A list of feature identifiers.
212 |   """
213 |   with open(file_path, 'w') as f:
214 |     f.write('\n'.join([
215 |         '%s\t%.6f' % (feature, weights[i]) for i, feature in enumerate(features)
216 |     ]))
217 | 
218 | 
219 | def parse_args(
220 |     test: typing.Optional[typing.List[str]] = None) -> argparse.Namespace:
221 |   """Parses commandline arguments.
222 | 
223 |   Args:
224 |     test (typing.Optional[typing.List[str]], optional): Commandline args for
225 |       testing. Defaults to None.
226 | 
227 |   Returns:
228 |     Parsed arguments (argparse.Namespace).
229 |   """
230 |   parser = argparse.ArgumentParser(
231 |       description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
232 |   parser.add_argument(
233 |       'train_data', help='File path for the encoded training data.')
234 |   parser.add_argument('base_model', help='File path for the base model file.')
235 |   parser.add_argument(
236 |       '-o',
237 |       '--output',
238 |       help=f'File path for the output weights. (default: {DEFAULT_OUTPUT_NAME})',
239 |       type=str,
240 |       default=DEFAULT_OUTPUT_NAME)
241 |   parser.add_argument(
242 |       '--val-data', help='File path for the encoded validation data.', type=str)
243 |   parser.add_argument(
244 |       '--iters',
245 |       help=f'Number of iterations for training. (default: {DEFAULT_NUM_ITERS})',
246 |       type=int,
247 |       default=DEFAULT_NUM_ITERS)
248 |   parser.add_argument(
249 |       '--log-span',
250 |       help=f'Iteration span to print metrics. (default: {DEFAULT_LOG_SPAN})',
251 |       type=int,
252 |       default=DEFAULT_LOG_SPAN)
253 |   parser.add_argument(
254 |       '--learning-rate',
255 |       help=f'Learning rate. (default: {DEFAULT_LEARNING_RATE})',
256 |       type=float,
257 |       default=DEFAULT_LEARNING_RATE)
258 |   if test is None:
259 |     return parser.parse_args()
260 |   else:
261 |     return parser.parse_args(test)
262 | 
263 | 
264 | def main() -> None:
265 |   args = parse_args()
266 |   train_data_path: str = args.train_data
267 |   base_model_path: str = args.base_model
268 |   weights_path: str = args.output
269 |   iters: int = args.iters
270 |   log_span: int = args.log_span
271 |   learning_rate: float = args.learning_rate
272 |   val_data_path: typing.Optional[str] = args.val_data
273 | 
274 |   model = load_model(base_model_path)
275 |   train_dataset = load_dataset(train_data_path, model)
276 |   val_dataset = load_dataset(val_data_path, model) if val_data_path else None
277 |   weights = fit(
278 |       model.weights,
279 |       train_dataset,
280 |       iters=iters,
281 |       log_span=log_span,
282 |       learning_rate=learning_rate,
283 |       val_dataset=val_dataset)
284 |   write_weights(weights_path, weights, model.features)
285 | 
286 | 
287 | if __name__ == '__main__':
288 |   main()
289 | 


--------------------------------------------------------------------------------
/scripts/prepare_knbc.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Prepares a dataset from the KNBC corpus.
 15 | 
 16 | Before running this script, you need to download the KNBC corpus by running:
 17 | 
 18 | $ curl -o knbc.tar.bz2 https://nlp.ist.i.kyoto-u.ac.jp/kuntt/KNBC_v1.0_090925_utf8.tar.bz2
 19 | $ tar -xf knbc.tar.bz2
 20 | 
 21 | Now you should have a directory named `KNBC_v1.0_090925_utf8`.
 22 | Run the following to generate a dataset named `source_knbc.txt`.
 23 | 
 24 | $ python scripts/prepare_knbc.py KNBC_v1.0_090925_utf8 -o source_knbc.txt
 25 | """
 26 | 
 27 | import argparse
 28 | import os
 29 | import sys
 30 | import typing
 31 | from html.parser import HTMLParser
 32 | 
 33 | # module hack
 34 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
 35 | sys.path.insert(0, os.path.abspath(LIB_PATH))
 36 | 
 37 | from budoux import utils  # noqa (module hack)
 38 | 
 39 | GRANULARITY_OPTIONS = {'phrase', 'tag', 'word'}
 40 | Granularity = typing.Literal['phrase', 'tag', 'word']
 41 | 
 42 | 
 43 | class KNBCHTMLParser(HTMLParser):
 44 |   """Parses the HTML files in the KNBC corpus to collect chunks.
 45 | 
 46 |   Attributes:
 47 |     chunks: The collected chunks.
 48 |     row: The current row index.
 49 |     col: The current column index.
 50 |     current_word: The current word to process.
 51 |     on_split_row: Whether the scan is on the splitting row.
 52 |     granularity: Granularity of the output chunks.
 53 |   """
 54 | 
 55 |   BUNSETSU_SPLIT_ID = 'bnst-kugiri'
 56 |   TAG_SPLIT_ID = 'tag-kugiri'
 57 | 
 58 |   def __init__(self, granularity: Granularity) -> None:
 59 |     """Initializes the HTML parser for the KNBC corpus.
 60 | 
 61 |     Args:
 62 |       granularity: Granularity of the output chunks.
 63 |     """
 64 |     super().__init__()
 65 |     self.chunks = ['']
 66 |     self.row = 0
 67 |     self.col = 0
 68 |     self.current_word = ''
 69 |     self.on_split_row = False
 70 |     self.granularity = granularity
 71 | 
 72 |   def handle_starttag(
 73 |       self, tag: str,
 74 |       attributes: typing.List[typing.Tuple[str, typing.Optional[str]]]) -> None:
 75 |     if tag == 'tr':
 76 |       self.row += 1
 77 |       self.col = 0
 78 |       self.current_word = ''
 79 |       self.on_split_row = False
 80 | 
 81 |     if tag == 'td':
 82 |       self.col += 1
 83 |       for name, value in attributes:
 84 |         bunsetsu_row = name == 'id' and value == self.BUNSETSU_SPLIT_ID
 85 |         tag_row = name == 'id' and value == self.TAG_SPLIT_ID
 86 |         if bunsetsu_row or (self.granularity == 'tag' and tag_row):
 87 |           self.on_split_row = True
 88 | 
 89 |   def handle_endtag(self, tag: str) -> None:
 90 |     if tag != 'tr':  # Skip all tags but TR.
 91 |       return None
 92 |     if self.row < 3:  # Skip the first two rows.
 93 |       return None
 94 |     if self.on_split_row:
 95 |       return self.chunks.append('')
 96 |     if self.col == 5:
 97 |       if self.granularity == 'word' and self.chunks[-1]:
 98 |         self.chunks.append('')
 99 |       self.chunks[-1] += self.current_word
100 | 
101 |   def handle_data(self, data: str) -> None:
102 |     if self.col == 1:
103 |       self.current_word = data
104 | 
105 | 
106 | def break_before_sequence(chunks: typing.List[str],
107 |                           sequence: str) -> typing.List[str]:
108 |   """Breaks chunks before a specified character sequence appears.
109 | 
110 |   Args:
111 |     chunks (List[str]): Chunks to break.
112 |     sequence (str): A character sequence to break chunks before.
113 | 
114 |   Returns:
115 |     Processed chunks.
116 |   """
117 |   chunks = utils.SEP.join(chunks).replace(sequence,
118 |                                           utils.SEP + sequence).split(utils.SEP)
119 |   chunks = [chunk for chunk in chunks if len(chunk) > 0]
120 |   return chunks
121 | 
122 | 
123 | def postprocess(chunks: typing.List[str]) -> typing.List[str]:
124 |   """Applies some processes to modify the extracted chunks.
125 | 
126 |   Args:
127 |     chunks (List[str]): Source chunks.
128 | 
129 |   Returns:
130 |     Processed chunks.
131 |   """
132 |   chunks = break_before_sequence(chunks, '（')
133 |   chunks = break_before_sequence(chunks, 'もら')
134 |   return chunks
135 | 
136 | 
137 | def parse_args() -> argparse.Namespace:
138 |   DEFAULT_OUT_PATH = 'source.txt'
139 |   DEFAULT_GRANULARITY = 'phrase'
140 |   parser = argparse.ArgumentParser(
141 |       description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
142 |   parser.add_argument('source_dir', help='Path to the KNBC corpus directory.')
143 |   parser.add_argument(
144 |       '-o',
145 |       '--outfile',
146 |       help=f'File path to the output dataset. (default: {DEFAULT_OUT_PATH})',
147 |       default=DEFAULT_OUT_PATH)
148 |   parser.add_argument(
149 |       '--granularity',
150 |       help=f'''Granularity of the output chunks. (default: {DEFAULT_GRANULARITY})
151 | The value should be one of "phrase", "tag", or "word".
152 | "phrase" is equivalent to Bunsetu-based segmentation.
153 | "tag" provides more granular segmentation than "phrase".
154 | "word" is equivalent to word-based segmentation.
155 | 
156 | e.g. 携帯ユーザーの仲間入りをするかです。
157 | phrase: 携帯ユーザーの / 仲間入りを / するかです。
158 | tag: 携帯 / ユーザーの / 仲間 / 入りを / するかです。
159 | word: 携帯 / ユーザー / の / 仲間 / 入り / を / する / か / です / 。
160 | ''',
161 |       choices=GRANULARITY_OPTIONS,
162 |       default=DEFAULT_GRANULARITY)
163 |   return parser.parse_args()
164 | 
165 | 
166 | def main() -> None:
167 |   args = parse_args()
168 |   source_dir = args.source_dir
169 |   outfile = args.outfile
170 |   granularity = args.granularity
171 |   html_dir = os.path.join(source_dir, 'html')
172 |   with open(outfile, 'w') as f:
173 |     for file in sorted(os.listdir(html_dir)):
174 |       if file[-11:] != '-morph.html':
175 |         continue
176 |       parser = KNBCHTMLParser(granularity)
177 |       data = open(os.path.join(html_dir, file)).read()
178 |       parser.feed(data)
179 |       chunks = parser.chunks
180 |       chunks = postprocess(chunks)
181 |       if len(chunks) < 2:
182 |         continue
183 |       f.write(utils.SEP.join(chunks) + '\n')
184 |   print('\033[92mTraining data is output to: %s\033[0m' % (outfile))
185 | 
186 | 
187 | if __name__ == '__main__':
188 |   main()
189 | 


--------------------------------------------------------------------------------
/scripts/prepare_wisesight.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Prepares a dataset from the Wisesight corpus.
15 | 
16 | Before running this script, you need to download the Wisesight corpus by running:
17 | 
18 | $ curl -o wisesight-1000-samples-tokenised.label https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/word-tokenization/wisesight-1000-samples-tokenised.label
19 | $ curl -o wisesight-160-samples-tokenised.label https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/word-tokenization/wisesight-160-samples-tokenised.label
20 | 
21 | Then run this command as follows over each file.
22 | 
23 | $ python scripts/prepare_wisesight.py wisesight-1000-samples-tokenised.label -o source_train.txt
24 | $ python scripts/prepare_wisesight.py wisesight-160-samples-tokenised.label -o source_val.txt
25 | """
26 | import argparse
27 | import re
28 | 
29 | import regex
30 | 
31 | 
32 | def parse_args() -> argparse.Namespace:
33 |   DEFAULT_OUT_PATH = 'source.txt'
34 |   parser = argparse.ArgumentParser(
35 |       description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
36 |   parser.add_argument(
37 |       'source_filepath', help='Path to a Wisesight corpus label file.')
38 |   parser.add_argument(
39 |       '-o',
40 |       '--outfile',
41 |       help=f'File path to the output dataset. (default: {DEFAULT_OUT_PATH})',
42 |       default=DEFAULT_OUT_PATH)
43 |   return parser.parse_args()
44 | 
45 | 
46 | def main() -> None:
47 |   args = parse_args()
48 |   source_filepath = args.source_filepath
49 |   target_filepath = args.outfile
50 | 
51 |   with open(target_filepath, 'w') as outfile:
52 |     with open(source_filepath) as infile:
53 |       for line in infile:
54 |         line = line.strip()
55 |         line = re.sub(r'https?://[^ ]+', '', line)  # Remove URLs
56 |         line = re.sub(r'#[^ ]+', '', line)  # Remove hashtags
57 |         line = regex.compile(r'\p{Emoji_Presentation=Yes}+').sub(
58 |             '', line)  # Remove emojis
59 |         line = re.sub(r'\|+', '|', line)  # Remove consecutive separators
60 |         line = re.sub(r'(\|\s)*\|$', '', line)  # Remove redundant spaces
61 |         outfile.write(line.replace('|', '▁') + '\n')  # Replace the separators.
62 |   print('\033[92mTraining data is output to: %s\033[0m' % (target_filepath))
63 | 
64 | 
65 | if __name__ == '__main__':
66 |   main()
67 | 


--------------------------------------------------------------------------------
/scripts/tests/test_build_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Tests the model build script."""
 15 | 
 16 | import os
 17 | import sys
 18 | import unittest
 19 | 
 20 | # module hack
 21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
 22 | sys.path.insert(0, os.path.abspath(LIB_PATH))
 23 | 
 24 | from scripts import build_model  # noqa (module hack)
 25 | 
 26 | 
 27 | class TestAggregateScores(unittest.TestCase):
 28 | 
 29 |   def test_standard(self) -> None:
 30 |     weights = [
 31 |         'AB:x\t2.893\n', 'BC:y\t0.123\n', 'AB:y\t2.123\n', 'BC:y\t1.234\n'
 32 |     ]
 33 |     model = build_model.aggregate_scores(weights)
 34 |     self.assertDictEqual(model, {
 35 |         'AB': {
 36 |             'x': 2.893,
 37 |             'y': 2.123
 38 |         },
 39 |         'BC': {
 40 |             'y': 1.357
 41 |         }
 42 |     }, 'should group scores by feature type.')
 43 | 
 44 |   def test_blank_line(self) -> None:
 45 |     weights = [
 46 |         '\n', 'AB:x\t2.893\n', 'BC:y\t0.123\n', '\n', 'AB:y\t2.123\n',
 47 |         'BC:y\t1.234\n'
 48 |     ]
 49 |     model = build_model.aggregate_scores(weights)
 50 |     self.assertDictEqual(model, {
 51 |         'AB': {
 52 |             'x': 2.893,
 53 |             'y': 2.123
 54 |         },
 55 |         'BC': {
 56 |             'y': 1.357
 57 |         }
 58 |     }, 'should skip blank lines.')
 59 | 
 60 |   def test_colon(self) -> None:
 61 |     weights = ['AB::\t8.123']
 62 |     model = build_model.aggregate_scores(weights)
 63 |     self.assertDictEqual(
 64 |         model, {'AB': {
 65 |             ':': 8.123
 66 |         }}, 'should consider the first colon only as a delimiter.')
 67 | 
 68 | 
 69 | class TestRoundModel(unittest.TestCase):
 70 | 
 71 |   def test_standard(self) -> None:
 72 |     model = {
 73 |         'AB': {
 74 |             'x': 1.0002,
 75 |             'y': 4.1237,
 76 |         },
 77 |         'BC': {
 78 |             'z': 2.1111,
 79 |         }
 80 |     }
 81 |     model_rounded = build_model.round_model(model, 1000)
 82 |     self.assertDictEqual(model_rounded, {
 83 |         'AB': {
 84 |             'x': 1000,
 85 |             'y': 4123
 86 |         },
 87 |         'BC': {
 88 |             'z': 2111
 89 |         }
 90 |     }, 'should scale and round scores to integer.')
 91 | 
 92 |   def test_insignificant_score(self) -> None:
 93 |     model = {
 94 |         'AB': {
 95 |             'x': 0.0009,
 96 |             'y': 4.1237,
 97 |         },
 98 |         'BC': {
 99 |             'z': 2.1111,
100 |         }
101 |     }
102 |     model_rounded = build_model.round_model(model, 1000)
103 |     self.assertDictEqual(model_rounded, {
104 |         'AB': {
105 |             'y': 4123
106 |         },
107 |         'BC': {
108 |             'z': 2111
109 |         }
110 |     }, 'should remove insignificant scores lower than 1.')
111 | 
112 | 
113 | class TestArgParse(unittest.TestCase):
114 | 
115 |   def test_cmdargs_invalid_option(self) -> None:
116 |     cmdargs = ['-v']
117 |     with self.assertRaises(SystemExit) as cm:
118 |       build_model.parse_args(cmdargs)
119 |     self.assertEqual(cm.exception.code, 2)
120 | 
121 |   def test_cmdargs_help(self) -> None:
122 |     cmdargs = ['-h']
123 |     with self.assertRaises(SystemExit) as cm:
124 |       build_model.parse_args(cmdargs)
125 |     self.assertEqual(cm.exception.code, 0)
126 | 
127 |   def test_cmdargs_no_input(self) -> None:
128 |     with self.assertRaises(SystemExit) as cm:
129 |       build_model.parse_args([])
130 |     self.assertEqual(cm.exception.code, 2)
131 | 
132 |   def test_cmdargs_default(self) -> None:
133 |     output = build_model.parse_args(['weight.txt'])
134 |     self.assertEqual(output.weight_file, 'weight.txt')
135 |     self.assertEqual(output.outfile, 'model.json')
136 |     self.assertEqual(output.scale, 1000)
137 | 
138 |   def test_cmdargs_with_scale(self) -> None:
139 |     output = build_model.parse_args(
140 |         ['weight.txt', '-o', 'foo.json', '--scale', '200'])
141 |     self.assertEqual(output.weight_file, 'weight.txt')
142 |     self.assertEqual(output.outfile, 'foo.json')
143 |     self.assertEqual(output.scale, 200)
144 | 


--------------------------------------------------------------------------------
/scripts/tests/test_encode_data.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Tests the data encoder script."""
 15 | 
 16 | import os
 17 | import sys
 18 | import typing
 19 | import unittest
 20 | 
 21 | # module hack
 22 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
 23 | sys.path.insert(0, os.path.abspath(LIB_PATH))
 24 | 
 25 | from budoux import utils  # noqa (module hack)
 26 | from scripts import encode_data  # noqa (module hack)
 27 | 
 28 | 
 29 | class TestGetFeature(unittest.TestCase):
 30 | 
 31 |   def test_standard(self) -> None:
 32 |     feature = encode_data.get_feature('a', 'b', 'c', 'd', 'e', 'f')
 33 |     self.assertSetEqual(
 34 |         set(feature),
 35 |         {
 36 |             # Unigram of Words (UW)
 37 |             'UW1:a',
 38 |             'UW2:b',
 39 |             'UW3:c',
 40 |             'UW4:d',
 41 |             'UW5:e',
 42 |             'UW6:f',
 43 | 
 44 |             # Bigram of Words (BW)
 45 |             'BW1:bc',
 46 |             'BW2:cd',
 47 |             'BW3:de',
 48 | 
 49 |             # Trigram of Words (TW)
 50 |             'TW1:abc',
 51 |             'TW2:bcd',
 52 |             'TW3:cde',
 53 |             'TW4:def',
 54 |         },
 55 |         'Features should be extracted.')
 56 | 
 57 |   def test_with_invalid(self) -> None:
 58 | 
 59 |     def find_by_prefix(prefix: str, feature: typing.List[str]) -> bool:
 60 |       for item in feature:
 61 |         if item.startswith(prefix):
 62 |           return True
 63 |       return False
 64 | 
 65 |     feature = encode_data.get_feature('a', 'a', encode_data.INVALID, 'a', 'a',
 66 |                                       'a')
 67 |     self.assertFalse(
 68 |         find_by_prefix('UW3:', feature),
 69 |         'Should omit the Unigram feature when the character is invalid.')
 70 |     self.assertFalse(
 71 |         find_by_prefix('BW2:', feature),
 72 |         'Should omit the Bigram feature that covers an invalid character.')
 73 | 
 74 | 
 75 | class TestArgParse(unittest.TestCase):
 76 | 
 77 |   def test_cmdargs_invalid_option(self) -> None:
 78 |     cmdargs = ['-v']
 79 |     with self.assertRaises(SystemExit) as cm:
 80 |       encode_data.parse_args(cmdargs)
 81 |     self.assertEqual(cm.exception.code, 2)
 82 | 
 83 |   def test_cmdargs_help(self) -> None:
 84 |     cmdargs = ['-h']
 85 |     with self.assertRaises(SystemExit) as cm:
 86 |       encode_data.parse_args(cmdargs)
 87 |     self.assertEqual(cm.exception.code, 0)
 88 | 
 89 |   def test_cmdargs_no_source(self) -> None:
 90 |     with self.assertRaises(SystemExit) as cm:
 91 |       encode_data.parse_args([])
 92 |     self.assertEqual(cm.exception.code, 2)
 93 | 
 94 |   def test_cmdargs_default(self) -> None:
 95 |     cmdargs = ['source.txt']
 96 |     output = encode_data.parse_args(cmdargs)
 97 |     self.assertEqual(output.source_data, 'source.txt')
 98 |     self.assertEqual(output.outfile, encode_data.DEFAULT_OUTPUT_FILENAME)
 99 |     self.assertIsNone(output.processes)
100 |     self.assertEqual(output.scale, 1)
101 | 
102 |   def test_cmdargs_with_outfile(self) -> None:
103 |     cmdargs = ['source.txt', '-o', 'out.txt']
104 |     output = encode_data.parse_args(cmdargs)
105 |     self.assertEqual(output.source_data, 'source.txt')
106 |     self.assertEqual(output.outfile, 'out.txt')
107 |     self.assertIsNone(output.processes)
108 |     self.assertEqual(output.scale, 1)
109 | 
110 |   def test_cmdargs_with_processes(self) -> None:
111 |     cmdargs = ['source.txt', '--processes', '8']
112 |     output = encode_data.parse_args(cmdargs)
113 |     self.assertEqual(output.source_data, 'source.txt')
114 |     self.assertEqual(output.outfile, encode_data.DEFAULT_OUTPUT_FILENAME)
115 |     self.assertEqual(output.processes, 8)
116 |     self.assertEqual(output.scale, 1)
117 | 
118 |   def test_cmdargs_with_scale(self) -> None:
119 |     cmdargs = ['source.txt', '--scale', '20']
120 |     output = encode_data.parse_args(cmdargs)
121 |     self.assertEqual(output.source_data, 'source.txt')
122 |     self.assertEqual(output.outfile, encode_data.DEFAULT_OUTPUT_FILENAME)
123 |     self.assertIsNone(output.processes)
124 |     self.assertEqual(output.scale, 20)
125 | 
126 | 
127 | class TestProcess(unittest.TestCase):
128 | 
129 |   sentence = '六本木ヒルズでお昼を食べる。'
130 |   sep_indices = {7, 10, 13}
131 | 
132 |   def test_on_negative_point_with_scale(self) -> None:
133 |     line = encode_data.process(8, self.sentence, self.sep_indices, 16)
134 |     items = line.split('\t')
135 |     weight = items[0]
136 |     features = set(items[1:])
137 |     self.assertEqual(weight, '-16')
138 |     self.assertIn('UW2:で', features)
139 | 
140 |   def test_on_positive_point_with_scale(self) -> None:
141 |     line = encode_data.process(7, self.sentence, self.sep_indices, 13)
142 |     items = line.split('\t')
143 |     weight = items[0]
144 |     features = set(items[1:])
145 |     self.assertEqual(weight, '13')
146 |     self.assertIn('UW3:で', features)
147 | 
148 | 
149 | class TestNormalizeInput(unittest.TestCase):
150 | 
151 |   def test_standard_input(self) -> None:
152 |     source = f'ABC{utils.SEP}DE{utils.SEP}FGHI'
153 |     sentence, sep_indices = encode_data.normalize_input(source)
154 |     self.assertEqual(sentence, 'ABCDEFGHI')
155 |     self.assertEqual(sep_indices, {3, 5, 9})
156 | 
157 |   def test_with_linebreaks(self) -> None:
158 |     source = f'AB\nCDE{utils.SEP}FG'
159 |     sentence, sep_indices = encode_data.normalize_input(source)
160 |     self.assertEqual(sentence, 'ABCDEFG')
161 |     self.assertEqual(sep_indices, {2, 5, 7})
162 | 
163 |   def test_doubled_seps(self) -> None:
164 |     source = f'ABC{utils.SEP}{utils.SEP}DE\n\nFG'
165 |     sentence, sep_indices = encode_data.normalize_input(source)
166 |     self.assertEqual(sentence, 'ABCDEFG')
167 |     self.assertEqual(sep_indices, {3, 5, 7})
168 | 
169 | 
170 | if __name__ == '__main__':
171 |   unittest.main()
172 | 


--------------------------------------------------------------------------------
/scripts/tests/test_finetune.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Tests the finetune script."""
 15 | 
 16 | import os
 17 | import sys
 18 | import tempfile
 19 | import unittest
 20 | 
 21 | from jax import numpy as jnp
 22 | 
 23 | # module hack
 24 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
 25 | sys.path.insert(0, os.path.abspath(LIB_PATH))
 26 | 
 27 | from scripts import finetune  # noqa (module hack)
 28 | 
 29 | 
 30 | class TestArgParse(unittest.TestCase):
 31 | 
 32 |   def test_cmdargs_invalid_option(self) -> None:
 33 |     cmdargs = ['-v']
 34 |     with self.assertRaises(SystemExit) as cm:
 35 |       finetune.parse_args(cmdargs)
 36 |     self.assertEqual(cm.exception.code, 2)
 37 | 
 38 |   def test_cmdargs_help(self) -> None:
 39 |     cmdargs = ['-h']
 40 |     with self.assertRaises(SystemExit) as cm:
 41 |       finetune.parse_args(cmdargs)
 42 |     self.assertEqual(cm.exception.code, 0)
 43 | 
 44 |   def test_cmdargs_no_data(self) -> None:
 45 |     with self.assertRaises(SystemExit) as cm:
 46 |       finetune.parse_args([])
 47 |     self.assertEqual(cm.exception.code, 2)
 48 | 
 49 |   def test_cmdargs_no_base_model(self) -> None:
 50 |     with self.assertRaises(SystemExit) as cm:
 51 |       finetune.parse_args(['encoded.txt'])
 52 |     self.assertEqual(cm.exception.code, 2)
 53 | 
 54 |   def test_cmdargs_default(self) -> None:
 55 |     cmdargs = ['encoded.txt', 'model.json']
 56 |     output = finetune.parse_args(cmdargs)
 57 |     self.assertEqual(output.train_data, 'encoded.txt')
 58 |     self.assertEqual(output.base_model, 'model.json')
 59 |     self.assertEqual(output.iters, finetune.DEFAULT_NUM_ITERS)
 60 |     self.assertEqual(output.log_span, finetune.DEFAULT_LOG_SPAN)
 61 |     self.assertEqual(output.learning_rate, finetune.DEFAULT_LEARNING_RATE)
 62 |     self.assertEqual(output.val_data, None)
 63 | 
 64 |   def test_cmdargs_with_values(self) -> None:
 65 |     cmdargs = [
 66 |         'encoded.txt', 'model.json', '--iters', '50', '--log-span', '10',
 67 |         '--learning-rate', '0.1', '--val-data', 'val.txt'
 68 |     ]
 69 |     output = finetune.parse_args(cmdargs)
 70 |     self.assertEqual(output.train_data, 'encoded.txt')
 71 |     self.assertEqual(output.base_model, 'model.json')
 72 |     self.assertEqual(output.iters, 50)
 73 |     self.assertEqual(output.log_span, 10)
 74 |     self.assertEqual(output.learning_rate, 0.1)
 75 |     self.assertEqual(output.val_data, 'val.txt')
 76 | 
 77 | 
 78 | class TestLoadModel(unittest.TestCase):
 79 | 
 80 |   def setUp(self) -> None:
 81 |     self.model_file_path = tempfile.NamedTemporaryFile().name
 82 |     with open(self.model_file_path, 'w') as f:
 83 |       f.write('{"UW1": {"a": 12, "b": 23}, "TW3": {"xyz": 47}}')
 84 | 
 85 |   def test_extracted_keys(self) -> None:
 86 |     result = finetune.load_model(self.model_file_path).features
 87 |     self.assertListEqual(result, ['UW1:a', 'UW1:b', 'TW3:xyz'])
 88 | 
 89 |   def test_value_variance(self) -> None:
 90 |     result = finetune.load_model(self.model_file_path).weights.var()
 91 |     self.assertAlmostEqual(float(result), 1, places=5)
 92 | 
 93 |   def test_value_mean(self) -> None:
 94 |     result = finetune.load_model(self.model_file_path).weights.sum()
 95 |     self.assertAlmostEqual(float(result), 0, places=5)
 96 | 
 97 |   def test_value_order(self) -> None:
 98 |     result = finetune.load_model(self.model_file_path).weights.tolist()
 99 |     self.assertGreater(result[1], result[0])
100 |     self.assertGreater(result[2], result[1])
101 | 
102 | 
103 | class TestLoadDataset(unittest.TestCase):
104 | 
105 |   def setUp(self) -> None:
106 |     self.entries_file_path = tempfile.NamedTemporaryFile().name
107 |     with open(self.entries_file_path, 'w') as f:
108 |       f.write(('1\tfoo\tbar\n'
109 |                '-1\tfoo\n'
110 |                '1\tfoo\tbar\tbaz\n'
111 |                '1\tbar\tfoo\n'
112 |                '-1\tbaz\tqux\n'))
113 |     self.model = finetune.NormalizedModel(['foo', 'bar'], jnp.array([23, -37]))
114 | 
115 |   def test_y(self) -> None:
116 |     result = finetune.load_dataset(self.entries_file_path, self.model)
117 |     expected = [True, False, True, True, False]
118 |     self.assertListEqual(result.Y.tolist(), expected)
119 | 
120 |   def test_x(self) -> None:
121 |     result = finetune.load_dataset(self.entries_file_path, self.model)
122 |     expected = [[1, 1], [1, -1], [1, 1], [1, 1], [-1, -1]]
123 |     self.assertListEqual(result.X.tolist(), expected)
124 | 
125 | 
126 | class TestFit(unittest.TestCase):
127 | 
128 |   def test_health(self) -> None:
129 |     w = jnp.array([.9, .5, -.3])
130 |     X = jnp.array([[-1, 1, 1], [1, -1, 1], [1, 1, -1]])
131 |     # The current result is x.dot(w) = [-0.7, 0.1, 1.1] => [False, True, True]
132 |     # It tests if the method can learn a new weight that inverses the result.
133 |     Y = jnp.array([True, False, False])
134 |     dataset = finetune.Dataset(X, Y)
135 |     w = finetune.fit(w, dataset, iters=1000, learning_rate=.01, log_span=100)
136 |     self.assertGreater(X.dot(w).tolist()[0], 0)  # x.dot(w) > 0 => True.
137 | 
138 | 
139 | class TestWriteWeights(unittest.TestCase):
140 | 
141 |   def test_write_weights(self) -> None:
142 |     weights = jnp.array([0.012, 0.238, -0.1237])
143 |     features = ['foo', 'bar', 'baz']
144 |     weights_path = tempfile.NamedTemporaryFile().name
145 |     finetune.write_weights(weights_path, weights, features)
146 |     with open(weights_path) as f:
147 |       result = f.read()
148 |     self.assertEqual(result, 'foo\t0.012000\nbar\t0.238000\nbaz\t-0.123700')
149 | 


--------------------------------------------------------------------------------
/scripts/tests/test_prepare_knbc.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Tests the prepare KNBC script."""
15 | 
16 | import os
17 | import sys
18 | import unittest
19 | 
20 | # module hack
21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
22 | sys.path.insert(0, os.path.abspath(LIB_PATH))
23 | 
24 | from scripts import prepare_knbc  # noqa (module hack)
25 | 
26 | 
27 | class TestBreakBeforeSequence(unittest.TestCase):
28 | 
29 |   def test_standard(self) -> None:
30 |     chunks = ['abcdef', 'ghi']
31 |     result = prepare_knbc.break_before_sequence(chunks, 'de')
32 |     self.assertListEqual(result, ['abc', 'def', 'ghi'])
33 | 
34 |   def test_sequence_on_top(self) -> None:
35 |     chunks = ['abcdef', 'ghi']
36 |     result = prepare_knbc.break_before_sequence(chunks, 'gh')
37 |     self.assertListEqual(result, ['abcdef', 'ghi'])
38 | 
39 |   def test_multiple_hit(self) -> None:
40 |     chunks = ['abcabc', 'def']
41 |     result = prepare_knbc.break_before_sequence(chunks, 'bc')
42 |     self.assertListEqual(result, ['a', 'bca', 'bc', 'def'])
43 | 
44 | 
45 | class TestKNBCHTMLParser(unittest.TestCase):
46 |   example_html = '''
47 |   <html>
48 |     <body>
49 |       <table>
50 |         <tr><th>HA</th><th>HB</th><th>HC</th><th>HD</th><th>HE</th></tr>
51 |         <tr><td colspan="5" id="bnst-kugiri"><a>文節区切り</a></td></tr>
52 |         <tr><td>abc</td><td></td><td></td><td></td><td></td></tr>
53 |         <tr><td>de</td><td></td><td></td><td></td><td></td></tr>
54 |         <tr><td colspan="5" id="tag-kugiri"><a>タグ区切り</a></td></tr>
55 |         <tr><td>fgh</td><td></td><td></td><td></td><td> </td></tr>
56 |         <tr><td>ijkl</td><td></td><td></td><td></td><td> </td></tr>
57 |         <tr><td colspan="5" id="bnst-kugiri"><a>文節区切り</a></td></tr>
58 |         <tr><td>mn</td><td></td><td></td><td></td><td> </td></tr>
59 |       </table>
60 |     </body>
61 |   </html>
62 |   '''
63 | 
64 |   def test_parse_phrase(self) -> None:
65 |     parser = prepare_knbc.KNBCHTMLParser('phrase')
66 |     parser.feed(self.example_html)
67 |     self.assertListEqual(parser.chunks, ['abcdefghijkl', 'mn'])
68 | 
69 |   def test_parse_tag(self) -> None:
70 |     parser = prepare_knbc.KNBCHTMLParser('tag')
71 |     parser.feed(self.example_html)
72 |     self.assertListEqual(parser.chunks, ['abcde', 'fghijkl', 'mn'])
73 | 
74 |   def test_parse_word(self) -> None:
75 |     parser = prepare_knbc.KNBCHTMLParser('word')
76 |     parser.feed(self.example_html)
77 |     self.assertListEqual(parser.chunks, ['abc', 'de', 'fgh', 'ijkl', 'mn'])
78 | 


--------------------------------------------------------------------------------
/scripts/tests/test_translate_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Tests the model translator script."""
15 | 
16 | import os
17 | import sys
18 | import unittest
19 | 
20 | # module hack
21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
22 | sys.path.insert(0, os.path.abspath(LIB_PATH))
23 | 
24 | from scripts import translate_model  # noqa (module hack)
25 | 
26 | 
27 | class TestNormalize(unittest.TestCase):
28 | 
29 |   def test_old_format_input(self) -> None:
30 |     model = {'a:x': 48, 'a:y': 21, 'b:x': 2, 'b:z': 89}
31 |     expect = {'a': {'x': 48, 'y': 21}, 'b': {'x': 2, 'z': 89}}
32 |     result = translate_model.normalize(model)
33 |     self.assertDictEqual(result, expect)
34 | 
35 |   def test_new_format_input(self) -> None:
36 |     model = {'a': {'x': 48, 'y': 21}, 'b': {'x': 2, 'z': 89}}
37 |     result = translate_model.normalize(model)
38 |     self.assertDictEqual(result, model)
39 | 
40 |   def test_broken_input1(self) -> None:
41 |     model = {'a:x': 23, 'b': {'x': 37, 'y': 18}}
42 |     with self.assertRaises(Exception) as cm:
43 |       translate_model.normalize(model)
44 |     self.assertTrue('Unsupported model format' in str(cm.exception))
45 | 
46 |   def test_broken_input2(self) -> None:
47 |     model = {'b': {'x': 37, 'y': {'z': 123}}}
48 |     with self.assertRaises(Exception) as cm:
49 |       translate_model.normalize(model)
50 |     self.assertTrue('Unsupported model format' in str(cm.exception))
51 | 
52 | 
53 | class TestTranslateICU(unittest.TestCase):
54 | 
55 |   def test_standard(self) -> None:
56 |     model = {}
57 |     model['b'] = {'x': 47, 'z': 13}
58 |     model['a'] = {'x': 12, 'y': 88}
59 |     expect = '''
60 | jaml {
61 |     aKeys {
62 |         "x",
63 |         "y",
64 |     }
65 |     aValues:intvector {
66 |         12,
67 |         88,
68 |     }
69 |     bKeys {
70 |         "x",
71 |         "z",
72 |     }
73 |     bValues:intvector {
74 |         47,
75 |         13,
76 |     }
77 | }
78 | '''.strip()
79 |     result = translate_model.translate_icu(model)
80 |     self.assertEqual(result, expect)
81 | 


--------------------------------------------------------------------------------
/scripts/translate_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Translates a model JSON file to another format, such as ICU Resource Bundle.
 15 | 
 16 | Example usage:
 17 | 
 18 | $ python translate_model.py --format=icu model.json > icurb.txt
 19 | 
 20 | You can also use this script to update the model files older than v0.5.0 to make
 21 | it work with the latest version.
 22 | 
 23 | $ python translate_model.py --format=json old-model.json > new-model.json
 24 | """
 25 | 
 26 | import argparse
 27 | import itertools
 28 | import json
 29 | import typing
 30 | 
 31 | ArgList = typing.Optional[typing.List[str]]
 32 | 
 33 | 
 34 | def translate_icu(model: typing.Dict[str, typing.Dict[str, int]]) -> str:
 35 |   """Translates a model to the ICU Resource Bundle format.
 36 | 
 37 |   The output is intended to update the data in:
 38 |   https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/adaboost/jaml.txt
 39 | 
 40 |   Args:
 41 |     model: A model.
 42 |   Returns:
 43 |     A model string formatted in the ICU Resource Bundle format.
 44 |   """
 45 |   indent = '    '
 46 |   output = 'jaml {\n'
 47 |   for group_name, members in sorted(model.items()):
 48 |     output += f'{indent}{group_name}Keys {{\n'
 49 |     for key in members.keys():
 50 |       output += f'{indent}{indent}"{key}",\n'
 51 |     output += f'{indent}}}\n'
 52 |     output += f'{indent}{group_name}Values:intvector {{\n'
 53 |     for val in members.values():
 54 |       output += f'{indent}{indent}{val},\n'
 55 |     output += f'{indent}}}\n'
 56 |   output += '}'
 57 |   return output
 58 | 
 59 | 
 60 | def normalize(
 61 |     model: typing.Dict[str,
 62 |                        typing.Any]) -> typing.Dict[str, typing.Dict[str, int]]:
 63 |   """Updates a model to the latest format. Does nothing if it's updated already.
 64 | 
 65 |   Args:
 66 |     model: A model.
 67 |   Returns:
 68 |     An updated model.
 69 |   """
 70 |   is_old_format = all([isinstance(v, int) for v in model.values()])
 71 |   if is_old_format:
 72 |     output = {}
 73 |     sorted_items = sorted(model.items(), key=lambda x: x[0])
 74 |     groups = itertools.groupby(sorted_items, key=lambda x: x[0].split(':')[0])
 75 |     for group in groups:
 76 |       output[group[0]] = dict(
 77 |           (item[0].split(':')[-1], item[1]) for item in group[1])
 78 |     return output
 79 |   try:
 80 |     assert (all([
 81 |         isinstance(v, int)
 82 |         for groups in model.values()
 83 |         for v in groups.values()
 84 |     ])), 'Scores should be integers'
 85 |   except (AssertionError, AttributeError) as e:
 86 |     raise Exception('Unsupported model format:', e)
 87 |   else:
 88 |     return model
 89 | 
 90 | 
 91 | def main() -> None:
 92 |   DEFAULT_FORMAT = 'json'
 93 |   parser = argparse.ArgumentParser(
 94 |       description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
 95 |   parser.add_argument(
 96 |       'model', help='File path for the JSON format model file.', type=str)
 97 |   parser.add_argument(
 98 |       '--format',
 99 |       help=f'Target format (default: {DEFAULT_FORMAT})',
100 |       type=str,
101 |       default=DEFAULT_FORMAT,
102 |       choices={DEFAULT_FORMAT, 'icu'})
103 |   args = parser.parse_args()
104 |   model_path: str = args.model
105 |   format: str = args.format
106 |   with open(model_path) as f:
107 |     model = json.load(f)
108 |   model = normalize(model)
109 |   if format == 'json':
110 |     print(json.dumps(model, ensure_ascii=False, separators=(',', ':')))
111 |   elif format == 'icu':
112 |     print(translate_icu(model))
113 |   else:
114 |     pass
115 | 
116 | 
117 | if __name__ == '__main__':
118 |   main()
119 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = budoux
 3 | version = attr: budoux.__init__.__version__
 4 | description = BudouX is the successor of Budou
 5 | long_description = file: README.md
 6 | long_description_content_type = text/markdown
 7 | license = Apache-2.0
 8 | author = Shuhei Iitsuka
 9 | author_email = tushuhei@google.com
10 | classifiers =
11 |     Development Status :: 3 - Alpha
12 |     Operating System :: OS Independent
13 |     License :: OSI Approved :: Apache Software License
14 |     Programming Language :: Python :: 3.9
15 |     Programming Language :: Python :: 3.10
16 |     Programming Language :: Python :: 3.11
17 |     Programming Language :: Python :: 3.12
18 |     Programming Language :: Python :: 3.13
19 | 
20 | [options]
21 | python_requires= >= 3.9
22 | packages = find:
23 | include_package_data = True
24 | test_suite = tests
25 | install_requires =
26 |     importlib-resources
27 | 
28 | [options.extras_require]
29 | dev =
30 |     build
31 |     flake8
32 |     isort
33 |     mypy==1.15.0
34 |     pytest
35 |     regex
36 |     toml
37 |     twine
38 |     types-regex
39 |     types-setuptools
40 |     yapf
41 | 
42 | jaxcpu =
43 |     jax==0.5.2
44 | 
45 | [options.entry_points]
46 | console_scripts =
47 |     budoux = budoux.main:main
48 | 
49 | [yapf]
50 | based_on_style = yapf
51 | 
52 | [flake8]
53 | # E124: closing bracket does not match visual indentation
54 | # E126: over-indentation
55 | # E501: line too long
56 | # BLK100: black formattable
57 | ignore = E124,E126,E501,BLK100
58 | indent-size = 2
59 | 
60 | [mypy]
61 | python_version = 3.10
62 | pretty = True
63 | strict = True
64 | allow_untyped_calls = True
65 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from setuptools import setup
16 | 
17 | setup()
18 | 


--------------------------------------------------------------------------------
/tests/in/1.in:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/in/2.in:
--------------------------------------------------------------------------------
1 | これはテストです。
2 | 


--------------------------------------------------------------------------------
/tests/in/3.in:
--------------------------------------------------------------------------------
1 | これは<b>テスト</b>です。
2 | 


--------------------------------------------------------------------------------
/tests/quality/ja.tsv:
--------------------------------------------------------------------------------
 1 | # label	sentence
 2 | init	今日は▁とても▁良い▁天気です。
 3 | init	これ以上▁利用する▁場合は▁教えてください。
 4 | init	食器は▁そのまま▁入れて▁大丈夫です。
 5 | gh152	ダウンロード▁ありがとう▁ございます。
 6 | gh152	ご利用▁ありがとう▁ございました。
 7 | gh157	要点を▁まとめる▁必要が▁ある。
 8 | gh160	目指すのは▁あらゆる▁人に▁便利な▁ソフトウェア
 9 | gh160	商品が▁まもなく▁到着します。
10 | gh160	プロジェクトが▁ようやく▁日の▁目を▁見る。
11 | gh160	明け方に▁ようやく▁目覚めると、
12 | gh160	明け方▁ようやく▁目覚めると、
13 | gh160	これは▁たまたま▁見つけた▁宝物
14 | gh160	歩いていて▁たまたま▁目に▁入った▁光景
15 | gh216	あなたの▁意図した▁とおりに▁情報を▁伝える。
16 | gh220	あの▁イーハトーヴォの▁すきとおった▁風、▁夏でも▁底に▁冷たさを▁もつ▁青い▁そら、▁うつくしい▁森で▁飾られた▁モリーオ市、▁郊外の▁ぎらぎら▁ひかる▁草の▁波。
17 | gh387	購入された▁お客様のみ▁入れます。
18 | gh387	購入された▁お客様のみ▁入場できます。
19 | gh387	パワーのみ▁有効だ
20 | b320113958	小さな▁つぶや▁空気中の▁ちり
21 | b320113958	光が▁どんどん▁空▁いっぱいに▁広がる
22 | b320113958	太陽の▁位置が▁ちがうから
23 | b320113958	太陽が▁しずむころに▁帰る
24 | b320113958	多すぎると▁うまく▁いかない
25 | b320113958	世界の▁子どもの▁命や▁権利
26 | b320113958	「ふだん▁どおり」を▁保つ
27 | b320113958	おもちゃや▁遊びに▁使える
28 | b320113958	コントロールできない▁ほど▁感情移入してしまう
29 | b320113958	いつも▁甘えがちに▁なる
30 | b320113958	存在が▁浮かび▁上がった。
31 | 


--------------------------------------------------------------------------------
/tests/test_html_processor.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | "Tests the HTML Processor."
 15 | 
 16 | import os
 17 | import sys
 18 | import unittest
 19 | 
 20 | # module hack
 21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
 22 | sys.path.insert(0, os.path.abspath(LIB_PATH))
 23 | 
 24 | from budoux import html_processor  # noqa (module hack)
 25 | 
 26 | 
 27 | class TestTextContentExtractor(unittest.TestCase):
 28 | 
 29 |   def test_output(self) -> None:
 30 |     input = '<p><a href="#">Hello</a>, <b>World</b></p>'
 31 |     expected = 'Hello, World'
 32 |     extractor = html_processor.TextContentExtractor()
 33 |     extractor.feed(input)
 34 |     self.assertEqual(
 35 |         extractor.output, expected,
 36 |         'Text content should be extacted from the given HTML string.')
 37 | 
 38 | 
 39 | class TestHTMLChunkResolver(unittest.TestCase):
 40 | 
 41 |   def test_output(self) -> None:
 42 |     input = '<p>ab<b>cde</b>f</p>'
 43 |     expected = '<p>ab<b>c<wbr>de</b>f</p>'
 44 |     resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '<wbr>')
 45 |     resolver.feed(input)
 46 |     self.assertEqual(resolver.output, expected,
 47 |                      'WBR tags should be inserted as specified by chunks.')
 48 | 
 49 |   def test_unpaired(self) -> None:
 50 |     input = '<p>abcdef</p></p>'
 51 |     expected = '<p>abc<wbr>def</p></p>'
 52 |     resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '<wbr>')
 53 |     resolver.feed(input)
 54 |     self.assertEqual(resolver.output, expected,
 55 |                      'Unpaired close tag should not cause errors.')
 56 | 
 57 |   def test_nobr(self) -> None:
 58 |     input = '<p>ab<nobr>cde</nobr>f</p>'
 59 |     expected = '<p>ab<nobr>cde</nobr>f</p>'
 60 |     resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '<wbr>')
 61 |     resolver.feed(input)
 62 |     self.assertEqual(resolver.output, expected,
 63 |                      'WBR tags should not be inserted if in NOBR.')
 64 | 
 65 |   def test_after_nobr(self) -> None:
 66 |     input = '<p>ab<nobr>xy</nobr>abcdef</p>'
 67 |     expected = '<p>ab<nobr>xy</nobr>abc<wbr>def</p>'
 68 |     resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '<wbr>')
 69 |     resolver.feed(input)
 70 |     self.assertEqual(resolver.output, expected,
 71 |                      'WBR tags should be inserted if after NOBR.')
 72 | 
 73 |   def test_img_in_nobr(self) -> None:
 74 |     input = '<p>ab<nobr>x<img>y</nobr>abcdef</p>'
 75 |     expected = '<p>ab<nobr>x<img>y</nobr>abc<wbr>def</p>'
 76 |     resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '<wbr>')
 77 |     resolver.feed(input)
 78 |     self.assertEqual(resolver.output, expected,
 79 |                      'IMG should not affect surrounding NOBR.')
 80 | 
 81 | 
 82 | class TestResolve(unittest.TestCase):
 83 | 
 84 |   def test_with_simple_text_input(self) -> None:
 85 |     chunks = ['abc', 'def']
 86 |     html = 'abcdef'
 87 |     result = html_processor.resolve(chunks, html)
 88 |     expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">abc\u200bdef</span>'
 89 |     self.assertEqual(result, expected)
 90 | 
 91 |   def test_with_standard_html_input(self) -> None:
 92 |     chunks = ['abc', 'def']
 93 |     html = 'ab<a href="http://example.com">cd</a>ef'
 94 |     result = html_processor.resolve(chunks, html)
 95 |     expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">ab<a href="http://example.com">c\u200bd</a>ef</span>'
 96 |     self.assertEqual(result, expected)
 97 | 
 98 |   def test_with_nodes_to_skip(self) -> None:
 99 |     chunks = ['abc', 'def', 'ghi']
100 |     html = "a<button>bcde</button>fghi"
101 |     result = html_processor.resolve(chunks, html)
102 |     expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">a<button>bcde</button>f\u200bghi</span>'
103 |     self.assertEqual(result, expected)
104 | 
105 |   def test_with_break_before_skip(self) -> None:
106 |     chunks = ['abc', 'def', 'ghi', 'jkl']
107 |     html = "abc<button>defghi</button>jkl"
108 |     result = html_processor.resolve(chunks, html)
109 |     expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">abc\u200b<button>defghi</button>\u200bjkl</span>'
110 |     self.assertEqual(result, expected)
111 | 
112 |   def test_with_nothing_to_split(self) -> None:
113 |     chunks = ['abcdef']
114 |     html = 'abcdef'
115 |     result = html_processor.resolve(chunks, html)
116 |     expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">abcdef</span>'
117 |     self.assertEqual(result, expected)
118 | 


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Tests the BudouX CLI."""
 15 | 
 16 | import io
 17 | import sys
 18 | import unittest
 19 | from os.path import abspath, dirname, join
 20 | 
 21 | # module hack
 22 | LIB_PATH = join(dirname(__file__), '..')
 23 | sys.path.insert(0, abspath(LIB_PATH))
 24 | 
 25 | from budoux import main  # noqa (module hack)
 26 | 
 27 | if isinstance(sys.stdin, io.TextIOWrapper):
 28 |   sys.stdin.reconfigure(encoding='utf-8')
 29 | 
 30 | if isinstance(sys.stdout, io.TextIOWrapper):
 31 |   sys.stdout.reconfigure(encoding='utf-8')
 32 | 
 33 | 
 34 | class TestCommonOption(unittest.TestCase):
 35 | 
 36 |   def test_cmdargs_invalid_option(self) -> None:
 37 |     cmdargs = ['-v']
 38 |     with self.assertRaises(SystemExit) as cm:
 39 |       main.parse_args(cmdargs)
 40 | 
 41 |     self.assertEqual(cm.exception.code, 2)
 42 | 
 43 |   def test_cmdargs_help(self) -> None:
 44 |     cmdargs = ['-h']
 45 |     with self.assertRaises(SystemExit) as cm:
 46 |       main.parse_args(cmdargs)
 47 | 
 48 |     self.assertEqual(cm.exception.code, 0)
 49 | 
 50 |   def test_cmdargs_version(self) -> None:
 51 |     cmdargs = ['-V']
 52 |     with self.assertRaises(SystemExit) as cm:
 53 |       main.parse_args(cmdargs)
 54 | 
 55 |     self.assertEqual(cm.exception.code, 0)
 56 | 
 57 | 
 58 | class TestModelOption(unittest.TestCase):
 59 | 
 60 |   def test_cmdargs_invalid_json(self) -> None:
 61 |     cmdargs = ['-m', '404.json']
 62 |     with self.assertRaises(SystemExit) as cm:
 63 |       main.parse_args(cmdargs)
 64 | 
 65 |     self.assertEqual(cm.exception.code, 2)
 66 | 
 67 |   def test_cmdargs_invalid_lang_1(self) -> None:
 68 |     cmdargs = ['-l', 'aa']
 69 |     with self.assertRaises(SystemExit) as cm:
 70 |       main.parse_args(cmdargs)
 71 | 
 72 |     self.assertEqual(cm.exception.code, 2)
 73 | 
 74 |   def test_cmdargs_invalid_lang_2(self) -> None:
 75 |     cmdargs = ['-l', 'ja-abc']
 76 |     with self.assertRaises(SystemExit) as cm:
 77 |       main.parse_args(cmdargs)
 78 | 
 79 |     self.assertEqual(cm.exception.code, 2)
 80 | 
 81 |   def test_cmdargs_lang_ja(self) -> None:
 82 |     cmdargs = ['-l', 'ja', '今日は良い天気ですね。']
 83 |     output = main._main(cmdargs)
 84 | 
 85 |     self.assertEqual(output, '今日は\n良い\n天気ですね。')
 86 | 
 87 |   def test_cmdargs_lang_zh_hans(self) -> None:
 88 |     cmdargs = ['-l', 'zh-hans', '今天天气晴朗。']
 89 |     output = main._main(cmdargs)
 90 | 
 91 |     self.assertEqual(output, '今天\n天气\n晴朗。')
 92 | 
 93 | 
 94 | class TestTextArguments(unittest.TestCase):
 95 | 
 96 |   def test_cmdargs_single_text(self) -> None:
 97 |     cmdargs = ['これはテストです。']
 98 |     output = main._main(cmdargs)
 99 | 
100 |     self.assertEqual(output, "これは\nテストです。")
101 | 
102 |   def test_cmdargs_single_multiline_text(self) -> None:
103 |     cmdargs = ["これはテストです。\n今日は晴天です。"]
104 |     output = main._main(cmdargs)
105 | 
106 |     self.assertEqual(output, "これは\nテストです。\n---\n今日は\n晴天です。")
107 | 
108 |   def test_cmdargs_single_multiline_text_with_delimiter(self) -> None:
109 |     cmdargs = ["これはテストです。\n今日は晴天です。", "-d", "@"]
110 |     output = main._main(cmdargs)
111 | 
112 |     self.assertEqual(output, "これは\nテストです。\n@\n今日は\n晴天です。")
113 | 
114 |   def test_cmdargs_single_multiline_text_with_empty_delimiter(self) -> None:
115 |     cmdargs = ["これはテストです。\n今日は晴天です。", "-d", ""]
116 |     output = main._main(cmdargs)
117 | 
118 |     self.assertEqual(output, "これは\nテストです。\n\n今日は\n晴天です。")
119 | 
120 |   def test_cmdargs_multi_text(self) -> None:
121 |     cmdargs = ['これはテストです。', '今日は晴天です。']
122 |     with self.assertRaises(SystemExit) as cm:
123 |       main.main(cmdargs)
124 | 
125 |     self.assertEqual(cm.exception.code, 2)
126 | 
127 |   def test_cmdargs_single_html(self) -> None:
128 |     cmdargs = ['-H', '今日は<b>とても天気</b>です。']
129 |     output = main._main(cmdargs)
130 | 
131 |     self.assertEqual(
132 |         output, '<span style="word-break: keep-all; overflow-wrap: anywhere;">'
133 |         '今日は<b>\u200bとても\u200b天気</b>です。</span>')
134 | 
135 |   def test_cmdargs_multi_html(self) -> None:
136 |     cmdargs = ['-H', '今日は<b>とても天気</b>です。', 'これは<b>テスト</b>です。']
137 |     with self.assertRaises(SystemExit) as cm:
138 |       main._main(cmdargs)
139 | 
140 |     self.assertEqual(cm.exception.code, 2)
141 | 
142 | 
143 | class TestStdin(unittest.TestCase):
144 | 
145 |   def test_cmdargs_blank_stdin(self) -> None:
146 |     with open(
147 |         join(abspath(dirname(__file__)), "in/1.in"),
148 |         "r",
149 |         encoding=sys.getdefaultencoding()) as f:
150 |       sys.stdin = f
151 |       output = main._main([])
152 | 
153 |     self.assertEqual(output, "")
154 | 
155 |   def test_cmdargs_text_stdin(self) -> None:
156 |     with open(
157 |         join(abspath(dirname(__file__)), "in/2.in"),
158 |         "r",
159 |         encoding=sys.getdefaultencoding()) as f:
160 |       sys.stdin = f
161 |       output = main._main([])
162 | 
163 |     self.assertEqual(output, "これは\nテストです。")
164 | 
165 |   def test_cmdargs_html_stdin(self) -> None:
166 |     with open(
167 |         join(abspath(dirname(__file__)), "in/3.in"),
168 |         "r",
169 |         encoding=sys.getdefaultencoding()) as f:
170 |       sys.stdin = f
171 |       output = main._main(["-H"])
172 | 
173 |     self.assertEqual(
174 |         output, '<span style="word-break: keep-all; overflow-wrap: anywhere;">'
175 |         'これは<b>\u200bテスト</b>です。\u200b\n'
176 |         '</span>')
177 | 
178 | 
179 | if __name__ == '__main__':
180 |   unittest.main()
181 | 


--------------------------------------------------------------------------------
/tests/test_parser.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Tests the BudouX parser."""
 15 | 
 16 | import os
 17 | import sys
 18 | import unittest
 19 | 
 20 | # module hack
 21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
 22 | sys.path.insert(0, os.path.abspath(LIB_PATH))
 23 | 
 24 | from budoux import parser  # noqa (module hack)
 25 | 
 26 | 
 27 | class TestParser(unittest.TestCase):
 28 |   TEST_SENTENCE = 'abcdeabcd'
 29 | 
 30 |   def test_parse(self) -> None:
 31 |     p = parser.Parser({
 32 |         'UW4': {
 33 |             'a': 10000
 34 |         },  # means "should separate right before 'a'".
 35 |     })
 36 |     chunks = p.parse(TestParser.TEST_SENTENCE)
 37 |     self.assertListEqual(chunks, ['abcde', 'abcd'],
 38 |                          'Should separate if a strong feature item supports.')
 39 | 
 40 |     p = parser.Parser({
 41 |         'UW4': {
 42 |             'b': 10000
 43 |         },  # means "should separate right before 'b'".
 44 |     })
 45 |     chunks = p.parse(TestParser.TEST_SENTENCE)
 46 |     self.assertListEqual(
 47 |         chunks, ['a', 'bcdea', 'bcd'],
 48 |         'Should separate even if it makes the first character a sole phrase.')
 49 | 
 50 |     p = parser.Parser({})
 51 |     chunks = p.parse('')
 52 |     self.assertListEqual(chunks, [],
 53 |                          'Should return a blank list when the input is blank.')
 54 | 
 55 |   def test_translate_html_string(self) -> None:
 56 |     p = parser.Parser({
 57 |         'UW4': {
 58 |             'a': 10000
 59 |         },  # means "should separate right before 'a'".
 60 |     })
 61 | 
 62 |     input_html = 'xyzabcd'
 63 |     expected_html = (
 64 |         '<span style="word-break: keep-all; overflow-wrap: anywhere;">'
 65 |         'xyz\u200babcd</span>')
 66 |     output_html = p.translate_html_string(input_html)
 67 |     self.assertEqual(
 68 |         output_html, expected_html,
 69 |         'Should output a html string with a SPAN parent with proper style attributes.'
 70 |     )
 71 | 
 72 |     input_html = 'xyz<script>alert(1);</script>xyzabc'
 73 |     # TODO: Because the content for skip elements are included, this test tries
 74 |     # to break before "alert". We may want to distinguish "skip from the
 75 |     # content" and "skip breaking" in future.
 76 |     expected_html = (
 77 |         '<span style="word-break: keep-all; overflow-wrap: anywhere;">'
 78 |         'xyz\u200b<script>alert(1);</script>xyz\u200babc</span>')
 79 |     output_html = p.translate_html_string(input_html)
 80 |     self.assertEqual(output_html, expected_html,
 81 |                      'Should pass script tags as is.')
 82 | 
 83 |     input_html = 'xyz<code>abc</code>abc'
 84 |     expected_html = (
 85 |         '<span style="word-break: keep-all; overflow-wrap: anywhere;">'
 86 |         'xyz\u200b<code>abc</code>\u200babc</span>')
 87 |     output_html = p.translate_html_string(input_html)
 88 |     self.assertEqual(output_html, expected_html,
 89 |                      'Should skip some specific tags.')
 90 | 
 91 |     input_html = 'xyza<a href="#" hidden>bc</a>abc'
 92 |     expected_html = (
 93 |         '<span style="word-break: keep-all; overflow-wrap: anywhere;">'
 94 |         'xyz\u200ba<a href="#" hidden>bc</a>\u200babc</span>')
 95 |     output_html = p.translate_html_string(input_html)
 96 |     self.assertEqual(output_html, expected_html,
 97 |                      'Should not ruin attributes of child elements.')
 98 | 
 99 |     input_html = 'xyza🇯🇵🇵🇹abc'
100 |     expected_html = (
101 |         '<span style="word-break: keep-all; overflow-wrap: anywhere;">'
102 |         'xyz\u200ba🇯🇵🇵🇹\u200babc</span>')
103 |     output_html = p.translate_html_string(input_html)
104 |     self.assertEqual(output_html, expected_html, 'Should work with emojis.')
105 | 
106 | 
107 | class TestDefaultParser(unittest.TestCase):
108 | 
109 |   def test_load_default_japanese_parser(self) -> None:
110 |     p_ja = parser.load_default_japanese_parser()
111 |     phrases = p_ja.parse('Google の使命は、世界中の情報を整理し、世界中の人がアクセスできて使えるようにすることです。')
112 |     self.assertListEqual(phrases, [
113 |         'Google の',
114 |         '使命は、',
115 |         '世界中の',
116 |         '情報を',
117 |         '整理し、',
118 |         '世界中の',
119 |         '人が',
120 |         'アクセスできて',
121 |         '使えるように',
122 |         'する',
123 |         'ことです。',
124 |     ])
125 | 
126 |   def test_load_default_simplified_chinese_parser(self) -> None:
127 |     p_hans = parser.load_default_simplified_chinese_parser()
128 |     phrases = p_hans.parse('我们的使命是整合全球信息，供大众使用，让人人受益。')
129 |     self.assertListEqual(phrases, [
130 |         '我们',
131 |         '的',
132 |         '使命',
133 |         '是',
134 |         '整合',
135 |         '全球',
136 |         '信息，',
137 |         '供',
138 |         '大众',
139 |         '使用，',
140 |         '让',
141 |         '人',
142 |         '人',
143 |         '受益。',
144 |     ])
145 | 
146 |   def test_load_default_traditional_chinese_parser(self) -> None:
147 |     p_hant = parser.load_default_traditional_chinese_parser()
148 |     phrases = p_hant.parse('我們的使命是匯整全球資訊，供大眾使用，使人人受惠。')
149 |     self.assertListEqual(phrases, [
150 |         '我們',
151 |         '的',
152 |         '使命',
153 |         '是',
154 |         '匯整',
155 |         '全球',
156 |         '資訊，',
157 |         '供',
158 |         '大眾',
159 |         '使用，',
160 |         '使',
161 |         '人',
162 |         '人',
163 |         '受惠。',
164 |     ])
165 | 
166 | 
167 | if __name__ == '__main__':
168 |   unittest.main()
169 | 


--------------------------------------------------------------------------------
/tests/test_quality.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Quality regression test."""
15 | 
16 | import os
17 | import sys
18 | import unittest
19 | 
20 | # module hack
21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
22 | sys.path.insert(0, os.path.abspath(LIB_PATH))
23 | 
24 | from budoux import load_default_japanese_parser, utils  # noqa (module hack)
25 | 
26 | 
27 | class TestQuality(unittest.TestCase):
28 | 
29 |   def test_ja(self) -> None:
30 |     errors = []
31 |     parser = load_default_japanese_parser()
32 |     fp = os.path.join(os.path.dirname(__file__), 'quality', 'ja.tsv')
33 |     with open(fp, 'r', encoding='utf-8') as f:
34 |       data = [line.split('\t') for line in f.readlines() if line[0] != '#']
35 |     expected_sentences = [line[1].strip() for line in data if len(line) > 1]
36 |     for expected in expected_sentences:
37 |       result = utils.SEP.join(parser.parse(expected.replace(utils.SEP, '')))
38 |       if result != expected:
39 |         errors.append((expected, result))
40 |     self.assertEqual(
41 |         len(errors), 0, 'Failing sentences:\n{}'.format('\n'.join(
42 |             [f'expected:{err[0]}\tactual:{err[1]}' for err in errors])))
43 | 


--------------------------------------------------------------------------------