├── .github ├── dependabot.yml └── workflows │ ├── build-demo.yml │ ├── codeql.yml │ ├── dependency-review.yml │ ├── java-unittest.yml │ ├── nodejs-unittest.yml │ ├── py-unittest.yml │ ├── scorecard.yml │ └── style-check.yml ├── .gitignore ├── .markdownlint.yaml ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── budoux ├── __init__.py ├── html_processor.py ├── main.py ├── models │ ├── ja.json │ ├── ja_knbc.json │ ├── th.json │ ├── zh-hans.json │ └── zh-hant.json ├── parser.py ├── py.typed ├── skip_nodes.json └── utils.py ├── bump_version.py ├── data └── finetuning │ └── ja │ ├── train.txt │ └── val.txt ├── demo ├── package-lock.json ├── package.json ├── src │ ├── app.ts │ └── worker.ts ├── static │ └── index.html └── tsconfig.json ├── example.png ├── java ├── .gitignore ├── README.md ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── google │ │ └── budoux │ │ ├── HTMLProcessor.java │ │ └── Parser.java │ └── test │ └── java │ └── com │ └── google │ └── budoux │ ├── HTMLProcessorTest.java │ └── ParserTest.java ├── javascript ├── .npmignore ├── .prettierrc.json ├── README.md ├── bin │ └── budoux.js ├── eslint.config.mjs ├── karma.conf.js ├── package-lock.json ├── package.json ├── scripts │ ├── check-cli-version.js │ └── copy-data.js ├── src │ ├── cli.ts │ ├── dom-browser.ts │ ├── dom.ts │ ├── html_processor.ts │ ├── index.ts │ ├── parser.ts │ ├── tests │ │ ├── index.browser.ts │ │ ├── index.node.ts │ │ ├── models │ │ │ └── separate_right_before_a.json │ │ ├── test_cli.ts │ │ ├── test_html_processor.ts │ │ ├── test_parser.ts │ │ ├── test_webcomponents.ts │ │ ├── testutils-browser.ts │ │ └── testutils.ts │ └── webcomponents │ │ ├── budoux-base.ts │ │ ├── budoux-ja.ts │ │ ├── budoux-th.ts │ │ ├── budoux-zh-hans.ts │ │ └── budoux-zh-hant.ts └── tsconfig.json ├── pyproject.toml ├── scripts ├── README.md ├── __init__.py ├── build_model.py ├── encode_data.py ├── finetune.py ├── prepare_knbc.py ├── prepare_wisesight.py ├── tests │ ├── test_build_model.py │ ├── test_encode_data.py │ ├── test_finetune.py │ ├── test_prepare_knbc.py │ ├── test_train.py │ └── test_translate_model.py ├── train.py └── translate_model.py ├── setup.cfg ├── setup.py └── tests ├── in ├── 1.in ├── 2.in └── 3.in ├── quality └── ja.tsv ├── test_html_processor.py ├── test_main.py ├── test_parser.py └── test_quality.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: pip 9 | directory: / 10 | schedule: 11 | interval: daily 12 | 13 | - package-ecosystem: github-actions 14 | directory: / 15 | schedule: 16 | interval: daily 17 | 18 | - package-ecosystem: npm 19 | directory: /demo 20 | schedule: 21 | interval: daily 22 | 23 | - package-ecosystem: npm 24 | directory: /javascript 25 | schedule: 26 | interval: daily 27 | 28 | - package-ecosystem: maven 29 | directory: /java 30 | schedule: 31 | interval: daily 32 | -------------------------------------------------------------------------------- /.github/workflows/build-demo.yml: -------------------------------------------------------------------------------- 1 | name: Build Demo 2 | on: 3 | push: 4 | branches: [ "main" ] 5 | permissions: 6 | contents: read 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Harden Runner 13 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 14 | with: 15 | egress-policy: audit 16 | 17 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 18 | - uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0 19 | with: 20 | node-version: '16' 21 | - run: npm install 22 | working-directory: ./javascript 23 | - run: npm install 24 | working-directory: ./demo 25 | - run: npm run build 26 | working-directory: ./demo 27 | - name: Upload static files as artifact 28 | id: deployment 29 | uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # v3.0.1 30 | with: 31 | path: ./demo/static/ 32 | deploy: 33 | needs: build 34 | permissions: 35 | pages: write 36 | id-token: write 37 | environment: 38 | name: github-pages 39 | url: ${{ steps.deployment.outputs.page_url }} 40 | runs-on: ubuntu-latest 41 | steps: 42 | - name: Deploy to GitHub Pages 43 | id: deployment 44 | uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4.0.5 45 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "main" ] 20 | schedule: 21 | - cron: '42 20 * * 3' 22 | 23 | permissions: 24 | contents: read 25 | 26 | jobs: 27 | analyze: 28 | name: Analyze 29 | runs-on: ubuntu-latest 30 | permissions: 31 | actions: read 32 | contents: read 33 | security-events: write 34 | 35 | strategy: 36 | fail-fast: false 37 | matrix: 38 | language: [ 'java', 'javascript', 'python' ] 39 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 40 | # Use only 'java' to analyze code written in Java, Kotlin or both 41 | # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both 42 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 43 | 44 | steps: 45 | - name: Harden Runner 46 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 47 | with: 48 | egress-policy: audit 49 | 50 | - name: Checkout repository 51 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 52 | 53 | # Initializes the CodeQL tools for scanning. 54 | - name: Initialize CodeQL 55 | uses: github/codeql-action/init@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v2.20.1 56 | with: 57 | languages: ${{ matrix.language }} 58 | # If you wish to specify custom queries, you can do so here or in a config file. 59 | # By default, queries listed here will override any specified in a config file. 60 | # Prefix the list here with "+" to use these queries and those in the config file. 61 | 62 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 63 | # queries: security-extended,security-and-quality 64 | 65 | 66 | # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). 67 | # If this step fails, then you should remove it and run the build manually (see below) 68 | - name: Autobuild 69 | uses: github/codeql-action/autobuild@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v2.20.1 70 | 71 | # ℹ️ Command-line programs to run using the OS shell. 72 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 73 | 74 | # If the Autobuild fails above, remove it and uncomment the following three lines. 75 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 76 | 77 | # - run: | 78 | # echo "Run, Build Application using script" 79 | # ./location_of_script_within_repo/buildscript.sh 80 | 81 | - name: Perform CodeQL Analysis 82 | uses: github/codeql-action/analyze@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v2.20.1 83 | with: 84 | category: "/language:${{matrix.language}}" 85 | -------------------------------------------------------------------------------- /.github/workflows/dependency-review.yml: -------------------------------------------------------------------------------- 1 | # Dependency Review Action 2 | # 3 | # This Action will scan dependency manifest files that change as part of a Pull Request, 4 | # surfacing known-vulnerable versions of the packages declared or updated in the PR. 5 | # Once installed, if the workflow run is marked as required, 6 | # PRs introducing known-vulnerable packages will be blocked from merging. 7 | # 8 | # Source repository: https://github.com/actions/dependency-review-action 9 | name: 'Dependency Review' 10 | on: [pull_request] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | dependency-review: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Harden Runner 20 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 21 | with: 22 | egress-policy: audit 23 | 24 | - name: 'Checkout Repository' 25 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 26 | - name: 'Dependency Review' 27 | uses: actions/dependency-review-action@3b139cfc5fae8b618d3eae3675e383bb1769c019 # v4.5.0 28 | -------------------------------------------------------------------------------- /.github/workflows/java-unittest.yml: -------------------------------------------------------------------------------- 1 | name: Unittest for Java 2 | on: 3 | push: 4 | paths: 5 | - 'java/**' 6 | pull_request: 7 | paths: 8 | - 'java/**' 9 | permissions: 10 | contents: read 11 | 12 | jobs: 13 | java-unittest: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | os: [ubuntu-latest, macos-latest, windows-latest] 19 | steps: 20 | - name: Harden Runner 21 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 22 | with: 23 | egress-policy: audit 24 | 25 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 26 | - name: Set up JDK 17 27 | uses: actions/setup-java@7a6d8a8234af8eb26422e24e3006232cccaa061b # v4.6.0 28 | with: 29 | java-version: '17' 30 | distribution: 'temurin' 31 | - name: Build with Maven 32 | run: mvn --batch-mode --update-snapshots -f ./java/pom.xml package 33 | -------------------------------------------------------------------------------- /.github/workflows/nodejs-unittest.yml: -------------------------------------------------------------------------------- 1 | name: Unittest for NodeJS 2 | on: 3 | push: 4 | paths: 5 | - 'javascript/**' 6 | pull_request: 7 | paths: 8 | - 'javascript/**' 9 | permissions: 10 | contents: read 11 | 12 | jobs: 13 | nodejs-unittest: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | os: [ubuntu-latest, macos-latest, windows-latest] 19 | node-version: [18, 20] 20 | 21 | steps: 22 | - name: Harden Runner 23 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 24 | with: 25 | egress-policy: audit 26 | 27 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 28 | - name: Setup Node ${{ matrix.node-version }} 29 | uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0 30 | with: 31 | node-version: ${{ matrix.node-version }} 32 | - name: Install Dependencies 33 | run: npm install 34 | working-directory: ./javascript 35 | - name: Create symlink 36 | run: npm link 37 | working-directory: ./javascript 38 | - name: Build package 39 | run: npm run build --if-present 40 | working-directory: ./javascript 41 | - name: Run testcases 42 | run: npm test 43 | working-directory: ./javascript 44 | -------------------------------------------------------------------------------- /.github/workflows/py-unittest.yml: -------------------------------------------------------------------------------- 1 | name: Unittest for Python 2 | on: 3 | push: 4 | paths-ignore: 5 | - 'javascript/**' 6 | - 'java/**' 7 | pull_request: 8 | paths-ignore: 9 | - 'javascript/**' 10 | - 'java/**' 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | python-unittest: 16 | runs-on: ${{ matrix.os }} 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | os: [ubuntu-latest, macos-latest, windows-latest] 21 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] 22 | steps: 23 | - name: Harden Runner 24 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 25 | with: 26 | egress-policy: audit 27 | 28 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 29 | - name: Setup python ${{ matrix.python-version }} 30 | uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 31 | with: 32 | python-version: ${{ matrix.python-version }} 33 | - name: Install requirements 34 | run: | 35 | python -m pip install --upgrade pip 36 | python -m pip install ".[dev]" 37 | - name: Run unittest 38 | run: pytest ./tests 39 | - name: Install Jax 40 | if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.9' }} 41 | run: pip install ".[jaxcpu]" 42 | - name: Run unittest with Jax 43 | if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.9' }} 44 | run: pytest ./scripts/tests 45 | -------------------------------------------------------------------------------- /.github/workflows/scorecard.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. They are provided 2 | # by a third-party and are governed by separate terms of service, privacy 3 | # policy, and support documentation. 4 | 5 | name: Scorecard supply-chain security 6 | on: 7 | # For Branch-Protection check. Only the default branch is supported. See 8 | # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection 9 | branch_protection_rule: 10 | # To guarantee Maintained check is occasionally updated. See 11 | # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained 12 | schedule: 13 | - cron: '37 11 * * 2' 14 | push: 15 | branches: [ "main" ] 16 | 17 | # Declare default permissions as read only. 18 | permissions: read-all 19 | 20 | jobs: 21 | analysis: 22 | name: Scorecard analysis 23 | runs-on: ubuntu-latest 24 | permissions: 25 | # Needed to upload the results to code-scanning dashboard. 26 | security-events: write 27 | # Needed to publish results and get a badge (see publish_results below). 28 | id-token: write 29 | # Uncomment the permissions below if installing in a private repository. 30 | # contents: read 31 | # actions: read 32 | 33 | steps: 34 | - name: Harden Runner 35 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 36 | with: 37 | egress-policy: audit 38 | 39 | - name: "Checkout code" 40 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 41 | with: 42 | persist-credentials: false 43 | 44 | - name: "Run analysis" 45 | uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0 46 | with: 47 | results_file: results.sarif 48 | results_format: sarif 49 | # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: 50 | # - you want to enable the Branch-Protection check on a *public* repository, or 51 | # - you are installing Scorecard on a *private* repository 52 | # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat. 53 | # repo_token: ${{ secrets.SCORECARD_TOKEN }} 54 | 55 | # Public repositories: 56 | # - Publish results to OpenSSF REST API for easy access by consumers 57 | # - Allows the repository to include the Scorecard badge. 58 | # - See https://github.com/ossf/scorecard-action#publishing-results. 59 | # For private repositories: 60 | # - `publish_results` will always be set to `false`, regardless 61 | # of the value entered here. 62 | publish_results: true 63 | 64 | # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF 65 | # format to the repository Actions tab. 66 | - name: "Upload artifact" 67 | uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 68 | with: 69 | name: SARIF file 70 | path: results.sarif 71 | retention-days: 5 72 | 73 | # Upload the results to GitHub's code scanning dashboard. 74 | - name: "Upload to code-scanning" 75 | uses: github/codeql-action/upload-sarif@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9 76 | with: 77 | sarif_file: results.sarif 78 | -------------------------------------------------------------------------------- /.github/workflows/style-check.yml: -------------------------------------------------------------------------------- 1 | name: Style Check 2 | on: [push, pull_request] 3 | permissions: 4 | contents: read 5 | jobs: 6 | python-style-check: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Harden Runner 10 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 11 | with: 12 | egress-policy: audit 13 | 14 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 15 | - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 16 | with: 17 | python-version: '3.10' 18 | - name: Install dependencies 19 | run: | 20 | pip install --upgrade pip 21 | pip install ".[dev]" 22 | pip install ".[jaxcpu]" 23 | - name: Run isort 24 | run: | 25 | isort --diff --check . 26 | - name: Run yapf 27 | run: | 28 | yapf --diff --recursive budoux tests scripts 29 | - name: Run mypy 30 | run: | 31 | mypy budoux tests scripts 32 | - name: Run flake8 33 | if: ${{ always() }} 34 | uses: suo/flake8-github-action@3e87882219642e01aa8a6bbd03b4b0adb8542c2a 35 | with: 36 | checkName: python-style-check 37 | env: 38 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 39 | typescript-style-check: 40 | runs-on: ubuntu-latest 41 | steps: 42 | - name: Harden Runner 43 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 44 | with: 45 | egress-policy: audit 46 | 47 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 48 | - uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0 49 | with: 50 | node-version: '20' 51 | - run: npm install 52 | working-directory: ./javascript 53 | - run: npm run lint 54 | working-directory: ./javascript 55 | java-style-check: 56 | runs-on: ubuntu-latest 57 | steps: 58 | - name: Harden Runner 59 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 60 | with: 61 | egress-policy: audit 62 | 63 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 64 | - uses: actions/setup-java@7a6d8a8234af8eb26422e24e3006232cccaa061b # v4.6.0 65 | with: 66 | java-version: '17' 67 | distribution: 'temurin' 68 | - name: Google Java Format 69 | uses: axel-op/googlejavaformat-action@dbff853fb823671ec5781365233bf86543b13215 70 | with: 71 | args: "--replace" 72 | skip-commit: true 73 | - name: Print diffs 74 | run: git --no-pager diff --exit-code 75 | markdown-style-check: 76 | runs-on: ubuntu-latest 77 | steps: 78 | - name: Harden Runner 79 | uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 80 | with: 81 | egress-policy: audit 82 | 83 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 84 | - name: markdownlint 85 | uses: nosborn/github-action-markdown-cli@9b5e871c11cc0649c5ac2526af22e23525fa344d 86 | with: 87 | files: '**/*.md' 88 | config_file: .markdownlint.yaml 89 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .venv 3 | /dist 4 | __pycache__ 5 | *.pyc 6 | *.log 7 | *.egg-info 8 | *.coverage 9 | cov.xml 10 | 11 | # Python related files 12 | build/ 13 | 14 | # JavaScript related files 15 | node_modules 16 | demo/static/app.js 17 | demo/static/worker.js 18 | javascript/bundle 19 | javascript/dist 20 | javascript/module 21 | javascript/src/data 22 | 23 | # Generated files by scripts 24 | source.txt 25 | encoded_data.txt 26 | weights.txt 27 | 28 | .vscode/ 29 | -------------------------------------------------------------------------------- /.markdownlint.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | default: true 15 | 16 | MD013: 17 | code_blocks: false 18 | MD010: false 19 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement (CLA). You (or your employer) retain the copyright to your 10 | contribution; this simply gives us permission to use and redistribute your 11 | contributions as part of the project. Head over to 12 | to see your current agreements on file or 13 | to sign a new one. 14 | 15 | You generally only need to submit a CLA once, so if you've already submitted one 16 | (even if it was for a different project), you probably don't need to do it 17 | again. 18 | 19 | ## Code Reviews 20 | 21 | All submissions, including submissions by project members, require review. We 22 | use GitHub pull requests for this purpose. Consult 23 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 24 | information on using pull requests. 25 | 26 | ## Community Guidelines 27 | 28 | This project follows 29 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/). 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include budoux/unicode_blocks.json 2 | include budoux/skip_nodes.json 3 | include budoux/py.typed 4 | recursive-include budoux/models *.json 5 | -------------------------------------------------------------------------------- /budoux/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """BudouX module.""" 15 | 16 | from . import parser 17 | 18 | __version__ = "0.7.0" 19 | 20 | Parser = parser.Parser 21 | load_default_japanese_parser = parser.load_default_japanese_parser 22 | load_default_simplified_chinese_parser = parser.load_default_simplified_chinese_parser 23 | load_default_traditional_chinese_parser = parser.load_default_traditional_chinese_parser 24 | load_default_thai_parser = parser.load_default_thai_parser 25 | -------------------------------------------------------------------------------- /budoux/html_processor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """HTML processor.""" 15 | 16 | import json 17 | import os 18 | import queue 19 | import typing 20 | from html.parser import HTMLParser 21 | 22 | from .utils import SEP 23 | 24 | HTMLAttr = typing.List[typing.Tuple[str, typing.Union[str, None]]] 25 | PARENT_CSS_STYLE = 'word-break: keep-all; overflow-wrap: anywhere;' 26 | with open( 27 | os.path.join(os.path.dirname(__file__), 'skip_nodes.json'), 28 | encoding='utf-8') as f: 29 | SKIP_NODES: typing.Set[str] = set(json.load(f)) 30 | 31 | 32 | class ElementState(object): 33 | """Represents the state for an element. 34 | 35 | Attributes: 36 | tag (str): The tag name. 37 | to_skip (bool): Whether the content should be skipped or not. 38 | """ 39 | 40 | def __init__(self, tag: str, to_skip: bool) -> None: 41 | self.tag = tag 42 | self.to_skip = to_skip 43 | 44 | 45 | class TextContentExtractor(HTMLParser): 46 | """An HTML parser to extract text content. 47 | 48 | Attributes: 49 | output (str): Accumulated text content. 50 | """ 51 | output = '' 52 | 53 | def handle_data(self, data: str) -> None: 54 | self.output += data 55 | 56 | 57 | class HTMLChunkResolver(HTMLParser): 58 | """An HTML parser to resolve the given HTML string and semantic chunks. 59 | 60 | Attributes: 61 | output (str): The HTML string to output. 62 | """ 63 | output = '' 64 | 65 | def __init__(self, chunks: typing.List[str], separator: str): 66 | """Initializes the parser. 67 | 68 | Args: 69 | chunks (List[str]): The chunks to resolve. 70 | separator (str): The separator string. 71 | """ 72 | HTMLParser.__init__(self) 73 | self.chunks_joined = SEP.join(chunks) 74 | self.separator = separator 75 | self.to_skip = False 76 | self.scan_index = 0 77 | self.element_stack: queue.LifoQueue[ElementState] = queue.LifoQueue() 78 | 79 | def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None: 80 | attr_pairs = [] 81 | for attr in attrs: 82 | if attr[1] is None: 83 | attr_pairs.append(' ' + attr[0]) 84 | else: 85 | attr_pairs.append(' %s="%s"' % (attr[0], attr[1])) 86 | encoded_attrs = ''.join(attr_pairs) 87 | self.element_stack.put(ElementState(tag, self.to_skip)) 88 | if tag.upper() in SKIP_NODES: 89 | if not self.to_skip and self.chunks_joined[self.scan_index] == SEP: 90 | self.scan_index += 1 91 | self.output += self.separator 92 | self.to_skip = True 93 | self.output += '<%s%s>' % (tag, encoded_attrs) 94 | 95 | def handle_endtag(self, tag: str) -> None: 96 | self.output += '' % (tag) 97 | while not self.element_stack.empty(): 98 | state = self.element_stack.get_nowait() 99 | if state.tag == tag: 100 | self.to_skip = state.to_skip 101 | break 102 | # If the close tag doesn't match the open tag, remove it and keep looking. 103 | # This means that close tags close their corresponding open tags. 104 | # e.g., `abcdef` or `

abcdef

` are both valid 105 | # HTML as per the HTML spec. 106 | # Note the HTML "adoption agency algorithm" isn't fully supported. 107 | # See https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser 108 | 109 | def handle_data(self, data: str) -> None: 110 | for char in data: 111 | if not char == self.chunks_joined[self.scan_index]: 112 | if not self.to_skip: 113 | self.output += self.separator 114 | self.scan_index += 1 115 | self.output += char 116 | self.scan_index += 1 117 | 118 | 119 | def get_text(html: str) -> str: 120 | """Gets the text content from the input HTML string. 121 | 122 | Args: 123 | html (str): Input HTML string. 124 | 125 | Returns: 126 | The text content. 127 | """ 128 | text_content_extractor = TextContentExtractor() 129 | text_content_extractor.feed(html) 130 | return text_content_extractor.output 131 | 132 | 133 | def resolve(phrases: typing.List[str], 134 | html: str, 135 | separator: str = '\u200b') -> str: 136 | """Wraps phrases in the HTML string with non-breaking markup. 137 | 138 | Args: 139 | phrases (List[str]): The phrases included in the HTML string. 140 | html (str): The HTML string to resolve. 141 | separator (str, optional): The separator string. 142 | 143 | Returns: 144 | The HTML string with phrases wrapped in non-breaking markup. 145 | """ 146 | resolver = HTMLChunkResolver(phrases, separator) 147 | resolver.feed(html) 148 | result = '%s' % (PARENT_CSS_STYLE, resolver.output) 149 | return result 150 | -------------------------------------------------------------------------------- /budoux/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2021 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """BudouX Script to provide CLI for user.""" 16 | import argparse 17 | import json 18 | import os 19 | import shutil 20 | import sys 21 | import textwrap 22 | import typing 23 | from pathlib import Path 24 | 25 | # TODO: replace with importlib.resources when py3.8 support is dropped. 26 | import importlib_resources 27 | 28 | import budoux 29 | 30 | ArgList = typing.Optional[typing.List[str]] 31 | models: Path = importlib_resources.files('budoux') / "models" 32 | langs = dict((model.stem, model) for model in models.glob("*.json")) 33 | 34 | 35 | class BudouxHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, 36 | argparse.RawDescriptionHelpFormatter): 37 | pass 38 | 39 | 40 | def check_file(path: str) -> str: 41 | """Check if a given filepath exists or not. 42 | 43 | Args: 44 | path (str): Model path 45 | 46 | Raises: 47 | FileNotFoundError: Raise if given path does not exist. 48 | 49 | Returns: 50 | str: A model path. 51 | """ 52 | if os.path.isfile(path): 53 | return path 54 | else: 55 | raise argparse.ArgumentTypeError(f"'{path}' is not found.") 56 | 57 | 58 | def check_lang(lang: str) -> Path: 59 | """Check if given language exists or not. 60 | 61 | Args: 62 | lang (str): language code (e.g.: 'ja') 63 | 64 | Raises: 65 | argparse.ArgumentTypeError: Raise if no model for given language exists. 66 | 67 | Returns: 68 | The model path. 69 | """ 70 | if lang in langs: 71 | return langs[lang] 72 | else: 73 | raise argparse.ArgumentTypeError( 74 | f"'{lang}' does not exist in builtin models. (supported languages: {list(langs.keys())})" 75 | ) 76 | 77 | 78 | def parse_args(test: ArgList = None) -> argparse.Namespace: 79 | """Parse commandline arguments. 80 | 81 | Args: 82 | test (typing.Optional[typing.List[str]], optional): Commandline args for testing. Defaults to None. 83 | 84 | Returns: 85 | argparse.Namespace: Parsed data of args. 86 | """ 87 | parser = argparse.ArgumentParser( 88 | prog="budoux", 89 | formatter_class=(lambda prog: BudouxHelpFormatter( 90 | prog, 91 | **{ 92 | "width": shutil.get_terminal_size(fallback=(120, 50)).columns, 93 | "max_help_position": 30, 94 | }, 95 | )), 96 | description=textwrap.dedent("""\ 97 | BudouX is the successor to Budou, 98 | the machine learning powered line break organizer tool."""), 99 | epilog="\n- ".join( 100 | ["supported languages of `-l`, `--lang`:", *langs.keys()])) 101 | 102 | parser.add_argument("text", metavar="TXT", nargs="?", type=str, help="text") 103 | parser.add_argument( 104 | "-H", 105 | "--html", 106 | action="store_true", 107 | help="HTML mode", 108 | ) 109 | model_select_group = parser.add_mutually_exclusive_group() 110 | model_select_group.add_argument( 111 | "-m", 112 | "--model", 113 | metavar="JSON", 114 | type=check_file, 115 | default=check_lang('ja'), 116 | help="custom model file path", 117 | ) 118 | model_select_group.add_argument( 119 | "-l", 120 | "--lang", 121 | metavar="LANG", 122 | type=check_lang, 123 | help="language of custom model", 124 | ) 125 | parser.add_argument( 126 | "-s", 127 | "--sep", 128 | metavar="STR", 129 | type=str, 130 | default="\n", 131 | help="output phrase separator in TEXT mode", 132 | ) 133 | parser.add_argument( 134 | "-d", 135 | "--delim", 136 | metavar="STR", 137 | type=str, 138 | default="---", 139 | help="output sentence delimiter in TEXT mode", 140 | ) 141 | parser.add_argument( 142 | "-V", 143 | "--version", 144 | action="version", 145 | version="%(prog)s {}".format(budoux.__version__), 146 | ) 147 | if test is not None: 148 | return parser.parse_args(test) 149 | else: 150 | return parser.parse_args() 151 | 152 | 153 | def _main(test: ArgList = None) -> str: 154 | args = parse_args(test=test) 155 | model_path = args.lang or args.model 156 | with open(model_path, 'r', encoding='utf-8') as f: 157 | model = json.load(f) 158 | 159 | parser = budoux.Parser(model) 160 | if args.html: 161 | if args.text is None: 162 | inputs_html = sys.stdin.read() 163 | else: 164 | inputs_html = args.text 165 | res = parser.translate_html_string(inputs_html) 166 | else: 167 | if args.text is None: 168 | inputs = [v.rstrip() for v in sys.stdin.readlines()] 169 | else: 170 | inputs = [v.rstrip() for v in args.text.splitlines()] 171 | outputs = [parser.parse(sentence) for sentence in inputs] 172 | combined_output = [args.sep.join(output) for output in outputs] 173 | ors = "\n" + args.delim + "\n" 174 | res = ors.join(combined_output) 175 | 176 | return res 177 | 178 | 179 | def main(test: ArgList = None) -> None: 180 | try: 181 | print(_main(test)) 182 | except KeyboardInterrupt: 183 | exit(0) 184 | 185 | 186 | if __name__ == "__main__": 187 | main() 188 | -------------------------------------------------------------------------------- /budoux/parser.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """BudouX parser to provide semantic chunks.""" 15 | 16 | import json 17 | import os 18 | import typing 19 | 20 | from .html_processor import get_text, resolve 21 | 22 | MODEL_DIR = os.path.join(os.path.dirname(__file__), 'models') 23 | 24 | 25 | class Parser: 26 | """BudouX's Parser. 27 | 28 | The main parser object with a variety of class methods to provide semantic 29 | chunks and markups from the given input string. 30 | 31 | Attributes: 32 | model: A dict mapping a feature (str) and its score (int). 33 | """ 34 | 35 | def __init__(self, model: typing.Dict[str, typing.Dict[str, int]]): 36 | """Initializes the parser. 37 | 38 | Args: 39 | model (Dict[str, Dict[str, int]]): A dict mapping a feature and its score. 40 | """ 41 | self.model = model 42 | 43 | def parse(self, sentence: str) -> typing.List[str]: 44 | """Parses the input sentence and returns a list of semantic chunks. 45 | 46 | Args: 47 | sentence (str): An input sentence. 48 | 49 | Returns: 50 | A list of semantic chunks (List[str]). 51 | """ 52 | if sentence == '': 53 | return [] 54 | chunks = [sentence[0]] 55 | base_score = -sum(sum(g.values()) for g in self.model.values()) * 0.5 56 | for i in range(1, len(sentence)): 57 | score = base_score 58 | if i > 2: 59 | score += self.model.get('UW1', {}).get(sentence[i - 3], 0) 60 | if i > 1: 61 | score += self.model.get('UW2', {}).get(sentence[i - 2], 0) 62 | score += self.model.get('UW3', {}).get(sentence[i - 1], 0) 63 | score += self.model.get('UW4', {}).get(sentence[i], 0) 64 | if i + 1 < len(sentence): 65 | score += self.model.get('UW5', {}).get(sentence[i + 1], 0) 66 | if i + 2 < len(sentence): 67 | score += self.model.get('UW6', {}).get(sentence[i + 2], 0) 68 | 69 | if i > 1: 70 | score += self.model.get('BW1', {}).get(sentence[i - 2:i], 0) 71 | score += self.model.get('BW2', {}).get(sentence[i - 1:i + 1], 0) 72 | if i + 1 < len(sentence): 73 | score += self.model.get('BW3', {}).get(sentence[i:i + 2], 0) 74 | 75 | if i > 2: 76 | score += self.model.get('TW1', {}).get(sentence[i - 3:i], 0) 77 | if i > 1: 78 | score += self.model.get('TW2', {}).get(sentence[i - 2:i + 1], 0) 79 | if i + 1 < len(sentence): 80 | score += self.model.get('TW3', {}).get(sentence[i - 1:i + 2], 0) 81 | if i + 2 < len(sentence): 82 | score += self.model.get('TW4', {}).get(sentence[i:i + 3], 0) 83 | 84 | if score > 0: 85 | chunks.append(sentence[i]) 86 | else: 87 | chunks[-1] += sentence[i] 88 | return chunks 89 | 90 | def translate_html_string(self, html: str) -> str: 91 | """Translates the given HTML string with markups for semantic line breaks. 92 | 93 | Args: 94 | html (str): An input html string. 95 | 96 | Returns: 97 | The translated HTML string (str). 98 | """ 99 | # TODO: Align with the JavaScript API regarding the parent element addition. 100 | text_content = get_text(html) 101 | chunks = self.parse(text_content) 102 | return resolve(chunks, html) 103 | 104 | 105 | def load_default_japanese_parser() -> Parser: 106 | """Loads a parser equipped with the default Japanese model. 107 | 108 | Returns: 109 | A parser (:obj:`budoux.Parser`). 110 | """ 111 | with open(os.path.join(MODEL_DIR, 'ja.json'), encoding='utf-8') as f: 112 | model = json.load(f) 113 | return Parser(model) 114 | 115 | 116 | def load_default_simplified_chinese_parser() -> Parser: 117 | """Loads a parser equipped with the default Simplified Chinese model. 118 | 119 | Returns: 120 | A parser (:obj:`budoux.Parser`). 121 | """ 122 | with open(os.path.join(MODEL_DIR, 'zh-hans.json'), encoding='utf-8') as f: 123 | model = json.load(f) 124 | return Parser(model) 125 | 126 | 127 | def load_default_traditional_chinese_parser() -> Parser: 128 | """Loads a parser equipped with the default Traditional Chinese model. 129 | 130 | Returns: 131 | A parser (:obj:`budoux.Parser`). 132 | """ 133 | with open(os.path.join(MODEL_DIR, 'zh-hant.json'), encoding='utf-8') as f: 134 | model = json.load(f) 135 | return Parser(model) 136 | 137 | 138 | def load_default_thai_parser() -> Parser: 139 | """Loads a parser equipped with the default Thai model. 140 | 141 | Returns: 142 | A parser (:obj:`budoux.Parser`). 143 | """ 144 | with open(os.path.join(MODEL_DIR, 'th.json'), encoding='utf-8') as f: 145 | model = json.load(f) 146 | return Parser(model) 147 | -------------------------------------------------------------------------------- /budoux/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/budoux/1f232ee0b7b11f9a7b7b3e02a0eecc2aaafe26ca/budoux/py.typed -------------------------------------------------------------------------------- /budoux/skip_nodes.json: -------------------------------------------------------------------------------- 1 | [ 2 | "ABBR", 3 | "BUTTON", 4 | "CODE", 5 | "IFRAME", 6 | "INPUT", 7 | "META", 8 | "NOBR", 9 | "SCRIPT", 10 | "STYLE", 11 | "TEXTAREA", 12 | "TIME", 13 | "VAR" 14 | ] 15 | -------------------------------------------------------------------------------- /budoux/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Utilities for BudouX.""" 15 | 16 | SEP = '▁' 17 | """The separator string to specify breakpoints.""" 18 | -------------------------------------------------------------------------------- /bump_version.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import json 17 | import re 18 | import subprocess 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser(description='Bump the version number.') 23 | parser.add_argument( 24 | 'new_version', type=str, help='The new version number (e.g., 1.2.3)') 25 | args = parser.parse_args() 26 | new_version = args.new_version 27 | 28 | # Updates Python port version number 29 | init_file = 'budoux/__init__.py' 30 | with open(init_file, 'r') as f: 31 | content = f.read() 32 | new_content = re.sub(r'(__version__\s+=\s+[\'"])([\.\d]+)([\'"])', 33 | rf'\g<1>{new_version}\g<3>', content) 34 | with open(init_file, 'w') as f: 35 | f.write(new_content) 36 | 37 | # Updates JavaScript port version number 38 | package_json_path = 'javascript/package.json' 39 | with open(package_json_path, 'r') as f: 40 | package_data = json.load(f) 41 | current_version = package_data.get('version') 42 | 43 | if current_version != new_version: 44 | npm_command = ['npm', 'version', new_version, '--no-git-tag-version'] 45 | subprocess.run(npm_command, cwd='javascript', check=True) 46 | else: 47 | print(f"JavaScript version is already {new_version}, skipping npm version.") 48 | 49 | cli_file = 'javascript/src/cli.ts' 50 | with open(cli_file, 'r') as f: 51 | content = f.read() 52 | new_content = re.sub(r'(const\s+CLI_VERSION\s+=\s+[\'"])([\.\d]+)([\'"])', 53 | rf'\g<1>{new_version}\g<3>', content) 54 | with open(cli_file, 'w') as f: 55 | f.write(new_content) 56 | 57 | # Updates Java port version number 58 | mvn_command = [ 59 | 'mvn', 'versions:set', f'-DnewVersion={new_version}', 60 | '-DgenerateBackupPoms=false' 61 | ] 62 | subprocess.run(mvn_command, cwd='java', check=True) 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /data/finetuning/ja/train.txt: -------------------------------------------------------------------------------- 1 | 指定された▁時間以上▁アプリケーションを▁利用する▁ことは▁できません。 2 | これ以上▁その機器を▁利用する▁場合は▁注意してください。 3 | それ以上▁コップを▁振ると▁こぼれます。 4 | ファイルは▁そのまま▁ご利用いただけます。 5 | 彼は▁そのまま▁行こうとした。 6 | ご利用▁いただき▁ありがとう▁ございます。 7 | フィードバック▁ありがとう▁ございます。 8 | 貴重な▁ご意見▁ありがとう▁ございます。 9 | この本は▁あらゆる▁トピックを▁カバーします。 10 | ドアを▁ありと▁あらゆる▁力を▁込めて▁開けます。 11 | 身の▁回りの▁あらゆる▁ものを▁化学式で▁表す。 12 | 当機は▁まもなく▁着陸態勢に▁入ります。 13 | まもなくして▁彼女が▁来た。 14 | まもなく▁電車が▁到着します。 15 | ようやく▁日が▁暮れた。 16 | やっと▁ようやく▁公開できそうです。 17 | あいつが▁ようやく▁来た。 18 | 夕方▁ようやく▁完成した。 19 | あれが▁入ったのは▁たまたまです。 20 | たまたま▁手に▁入れる▁ことが▁できた。 21 | 彼が▁たまたま▁持っていた。 22 | 全部▁まとめて▁提出します。 23 | 論点を▁まとめる。 24 | 思った▁とおりに▁書く。 25 | 言われた▁とおりに▁動きます。 26 | まるで▁水晶の▁ように▁すきとおって▁いた。 27 | 彼の▁すきとおる▁肌 28 | 冷たさを▁もつ▁青い▁空 29 | 当日券のみ▁有効です。 30 | 該当する方▁のみ▁入場できます。 31 | あの▁青い▁空と▁白い▁雲のみが▁見える。 32 | 白い▁つぶが▁ちりのように▁舞う 33 | つぶつぶの▁食感 34 | 煙が▁どんどん▁広がっていく 35 | さあ▁どんどん▁食べてくれ 36 | そこが▁ちがうと▁思う 37 | はじまりが▁ちがうから▁おわりも▁ちがう 38 | 日が▁しずむまでに▁終わらせよう 39 | うまく▁言葉に▁できない 40 | それは▁子どもの▁遊び場です。 41 | ふだん▁どおりに▁やれば▁大丈夫。 42 | この▁おもちゃを▁ください。 43 | 映画に▁感情移入する。 44 | 制度に▁甘えがちな▁場面 45 | 可能性が▁浮かび▁上がる -------------------------------------------------------------------------------- /data/finetuning/ja/val.txt: -------------------------------------------------------------------------------- 1 | それ以上▁モニターは▁増やせません 2 | 今回の▁発表は▁以上に▁なります。 3 | そのままに▁しておけば▁良い。 4 | そのまま▁お送りください。 5 | たくさんの▁お便り▁ありがとう▁ございます。 6 | 彼は▁あらゆる▁服を▁持っています。 7 | 係の▁者が▁まもなく▁来ます。 8 | 山の▁頂が▁ようやく▁見えた。 9 | たまたま▁聞こえてきた▁歌声。 10 | -------------------------------------------------------------------------------- /demo/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "budoux-demo", 3 | "version": "0.1.2", 4 | "description": "A demo app for BudouX", 5 | "main": "static/app.js", 6 | "scripts": { 7 | "build:app": "esbuild src/app.ts --bundle --minify --outfile=static/app.js", 8 | "build:worker": "esbuild src/worker.ts --bundle --minify --outfile=static/worker.js", 9 | "build": "npm run build:app && npm run build:worker", 10 | "watch:app": "esbuild src/app.ts --watch --bundle --minify --outfile=static/app.js", 11 | "watch:worker": "esbuild src/worker.ts --watch --bundle --minify --outfile=static/worker.js", 12 | "watch": "concurrently \"npm run watch:app\" \"npm run watch:worker\"", 13 | "serve": "http-server static", 14 | "dev": "concurrently \"npm run serve\" \"npm run watch\"", 15 | "start": "npm run dev" 16 | }, 17 | "keywords": [], 18 | "author": "Shuhei Iitsuka", 19 | "license": "Apache-2.0", 20 | "dependencies": { 21 | "budoux": "file:../javascript", 22 | "dompurify": "^3.2.5" 23 | }, 24 | "devDependencies": { 25 | "@types/dompurify": "^3.2.0", 26 | "concurrently": "^9.1.2", 27 | "esbuild": "^0.19.5", 28 | "http-server": "^14.1.1", 29 | "typescript": "^5.2.2" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /demo/src/app.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import DOMPurify from 'dompurify'; 18 | import { loadDefaultJapaneseParser, loadDefaultSimplifiedChineseParser, loadDefaultTraditionalChineseParser, loadDefaultThaiParser } from 'budoux'; 19 | 20 | const parsers = new Map([ 21 | ['ja', loadDefaultJapaneseParser()], 22 | ['zh-hans', loadDefaultSimplifiedChineseParser()], 23 | ['zh-hant', loadDefaultTraditionalChineseParser()], 24 | ['th', loadDefaultThaiParser()] 25 | ]); 26 | const defaultInputs = new Map([ 27 | ['ja', 'Google の使命は、世界中の情報を整理し、世界中の人がアクセスできて使えるようにすることです。'], 28 | ['zh-hans', '我们的使命是整合全球信息,供大众使用,让人人受益。'], 29 | ['zh-hant', '我們的使命是匯整全球資訊,供大眾使用,使人人受惠。'], 30 | ['th', 'พันธกิจของเราคือการจัดระเบียบข้อมูลในโลกนี้และทำให้เข้าถึงได้ง่ายในทุกที่และมีประโยชน์'] 31 | ]) 32 | const inputTextElement = document.getElementById('input') as HTMLTextAreaElement; 33 | const outputContainerElement = document.getElementById('output') as HTMLElement; 34 | const fontSizeElement = document.getElementById('fontsize') as HTMLInputElement; 35 | const brCheckElement = document.getElementById('wbr2br') as HTMLInputElement; 36 | const modelSelectElement = document.getElementById('model') as HTMLSelectElement; 37 | const url = new URL(document.location.href); 38 | const worker = new Worker('./worker.js'); 39 | worker.onmessage = (e: MessageEvent) => { 40 | console.log('response from worker:', e); 41 | }; 42 | 43 | 44 | /** 45 | * Runs the BudouX model to process the input text and render the processed HTML. 46 | */ 47 | const run = () => { 48 | outputContainerElement.innerHTML = DOMPurify.sanitize(inputTextElement.value); 49 | const model = modelSelectElement.value; 50 | worker.postMessage({'sentence': outputContainerElement.textContent, 'model': model}); 51 | const parser = parsers.get(model); 52 | if (!parser) return; 53 | parser.applyToElement(outputContainerElement); 54 | const renderWithBR = brCheckElement.checked; 55 | if (renderWithBR) { 56 | outputContainerElement.innerHTML = DOMPurify.sanitize( 57 | outputContainerElement.innerHTML.replace(/\u200b/g, '
')); 58 | } 59 | url.searchParams.set('q', inputTextElement.value); 60 | window.history.replaceState('', '', url.toString()); 61 | }; 62 | 63 | /** 64 | * Initializes the app. 65 | */ 66 | const init = () => { 67 | const lang = url.searchParams.get('lang'); 68 | if (lang) modelSelectElement.value = lang; 69 | const input = url.searchParams.get('q') || defaultInputs.get(modelSelectElement.value); 70 | if (input) inputTextElement.value = input; 71 | run(); 72 | } 73 | 74 | fontSizeElement.addEventListener('input', () => { 75 | outputContainerElement.style.fontSize = `${fontSizeElement.value}rem`; 76 | }) 77 | 78 | inputTextElement.addEventListener('input', () => { 79 | run(); 80 | }); 81 | 82 | brCheckElement.addEventListener('input', () => { 83 | run(); 84 | }); 85 | 86 | modelSelectElement.addEventListener('change', () => { 87 | url.searchParams.set('lang', modelSelectElement.value); 88 | window.history.pushState('', '', url.toString()); 89 | const input = defaultInputs.get(modelSelectElement.value); 90 | if (input) inputTextElement.value = input; 91 | run(); 92 | }) 93 | 94 | init(); -------------------------------------------------------------------------------- /demo/src/worker.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2023 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import { Parser, jaModel, zhHansModel, zhHantModel } from 'budoux'; 18 | 19 | const parsers: { [key: string]: Parser } = { 20 | 'ja': new Parser(jaModel), 21 | 'zh-hans': new Parser(zhHansModel), 22 | 'zh-hant': new Parser(zhHantModel), 23 | }; 24 | 25 | onmessage = (e: MessageEvent) => { 26 | const model: string = e.data['model']; 27 | if (!Object.keys(parsers).includes(model)) return; 28 | const parser = parsers[model]; 29 | const result = parser.parse(e.data['sentence']); 30 | console.log('It works in Web Worker, too!', result); 31 | postMessage(result); 32 | }; 33 | -------------------------------------------------------------------------------- /demo/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | BudouX demo 7 | 57 | 58 | 59 |
60 |
61 |

BudouX 🍇

62 |

A small, standalone, and language-neutral line break organizer.

63 |

64 | [GitHub] 65 | [PyPI] 66 | [NPM] 67 |

68 |
69 |
70 |
71 |

72 | 73 | 79 |

80 | 81 |
82 | 83 | 84 |
85 |
86 | 87 | 88 |
89 |

90 |
91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /demo/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2017", 4 | "module": "commonjs", 5 | "esModuleInterop": true, 6 | "forceConsistentCasingInFileNames": true, 7 | "strict": true, 8 | "skipLibCheck": true, 9 | "resolveJsonModule": true 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/budoux/1f232ee0b7b11f9a7b7b3e02a0eecc2aaafe26ca/example.png -------------------------------------------------------------------------------- /java/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | src/main/resources 3 | -------------------------------------------------------------------------------- /java/README.md: -------------------------------------------------------------------------------- 1 | # BudouX Java Module 2 | 3 | BudouX is a standalone, small, and language-neutral phrase segmenter tool that 4 | provides beautiful and legible line breaks. 5 | 6 | For more details about the project, please refer to the [project README](https://github.com/google/budoux/). 7 | 8 | ## Demo 9 | 10 | 11 | 12 | ## Usage 13 | 14 | ### Simple usage 15 | 16 | You can get a list of phrases by feeding a sentence to the parser. 17 | The easiest way is to get a parser is loading the default parser for each language. 18 | 19 | ```java 20 | import com.google.budoux.Parser; 21 | 22 | public class App 23 | { 24 | public static void main( String[] args ) 25 | { 26 | Parser parser = Parser.loadDefaultJapaneseParser(); 27 | System.out.println(parser.parse("今日は良い天気ですね。")); 28 | // [今日は, 良い, 天気ですね。] 29 | } 30 | } 31 | ``` 32 | 33 | #### Supported languages and their default parsers 34 | 35 | - Japanese: `Parser.loadDefaultJapaneseParser()` 36 | - Simplified Chinese: `Parser.loadDefaultSimplifiedChineseParser()` 37 | - Traditional Chinese: `Parser.loadDefaultTraditionalChineseParser()` 38 | - Thai: `Parser.loadDefaultThaiParser()` 39 | 40 | ### Working with HTML 41 | 42 | If you want to use the result in a website, you can use the `translateHTMLString` 43 | method to get an HTML string that wraps phrases with non-breaking markup, 44 | speicifcally, zero-width space (U+200B). 45 | 46 | ```java 47 | System.out.println(parser.translateHTMLString("今日は良い天気ですね。")); 48 | //今日は\u200b良い\u200b天気ですね。 49 | ``` 50 | 51 | Please note that separators are denoted as `\u200b` in the example above for 52 | illustrative purposes, but the actual output is an invisible string as it's a 53 | zero-width space. 54 | 55 | ## Caveat 56 | 57 | BudouX supports HTML inputs and outputs HTML strings with markup applied to wrap 58 | phrases, but it's not meant to be used as an HTML sanitizer. 59 | **BudouX doesn't sanitize any inputs.** 60 | Malicious HTML inputs yield malicious HTML outputs. 61 | Please use it with an appropriate sanitizer library if you don't trust the input. 62 | 63 | ## Author 64 | 65 | [Shuhei Iitsuka](https://tushuhei.com) 66 | 67 | ## Disclaimer 68 | 69 | This is not an officially supported Google product. 70 | -------------------------------------------------------------------------------- /java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 4.0.0 20 | 21 | 22 | org.sonatype.oss 23 | oss-parent 24 | 9 25 | 26 | 27 | com.google.budoux 28 | budoux 29 | 0.7.0 30 | 31 | BudouX 32 | https://google.github.io/budoux/ 33 | 34 | 35 | UTF-8 36 | 1.8 37 | 1.8 38 | 39 | 40 | 41 | junit 42 | junit 43 | 4.13.2 44 | test 45 | 46 | 47 | com.google.code.gson 48 | gson 49 | 2.13.0 50 | 51 | 52 | org.jsoup 53 | jsoup 54 | 1.19.1 55 | 56 | 57 | 58 | 59 | 60 | 61 | org.apache.maven.plugins 62 | maven-javadoc-plugin 63 | 3.11.2 64 | 65 | 66 | 67 | 68 | 69 | 70 | maven-clean-plugin 71 | 3.4.1 72 | 73 | 74 | 75 | maven-resources-plugin 76 | 3.3.1 77 | 78 | 79 | copy-data 80 | generate-resources 81 | 82 | copy-resources 83 | 84 | 85 | ${basedir}/src/main/resources 86 | 87 | 88 | ../budoux 89 | 90 | models/*.json 91 | skip_nodes.json 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | maven-compiler-plugin 101 | 3.14.0 102 | 103 | 104 | maven-surefire-plugin 105 | 3.5.3 106 | 107 | 108 | maven-jar-plugin 109 | 3.4.2 110 | 111 | 112 | maven-install-plugin 113 | 3.1.4 114 | 115 | 116 | maven-deploy-plugin 117 | 3.1.4 118 | 119 | 120 | 121 | maven-site-plugin 122 | 3.21.0 123 | 124 | 125 | maven-project-info-reports-plugin 126 | 3.9.0 127 | 128 | 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/budoux/HTMLProcessor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.budoux; 18 | 19 | import com.google.gson.Gson; 20 | import com.google.gson.JsonIOException; 21 | import com.google.gson.JsonSyntaxException; 22 | import java.io.IOException; 23 | import java.io.InputStream; 24 | import java.io.InputStreamReader; 25 | import java.io.Reader; 26 | import java.nio.charset.StandardCharsets; 27 | import java.util.ArrayDeque; 28 | import java.util.Arrays; 29 | import java.util.HashSet; 30 | import java.util.List; 31 | import java.util.Locale; 32 | import java.util.Set; 33 | import java.util.stream.Collectors; 34 | import org.jsoup.Jsoup; 35 | import org.jsoup.nodes.Comment; 36 | import org.jsoup.nodes.Document; 37 | import org.jsoup.nodes.Element; 38 | import org.jsoup.nodes.Node; 39 | import org.jsoup.nodes.TextNode; 40 | import org.jsoup.select.NodeVisitor; 41 | 42 | /** Processes phrases into an HTML string wrapping them in no-breaking markup. */ 43 | final class HTMLProcessor { 44 | private static final Set skipNodes; 45 | private static final String STYLE = "word-break: keep-all; overflow-wrap: anywhere;"; 46 | 47 | private HTMLProcessor() {} 48 | 49 | static { 50 | Gson gson = new Gson(); 51 | InputStream inputStream = HTMLProcessor.class.getResourceAsStream("/skip_nodes.json"); 52 | try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) { 53 | String[] skipNodesStrings = gson.fromJson(reader, String[].class); 54 | skipNodes = new HashSet<>(Arrays.asList(skipNodesStrings)); 55 | } catch (JsonSyntaxException | JsonIOException | IOException e) { 56 | throw new AssertionError(e); 57 | } 58 | } 59 | 60 | /** 61 | * A `NodeVisitor` subclass that concatenates all `TextNode`s to a string. 62 | * 63 | *

It also converts `<br>` to `\n`. 64 | */ 65 | private static class TextizeNodeVisitor implements NodeVisitor { 66 | private StringBuilder output = new StringBuilder(); 67 | 68 | public String getString() { 69 | return output.toString(); 70 | } 71 | 72 | @Override 73 | public void head(Node node, int depth) { 74 | if (node instanceof Element) { 75 | final String nodeName = node.nodeName(); 76 | if (nodeName.equals("br")) { 77 | output.append('\n'); 78 | } 79 | } else if (node instanceof TextNode) { 80 | output.append(((TextNode) node).getWholeText()); 81 | } 82 | } 83 | 84 | @Override 85 | public void tail(Node node, int depth) {} 86 | } 87 | 88 | private static class PhraseResolvingNodeVisitor implements NodeVisitor { 89 | private static final char SEP = '\uFFFF'; 90 | private final String phrasesJoined; 91 | private final String separator; 92 | private final StringBuilder output = new StringBuilder(); 93 | private Integer scanIndex = 0; 94 | private boolean toSkip = false; 95 | private final ArrayDeque elementStack = new ArrayDeque<>(); 96 | 97 | /** 98 | * Constructs a PhraseResolvingNodeVisitor. 99 | * 100 | * @param phrases a list of phrase strings. 101 | * @param separator the separator string. 102 | */ 103 | PhraseResolvingNodeVisitor(List phrases, String separator) { 104 | this.separator = separator; 105 | this.phrasesJoined = String.join(Character.toString(SEP), phrases); 106 | } 107 | 108 | /** 109 | * Returns the resolved output string. 110 | * 111 | * @return the output string. 112 | */ 113 | public StringBuilder getOutput() { 114 | return output; 115 | } 116 | 117 | @Override 118 | public void head(Node node, int depth) { 119 | if (node.nodeName().equals("body")) { 120 | return; 121 | } 122 | if (node instanceof Element) { 123 | elementStack.push(toSkip); 124 | String attributesEncoded = 125 | node.attributes().asList().stream() 126 | .map(attribute -> " " + attribute) 127 | .collect(Collectors.joining("")); 128 | final String nodeName = node.nodeName(); 129 | if (nodeName.equals("br")) { 130 | // `
` is converted to `\n`, see `TextizeNodeVisitor.head`. 131 | // Assume phrasesJoined.charAt(scanIndex) == '\n'. 132 | scanIndex++; 133 | } else if (skipNodes.contains(nodeName.toUpperCase(Locale.ENGLISH))) { 134 | if (!toSkip 135 | && scanIndex < phrasesJoined.length() 136 | && phrasesJoined.charAt(scanIndex) == SEP) { 137 | output.append(separator); 138 | scanIndex++; 139 | } 140 | toSkip = true; 141 | } 142 | output.append(String.format("<%s%s>", nodeName, attributesEncoded)); 143 | } else if (node instanceof TextNode) { 144 | String data = ((TextNode) node).getWholeText(); 145 | for (int i = 0; i < data.length(); i++) { 146 | char c = data.charAt(i); 147 | if (c != phrasesJoined.charAt(scanIndex)) { 148 | // Assume phrasesJoined.charAt(scanIndex) == SEP. 149 | if (!toSkip) { 150 | output.append(separator); 151 | } 152 | scanIndex++; 153 | } 154 | scanIndex++; 155 | output.append(c); 156 | } 157 | } 158 | } 159 | 160 | @Override 161 | public void tail(Node node, int depth) { 162 | if (node.nodeName().equals("body") || node instanceof TextNode || node instanceof Comment) { 163 | return; 164 | } 165 | // assume node instanceof Element; 166 | toSkip = elementStack.pop(); 167 | Element element = (Element) node; 168 | if (element.tag().isSelfClosing()) { 169 | return; 170 | } 171 | output.append(String.format("", node.nodeName())); 172 | } 173 | } 174 | 175 | /** 176 | * Wraps phrases in the HTML string with non-breaking markup. 177 | * 178 | * @param phrases the phrases included in the HTML string. 179 | * @param html the HTML string to resolve. 180 | * @return the HTML string of phrases wrapped in non-breaking markup. 181 | */ 182 | public static String resolve(List phrases, String html) { 183 | return resolve(phrases, html, "\u200b"); 184 | } 185 | 186 | /** 187 | * Wraps phrases in the HTML string with non-breaking markup. 188 | * 189 | * @param phrases the phrases included in the HTML string. 190 | * @param html the HTML string to resolve. 191 | * @param separator the separator string. 192 | * @return the HTML string of phrases wrapped in non-breaking markup. 193 | */ 194 | public static String resolve(List phrases, String html, String separator) { 195 | Document doc = Jsoup.parseBodyFragment(html); 196 | PhraseResolvingNodeVisitor nodeVisitor = new PhraseResolvingNodeVisitor(phrases, separator); 197 | doc.body().traverse(nodeVisitor); 198 | return String.format("%s", STYLE, nodeVisitor.getOutput()); 199 | } 200 | 201 | /** 202 | * Gets the text content from the input HTML string. 203 | * 204 | * @param html an HTML string. 205 | * @return the text content. 206 | */ 207 | public static String getText(String html) { 208 | Document doc = Jsoup.parseBodyFragment(html); 209 | TextizeNodeVisitor nodeVisitor = new TextizeNodeVisitor(); 210 | doc.body().traverse(nodeVisitor); 211 | return nodeVisitor.getString(); 212 | } 213 | } 214 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/budoux/Parser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.budoux; 18 | 19 | import com.google.gson.Gson; 20 | import com.google.gson.JsonIOException; 21 | import com.google.gson.JsonSyntaxException; 22 | import com.google.gson.reflect.TypeToken; 23 | import java.io.IOException; 24 | import java.io.InputStream; 25 | import java.io.InputStreamReader; 26 | import java.io.Reader; 27 | import java.lang.reflect.Type; 28 | import java.nio.charset.StandardCharsets; 29 | import java.util.ArrayList; 30 | import java.util.List; 31 | import java.util.Map; 32 | import java.util.Optional; 33 | 34 | /** 35 | * The BudouX parser that translates the input sentence into phrases. 36 | * 37 | *

You can create a parser instance by invoking {@code new Parser(model)} with the model data you 38 | * want to use. You can also create a parser by specifying the model file path with {@code 39 | * Parser.loadByFileName(modelFileName)}. 40 | * 41 | *

In most cases, it's sufficient to use the default parser for the language. For example, you 42 | * can create a default Japanese parser as follows. 43 | * 44 | *

 45 |  * Parser parser = Parser.loadDefaultJapaneseParser();
 46 |  * 
47 | */ 48 | public class Parser { 49 | private final Map> model; 50 | 51 | /** 52 | * Constructs a BudouX parser. 53 | * 54 | * @param model the model data. 55 | */ 56 | public Parser(Map> model) { 57 | this.model = model; 58 | } 59 | 60 | /** 61 | * Loads the default Japanese parser. 62 | * 63 | * @return a BudouX parser with the default Japanese model. 64 | */ 65 | public static Parser loadDefaultJapaneseParser() { 66 | return loadByFileName("/models/ja.json"); 67 | } 68 | 69 | /** 70 | * Loads the default Simplified Chinese parser. 71 | * 72 | * @return a BudouX parser with the default Simplified Chinese model. 73 | */ 74 | public static Parser loadDefaultSimplifiedChineseParser() { 75 | return loadByFileName("/models/zh-hans.json"); 76 | } 77 | 78 | /** 79 | * Loads the default Traditional Chinese parser. 80 | * 81 | * @return a BudouX parser with the default Traditional Chinese model. 82 | */ 83 | public static Parser loadDefaultTraditionalChineseParser() { 84 | return loadByFileName("/models/zh-hant.json"); 85 | } 86 | 87 | /** 88 | * Loads the default Thai parser. 89 | * 90 | * @return a BudouX parser with the default Thai model. 91 | */ 92 | public static Parser loadDefaultThaiParser() { 93 | return loadByFileName("/models/th.json"); 94 | } 95 | 96 | /** 97 | * Loads a parser by specifying the model file path. 98 | * 99 | * @param modelFileName the model file path. 100 | * @return a BudouX parser. 101 | */ 102 | public static Parser loadByFileName(String modelFileName) { 103 | Gson gson = new Gson(); 104 | Type type = new TypeToken>>() {}.getType(); 105 | InputStream inputStream = Parser.class.getResourceAsStream(modelFileName); 106 | try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) { 107 | Map> model = gson.fromJson(reader, type); 108 | return new Parser(model); 109 | } catch (JsonIOException | JsonSyntaxException | IOException e) { 110 | throw new AssertionError(e); 111 | } 112 | } 113 | 114 | /** 115 | * Gets the score for the specified feature of the given sequence. 116 | * 117 | * @param featureKey the feature key to examine. 118 | * @param sequence the sequence to look up the score. 119 | * @return the contribution score to support a phrase break. 120 | */ 121 | private int getScore(String featureKey, String sequence) { 122 | return Optional.ofNullable(this.model.get(featureKey)) 123 | .map(group -> group.get(sequence)) 124 | .orElse(0); 125 | } 126 | 127 | /** 128 | * Parses a sentence into phrases. 129 | * 130 | * @param sentence the sentence to break by phrase. 131 | * @return a list of phrases. 132 | */ 133 | public List parse(String sentence) { 134 | if (sentence.isEmpty()) { 135 | return new ArrayList<>(); 136 | } 137 | List result = new ArrayList<>(); 138 | result.add(String.valueOf(sentence.charAt(0))); 139 | int totalScore = 140 | this.model.values().stream() 141 | .mapToInt(group -> group.values().stream().mapToInt(Integer::intValue).sum()) 142 | .sum(); 143 | for (int i = 1; i < sentence.length(); i++) { 144 | int score = -totalScore; 145 | if (i - 2 > 0) { 146 | score += 2 * this.getScore("UW1", sentence.substring(i - 3, i - 2)); 147 | } 148 | if (i - 1 > 0) { 149 | score += 2 * this.getScore("UW2", sentence.substring(i - 2, i - 1)); 150 | } 151 | score += 2 * this.getScore("UW3", sentence.substring(i - 1, i)); 152 | score += 2 * this.getScore("UW4", sentence.substring(i, i + 1)); 153 | if (i + 1 < sentence.length()) { 154 | score += 2 * this.getScore("UW5", sentence.substring(i + 1, i + 2)); 155 | } 156 | if (i + 2 < sentence.length()) { 157 | score += 2 * this.getScore("UW6", sentence.substring(i + 2, i + 3)); 158 | } 159 | if (i > 1) { 160 | score += 2 * this.getScore("BW1", sentence.substring(i - 2, i)); 161 | } 162 | score += 2 * this.getScore("BW2", sentence.substring(i - 1, i + 1)); 163 | if (i + 1 < sentence.length()) { 164 | score += 2 * this.getScore("BW3", sentence.substring(i, i + 2)); 165 | } 166 | if (i - 2 > 0) { 167 | score += 2 * this.getScore("TW1", sentence.substring(i - 3, i)); 168 | } 169 | if (i - 1 > 0) { 170 | score += 2 * this.getScore("TW2", sentence.substring(i - 2, i + 1)); 171 | } 172 | if (i + 1 < sentence.length()) { 173 | score += 2 * this.getScore("TW3", sentence.substring(i - 1, i + 2)); 174 | } 175 | if (i + 2 < sentence.length()) { 176 | score += 2 * this.getScore("TW4", sentence.substring(i, i + 3)); 177 | } 178 | if (score > 0) { 179 | result.add(""); 180 | } 181 | result.set(result.size() - 1, result.get(result.size() - 1) + sentence.charAt(i)); 182 | } 183 | return result; 184 | } 185 | 186 | /** 187 | * Translates an HTML string with phrases wrapped in no-breaking markup. 188 | * 189 | * @param html an HTML string. 190 | * @return the translated HTML string with no-breaking markup. 191 | */ 192 | public String translateHTMLString(String html) { 193 | String sentence = HTMLProcessor.getText(html); 194 | List phrases = parse(sentence); 195 | return HTMLProcessor.resolve(phrases, html, "\u200b"); 196 | } 197 | } 198 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/budoux/HTMLProcessorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.budoux; 18 | 19 | import static org.junit.Assert.assertEquals; 20 | 21 | import java.util.Arrays; 22 | import java.util.List; 23 | import org.junit.Test; 24 | import org.junit.runner.RunWith; 25 | import org.junit.runners.JUnit4; 26 | 27 | /** Unit tests for {@link HTMLProcessor}. */ 28 | @RunWith(JUnit4.class) 29 | public class HTMLProcessorTest { 30 | String pre = ""; 31 | String post = ""; 32 | 33 | private String wrap(String input) { 34 | return this.pre + input + this.post; 35 | } 36 | 37 | @Test 38 | public void testResolveWithSimpleTextInput() { 39 | List phrases = Arrays.asList("abc", "def"); 40 | String html = "abcdef"; 41 | String result = HTMLProcessor.resolve(phrases, html, ""); 42 | assertEquals(this.wrap("abcdef"), result); 43 | } 44 | 45 | @Test 46 | public void testResolveWithStandardHTMLInput() { 47 | List phrases = Arrays.asList("abc", "def"); 48 | String html = "abcdef"; 49 | String result = HTMLProcessor.resolve(phrases, html, ""); 50 | assertEquals(this.wrap("abcdef"), result); 51 | } 52 | 53 | @Test 54 | public void testResolveWithImg() { 55 | List phrases = Arrays.asList("abc", "def"); 56 | String html = "abcdef"; 57 | String result = HTMLProcessor.resolve(phrases, html, ""); 58 | assertEquals(this.wrap("abcdef"), result); 59 | } 60 | 61 | @Test 62 | public void testResolveWithUnpairedClose() { 63 | List phrases = Arrays.asList("abc", "def"); 64 | String html = "abcdef

"; 65 | String result = HTMLProcessor.resolve(phrases, html, ""); 66 | assertEquals(this.wrap("abcdef

"), result); 67 | } 68 | 69 | @Test 70 | public void testResolveWithNodesToSkip() { 71 | List phrases = Arrays.asList("abc", "def", "ghi"); 72 | String html = "afghi"; 73 | String result = HTMLProcessor.resolve(phrases, html, ""); 74 | assertEquals(this.wrap("afghi"), result); 75 | } 76 | 77 | @Test 78 | public void testResolveWithNodesBreakBeforeSkip() { 79 | List phrases = Arrays.asList("abc", "def", "ghi", "jkl"); 80 | String html = "abcdefghijkl"; 81 | String result = HTMLProcessor.resolve(phrases, html, ""); 82 | assertEquals(this.wrap("abcdefghijkl"), result); 83 | } 84 | 85 | @Test 86 | public void testResolveWithAfterSkip() { 87 | List phrases = Arrays.asList("abc", "def", "ghi", "jkl"); 88 | String html = "abcdefghijkl"; 89 | String result = HTMLProcessor.resolve(phrases, html, ""); 90 | assertEquals(this.wrap("abcdefghijkl"), result); 91 | } 92 | 93 | @Test 94 | public void testResolveWithAfterSkipWithImg() { 95 | List phrases = Arrays.asList("abc", "def", "ghi", "jkl"); 96 | String html = "abcdefghijkl"; 97 | String result = HTMLProcessor.resolve(phrases, html, ""); 98 | assertEquals(this.wrap("abcdefghijkl"), result); 99 | } 100 | 101 | @Test 102 | public void testResolveWithNothingToSplit() { 103 | List phrases = Arrays.asList("abcdef"); 104 | String html = "abcdef"; 105 | String result = HTMLProcessor.resolve(phrases, html, ""); 106 | assertEquals(this.wrap("abcdef"), result); 107 | } 108 | 109 | @Test 110 | public void testResolveBR() { 111 | String html = " 1
2 "; 112 | String text = HTMLProcessor.getText(html); 113 | assertEquals(" 1 \n 2 ", text); 114 | List phrases = Arrays.asList(" 1 \n 2 "); 115 | String result = HTMLProcessor.resolve(phrases, html, ""); 116 | assertEquals(this.wrap(" 1
2 "), result); 117 | } 118 | 119 | @Test 120 | public void testGetText() { 121 | String html = "Hello !"; 122 | String result = HTMLProcessor.getText(html); 123 | assertEquals("Hello World!", result); 124 | } 125 | 126 | @Test 127 | public void testGetTextWhiteSpace() { 128 | String html = " H e "; 129 | String result = HTMLProcessor.getText(html); 130 | assertEquals(" H e ", result); 131 | } 132 | 133 | @Test 134 | public void testGetTextWhiteSpaceAcrossElements() { 135 | String html = "
1
2
"; 136 | String result = HTMLProcessor.getText(html); 137 | assertEquals(" 1 2 ", result); 138 | } 139 | 140 | @Test 141 | public void testResolveSkipNodeAtTheEnd() { 142 | List phrases = Arrays.asList("abc", "def", "ghi", "jkl"); 143 | String html = "abcdefghijkl"; 144 | String result = HTMLProcessor.resolve(phrases, html, ""); 145 | assertEquals(this.wrap("abcdefghijkl"), result); 146 | } 147 | 148 | @Test 149 | public void testResolveWithComments() { 150 | List phrases = Arrays.asList("abc", "def", "ghi", "jkl"); 151 | String html = "abcdefghijkl"; 152 | String result = HTMLProcessor.resolve(phrases, html, ""); 153 | assertEquals(this.wrap("abcdefghijkl"), result); 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/budoux/ParserTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.budoux; 18 | 19 | import static org.junit.Assert.assertEquals; 20 | 21 | import java.util.Arrays; 22 | import java.util.HashMap; 23 | import java.util.List; 24 | import java.util.Map; 25 | import org.junit.Test; 26 | import org.junit.runner.RunWith; 27 | import org.junit.runners.JUnit4; 28 | 29 | /** Unit tests for {@link Parser}. */ 30 | @RunWith(JUnit4.class) 31 | public class ParserTest { 32 | 33 | @Test 34 | public void testParse() { 35 | Map> model = new HashMap<>(); 36 | Map uw4 = new HashMap<>(); 37 | uw4.put("a", 100); 38 | model.put("UW4", uw4); 39 | Parser parser = new Parser(model); 40 | List result = parser.parse("xyzabc"); 41 | List expected = Arrays.asList("xyz", "abc"); 42 | assertEquals(expected, result); 43 | } 44 | 45 | @Test 46 | public void testLoadDefaultJapaneseParser() { 47 | Parser parser = Parser.loadDefaultJapaneseParser(); 48 | List result = parser.parse("今日は天気です。"); 49 | List expected = Arrays.asList("今日は", "天気です。"); 50 | assertEquals(expected, result); 51 | } 52 | 53 | @Test 54 | public void testTranslateHTMLString() { 55 | Map> model = new HashMap<>(); 56 | Map uw4 = new HashMap<>(); 57 | uw4.put("a", 100); 58 | model.put("UW4", uw4); 59 | Parser parser = new Parser(model); 60 | String html = "xyzabc"; 61 | String result = parser.translateHTMLString(html); 62 | assertEquals( 63 | "xyz\u200babc", 65 | result); 66 | } 67 | 68 | @Test 69 | public void testNewline() { 70 | Parser parser = Parser.loadDefaultJapaneseParser(); 71 | List result = parser.parse(" 1 \n 2 "); 72 | List expected = Arrays.asList(" 1 \n 2 "); 73 | assertEquals(expected, result); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /javascript/.npmignore: -------------------------------------------------------------------------------- 1 | scripts 2 | -------------------------------------------------------------------------------- /javascript/.prettierrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "bracketSpacing": false, 3 | "singleQuote": true, 4 | "trailingComma": "es5", 5 | "arrowParens": "avoid" 6 | } 7 | -------------------------------------------------------------------------------- /javascript/README.md: -------------------------------------------------------------------------------- 1 | 2 | # BudouX JavaScript module 3 | 4 | BudouX is a standalone, small, and language-neutral phrase segmenter tool that 5 | provides beautiful and legible line breaks. 6 | 7 | For more details about the project, please refer to the [project README](https://github.com/google/budoux/). 8 | 9 | ## Demo 10 | 11 | 12 | 13 | ## Install 14 | 15 | ```shellsession 16 | $ npm install budoux 17 | ``` 18 | 19 | ## Usage 20 | 21 | ### Simple usage 22 | 23 | You can get a list of phrases by feeding a sentence to the parser. 24 | The easiest way is to get a parser is loading the default parser for each language. 25 | 26 | **Japanese:** 27 | 28 | ```javascript 29 | import { loadDefaultJapaneseParser } from 'budoux'; 30 | const parser = loadDefaultJapaneseParser(); 31 | console.log(parser.parse('今日は天気です。')); 32 | // ['今日は', '天気です。'] 33 | ``` 34 | 35 | **Simplified Chinese:** 36 | 37 | ```javascript 38 | import { loadDefaultSimplifiedChineseParser } from 'budoux'; 39 | const parser = loadDefaultSimplifiedChineseParser(); 40 | console.log(parser.parse('是今天的天气。')); 41 | // ['是', '今天', '的', '天气。'] 42 | ``` 43 | 44 | **Traditional Chinese:** 45 | 46 | ```javascript 47 | import { loadDefaultTraditionalChineseParser } from 'budoux'; 48 | const parser = loadDefaultTraditionalChineseParser(); 49 | console.log(parser.parse('是今天的天氣。')); 50 | // ['是', '今天', '的', '天氣。'] 51 | ``` 52 | 53 | **Thai:** 54 | 55 | ```javascript 56 | import { loadDefaultThaiParser } from 'budoux'; 57 | const parser = loadDefaultThaiParser(); 58 | console.log(parser.parse('วันนี้อากาศดี')); 59 | // ['วัน', 'นี้', 'อากาศ', 'ดี'] 60 | ``` 61 | 62 | ### Translating an HTML string 63 | 64 | You can also translate an HTML string to wrap phrases with non-breaking markup, 65 | specifically, zero-width spaces (U+200B). 66 | 67 | ```javascript 68 | console.log(parser.translateHTMLString('今日はとても天気です。')); 69 | // 今日は\u200bとても\u200b天気です。 70 | ``` 71 | 72 | Please note that separators are denoted as `\u200b` in the example above for 73 | illustrative purposes, but the actual output is an invisible string as it's a 74 | zero-width space. 75 | 76 | ### Applying to an HTML element 77 | 78 | You can also feed an HTML element to the parser to apply the process. 79 | 80 | ```javascript 81 | const ele = document.querySelector('p.budou-this'); 82 | console.log(ele.outerHTML); 83 | //

今日はとても天気です。

84 | parser.applyToElement(ele); 85 | console.log(ele.outerHTML); 86 | //

今日は\u200bとても\u200b天気です。

87 | ``` 88 | 89 | Internally, the `applyToElement` calls the [`HTMLProcessor`]'s `applyToElement` 90 | function with the zero-width space as the separator. 91 | 92 | You can use the [`HTMLProcessor`] class directly if desired. 93 | For example: 94 | 95 | ```javascript 96 | import { HTMLProcessor } from 'budoux'; 97 | const ele = document.querySelector('p.budou-this'); 98 | const htmlProcessor = new HTMLProcessor(parser, { 99 | separator: ' ' 100 | }); 101 | htmlProcessor.applyToElement(ele); 102 | ``` 103 | 104 | [`HTMLProcessor`]: https://github.com/google/budoux/blob/main/javascript/src/html_processor.ts 105 | 106 | ### Loading a custom model 107 | 108 | You can load your own custom model as follows. 109 | 110 | ```javascript 111 | import { Parser } from 'budoux'; 112 | const model = JSON.parse('{"UW4": {"a": 133}}'); // Content of the custom model JSON file. 113 | const parser = new Parser(model); 114 | parser.parse('xyzabc'); // ['xyz', 'abc'] 115 | ``` 116 | 117 | ### Working with Web Worker 118 | 119 | If you like to use BudouX inside a Web worker script, constrcut a parser without 120 | `HTMLProcessor`, i.e. use the pure `Parser` instance. 121 | Refer to [worker.ts](https://github.com/google/budoux/blob/main/demo/src/worker.ts) 122 | for a working demo. 123 | 124 | ```javascript 125 | import { Parser, jaModel } from 'budoux'; 126 | const parser = new Parser(jaModel); 127 | parser.parse('今日は天気です'); // ['今日は', '天気です'] 128 | ``` 129 | 130 | ## Web components 131 | 132 | BudouX also offers Web components to integrate the parser with your website quickly. 133 | All you have to do is wrap sentences with: 134 | 135 | - `` for Japanese 136 | - `` for Simplified Chinese 137 | - `` for Traditional Chinese 138 | - `` for Thai 139 | 140 | ```html 141 | 今日は天気です。 142 | 今天是晴天。 143 | 今天是晴天。 144 | วันนี้อากาศดี 145 | ``` 146 | 147 | In order to enable the custom element, you can simply add this line to load the bundle. 148 | 149 | ```html 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | ``` 162 | 163 | Otherwise, if you wish to bundle the component with the rest of your source code, 164 | you can import the component as shown below. 165 | 166 | ```javascript 167 | // For Japanese 168 | import 'budoux/module/webcomponents/budoux-ja'; 169 | 170 | // For Simplified Chinese 171 | import 'budoux/module/webcomponents/budoux-zh-hans'; 172 | 173 | // For Traditional Chinese 174 | import 'budoux/module/webcomponents/budoux-zh-hant'; 175 | 176 | // For Thai 177 | import 'budoux/module/webcomponents/budoux-th'; 178 | ``` 179 | 180 | **Note:** BudouX Web Components directly manipulate the input HTML content 181 | instead of outputting the result to a shadow DOM. This design was chosen because 182 | the goal of BudouX Web Components is to simply insert zero-width spaces (ZWSPs) 183 | into the content, and isolating the style from the rest of the document could 184 | introduce unexpected side effects for developers. 185 | 186 | Consequently, cloning or editing the element might lead to duplicated ZWSPs 187 | between phrases. This is because BudouX Web Components cannot distinguish 188 | between characters that originate in the source and those that are inserted by 189 | BudouX itself once connected to the document. Duplicating ZWSPs will not cause 190 | any severe problems in controlling line breaks, and they are invisible anyway, 191 | but this is the reason we do not support other separator characters for these 192 | components. 193 | 194 | ### CLI 195 | 196 | You can also format inputs on your terminal with `budoux` command. 197 | 198 | ```shellsession 199 | $ budoux 本日は晴天です。 200 | 本日は 201 | 晴天です。 202 | ``` 203 | 204 | ```shellsession 205 | $ echo $'本日は晴天です。\n明日は曇りでしょう。' | budoux 206 | 本日は 207 | 晴天です。 208 | --- 209 | 明日は 210 | 曇りでしょう。 211 | ``` 212 | 213 | ```shellsession 214 | $ budoux 本日は晴天です。 -H 215 | 本日は\u200b晴天です。 216 | ``` 217 | 218 | Please note that separators are denoted as `\u200b` in the example above for 219 | illustrative purposes, but the actual output is an invisible string as it's a 220 | zero-width space. 221 | 222 | If you want to see help, run `budoux -h`. 223 | 224 | ```shellsession 225 | $ budoux -h 226 | Usage: budoux [-h] [-H] [-d STR] [-m JSON] [-V] [TXT] 227 | 228 | BudouX is the successor to Budou, the machine learning powered line break organizer tool. 229 | 230 | Arguments: 231 | txt text 232 | 233 | Options: 234 | -H, --html HTML mode (default: false) 235 | -d, --delim output delimiter in TEXT mode (default: "---") 236 | -m, --model custom model file path 237 | -V, --version output the version number 238 | -h, --help display help for command 239 | ``` 240 | 241 | ## Caveat 242 | 243 | BudouX supports HTML inputs and outputs HTML strings with markup applied to wrap 244 | phrases, but it's not meant to be used as an HTML sanitizer. 245 | **BudouX doesn't sanitize any inputs.** 246 | Malicious HTML inputs yield malicious HTML outputs. 247 | Please use it with an appropriate sanitizer library if you don't trust the input. 248 | 249 | ## Author 250 | 251 | [Shuhei Iitsuka](https://tushuhei.com) 252 | 253 | ## Disclaimer 254 | 255 | This is not an officially supported Google product. 256 | -------------------------------------------------------------------------------- /javascript/bin/budoux.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | /** 3 | * @license 4 | * Copyright 2021 Google LLC 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | const {cli} = require('../dist/cli'); 19 | cli(process.argv); 20 | -------------------------------------------------------------------------------- /javascript/eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import prettier from "eslint-plugin-prettier"; 2 | import tsParser from "@typescript-eslint/parser"; 3 | import path from "node:path"; 4 | import { fileURLToPath } from "node:url"; 5 | import js from "@eslint/js"; 6 | import { FlatCompat } from "@eslint/eslintrc"; 7 | 8 | const __filename = fileURLToPath(import.meta.url); 9 | const __dirname = path.dirname(__filename); 10 | const compat = new FlatCompat({ 11 | baseDirectory: __dirname, 12 | recommendedConfig: js.configs.recommended, 13 | allConfig: js.configs.all 14 | }); 15 | 16 | export default [{ 17 | ignores: ["**/bundle", "**/dist", "**/module", "src/data"], 18 | }, ...compat.extends("eslint:recommended", "prettier"), { 19 | plugins: { 20 | prettier, 21 | }, 22 | 23 | rules: { 24 | "prettier/prettier": "error", 25 | "block-scoped-var": "error", 26 | eqeqeq: "error", 27 | "no-var": "error", 28 | "prefer-const": "error", 29 | "eol-last": "error", 30 | "prefer-arrow-callback": "error", 31 | "no-trailing-spaces": "error", 32 | 33 | quotes: ["warn", "single", { 34 | avoidEscape: true, 35 | }], 36 | 37 | "no-restricted-properties": ["error", { 38 | object: "describe", 39 | property: "only", 40 | }, { 41 | object: "it", 42 | property: "only", 43 | }], 44 | }, 45 | }, ...compat.extends("plugin:@typescript-eslint/recommended").map(config => ({ 46 | ...config, 47 | files: ["**/*.ts", "**/*.tsx"], 48 | })), { 49 | files: ["**/*.ts", "**/*.tsx"], 50 | 51 | languageOptions: { 52 | parser: tsParser, 53 | ecmaVersion: 2018, 54 | sourceType: "module", 55 | }, 56 | 57 | rules: { 58 | "@typescript-eslint/no-non-null-assertion": "off", 59 | "@typescript-eslint/no-use-before-define": "off", 60 | "@typescript-eslint/no-warning-comments": "off", 61 | "@typescript-eslint/no-empty-function": "off", 62 | "@typescript-eslint/no-var-requires": "off", 63 | "@typescript-eslint/explicit-function-return-type": "off", 64 | "@typescript-eslint/explicit-module-boundary-types": "off", 65 | "@typescript-eslint/ban-types": "off", 66 | "@typescript-eslint/camelcase": "off", 67 | "node/no-empty-function": "off", 68 | "node/no-missing-import": "off", 69 | "node/no-unsupported-features/es-syntax": "off", 70 | "node/no-missing-require": "off", 71 | "node/shebang": "off", 72 | "no-dupe-class-members": "off", 73 | "require-atomic-updates": "off", 74 | }, 75 | }]; -------------------------------------------------------------------------------- /javascript/karma.conf.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2023 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | module.exports = function (config) { 18 | config.set({ 19 | basePath: '', 20 | frameworks: ['jasmine'], 21 | files: ['bundle/tests/*.js'], 22 | reporters: ['progress'], 23 | port: 9876, 24 | colors: true, 25 | logLevel: config.LOG_INFO, 26 | autoWatch: false, 27 | browsers: ['ChromeHeadless'], 28 | singleRun: true, 29 | concurrency: Infinity, 30 | }); 31 | }; 32 | -------------------------------------------------------------------------------- /javascript/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "budoux", 3 | "version": "0.7.0", 4 | "description": "A small chunk segmenter.", 5 | "author": "Shuhei Iitsuka", 6 | "license": "Apache-2.0", 7 | "repository": { 8 | "type": "git", 9 | "url": "https://github.com/google/budoux.git", 10 | "directory": "javascript" 11 | }, 12 | "main": "./dist/index.js", 13 | "module": "./module/index.js", 14 | "exports": { 15 | ".": { 16 | "import": "./module/index.js", 17 | "require": "./dist/index.js" 18 | } 19 | }, 20 | "browser": { 21 | "./dist/dom.js": "./dist/dom-browser.js", 22 | "./module/dom.js": "./module/dom-browser.js", 23 | "./dist/tests/testutils.js": "./dist/tests/testutils-browser.js", 24 | "./module/tests/testutils.js": "./module/tests/testutils-browser.js" 25 | }, 26 | "bin": { 27 | "budoux": "./bin/budoux.js" 28 | }, 29 | "sideEffects": [ 30 | "./module/webcomponents/*", 31 | "./module/tests/*" 32 | ], 33 | "scripts": { 34 | "build": "npm run build:esm && npm run build:cjs", 35 | "build:cjs": "tsc && cp -r src/tests/models/ dist/tests/models/", 36 | "build:esm": "tsc --outDir module --module ES2020 && cp -r src/tests/models/ module/tests/models/", 37 | "bundle": "npm run bundle:webcomponents && npm run bundle:test", 38 | "bundle:test": "esbuild module/tests/index.browser.js --bundle --sourcemap --outfile=bundle/tests/index.browser.js", 39 | "bundle:webcomponents": "npm run bundle:webcomponents:ja && npm run bundle:webcomponents:zh-hans && npm run bundle:webcomponents:zh-hant && npm run bundle:webcomponents:th", 40 | "bundle:webcomponents:ja": "esbuild module/webcomponents/budoux-ja.js --bundle --minify --sourcemap --outfile=bundle/budoux-ja.min.js", 41 | "bundle:webcomponents:zh-hans": "esbuild module/webcomponents/budoux-zh-hans.js --bundle --minify --sourcemap --outfile=bundle/budoux-zh-hans.min.js", 42 | "bundle:webcomponents:zh-hant": "esbuild module/webcomponents/budoux-zh-hant.js --bundle --minify --sourcemap --outfile=bundle/budoux-zh-hant.min.js", 43 | "bundle:webcomponents:th": "esbuild module/webcomponents/budoux-th.js --bundle --minify --sourcemap --outfile=bundle/budoux-th.min.js", 44 | "clean": "rm -rf dist module src/data", 45 | "copy": "node ./scripts/copy-data.js", 46 | "prebuild": "npm run clean && npm run copy", 47 | "prepare": "npm run clean && npm run copy && npm run build && npm run bundle", 48 | "pretest": "npm run build && npm run bundle:test", 49 | "test": "npm run test:jasmine && npm run test:karma && npm run test:cli-version", 50 | "test:cli-version": "node ./scripts/check-cli-version.js", 51 | "test:jasmine": "jasmine dist/tests/index.node.js", 52 | "test:karma": "karma start", 53 | "lint": "eslint src/** --no-error-on-unmatched-pattern", 54 | "fix": "eslint src/** --no-error-on-unmatched-pattern --fix" 55 | }, 56 | "devDependencies": { 57 | "@eslint/eslintrc": "^3.1.0", 58 | "@eslint/js": "^9.9.0", 59 | "@types/jasmine": "^5.1.0", 60 | "@types/node": "^22.0.0", 61 | "@typescript-eslint/eslint-plugin": "^8.0.1", 62 | "esbuild": "^0.25.0", 63 | "eslint": "^9.9.0", 64 | "eslint-config-prettier": "^10.0.1", 65 | "eslint-plugin-prettier": "^5.0.0", 66 | "jasmine": "^5.0.2", 67 | "jasmine-core": "^5.0.1", 68 | "karma": "^6.4.2", 69 | "karma-chrome-launcher": "^3.2.0", 70 | "karma-jasmine": "^5.1.0", 71 | "typescript": "^5.1.6" 72 | }, 73 | "dependencies": { 74 | "commander": "^13.0.0", 75 | "linkedom": "^0.18.7" 76 | }, 77 | "overrides": { 78 | "tough-cookie": "^5.0.0-rc.4" 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /javascript/scripts/check-cli-version.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | const assert = require('assert'); 18 | const path = require('path'); 19 | const childProcess = require('child_process'); 20 | const package = require('../package.json'); 21 | 22 | const packageVersion = package.version; 23 | const runCli = args => 24 | new Promise(resolve => { 25 | childProcess.execFile( 26 | 'node', 27 | [path.resolve(__dirname, '..', 'bin', 'budoux.js'), ...args], 28 | (error, stdout, stderr) => { 29 | resolve({ 30 | error, 31 | stdout, 32 | stderr, 33 | }); 34 | } 35 | ); 36 | }); 37 | 38 | runCli(['-V']).then(({stdout}) => { 39 | assert.equal( 40 | stdout.replace('\n', ''), 41 | packageVersion, 42 | 'Package version and CLI version output (-V) should match.' 43 | ); 44 | }); 45 | 46 | runCli(['--version']).then(({stdout}) => { 47 | assert.equal( 48 | stdout.replace('\n', ''), 49 | packageVersion, 50 | 'Package version and CLI version output (--version) should match.' 51 | ); 52 | }); 53 | -------------------------------------------------------------------------------- /javascript/scripts/copy-data.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | const path = require('path'); 18 | const fs = require('fs'); 19 | 20 | const PROJECT_ROOT = path.join(__dirname, '..', '..'); 21 | const DATA_DIR = path.join(PROJECT_ROOT, 'javascript', 'src', 'data'); 22 | fs.mkdirSync(path.join(DATA_DIR, 'models'), {recursive: true}); 23 | 24 | const copyModels = () => { 25 | const modelsDirPath = path.join(PROJECT_ROOT, 'budoux', 'models'); 26 | const files = fs.readdirSync(modelsDirPath); 27 | files.forEach(file => { 28 | const ext = file.split('.').pop(); 29 | const body = file.split('.').slice(0, -1).join('.'); 30 | if (ext !== 'json') return; 31 | const sourcePath = path.join(modelsDirPath, file); 32 | const targetPath = path.join(DATA_DIR, 'models', `${body}.ts`); 33 | const content = fs.readFileSync(sourcePath); 34 | fs.writeFileSync( 35 | targetPath, 36 | `export const model: {[key:string]: {[key:string]: number}} = ${content}` 37 | ); 38 | }); 39 | }; 40 | 41 | const main = () => { 42 | copyModels(); 43 | }; 44 | 45 | main(); 46 | -------------------------------------------------------------------------------- /javascript/src/cli.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import {readFileSync} from 'fs'; 18 | import * as path from 'path'; 19 | import * as readline from 'readline'; 20 | import {Command} from 'commander'; 21 | import { 22 | HTMLProcessingParser, 23 | loadDefaultParsers, 24 | loadDefaultJapaneseParser, 25 | } from './index.js'; 26 | 27 | const CLI_VERSION = '0.7.0'; 28 | const defaultParsers = loadDefaultParsers(); 29 | 30 | /** 31 | * Run the command line interface program. 32 | * @param argv process.argv. 33 | */ 34 | export const cli = (argv: string[]) => { 35 | const program = new Command('budoux'); 36 | 37 | program.usage('[-h] [-H] [-d STR] [-s STR] [-m JSON] [-l LANG] [-V] [TXT]'); 38 | program.description( 39 | 'BudouX is the successor to Budou, the machine learning powered line break organizer tool.' 40 | ); 41 | program 42 | .option('-H, --html', 'HTML mode', false) 43 | .option( 44 | '-d, --delim ', 45 | 'output sentence delimiter in TEXT mode', 46 | '---' 47 | ) 48 | .option('-s, --sep ', 'output phrase separator in TEXT mode', '\n') 49 | .option('-m, --model ', 'model file path') 50 | .option( 51 | '-l, --lang ', 52 | `language model to use. -m and --model will be prioritized if any.\navailable languages: ${[ 53 | ...defaultParsers.keys(), 54 | ].join(', ')}` 55 | ) 56 | .argument('[txt]', 'text') 57 | .allowExcessArguments(); 58 | 59 | program.version(CLI_VERSION); 60 | 61 | program.parse(argv); 62 | 63 | const options = program.opts(); 64 | const {lang, model, delim, sep, html} = options as { 65 | html: boolean; 66 | delim: string; 67 | sep: string; 68 | model?: string; 69 | lang?: string; 70 | }; 71 | const {args} = program; 72 | 73 | const parser = model 74 | ? loadCustomParser(model) 75 | : lang && defaultParsers.has(lang) 76 | ? defaultParsers.get(lang)! 77 | : loadDefaultJapaneseParser(); 78 | 79 | switch (args.length) { 80 | case 0: { 81 | const rl = readline.createInterface({ 82 | input: process.stdin, 83 | }); 84 | 85 | let stdin = ''; 86 | rl.on('line', line => { 87 | stdin += line + '\n'; 88 | }); 89 | process.stdin.on('end', () => { 90 | outputParsedTexts(parser, html, delim, sep, [stdin]); 91 | }); 92 | break; 93 | } 94 | case 1: { 95 | outputParsedTexts(parser, html, delim, sep, args); 96 | break; 97 | } 98 | default: { 99 | throw new Error( 100 | 'Too many arguments. Please, pass the only one argument.' 101 | ); 102 | } 103 | } 104 | }; 105 | 106 | /** 107 | * Run the command line interface program. 108 | * @param parser A parser. 109 | * @param html A flag of html output mode. 110 | * @param delim A delimiter to separate output sentence. 111 | * @param sep A separator to separate output phrases. 112 | * @param args string array to parse. Array should have only one element. 113 | */ 114 | const outputParsedTexts = ( 115 | parser: HTMLProcessingParser, 116 | html: boolean, 117 | delim: string, 118 | sep: string, 119 | args: string[] 120 | ) => { 121 | if (html) { 122 | const text = args[0]; 123 | const output = parser.translateHTMLString(text); 124 | console.log(output); 125 | } else { 126 | const splitedTextsByNewLine = args[0] 127 | .split(/\r?\n/) 128 | .filter(text => text !== ''); 129 | splitedTextsByNewLine.forEach((text, index) => { 130 | const parsedTexts = parser.parse(text); 131 | console.log(parsedTexts.join(sep)); 132 | if (index + 1 !== splitedTextsByNewLine.length) console.log(delim); 133 | }); 134 | } 135 | }; 136 | 137 | /** 138 | * Loads a parser equipped with custom model. 139 | * @return A parser with the loaded model. 140 | */ 141 | const loadCustomParser = (modelPath: string) => { 142 | const file = readFileSync(path.resolve(modelPath)).toString(); 143 | const model = JSON.parse(file); 144 | return new HTMLProcessingParser(model); 145 | }; 146 | -------------------------------------------------------------------------------- /javascript/src/dom-browser.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /** 18 | * Parses an html string and returns a parsed html document. 19 | * @param html An HTML string. 20 | * @return A Document. 21 | */ 22 | export const parseFromString = (html: string) => { 23 | return new window.DOMParser().parseFromString(html, 'text/html'); 24 | }; 25 | -------------------------------------------------------------------------------- /javascript/src/dom.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import {DOMParser} from 'linkedom'; 18 | 19 | /** 20 | * Parses an html string and returns a parsed html document. 21 | * @param html An HTML string. 22 | * @return A Document. 23 | */ 24 | export const parseFromString = (html: string) => { 25 | return new DOMParser().parseFromString( 26 | `${html}`, 27 | 'text/html' 28 | ); 29 | }; 30 | -------------------------------------------------------------------------------- /javascript/src/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import {model as jaModel} from './data/models/ja.js'; 18 | import {model as zhHansModel} from './data/models/zh-hans.js'; 19 | import {model as zhHantModel} from './data/models/zh-hant.js'; 20 | import {model as thModel} from './data/models/th.js'; 21 | import {HTMLProcessingParser} from './html_processor.js'; 22 | 23 | export {Parser} from './parser.js'; 24 | export {HTMLProcessor, HTMLProcessingParser} from './html_processor.js'; 25 | export {jaModel, zhHansModel, zhHantModel}; 26 | 27 | /** 28 | * Loads a parser equipped with the default Japanese model. 29 | * @return A parser with the default Japanese model. 30 | */ 31 | export const loadDefaultJapaneseParser = () => { 32 | return new HTMLProcessingParser(jaModel); 33 | }; 34 | 35 | /** 36 | * Loads a parser equipped with the default Simplified Chinese model. 37 | * @return A parser with the default Simplified Chinese model. 38 | */ 39 | export const loadDefaultSimplifiedChineseParser = () => { 40 | return new HTMLProcessingParser(zhHansModel); 41 | }; 42 | 43 | /** 44 | * Loads a parser equipped with the default Traditional Chinese model. 45 | * @return A parser with the default Traditional Chinese model. 46 | */ 47 | export const loadDefaultTraditionalChineseParser = () => { 48 | return new HTMLProcessingParser(zhHantModel); 49 | }; 50 | 51 | /** 52 | * Loads a parser equipped with the default Thai model. 53 | * @returns A parser with the default Thai model. 54 | */ 55 | export const loadDefaultThaiParser = () => { 56 | return new HTMLProcessingParser(thModel); 57 | }; 58 | /** 59 | * Loads available default parsers. 60 | * @return A map between available lang codes and their default parsers. 61 | */ 62 | export const loadDefaultParsers = () => { 63 | return new Map([ 64 | ['ja', loadDefaultJapaneseParser()], 65 | ['zh-hans', loadDefaultSimplifiedChineseParser()], 66 | ['zh-hant', loadDefaultTraditionalChineseParser()], 67 | ['th', loadDefaultThaiParser()], 68 | ]); 69 | }; 70 | -------------------------------------------------------------------------------- /javascript/src/parser.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /** 18 | * Base BudouX parser. 19 | */ 20 | export class Parser { 21 | /** BudouX model data */ 22 | private readonly model: Map>; 23 | private readonly baseScore: number; 24 | 25 | /** 26 | * Constructs a BudouX parser. 27 | * @param model A model data. 28 | */ 29 | constructor(model: {[key: string]: {[key: string]: number}}) { 30 | this.model = new Map( 31 | Object.entries(model).map(([k, v]) => [k, new Map(Object.entries(v))]) 32 | ); 33 | this.baseScore = 34 | -0.5 * 35 | [...this.model.values()] 36 | .map(group => [...group.values()]) 37 | .flat() 38 | .reduce((prev, curr) => prev + curr, 0); 39 | } 40 | 41 | /** 42 | * Parses the input sentence and returns a list of semantic chunks. 43 | * 44 | * @param sentence An input sentence. 45 | * @return The retrieved chunks. 46 | */ 47 | parse(sentence: string): string[] { 48 | if (sentence === '') return []; 49 | const boundaries = this.parseBoundaries(sentence); 50 | const result = []; 51 | let start = 0; 52 | for (const boundary of boundaries) { 53 | result.push(sentence.slice(start, boundary)); 54 | start = boundary; 55 | } 56 | result.push(sentence.slice(start)); 57 | return result; 58 | } 59 | 60 | /** 61 | * Parses the input sentence and returns a list of boundaries. 62 | * 63 | * @param sentence An input sentence. 64 | * @return The list of boundaries. 65 | */ 66 | parseBoundaries(sentence: string): number[] { 67 | const result = []; 68 | 69 | for (let i = 1; i < sentence.length; i++) { 70 | let score = this.baseScore; 71 | // NOTE: Score values in models may be negative. 72 | /* eslint-disable */ 73 | score += this.model.get('UW1')?.get(sentence.substring(i - 3, i - 2)) || 0; 74 | score += this.model.get('UW2')?.get(sentence.substring(i - 2, i - 1)) || 0; 75 | score += this.model.get('UW3')?.get(sentence.substring(i - 1, i)) || 0; 76 | score += this.model.get('UW4')?.get(sentence.substring(i, i + 1)) || 0; 77 | score += this.model.get('UW5')?.get(sentence.substring(i + 1, i + 2)) || 0; 78 | score += this.model.get('UW6')?.get(sentence.substring(i + 2, i + 3)) || 0; 79 | score += this.model.get('BW1')?.get(sentence.substring(i - 2, i)) || 0; 80 | score += this.model.get('BW2')?.get(sentence.substring(i - 1, i + 1)) || 0; 81 | score += this.model.get('BW3')?.get(sentence.substring(i, i + 2)) || 0; 82 | score += this.model.get('TW1')?.get(sentence.substring(i - 3, i)) || 0; 83 | score += this.model.get('TW2')?.get(sentence.substring(i - 2, i + 1)) || 0; 84 | score += this.model.get('TW3')?.get(sentence.substring(i - 1, i + 2)) || 0; 85 | score += this.model.get('TW4')?.get(sentence.substring(i, i + 3)) || 0; 86 | /* eslint-enable */ 87 | if (score > 0) result.push(i); 88 | } 89 | return result; 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /javascript/src/tests/index.browser.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2023 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import './test_html_processor.js'; 18 | import './test_parser.js'; 19 | import './test_webcomponents.js'; 20 | -------------------------------------------------------------------------------- /javascript/src/tests/index.node.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2023 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import './test_cli.js'; 18 | import './test_html_processor.js'; 19 | import './test_parser.js'; 20 | -------------------------------------------------------------------------------- /javascript/src/tests/models/separate_right_before_a.json: -------------------------------------------------------------------------------- 1 | {"UW4": {"a": 1001}} 2 | -------------------------------------------------------------------------------- /javascript/src/tests/test_cli.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import {cli} from '../cli.js'; 18 | import {execFile, ExecFileException} from 'child_process'; 19 | import * as path from 'path'; 20 | import stream from 'stream'; 21 | import {loadDefaultParsers} from '../index.js'; 22 | 23 | type execFileCallBack = { 24 | error: ExecFileException | null; 25 | stdout: string; 26 | stderr: string; 27 | }; 28 | 29 | const runCli = (args: string[], stdin?: string): Promise => { 30 | return new Promise(resolve => { 31 | const binPath = path.resolve('./bin/budoux.js'); 32 | const child = execFile( 33 | 'node', 34 | [binPath, ...args], 35 | (error, stdout, stderr) => { 36 | resolve({ 37 | error, 38 | stdout, 39 | stderr, 40 | }); 41 | } 42 | ); 43 | 44 | if (stdin) { 45 | const stdinStream = new stream.Readable(); 46 | stdinStream.push(stdin); 47 | stdinStream.push(null); 48 | if (child.stdin) { 49 | stdinStream.pipe(child.stdin); 50 | } 51 | } 52 | }); 53 | }; 54 | 55 | describe('cli', () => { 56 | it('should output the wrapped HTML sentence when execute budoux command with --html option.', async () => { 57 | const inputText = '今日は天気です。'; 58 | const argv = ['--html', inputText]; 59 | const expectedStdOut = 60 | '今日は\u200B天気です。'; 61 | const {stdout} = await runCli(argv); 62 | expect(stdout.trim()).toBe(expectedStdOut); 63 | }); 64 | 65 | it('should output the wrapped HTML sentence when execute budoux command with -H option alias.', async () => { 66 | const inputText = '今日は天気です。'; 67 | const argv = ['-H', inputText]; 68 | const expectedStdOut = 69 | '今日は\u200B天気です。'; 70 | const {stdout} = await runCli(argv); 71 | expect(stdout.trim()).toBe(expectedStdOut); 72 | }); 73 | 74 | it('should output the separated sentence with custom model when execute budoux command with --model option.', async () => { 75 | const inputText = 'abcdeabcd'; 76 | const customModelPath = path.resolve( 77 | __dirname, 78 | 'models', 79 | 'separate_right_before_a.json' 80 | ); 81 | const argv = ['--model', customModelPath, inputText]; 82 | const expectedStdOut = 'abcde\nabcd'; 83 | const {stdout} = await runCli(argv); 84 | expect(stdout.trim()).toBe(expectedStdOut); 85 | }); 86 | 87 | it('should output the separated sentence with custom model when execute budoux command with -m option alias.', async () => { 88 | const inputText = 'abcdeabcd'; 89 | const customModelPath = path.resolve( 90 | __dirname, 91 | 'models', 92 | 'separate_right_before_a.json' 93 | ); 94 | const argv = ['-m', customModelPath, inputText]; 95 | const expectedStdOut = 'abcde\nabcd'; 96 | const {stdout} = await runCli(argv); 97 | expect(stdout.trim()).toBe(expectedStdOut); 98 | }); 99 | 100 | it('should use the corresponding language model when the -l parameter is given.', async () => { 101 | const inputTextHans = '我们的使命是整合全球信息,供大众使用,让人人受益。'; 102 | const expectedStdOut = loadDefaultParsers() 103 | .get('zh-hans')! 104 | .parse(inputTextHans) 105 | .join('\n'); 106 | const argv = ['-l', 'zh-hans', inputTextHans]; 107 | const {stdout} = await runCli(argv); 108 | expect(stdout.trim()).toBe(expectedStdOut); 109 | }); 110 | 111 | it('should use the corresponding language model when the --lang parameter is given.', async () => { 112 | const inputTextHans = '我們的使命是匯整全球資訊,供大眾使用,使人人受惠。'; 113 | const expectedStdOut = loadDefaultParsers() 114 | .get('zh-hant')! 115 | .parse(inputTextHans) 116 | .join('\n'); 117 | const argv = ['--lang', 'zh-hant', inputTextHans]; 118 | const {stdout} = await runCli(argv); 119 | expect(stdout.trim()).toBe(expectedStdOut); 120 | }); 121 | 122 | it('should prioritize -m and --model over -l and --lang', async () => { 123 | const inputTextHans = '我們的使a命'; 124 | const customModelPath = path.resolve( 125 | __dirname, 126 | 'models', 127 | 'separate_right_before_a.json' 128 | ); 129 | const argv = [ 130 | '--model', 131 | customModelPath, 132 | '--lang', 133 | 'zh-hant', 134 | inputTextHans, 135 | ]; 136 | const expectedStdOut = '我們的使\na命'; 137 | const {stdout} = await runCli(argv); 138 | expect(stdout.trim()).toBe(expectedStdOut); 139 | }); 140 | 141 | it('should output the separated sentence with separater when execute budoux command with --delim option.', async () => { 142 | const inputText = '今日は天気です。\n明日は雨かな?'; 143 | const argv = ['--delim', '###', inputText]; 144 | const expectedStdOut = '今日は\n天気です。\n###\n明日は\n雨かな?'; 145 | const {stdout} = await runCli(argv); 146 | expect(stdout.trim()).toBe(expectedStdOut); 147 | }); 148 | 149 | it('should output the separated sentence with separater when execute budoux command with -d option alias.', async () => { 150 | const inputText = '今日は天気です。\n明日は雨かな?'; 151 | const argv = ['-d', '###', inputText]; 152 | const expectedStdOut = '今日は\n天気です。\n###\n明日は\n雨かな?'; 153 | const {stdout} = await runCli(argv); 154 | expect(stdout.trim()).toBe(expectedStdOut); 155 | }); 156 | 157 | it('should output the separated sentence with separater when execute budoux with stdin inputed by pipe', async () => { 158 | const {stdout} = await runCli([], '今日は天気です。\n明日は雨かな?'); 159 | const expectedStdOut = '今日は\n天気です。\n---\n明日は\n雨かな?'; 160 | expect(stdout.trim()).toBe(expectedStdOut); 161 | }); 162 | 163 | it('should output phrases with the separator specified by -s option', async () => { 164 | const inputText = '今日は天気です。'; 165 | const argv = ['-s', '/', inputText]; 166 | const expectedStdOut = '今日は/天気です。'; 167 | const {stdout} = await runCli(argv); 168 | expect(stdout.trim()).toBe(expectedStdOut); 169 | }); 170 | 171 | it('should output phrases with the separator specified by --sep option', async () => { 172 | const inputText = '今日は天気です。'; 173 | const argv = ['--sep', '/', inputText]; 174 | const expectedStdOut = '今日は/天気です。'; 175 | const {stdout} = await runCli(argv); 176 | expect(stdout.trim()).toBe(expectedStdOut); 177 | }); 178 | 179 | it('should output the error message when get more than one text argument.', () => { 180 | const argv = [ 181 | 'node', 182 | 'budoux', 183 | '今日は天気です。', 184 | '明日は晴れるでしょう。', 185 | ]; 186 | const stab = () => cli(argv); 187 | 188 | expect(stab).toThrowError( 189 | 'Too many arguments. Please, pass the only one argument.' 190 | ); 191 | }); 192 | 193 | it('should output the error message when get extra option argument.', () => { 194 | const argv = [ 195 | 'node', 196 | 'budoux', 197 | '--delim', 198 | '---', 199 | '', 200 | '今日は天気です。', 201 | ]; 202 | const stab = () => cli(argv); 203 | 204 | expect(stab).toThrowError( 205 | 'Too many arguments. Please, pass the only one argument.' 206 | ); 207 | }); 208 | 209 | it('should output the error message when get extra option argument.', () => { 210 | const customModelPath = path.resolve( 211 | __dirname, 212 | 'models', 213 | 'separate_right_before_a.json' 214 | ); 215 | const argv = [ 216 | 'node', 217 | 'budoux', 218 | '--model', 219 | customModelPath, 220 | '', 221 | '今日は天気です。', 222 | ]; 223 | const stab = () => cli(argv); 224 | 225 | expect(stab).toThrowError( 226 | 'Too many arguments. Please, pass the only one argument.' 227 | ); 228 | }); 229 | 230 | it('should output the unknown option error when execute budoux command with -v option.', async () => { 231 | const {stderr} = await runCli(['-v']); 232 | 233 | expect(stderr).toBe("error: unknown option '-v'\n"); 234 | }); 235 | }); 236 | -------------------------------------------------------------------------------- /javascript/src/tests/test_parser.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import {Parser} from '../parser.js'; 18 | 19 | describe('Parser.parse', () => { 20 | const TEST_SENTENCE = 'abcdeabcd'; 21 | 22 | it('should separate if a strong feature item supports.', () => { 23 | const model = { 24 | UW4: {a: 10000}, // means "should separate right before 'a'". 25 | }; 26 | const parser = new Parser(model); 27 | const result = parser.parse(TEST_SENTENCE); 28 | expect(result).toEqual(['abcde', 'abcd']); 29 | }); 30 | 31 | it('should separate even if it makes a phrase of one character.', () => { 32 | const model = { 33 | UW4: {b: 10000}, // means "should separate right before 'b'". 34 | }; 35 | const parser = new Parser(model); 36 | const result = parser.parse(TEST_SENTENCE); 37 | expect(result).toEqual(['a', 'bcdea', 'bcd']); 38 | }); 39 | 40 | it('should return an empty list when the input is a blank string.', () => { 41 | const parser = new Parser({}); 42 | const result = parser.parse(''); 43 | expect(result).toEqual([]); 44 | }); 45 | }); 46 | -------------------------------------------------------------------------------- /javascript/src/tests/test_webcomponents.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import '../webcomponents/budoux-ja.js'; 18 | 19 | describe('Web Components', () => { 20 | beforeAll(async () => { 21 | await window.customElements.whenDefined('budoux-ja'); 22 | }); 23 | 24 | beforeEach(() => { 25 | window.document.body.innerText = ''; 26 | }); 27 | 28 | it('should process the provided text.', () => { 29 | const budouxElement = window.document.createElement('budoux-ja'); 30 | budouxElement.textContent = '今日は良い天気です。'; 31 | window.document.body.appendChild(budouxElement); 32 | 33 | expect(budouxElement.innerHTML).toBe('今日は\u200B良い\u200B天気です。'); 34 | }); 35 | 36 | it('should react to text content changes after attached.', resolve => { 37 | const budouxElement = window.document.createElement('budoux-ja'); 38 | budouxElement.textContent = '今日は良い天気です。'; 39 | window.document.body.appendChild(budouxElement); 40 | 41 | const observer = new window.MutationObserver(() => { 42 | expect(budouxElement.innerHTML).toBe('明日は\u200B晴れるかな?'); 43 | resolve(); 44 | }); 45 | observer.observe(budouxElement, { 46 | childList: true, 47 | }); 48 | budouxElement.textContent = '明日は晴れるかな?'; 49 | }); 50 | 51 | it('should work with HTML inputs.', () => { 52 | const budouxElement = window.document.createElement('budoux-ja'); 53 | budouxElement.appendChild(window.document.createTextNode('昨日は')); 54 | const b = window.document.createElement('b'); 55 | b.textContent = '雨'; 56 | budouxElement.appendChild(b); 57 | budouxElement.appendChild(window.document.createTextNode('でした。')); 58 | window.document.body.appendChild(budouxElement); 59 | expect(budouxElement.innerHTML).toBe('昨日は\u200B雨でした。'); 60 | }); 61 | 62 | it('should have wrapping styles to control line breaks.', () => { 63 | const budouxElement = window.document.createElement('budoux-ja'); 64 | budouxElement.textContent = 'Hello world'; 65 | window.document.body.appendChild(budouxElement); 66 | const styles = budouxElement.computedStyleMap(); 67 | expect(styles.get('word-break')?.toString()).toBe('keep-all'); 68 | expect(styles.get('overflow-wrap')?.toString()).toBe('anywhere'); 69 | }); 70 | }); 71 | -------------------------------------------------------------------------------- /javascript/src/tests/testutils-browser.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2025 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /** 18 | * Sets an innerHTML on a given Element. 19 | * @param element An Element. 20 | * @param html An HTML string to set. 21 | */ 22 | export const setInnerHtml = (element: Element, html: string) => { 23 | element.innerHTML = html; 24 | }; 25 | 26 | /** 27 | * Creates an HTML document. 28 | * @returns Document 29 | */ 30 | export const createDocument = () => { 31 | return window.document; 32 | }; 33 | 34 | /** 35 | * Whether the running environment is a Web browser. 36 | */ 37 | export const isBrowser = true; 38 | -------------------------------------------------------------------------------- /javascript/src/tests/testutils.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2025 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import {parseHTML} from 'linkedom'; 18 | 19 | /** 20 | * Sets an innerHTML on a given Element. 21 | * @param element An Element. 22 | * @param html An HTML string to set. 23 | */ 24 | export const setInnerHtml = (element: Element, html: string) => { 25 | element.innerHTML = html; 26 | }; 27 | 28 | /** 29 | * Creates an HTML document. 30 | * @returns Document 31 | */ 32 | export const createDocument = () => { 33 | const {document} = parseHTML(''); 34 | return document; 35 | }; 36 | 37 | /** 38 | * Whether the running environment is a Web browser. 39 | */ 40 | export const isBrowser = false; 41 | -------------------------------------------------------------------------------- /javascript/src/webcomponents/budoux-base.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import {applyWrapStyle, type HTMLProcessingParser} from '../html_processor.js'; 18 | 19 | const MUTATION_OBSERVER_OPTIONS = { 20 | attributes: false, 21 | characterData: true, 22 | childList: true, 23 | subtree: true, 24 | }; 25 | 26 | /** 27 | * Base BudouX Web component. 28 | */ 29 | export abstract class BudouXBaseElement extends HTMLElement { 30 | abstract parser: HTMLProcessingParser; 31 | observer: MutationObserver; 32 | 33 | /** 34 | * Base BudouX Web component constructor. 35 | */ 36 | constructor() { 37 | super(); 38 | 39 | this.observer = new MutationObserver(this.sync.bind(this)); 40 | this.observer.observe(this, MUTATION_OBSERVER_OPTIONS); 41 | } 42 | 43 | connectedCallback() { 44 | applyWrapStyle(this); 45 | this.sync(); 46 | } 47 | 48 | attributeChangedCallback() { 49 | this.sync(); 50 | } 51 | 52 | sync() { 53 | this.observer.disconnect(); 54 | this.parser.applyToElement(this); 55 | this.observer.observe(this, MUTATION_OBSERVER_OPTIONS); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /javascript/src/webcomponents/budoux-ja.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import { 18 | type HTMLProcessingParser, 19 | loadDefaultJapaneseParser, 20 | } from '../index.js'; 21 | import {BudouXBaseElement} from './budoux-base.js'; 22 | 23 | /** 24 | * BudouX Japanese Web component. 25 | */ 26 | export class BudouXJapaneseElement extends BudouXBaseElement { 27 | parser: HTMLProcessingParser; 28 | 29 | /** 30 | * BudouX Japanese Web component constructor. 31 | */ 32 | constructor() { 33 | super(); 34 | this.parser = loadDefaultJapaneseParser(); 35 | } 36 | } 37 | 38 | customElements.define('budoux-ja', BudouXJapaneseElement); 39 | -------------------------------------------------------------------------------- /javascript/src/webcomponents/budoux-th.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2023 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import {type HTMLProcessingParser, loadDefaultThaiParser} from '../index.js'; 18 | import {BudouXBaseElement} from './budoux-base.js'; 19 | 20 | /** 21 | * BudouX Thai Web component. 22 | */ 23 | export class BudouXThaiElement extends BudouXBaseElement { 24 | parser: HTMLProcessingParser; 25 | 26 | /** 27 | * BudouX Thai Web component constructor. 28 | */ 29 | constructor() { 30 | super(); 31 | this.parser = loadDefaultThaiParser(); 32 | } 33 | } 34 | 35 | customElements.define('budoux-th', BudouXThaiElement); 36 | -------------------------------------------------------------------------------- /javascript/src/webcomponents/budoux-zh-hans.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2021 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import { 18 | type HTMLProcessingParser, 19 | loadDefaultSimplifiedChineseParser, 20 | } from '../index.js'; 21 | import {BudouXBaseElement} from './budoux-base.js'; 22 | 23 | /** 24 | * BudouX Simplified Chinese Web component. 25 | */ 26 | export class BudouXSimplifiedChineseElement extends BudouXBaseElement { 27 | parser: HTMLProcessingParser; 28 | 29 | /** 30 | * BudouX Simplified Chinese Web component constructor. 31 | */ 32 | constructor() { 33 | super(); 34 | this.parser = loadDefaultSimplifiedChineseParser(); 35 | } 36 | } 37 | 38 | customElements.define('budoux-zh-hans', BudouXSimplifiedChineseElement); 39 | -------------------------------------------------------------------------------- /javascript/src/webcomponents/budoux-zh-hant.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2022 Google LLC 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import { 18 | type HTMLProcessingParser, 19 | loadDefaultTraditionalChineseParser, 20 | } from '../index.js'; 21 | import {BudouXBaseElement} from './budoux-base.js'; 22 | 23 | /** 24 | * BudouX Traditional Chinese Web component. 25 | */ 26 | export class BudouXTraditionalChineseElement extends BudouXBaseElement { 27 | parser: HTMLProcessingParser; 28 | 29 | /** 30 | * BudouX Traditional Chinese Web component constructor. 31 | */ 32 | constructor() { 33 | super(); 34 | this.parser = loadDefaultTraditionalChineseParser(); 35 | } 36 | } 37 | 38 | customElements.define('budoux-zh-hant', BudouXTraditionalChineseElement); 39 | -------------------------------------------------------------------------------- /javascript/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "allowUnreachableCode": false, 4 | "allowUnusedLabels": false, 5 | "lib": ["es6", "dom", "dom.iterable"], 6 | "target": "es2017", 7 | "module": "commonjs", 8 | "moduleResolution": "node", 9 | "noEmitOnError": true, 10 | "noFallthroughCasesInSwitch": true, 11 | "noImplicitReturns": true, 12 | "pretty": true, 13 | "resolveJsonModule": true, 14 | "declaration": true, 15 | "sourceMap": true, 16 | "esModuleInterop": true, 17 | "forceConsistentCasingInFileNames": true, 18 | "strict": true, 19 | "skipLibCheck": true, 20 | "outDir": "./dist" 21 | }, 22 | "exclude": [ 23 | "node_modules" 24 | ], 25 | "include": ["src/**/*.ts"] 26 | } 27 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["wheel", "setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/budoux/1f232ee0b7b11f9a7b7b3e02a0eecc2aaafe26ca/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/build_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Builds a model from the learned weights. 15 | 16 | This script outputs a model file in JSON format from the learned weights file 17 | output by the `train.py` script. 18 | """ 19 | 20 | import argparse 21 | import json 22 | import typing 23 | 24 | 25 | def aggregate_scores( 26 | weights: typing.List[str]) -> typing.Dict[str, typing.Dict[str, float]]: 27 | """Exports the model by aggregating the weight scores. 28 | 29 | Args: 30 | weights (List[str]): The lines of exported weight score file. 31 | 32 | Returns: 33 | model (Dict[string, Dict[string, float]]) The exported model. 34 | """ 35 | decision_trees: typing.Dict[str, typing.Dict[str, float]] = dict() 36 | for row in weights: 37 | row = row.strip() 38 | if not row: 39 | continue 40 | feature = row.split('\t')[0] 41 | feature_group, feature_content = feature.split(':', 1) 42 | score = float(row.split('\t')[1]) 43 | decision_trees.setdefault(feature_group, {}) 44 | decision_trees[feature_group].setdefault(feature_content, 0) 45 | decision_trees[feature_group][feature_content] += score 46 | return decision_trees 47 | 48 | 49 | def round_model(model: typing.Dict[str, typing.Dict[str, float]], 50 | scale: int) -> typing.Dict[str, typing.Dict[str, int]]: 51 | """Rounds the scores in the model to integer after scaling. 52 | 53 | Args: 54 | model (Dict[str, Dict[str, float]]): The model to round scores. 55 | scale (int, optional): A scale factor to multiply scores. 56 | 57 | Returns: 58 | model_rounded (Dict[str, Dict[str, int]]) The rounded model. 59 | """ 60 | model_rounded: typing.Dict[str, typing.Dict[str, int]] = dict() 61 | for feature_group, features in model.items(): 62 | for feature_content, score in features.items(): 63 | scaled_score = int(score * scale) 64 | if abs(scaled_score) > 0: 65 | model_rounded.setdefault(feature_group, {}) 66 | model_rounded[feature_group][feature_content] = scaled_score 67 | return model_rounded 68 | 69 | 70 | def parse_args( 71 | test: typing.Optional[typing.List[str]] = None) -> argparse.Namespace: 72 | """Parses commandline arguments. 73 | 74 | Args: 75 | test (typing.Optional[typing.List[str]], optional): Commandline args for 76 | testing. Defaults to None. 77 | 78 | Returns: 79 | Parsed arguments (argparse.Namespace). 80 | """ 81 | parser = argparse.ArgumentParser( 82 | description=__doc__, formatter_class=argparse.RawTextHelpFormatter) 83 | parser.add_argument( 84 | 'weight_file', help='A file path for the learned weights.') 85 | parser.add_argument( 86 | '-o', 87 | '--outfile', 88 | help='A file path to export a model file. (default: model.json)', 89 | default='model.json', 90 | type=str) 91 | parser.add_argument( 92 | '--scale', 93 | help='A scale factor for the output scores', 94 | default=1000, 95 | type=int) 96 | if test is None: 97 | return parser.parse_args() 98 | else: 99 | return parser.parse_args(test) 100 | 101 | 102 | def main() -> None: 103 | args = parse_args() 104 | weights_filename = args.weight_file 105 | model_filename = args.outfile 106 | scale = args.scale 107 | with open(weights_filename) as f: 108 | weights = f.readlines() 109 | model = aggregate_scores(weights) 110 | model_rounded = round_model(model, scale) 111 | with open(model_filename, 'w', encoding='utf-8') as f: 112 | json.dump(model_rounded, f, ensure_ascii=False, separators=(',', ':')) 113 | print('Model file is exported as', model_filename) 114 | 115 | 116 | if __name__ == '__main__': 117 | main() 118 | -------------------------------------------------------------------------------- /scripts/encode_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Encodes the training data with extracted features.""" 15 | 16 | import argparse 17 | import functools 18 | import itertools 19 | import multiprocessing 20 | import os 21 | import sys 22 | import typing 23 | 24 | # module hack 25 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..') 26 | sys.path.insert(0, os.path.abspath(LIB_PATH)) 27 | 28 | from budoux import utils # noqa (module hack) 29 | 30 | ArgList = typing.Optional[typing.List[str]] 31 | DEFAULT_OUTPUT_FILENAME = 'encoded_data.txt' 32 | 33 | INVALID = '▔' 34 | """The invalid feature string.""" 35 | 36 | 37 | def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, 38 | w6: str) -> typing.List[str]: 39 | """Generates a feature from characters around (w1-6). 40 | 41 | Args: 42 | w1 (str): The character 3 characters before the break point. 43 | w2 (str): The character 2 characters before the break point. 44 | w3 (str): The character right before the break point. 45 | w4 (str): The character right after the break point. 46 | w5 (str): The character 2 characters after the break point. 47 | w6 (str): The character 3 characters after the break point. 48 | 49 | Returns: 50 | The feature (list[str]). 51 | 52 | """ 53 | raw_feature = { 54 | 'UW1': w1, 55 | 'UW2': w2, 56 | 'UW3': w3, 57 | 'UW4': w4, 58 | 'UW5': w5, 59 | 'UW6': w6, 60 | 'BW1': w2 + w3, 61 | 'BW2': w3 + w4, 62 | 'BW3': w4 + w5, 63 | 'TW1': w1 + w2 + w3, 64 | 'TW2': w2 + w3 + w4, 65 | 'TW3': w3 + w4 + w5, 66 | 'TW4': w4 + w5 + w6, 67 | } 68 | for key, value in list(raw_feature.items()): 69 | if INVALID in value: 70 | del raw_feature[key] 71 | return [f'{item[0]}:{item[1]}' for item in raw_feature.items()] 72 | 73 | 74 | def parse_args(test: ArgList = None) -> argparse.Namespace: 75 | """Parses commandline arguments. 76 | 77 | Args: 78 | test (typing.Optional[typing.List[str]], optional): Commandline args for testing. Defaults to None. 79 | 80 | Returns: 81 | argparse.Namespace: Parsed data of args. 82 | """ 83 | parser = argparse.ArgumentParser(description=__doc__) 84 | parser.add_argument( 85 | 'source_data', 86 | help='''File path of the source training data to extract features.''') 87 | parser.add_argument( 88 | '-o', 89 | '--outfile', 90 | help='''Output file path for the encoded training data. 91 | (default: encoded_data.txt)''', 92 | default=DEFAULT_OUTPUT_FILENAME) 93 | parser.add_argument( 94 | '--processes', 95 | type=int, 96 | help='''Number of processes to use. 97 | (default: the number of CPUs in the system)''', 98 | default=None) 99 | parser.add_argument( 100 | '--scale', 101 | type=int, 102 | help='''Weight scale for the entries. The value should be a unsigned 103 | integer. (default: 1)''', 104 | default=1) 105 | if test is None: 106 | return parser.parse_args() 107 | else: 108 | return parser.parse_args(test) 109 | 110 | 111 | def process(i: int, sentence: str, sep_indices: typing.Set[int], 112 | scale: int) -> str: 113 | """Outputs an encoded line of features from the given index. 114 | 115 | Args: 116 | i (int): index 117 | sentence (str): A sentence 118 | sep_indices (typing.Set[int]): A set of separator indices. 119 | scale (int): A weight scale for the entries. 120 | """ 121 | feature = get_feature(sentence[i - 3] if i > 2 else INVALID, 122 | sentence[i - 2] if i > 1 else INVALID, sentence[i - 1], 123 | sentence[i] if i < len(sentence) else INVALID, 124 | sentence[i + 1] if i + 1 < len(sentence) else INVALID, 125 | sentence[i + 2] if i + 2 < len(sentence) else INVALID) 126 | positive = i in sep_indices 127 | line = '\t'.join(['%d' % (scale) if positive else '%d' % (-scale)] + feature) 128 | return line 129 | 130 | 131 | def normalize_input(data: str) -> typing.Tuple[str, typing.Set[int]]: 132 | """Normalizes the input to one line with separators. 133 | 134 | Args: 135 | data(str): Source input 136 | 137 | Returns: 138 | typing.Tuple[str, typing.Set[int]]: A tuple of the sentence and the 139 | separator indices. 140 | """ 141 | chunks = data.replace('\n', utils.SEP).strip().split(utils.SEP) 142 | chunk_lengths = [len(chunk) for chunk in chunks] 143 | sep_indices = set(itertools.accumulate(chunk_lengths, lambda x, y: x + y)) 144 | sentence = ''.join(chunks) 145 | return (sentence, sep_indices) 146 | 147 | 148 | def main(test: ArgList = None) -> None: 149 | args = parse_args(test) 150 | source_filename: str = args.source_data 151 | entries_filename: str = args.outfile 152 | processes = None if args.processes is None else int(args.processes) 153 | scale: int = args.scale 154 | with open(source_filename, encoding=sys.getdefaultencoding()) as f: 155 | data = f.read() 156 | sentence, sep_indices = normalize_input(data) 157 | with multiprocessing.Pool(processes) as p: 158 | func = functools.partial( 159 | process, sentence=sentence, sep_indices=sep_indices, scale=scale) 160 | lines = p.map(func, range(1, len(sentence) + 1)) 161 | 162 | with open(entries_filename, 'w', encoding=sys.getdefaultencoding()) as f: 163 | for line in lines: 164 | f.write(line + '\n') 165 | 166 | print('\033[92mEncoded training data is out at: %s\033[0m' % entries_filename) 167 | 168 | 169 | if __name__ == '__main__': 170 | main() 171 | -------------------------------------------------------------------------------- /scripts/finetune.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Finetunes a BudouX model with the given training dataset. 15 | 16 | Example usage: 17 | 18 | $ python finetune.py train_data.txt base_model.json -o weights.txt --val_data=val_data.txt 19 | """ 20 | 21 | import argparse 22 | import array 23 | import json 24 | import typing 25 | from collections import OrderedDict 26 | 27 | from jax import Array, grad, jit 28 | from jax import numpy as jnp 29 | 30 | EPSILON = float(jnp.finfo(float).eps) 31 | DEFAULT_OUTPUT_NAME = 'finetuned-weights.txt' 32 | DEFAULT_NUM_ITERS = 1000 33 | DEFAULT_LOG_SPAN = 100 34 | DEFAULT_LEARNING_RATE = 0.01 35 | 36 | 37 | class NormalizedModel(typing.NamedTuple): 38 | features: typing.List[str] 39 | weights: Array 40 | 41 | 42 | class Dataset(typing.NamedTuple): 43 | X: Array 44 | Y: Array 45 | 46 | 47 | class Metrics(typing.NamedTuple): 48 | tp: int 49 | tn: int 50 | fp: int 51 | fn: int 52 | accuracy: float 53 | precision: float 54 | recall: float 55 | fscore: float 56 | loss: float 57 | 58 | 59 | def load_model(file_path: str) -> NormalizedModel: 60 | """Loads a model as a pair of a features list and a normalized weight vector. 61 | 62 | Args: 63 | file_path: A file path for the model JSON file. 64 | 65 | Returns: 66 | A normalized model, which is a pair of a list of feature identifiers and a 67 | normalized weight vector. 68 | """ 69 | with open(file_path) as f: 70 | model = json.load(f) 71 | model_flat = OrderedDict() 72 | for category in model: 73 | for item in model[category]: 74 | model_flat['%s:%s' % (category, item)] = model[category][item] 75 | weights = jnp.array(list(model_flat.values())) 76 | weights = weights / weights.std() 77 | weights = weights - weights.mean() 78 | keys = list(model_flat.keys()) 79 | return NormalizedModel(keys, weights) 80 | 81 | 82 | def load_dataset(file_path: str, model: NormalizedModel) -> Dataset: 83 | """Loads a dataset from the given file path. 84 | 85 | Args: 86 | file_path: A file path for the encoded data file. 87 | model: A normalized model. 88 | 89 | Returns: 90 | A dataset of inputs (X) and outputs (Y). 91 | """ 92 | xs = [] 93 | ys = array.array('B') 94 | with open(file_path) as f: 95 | for row in f: 96 | cols = row.strip().split('\t') 97 | if len(cols) < 2: 98 | continue 99 | ys.append(cols[0] == '1') 100 | xs.append(tuple(k in set(cols[1:]) for k in model.features)) 101 | X = jnp.array(xs) * 2 - 1 102 | Y = jnp.array(ys) 103 | return Dataset(X, Y) 104 | 105 | 106 | def cross_entropy_loss(weights: Array, x: Array, y: Array) -> Array: 107 | """Calcurates a cross entropy loss with a prediction by a sigmoid function. 108 | 109 | Args: 110 | weights: A weight vector. 111 | x: An input array. 112 | y: A target output array. 113 | 114 | Returns: 115 | A cross entropy loss. 116 | """ 117 | pred = 1 / (1 + jnp.exp(-x.dot(weights))) 118 | return -jnp.mean(y * jnp.log(pred) + (1 - y) * jnp.log(1 - pred)) 119 | 120 | 121 | def get_metrics(weights: Array, dataset: Dataset) -> Metrics: 122 | """Gets evaluation metrics from the learned weight vector and the dataset. 123 | 124 | Args: 125 | weights: A weight vector. 126 | dataset: A dataset. 127 | 128 | Returns: 129 | result (Metrics): The metrics over the given weights and the dataset. 130 | """ 131 | pred = dataset.X.dot(weights) > 0 132 | actual = dataset.Y 133 | tp: int = jnp.sum(jnp.logical_and(pred == 1, actual == 1)) # type: ignore 134 | tn: int = jnp.sum(jnp.logical_and(pred == 0, actual == 0)) # type: ignore 135 | fp: int = jnp.sum(jnp.logical_and(pred == 1, actual == 0)) # type: ignore 136 | fn: int = jnp.sum(jnp.logical_and(pred == 0, actual == 1)) # type: ignore 137 | loss: float = cross_entropy_loss(weights, dataset.X, 138 | dataset.Y) # type: ignore 139 | accuracy = (tp + tn) / (tp + tn + fp + fn) 140 | precision = tp / (tp + fp + EPSILON) 141 | recall = tp / (tp + fn + EPSILON) 142 | fscore = 2 * precision * recall / (precision + recall + EPSILON) 143 | return Metrics( 144 | tp=tp, 145 | tn=tn, 146 | fp=fp, 147 | fn=fn, 148 | accuracy=accuracy, 149 | precision=precision, 150 | recall=recall, 151 | fscore=fscore, 152 | loss=loss, 153 | ) 154 | 155 | 156 | def fit(weights: Array, 157 | train_dataset: Dataset, 158 | iters: int, 159 | learning_rate: float, 160 | log_span: int, 161 | val_dataset: typing.Optional[Dataset] = None) -> Array: 162 | """Updates the weights with the given dataset. 163 | 164 | Args: 165 | weights: A weight vector. 166 | train_dataset: A train dataset. 167 | iters: A number of iterations. 168 | learning_rate: A learning rate. 169 | log_span: A span to log metrics. 170 | val_dataset: A validation dataset (optional). 171 | 172 | Returns: 173 | An updated weight vector. 174 | """ 175 | grad_loss = jit(grad(cross_entropy_loss, argnums=0)) 176 | for t in range(iters): 177 | weights = weights - learning_rate * grad_loss(weights, train_dataset.X, 178 | train_dataset.Y) 179 | if (t + 1) % log_span != 0: 180 | continue 181 | metrics_train = jit(get_metrics)(weights, train_dataset) 182 | print() 183 | print('iter:\t%d' % (t + 1)) 184 | print() 185 | print('train accuracy:\t%.5f' % metrics_train.accuracy) 186 | print('train prec.:\t%.5f' % metrics_train.precision) 187 | print('train recall:\t%.5f' % metrics_train.recall) 188 | print('train fscore:\t%.5f' % metrics_train.fscore) 189 | print('train loss:\t%.5f' % metrics_train.loss) 190 | print() 191 | 192 | if val_dataset is None: 193 | continue 194 | metrics_val = jit(get_metrics)(weights, val_dataset) 195 | print('val accuracy:\t%.5f' % metrics_val.accuracy) 196 | print('val prec.:\t%.5f' % metrics_val.precision) 197 | print('val recall:\t%.5f' % metrics_val.recall) 198 | print('val fscore:\t%.5f' % metrics_val.fscore) 199 | print('val loss:\t%.5f' % metrics_val.loss) 200 | print() 201 | return weights 202 | 203 | 204 | def write_weights(file_path: str, weights: Array, 205 | features: typing.List[str]) -> None: 206 | """Writes learned weights and corresponsing features to a file. 207 | 208 | Args: 209 | file_path: A file path for the weights file. 210 | weights: A weight vector. 211 | features: A list of feature identifiers. 212 | """ 213 | with open(file_path, 'w') as f: 214 | f.write('\n'.join([ 215 | '%s\t%.6f' % (feature, weights[i]) for i, feature in enumerate(features) 216 | ])) 217 | 218 | 219 | def parse_args( 220 | test: typing.Optional[typing.List[str]] = None) -> argparse.Namespace: 221 | """Parses commandline arguments. 222 | 223 | Args: 224 | test (typing.Optional[typing.List[str]], optional): Commandline args for 225 | testing. Defaults to None. 226 | 227 | Returns: 228 | Parsed arguments (argparse.Namespace). 229 | """ 230 | parser = argparse.ArgumentParser( 231 | description=__doc__, formatter_class=argparse.RawTextHelpFormatter) 232 | parser.add_argument( 233 | 'train_data', help='File path for the encoded training data.') 234 | parser.add_argument('base_model', help='File path for the base model file.') 235 | parser.add_argument( 236 | '-o', 237 | '--output', 238 | help=f'File path for the output weights. (default: {DEFAULT_OUTPUT_NAME})', 239 | type=str, 240 | default=DEFAULT_OUTPUT_NAME) 241 | parser.add_argument( 242 | '--val-data', help='File path for the encoded validation data.', type=str) 243 | parser.add_argument( 244 | '--iters', 245 | help=f'Number of iterations for training. (default: {DEFAULT_NUM_ITERS})', 246 | type=int, 247 | default=DEFAULT_NUM_ITERS) 248 | parser.add_argument( 249 | '--log-span', 250 | help=f'Iteration span to print metrics. (default: {DEFAULT_LOG_SPAN})', 251 | type=int, 252 | default=DEFAULT_LOG_SPAN) 253 | parser.add_argument( 254 | '--learning-rate', 255 | help=f'Learning rate. (default: {DEFAULT_LEARNING_RATE})', 256 | type=float, 257 | default=DEFAULT_LEARNING_RATE) 258 | if test is None: 259 | return parser.parse_args() 260 | else: 261 | return parser.parse_args(test) 262 | 263 | 264 | def main() -> None: 265 | args = parse_args() 266 | train_data_path: str = args.train_data 267 | base_model_path: str = args.base_model 268 | weights_path: str = args.output 269 | iters: int = args.iters 270 | log_span: int = args.log_span 271 | learning_rate: float = args.learning_rate 272 | val_data_path: typing.Optional[str] = args.val_data 273 | 274 | model = load_model(base_model_path) 275 | train_dataset = load_dataset(train_data_path, model) 276 | val_dataset = load_dataset(val_data_path, model) if val_data_path else None 277 | weights = fit( 278 | model.weights, 279 | train_dataset, 280 | iters=iters, 281 | log_span=log_span, 282 | learning_rate=learning_rate, 283 | val_dataset=val_dataset) 284 | write_weights(weights_path, weights, model.features) 285 | 286 | 287 | if __name__ == '__main__': 288 | main() 289 | -------------------------------------------------------------------------------- /scripts/prepare_knbc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Prepares a dataset from the KNBC corpus. 15 | 16 | Before running this script, you need to download the KNBC corpus by running: 17 | 18 | $ curl -o knbc.tar.bz2 https://nlp.ist.i.kyoto-u.ac.jp/kuntt/KNBC_v1.0_090925_utf8.tar.bz2 19 | $ tar -xf knbc.tar.bz2 20 | 21 | Now you should have a directory named `KNBC_v1.0_090925_utf8`. 22 | Run the following to generate a dataset named `source_knbc.txt`. 23 | 24 | $ python scripts/prepare_knbc.py KNBC_v1.0_090925_utf8 -o source_knbc.txt 25 | """ 26 | 27 | import argparse 28 | import os 29 | import sys 30 | import typing 31 | from html.parser import HTMLParser 32 | 33 | # module hack 34 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..') 35 | sys.path.insert(0, os.path.abspath(LIB_PATH)) 36 | 37 | from budoux import utils # noqa (module hack) 38 | 39 | GRANULARITY_OPTIONS = {'phrase', 'tag', 'word'} 40 | Granularity = typing.Literal['phrase', 'tag', 'word'] 41 | 42 | 43 | class KNBCHTMLParser(HTMLParser): 44 | """Parses the HTML files in the KNBC corpus to collect chunks. 45 | 46 | Attributes: 47 | chunks: The collected chunks. 48 | row: The current row index. 49 | col: The current column index. 50 | current_word: The current word to process. 51 | on_split_row: Whether the scan is on the splitting row. 52 | granularity: Granularity of the output chunks. 53 | """ 54 | 55 | BUNSETSU_SPLIT_ID = 'bnst-kugiri' 56 | TAG_SPLIT_ID = 'tag-kugiri' 57 | 58 | def __init__(self, granularity: Granularity) -> None: 59 | """Initializes the HTML parser for the KNBC corpus. 60 | 61 | Args: 62 | granularity: Granularity of the output chunks. 63 | """ 64 | super().__init__() 65 | self.chunks = [''] 66 | self.row = 0 67 | self.col = 0 68 | self.current_word = '' 69 | self.on_split_row = False 70 | self.granularity = granularity 71 | 72 | def handle_starttag( 73 | self, tag: str, 74 | attributes: typing.List[typing.Tuple[str, typing.Optional[str]]]) -> None: 75 | if tag == 'tr': 76 | self.row += 1 77 | self.col = 0 78 | self.current_word = '' 79 | self.on_split_row = False 80 | 81 | if tag == 'td': 82 | self.col += 1 83 | for name, value in attributes: 84 | bunsetsu_row = name == 'id' and value == self.BUNSETSU_SPLIT_ID 85 | tag_row = name == 'id' and value == self.TAG_SPLIT_ID 86 | if bunsetsu_row or (self.granularity == 'tag' and tag_row): 87 | self.on_split_row = True 88 | 89 | def handle_endtag(self, tag: str) -> None: 90 | if tag != 'tr': # Skip all tags but TR. 91 | return None 92 | if self.row < 3: # Skip the first two rows. 93 | return None 94 | if self.on_split_row: 95 | return self.chunks.append('') 96 | if self.col == 5: 97 | if self.granularity == 'word' and self.chunks[-1]: 98 | self.chunks.append('') 99 | self.chunks[-1] += self.current_word 100 | 101 | def handle_data(self, data: str) -> None: 102 | if self.col == 1: 103 | self.current_word = data 104 | 105 | 106 | def break_before_sequence(chunks: typing.List[str], 107 | sequence: str) -> typing.List[str]: 108 | """Breaks chunks before a specified character sequence appears. 109 | 110 | Args: 111 | chunks (List[str]): Chunks to break. 112 | sequence (str): A character sequence to break chunks before. 113 | 114 | Returns: 115 | Processed chunks. 116 | """ 117 | chunks = utils.SEP.join(chunks).replace(sequence, 118 | utils.SEP + sequence).split(utils.SEP) 119 | chunks = [chunk for chunk in chunks if len(chunk) > 0] 120 | return chunks 121 | 122 | 123 | def postprocess(chunks: typing.List[str]) -> typing.List[str]: 124 | """Applies some processes to modify the extracted chunks. 125 | 126 | Args: 127 | chunks (List[str]): Source chunks. 128 | 129 | Returns: 130 | Processed chunks. 131 | """ 132 | chunks = break_before_sequence(chunks, '(') 133 | chunks = break_before_sequence(chunks, 'もら') 134 | return chunks 135 | 136 | 137 | def parse_args() -> argparse.Namespace: 138 | DEFAULT_OUT_PATH = 'source.txt' 139 | DEFAULT_GRANULARITY = 'phrase' 140 | parser = argparse.ArgumentParser( 141 | description=__doc__, formatter_class=argparse.RawTextHelpFormatter) 142 | parser.add_argument('source_dir', help='Path to the KNBC corpus directory.') 143 | parser.add_argument( 144 | '-o', 145 | '--outfile', 146 | help=f'File path to the output dataset. (default: {DEFAULT_OUT_PATH})', 147 | default=DEFAULT_OUT_PATH) 148 | parser.add_argument( 149 | '--granularity', 150 | help=f'''Granularity of the output chunks. (default: {DEFAULT_GRANULARITY}) 151 | The value should be one of "phrase", "tag", or "word". 152 | "phrase" is equivalent to Bunsetu-based segmentation. 153 | "tag" provides more granular segmentation than "phrase". 154 | "word" is equivalent to word-based segmentation. 155 | 156 | e.g. 携帯ユーザーの仲間入りをするかです。 157 | phrase: 携帯ユーザーの / 仲間入りを / するかです。 158 | tag: 携帯 / ユーザーの / 仲間 / 入りを / するかです。 159 | word: 携帯 / ユーザー / の / 仲間 / 入り / を / する / か / です / 。 160 | ''', 161 | choices=GRANULARITY_OPTIONS, 162 | default=DEFAULT_GRANULARITY) 163 | return parser.parse_args() 164 | 165 | 166 | def main() -> None: 167 | args = parse_args() 168 | source_dir = args.source_dir 169 | outfile = args.outfile 170 | granularity = args.granularity 171 | html_dir = os.path.join(source_dir, 'html') 172 | with open(outfile, 'w') as f: 173 | for file in sorted(os.listdir(html_dir)): 174 | if file[-11:] != '-morph.html': 175 | continue 176 | parser = KNBCHTMLParser(granularity) 177 | data = open(os.path.join(html_dir, file)).read() 178 | parser.feed(data) 179 | chunks = parser.chunks 180 | chunks = postprocess(chunks) 181 | if len(chunks) < 2: 182 | continue 183 | f.write(utils.SEP.join(chunks) + '\n') 184 | print('\033[92mTraining data is output to: %s\033[0m' % (outfile)) 185 | 186 | 187 | if __name__ == '__main__': 188 | main() 189 | -------------------------------------------------------------------------------- /scripts/prepare_wisesight.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Prepares a dataset from the Wisesight corpus. 15 | 16 | Before running this script, you need to download the Wisesight corpus by running: 17 | 18 | $ curl -o wisesight-1000-samples-tokenised.label https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/word-tokenization/wisesight-1000-samples-tokenised.label 19 | $ curl -o wisesight-160-samples-tokenised.label https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/word-tokenization/wisesight-160-samples-tokenised.label 20 | 21 | Then run this command as follows over each file. 22 | 23 | $ python scripts/prepare_wisesight.py wisesight-1000-samples-tokenised.label -o source_train.txt 24 | $ python scripts/prepare_wisesight.py wisesight-160-samples-tokenised.label -o source_val.txt 25 | """ 26 | import argparse 27 | import re 28 | 29 | import regex 30 | 31 | 32 | def parse_args() -> argparse.Namespace: 33 | DEFAULT_OUT_PATH = 'source.txt' 34 | parser = argparse.ArgumentParser( 35 | description=__doc__, formatter_class=argparse.RawTextHelpFormatter) 36 | parser.add_argument( 37 | 'source_filepath', help='Path to a Wisesight corpus label file.') 38 | parser.add_argument( 39 | '-o', 40 | '--outfile', 41 | help=f'File path to the output dataset. (default: {DEFAULT_OUT_PATH})', 42 | default=DEFAULT_OUT_PATH) 43 | return parser.parse_args() 44 | 45 | 46 | def main() -> None: 47 | args = parse_args() 48 | source_filepath = args.source_filepath 49 | target_filepath = args.outfile 50 | 51 | with open(target_filepath, 'w') as outfile: 52 | with open(source_filepath) as infile: 53 | for line in infile: 54 | line = line.strip() 55 | line = re.sub(r'https?://[^ ]+', '', line) # Remove URLs 56 | line = re.sub(r'#[^ ]+', '', line) # Remove hashtags 57 | line = regex.compile(r'\p{Emoji_Presentation=Yes}+').sub( 58 | '', line) # Remove emojis 59 | line = re.sub(r'\|+', '|', line) # Remove consecutive separators 60 | line = re.sub(r'(\|\s)*\|$', '', line) # Remove redundant spaces 61 | outfile.write(line.replace('|', '▁') + '\n') # Replace the separators. 62 | print('\033[92mTraining data is output to: %s\033[0m' % (target_filepath)) 63 | 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /scripts/tests/test_build_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tests the model build script.""" 15 | 16 | import os 17 | import sys 18 | import unittest 19 | 20 | # module hack 21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..') 22 | sys.path.insert(0, os.path.abspath(LIB_PATH)) 23 | 24 | from scripts import build_model # noqa (module hack) 25 | 26 | 27 | class TestAggregateScores(unittest.TestCase): 28 | 29 | def test_standard(self) -> None: 30 | weights = [ 31 | 'AB:x\t2.893\n', 'BC:y\t0.123\n', 'AB:y\t2.123\n', 'BC:y\t1.234\n' 32 | ] 33 | model = build_model.aggregate_scores(weights) 34 | self.assertDictEqual(model, { 35 | 'AB': { 36 | 'x': 2.893, 37 | 'y': 2.123 38 | }, 39 | 'BC': { 40 | 'y': 1.357 41 | } 42 | }, 'should group scores by feature type.') 43 | 44 | def test_blank_line(self) -> None: 45 | weights = [ 46 | '\n', 'AB:x\t2.893\n', 'BC:y\t0.123\n', '\n', 'AB:y\t2.123\n', 47 | 'BC:y\t1.234\n' 48 | ] 49 | model = build_model.aggregate_scores(weights) 50 | self.assertDictEqual(model, { 51 | 'AB': { 52 | 'x': 2.893, 53 | 'y': 2.123 54 | }, 55 | 'BC': { 56 | 'y': 1.357 57 | } 58 | }, 'should skip blank lines.') 59 | 60 | def test_colon(self) -> None: 61 | weights = ['AB::\t8.123'] 62 | model = build_model.aggregate_scores(weights) 63 | self.assertDictEqual( 64 | model, {'AB': { 65 | ':': 8.123 66 | }}, 'should consider the first colon only as a delimiter.') 67 | 68 | 69 | class TestRoundModel(unittest.TestCase): 70 | 71 | def test_standard(self) -> None: 72 | model = { 73 | 'AB': { 74 | 'x': 1.0002, 75 | 'y': 4.1237, 76 | }, 77 | 'BC': { 78 | 'z': 2.1111, 79 | } 80 | } 81 | model_rounded = build_model.round_model(model, 1000) 82 | self.assertDictEqual(model_rounded, { 83 | 'AB': { 84 | 'x': 1000, 85 | 'y': 4123 86 | }, 87 | 'BC': { 88 | 'z': 2111 89 | } 90 | }, 'should scale and round scores to integer.') 91 | 92 | def test_insignificant_score(self) -> None: 93 | model = { 94 | 'AB': { 95 | 'x': 0.0009, 96 | 'y': 4.1237, 97 | }, 98 | 'BC': { 99 | 'z': 2.1111, 100 | } 101 | } 102 | model_rounded = build_model.round_model(model, 1000) 103 | self.assertDictEqual(model_rounded, { 104 | 'AB': { 105 | 'y': 4123 106 | }, 107 | 'BC': { 108 | 'z': 2111 109 | } 110 | }, 'should remove insignificant scores lower than 1.') 111 | 112 | 113 | class TestArgParse(unittest.TestCase): 114 | 115 | def test_cmdargs_invalid_option(self) -> None: 116 | cmdargs = ['-v'] 117 | with self.assertRaises(SystemExit) as cm: 118 | build_model.parse_args(cmdargs) 119 | self.assertEqual(cm.exception.code, 2) 120 | 121 | def test_cmdargs_help(self) -> None: 122 | cmdargs = ['-h'] 123 | with self.assertRaises(SystemExit) as cm: 124 | build_model.parse_args(cmdargs) 125 | self.assertEqual(cm.exception.code, 0) 126 | 127 | def test_cmdargs_no_input(self) -> None: 128 | with self.assertRaises(SystemExit) as cm: 129 | build_model.parse_args([]) 130 | self.assertEqual(cm.exception.code, 2) 131 | 132 | def test_cmdargs_default(self) -> None: 133 | output = build_model.parse_args(['weight.txt']) 134 | self.assertEqual(output.weight_file, 'weight.txt') 135 | self.assertEqual(output.outfile, 'model.json') 136 | self.assertEqual(output.scale, 1000) 137 | 138 | def test_cmdargs_with_scale(self) -> None: 139 | output = build_model.parse_args( 140 | ['weight.txt', '-o', 'foo.json', '--scale', '200']) 141 | self.assertEqual(output.weight_file, 'weight.txt') 142 | self.assertEqual(output.outfile, 'foo.json') 143 | self.assertEqual(output.scale, 200) 144 | -------------------------------------------------------------------------------- /scripts/tests/test_encode_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tests the data encoder script.""" 15 | 16 | import os 17 | import sys 18 | import typing 19 | import unittest 20 | 21 | # module hack 22 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..') 23 | sys.path.insert(0, os.path.abspath(LIB_PATH)) 24 | 25 | from budoux import utils # noqa (module hack) 26 | from scripts import encode_data # noqa (module hack) 27 | 28 | 29 | class TestGetFeature(unittest.TestCase): 30 | 31 | def test_standard(self) -> None: 32 | feature = encode_data.get_feature('a', 'b', 'c', 'd', 'e', 'f') 33 | self.assertSetEqual( 34 | set(feature), 35 | { 36 | # Unigram of Words (UW) 37 | 'UW1:a', 38 | 'UW2:b', 39 | 'UW3:c', 40 | 'UW4:d', 41 | 'UW5:e', 42 | 'UW6:f', 43 | 44 | # Bigram of Words (BW) 45 | 'BW1:bc', 46 | 'BW2:cd', 47 | 'BW3:de', 48 | 49 | # Trigram of Words (TW) 50 | 'TW1:abc', 51 | 'TW2:bcd', 52 | 'TW3:cde', 53 | 'TW4:def', 54 | }, 55 | 'Features should be extracted.') 56 | 57 | def test_with_invalid(self) -> None: 58 | 59 | def find_by_prefix(prefix: str, feature: typing.List[str]) -> bool: 60 | for item in feature: 61 | if item.startswith(prefix): 62 | return True 63 | return False 64 | 65 | feature = encode_data.get_feature('a', 'a', encode_data.INVALID, 'a', 'a', 66 | 'a') 67 | self.assertFalse( 68 | find_by_prefix('UW3:', feature), 69 | 'Should omit the Unigram feature when the character is invalid.') 70 | self.assertFalse( 71 | find_by_prefix('BW2:', feature), 72 | 'Should omit the Bigram feature that covers an invalid character.') 73 | 74 | 75 | class TestArgParse(unittest.TestCase): 76 | 77 | def test_cmdargs_invalid_option(self) -> None: 78 | cmdargs = ['-v'] 79 | with self.assertRaises(SystemExit) as cm: 80 | encode_data.parse_args(cmdargs) 81 | self.assertEqual(cm.exception.code, 2) 82 | 83 | def test_cmdargs_help(self) -> None: 84 | cmdargs = ['-h'] 85 | with self.assertRaises(SystemExit) as cm: 86 | encode_data.parse_args(cmdargs) 87 | self.assertEqual(cm.exception.code, 0) 88 | 89 | def test_cmdargs_no_source(self) -> None: 90 | with self.assertRaises(SystemExit) as cm: 91 | encode_data.parse_args([]) 92 | self.assertEqual(cm.exception.code, 2) 93 | 94 | def test_cmdargs_default(self) -> None: 95 | cmdargs = ['source.txt'] 96 | output = encode_data.parse_args(cmdargs) 97 | self.assertEqual(output.source_data, 'source.txt') 98 | self.assertEqual(output.outfile, encode_data.DEFAULT_OUTPUT_FILENAME) 99 | self.assertIsNone(output.processes) 100 | self.assertEqual(output.scale, 1) 101 | 102 | def test_cmdargs_with_outfile(self) -> None: 103 | cmdargs = ['source.txt', '-o', 'out.txt'] 104 | output = encode_data.parse_args(cmdargs) 105 | self.assertEqual(output.source_data, 'source.txt') 106 | self.assertEqual(output.outfile, 'out.txt') 107 | self.assertIsNone(output.processes) 108 | self.assertEqual(output.scale, 1) 109 | 110 | def test_cmdargs_with_processes(self) -> None: 111 | cmdargs = ['source.txt', '--processes', '8'] 112 | output = encode_data.parse_args(cmdargs) 113 | self.assertEqual(output.source_data, 'source.txt') 114 | self.assertEqual(output.outfile, encode_data.DEFAULT_OUTPUT_FILENAME) 115 | self.assertEqual(output.processes, 8) 116 | self.assertEqual(output.scale, 1) 117 | 118 | def test_cmdargs_with_scale(self) -> None: 119 | cmdargs = ['source.txt', '--scale', '20'] 120 | output = encode_data.parse_args(cmdargs) 121 | self.assertEqual(output.source_data, 'source.txt') 122 | self.assertEqual(output.outfile, encode_data.DEFAULT_OUTPUT_FILENAME) 123 | self.assertIsNone(output.processes) 124 | self.assertEqual(output.scale, 20) 125 | 126 | 127 | class TestProcess(unittest.TestCase): 128 | 129 | sentence = '六本木ヒルズでお昼を食べる。' 130 | sep_indices = {7, 10, 13} 131 | 132 | def test_on_negative_point_with_scale(self) -> None: 133 | line = encode_data.process(8, self.sentence, self.sep_indices, 16) 134 | items = line.split('\t') 135 | weight = items[0] 136 | features = set(items[1:]) 137 | self.assertEqual(weight, '-16') 138 | self.assertIn('UW2:で', features) 139 | 140 | def test_on_positive_point_with_scale(self) -> None: 141 | line = encode_data.process(7, self.sentence, self.sep_indices, 13) 142 | items = line.split('\t') 143 | weight = items[0] 144 | features = set(items[1:]) 145 | self.assertEqual(weight, '13') 146 | self.assertIn('UW3:で', features) 147 | 148 | 149 | class TestNormalizeInput(unittest.TestCase): 150 | 151 | def test_standard_input(self) -> None: 152 | source = f'ABC{utils.SEP}DE{utils.SEP}FGHI' 153 | sentence, sep_indices = encode_data.normalize_input(source) 154 | self.assertEqual(sentence, 'ABCDEFGHI') 155 | self.assertEqual(sep_indices, {3, 5, 9}) 156 | 157 | def test_with_linebreaks(self) -> None: 158 | source = f'AB\nCDE{utils.SEP}FG' 159 | sentence, sep_indices = encode_data.normalize_input(source) 160 | self.assertEqual(sentence, 'ABCDEFG') 161 | self.assertEqual(sep_indices, {2, 5, 7}) 162 | 163 | def test_doubled_seps(self) -> None: 164 | source = f'ABC{utils.SEP}{utils.SEP}DE\n\nFG' 165 | sentence, sep_indices = encode_data.normalize_input(source) 166 | self.assertEqual(sentence, 'ABCDEFG') 167 | self.assertEqual(sep_indices, {3, 5, 7}) 168 | 169 | 170 | if __name__ == '__main__': 171 | unittest.main() 172 | -------------------------------------------------------------------------------- /scripts/tests/test_finetune.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tests the finetune script.""" 15 | 16 | import os 17 | import sys 18 | import tempfile 19 | import unittest 20 | 21 | from jax import numpy as jnp 22 | 23 | # module hack 24 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..') 25 | sys.path.insert(0, os.path.abspath(LIB_PATH)) 26 | 27 | from scripts import finetune # noqa (module hack) 28 | 29 | 30 | class TestArgParse(unittest.TestCase): 31 | 32 | def test_cmdargs_invalid_option(self) -> None: 33 | cmdargs = ['-v'] 34 | with self.assertRaises(SystemExit) as cm: 35 | finetune.parse_args(cmdargs) 36 | self.assertEqual(cm.exception.code, 2) 37 | 38 | def test_cmdargs_help(self) -> None: 39 | cmdargs = ['-h'] 40 | with self.assertRaises(SystemExit) as cm: 41 | finetune.parse_args(cmdargs) 42 | self.assertEqual(cm.exception.code, 0) 43 | 44 | def test_cmdargs_no_data(self) -> None: 45 | with self.assertRaises(SystemExit) as cm: 46 | finetune.parse_args([]) 47 | self.assertEqual(cm.exception.code, 2) 48 | 49 | def test_cmdargs_no_base_model(self) -> None: 50 | with self.assertRaises(SystemExit) as cm: 51 | finetune.parse_args(['encoded.txt']) 52 | self.assertEqual(cm.exception.code, 2) 53 | 54 | def test_cmdargs_default(self) -> None: 55 | cmdargs = ['encoded.txt', 'model.json'] 56 | output = finetune.parse_args(cmdargs) 57 | self.assertEqual(output.train_data, 'encoded.txt') 58 | self.assertEqual(output.base_model, 'model.json') 59 | self.assertEqual(output.iters, finetune.DEFAULT_NUM_ITERS) 60 | self.assertEqual(output.log_span, finetune.DEFAULT_LOG_SPAN) 61 | self.assertEqual(output.learning_rate, finetune.DEFAULT_LEARNING_RATE) 62 | self.assertEqual(output.val_data, None) 63 | 64 | def test_cmdargs_with_values(self) -> None: 65 | cmdargs = [ 66 | 'encoded.txt', 'model.json', '--iters', '50', '--log-span', '10', 67 | '--learning-rate', '0.1', '--val-data', 'val.txt' 68 | ] 69 | output = finetune.parse_args(cmdargs) 70 | self.assertEqual(output.train_data, 'encoded.txt') 71 | self.assertEqual(output.base_model, 'model.json') 72 | self.assertEqual(output.iters, 50) 73 | self.assertEqual(output.log_span, 10) 74 | self.assertEqual(output.learning_rate, 0.1) 75 | self.assertEqual(output.val_data, 'val.txt') 76 | 77 | 78 | class TestLoadModel(unittest.TestCase): 79 | 80 | def setUp(self) -> None: 81 | self.model_file_path = tempfile.NamedTemporaryFile().name 82 | with open(self.model_file_path, 'w') as f: 83 | f.write('{"UW1": {"a": 12, "b": 23}, "TW3": {"xyz": 47}}') 84 | 85 | def test_extracted_keys(self) -> None: 86 | result = finetune.load_model(self.model_file_path).features 87 | self.assertListEqual(result, ['UW1:a', 'UW1:b', 'TW3:xyz']) 88 | 89 | def test_value_variance(self) -> None: 90 | result = finetune.load_model(self.model_file_path).weights.var() 91 | self.assertAlmostEqual(float(result), 1, places=5) 92 | 93 | def test_value_mean(self) -> None: 94 | result = finetune.load_model(self.model_file_path).weights.sum() 95 | self.assertAlmostEqual(float(result), 0, places=5) 96 | 97 | def test_value_order(self) -> None: 98 | result = finetune.load_model(self.model_file_path).weights.tolist() 99 | self.assertGreater(result[1], result[0]) 100 | self.assertGreater(result[2], result[1]) 101 | 102 | 103 | class TestLoadDataset(unittest.TestCase): 104 | 105 | def setUp(self) -> None: 106 | self.entries_file_path = tempfile.NamedTemporaryFile().name 107 | with open(self.entries_file_path, 'w') as f: 108 | f.write(('1\tfoo\tbar\n' 109 | '-1\tfoo\n' 110 | '1\tfoo\tbar\tbaz\n' 111 | '1\tbar\tfoo\n' 112 | '-1\tbaz\tqux\n')) 113 | self.model = finetune.NormalizedModel(['foo', 'bar'], jnp.array([23, -37])) 114 | 115 | def test_y(self) -> None: 116 | result = finetune.load_dataset(self.entries_file_path, self.model) 117 | expected = [True, False, True, True, False] 118 | self.assertListEqual(result.Y.tolist(), expected) 119 | 120 | def test_x(self) -> None: 121 | result = finetune.load_dataset(self.entries_file_path, self.model) 122 | expected = [[1, 1], [1, -1], [1, 1], [1, 1], [-1, -1]] 123 | self.assertListEqual(result.X.tolist(), expected) 124 | 125 | 126 | class TestFit(unittest.TestCase): 127 | 128 | def test_health(self) -> None: 129 | w = jnp.array([.9, .5, -.3]) 130 | X = jnp.array([[-1, 1, 1], [1, -1, 1], [1, 1, -1]]) 131 | # The current result is x.dot(w) = [-0.7, 0.1, 1.1] => [False, True, True] 132 | # It tests if the method can learn a new weight that inverses the result. 133 | Y = jnp.array([True, False, False]) 134 | dataset = finetune.Dataset(X, Y) 135 | w = finetune.fit(w, dataset, iters=1000, learning_rate=.01, log_span=100) 136 | self.assertGreater(X.dot(w).tolist()[0], 0) # x.dot(w) > 0 => True. 137 | 138 | 139 | class TestWriteWeights(unittest.TestCase): 140 | 141 | def test_write_weights(self) -> None: 142 | weights = jnp.array([0.012, 0.238, -0.1237]) 143 | features = ['foo', 'bar', 'baz'] 144 | weights_path = tempfile.NamedTemporaryFile().name 145 | finetune.write_weights(weights_path, weights, features) 146 | with open(weights_path) as f: 147 | result = f.read() 148 | self.assertEqual(result, 'foo\t0.012000\nbar\t0.238000\nbaz\t-0.123700') 149 | -------------------------------------------------------------------------------- /scripts/tests/test_prepare_knbc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tests the prepare KNBC script.""" 15 | 16 | import os 17 | import sys 18 | import unittest 19 | 20 | # module hack 21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..') 22 | sys.path.insert(0, os.path.abspath(LIB_PATH)) 23 | 24 | from scripts import prepare_knbc # noqa (module hack) 25 | 26 | 27 | class TestBreakBeforeSequence(unittest.TestCase): 28 | 29 | def test_standard(self) -> None: 30 | chunks = ['abcdef', 'ghi'] 31 | result = prepare_knbc.break_before_sequence(chunks, 'de') 32 | self.assertListEqual(result, ['abc', 'def', 'ghi']) 33 | 34 | def test_sequence_on_top(self) -> None: 35 | chunks = ['abcdef', 'ghi'] 36 | result = prepare_knbc.break_before_sequence(chunks, 'gh') 37 | self.assertListEqual(result, ['abcdef', 'ghi']) 38 | 39 | def test_multiple_hit(self) -> None: 40 | chunks = ['abcabc', 'def'] 41 | result = prepare_knbc.break_before_sequence(chunks, 'bc') 42 | self.assertListEqual(result, ['a', 'bca', 'bc', 'def']) 43 | 44 | 45 | class TestKNBCHTMLParser(unittest.TestCase): 46 | example_html = ''' 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 |
HAHBHCHDHE
文節区切り
abc
de
タグ区切り
fgh
ijkl
文節区切り
mn
60 | 61 | 62 | ''' 63 | 64 | def test_parse_phrase(self) -> None: 65 | parser = prepare_knbc.KNBCHTMLParser('phrase') 66 | parser.feed(self.example_html) 67 | self.assertListEqual(parser.chunks, ['abcdefghijkl', 'mn']) 68 | 69 | def test_parse_tag(self) -> None: 70 | parser = prepare_knbc.KNBCHTMLParser('tag') 71 | parser.feed(self.example_html) 72 | self.assertListEqual(parser.chunks, ['abcde', 'fghijkl', 'mn']) 73 | 74 | def test_parse_word(self) -> None: 75 | parser = prepare_knbc.KNBCHTMLParser('word') 76 | parser.feed(self.example_html) 77 | self.assertListEqual(parser.chunks, ['abc', 'de', 'fgh', 'ijkl', 'mn']) 78 | -------------------------------------------------------------------------------- /scripts/tests/test_translate_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tests the model translator script.""" 15 | 16 | import os 17 | import sys 18 | import unittest 19 | 20 | # module hack 21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..') 22 | sys.path.insert(0, os.path.abspath(LIB_PATH)) 23 | 24 | from scripts import translate_model # noqa (module hack) 25 | 26 | 27 | class TestNormalize(unittest.TestCase): 28 | 29 | def test_old_format_input(self) -> None: 30 | model = {'a:x': 48, 'a:y': 21, 'b:x': 2, 'b:z': 89} 31 | expect = {'a': {'x': 48, 'y': 21}, 'b': {'x': 2, 'z': 89}} 32 | result = translate_model.normalize(model) 33 | self.assertDictEqual(result, expect) 34 | 35 | def test_new_format_input(self) -> None: 36 | model = {'a': {'x': 48, 'y': 21}, 'b': {'x': 2, 'z': 89}} 37 | result = translate_model.normalize(model) 38 | self.assertDictEqual(result, model) 39 | 40 | def test_broken_input1(self) -> None: 41 | model = {'a:x': 23, 'b': {'x': 37, 'y': 18}} 42 | with self.assertRaises(Exception) as cm: 43 | translate_model.normalize(model) 44 | self.assertTrue('Unsupported model format' in str(cm.exception)) 45 | 46 | def test_broken_input2(self) -> None: 47 | model = {'b': {'x': 37, 'y': {'z': 123}}} 48 | with self.assertRaises(Exception) as cm: 49 | translate_model.normalize(model) 50 | self.assertTrue('Unsupported model format' in str(cm.exception)) 51 | 52 | 53 | class TestTranslateICU(unittest.TestCase): 54 | 55 | def test_standard(self) -> None: 56 | model = {} 57 | model['b'] = {'x': 47, 'z': 13} 58 | model['a'] = {'x': 12, 'y': 88} 59 | expect = ''' 60 | jaml { 61 | aKeys { 62 | "x", 63 | "y", 64 | } 65 | aValues:intvector { 66 | 12, 67 | 88, 68 | } 69 | bKeys { 70 | "x", 71 | "z", 72 | } 73 | bValues:intvector { 74 | 47, 75 | 13, 76 | } 77 | } 78 | '''.strip() 79 | result = translate_model.translate_icu(model) 80 | self.assertEqual(result, expect) 81 | -------------------------------------------------------------------------------- /scripts/translate_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Translates a model JSON file to another format, such as ICU Resource Bundle. 15 | 16 | Example usage: 17 | 18 | $ python translate_model.py --format=icu model.json > icurb.txt 19 | 20 | You can also use this script to update the model files older than v0.5.0 to make 21 | it work with the latest version. 22 | 23 | $ python translate_model.py --format=json old-model.json > new-model.json 24 | """ 25 | 26 | import argparse 27 | import itertools 28 | import json 29 | import typing 30 | 31 | ArgList = typing.Optional[typing.List[str]] 32 | 33 | 34 | def translate_icu(model: typing.Dict[str, typing.Dict[str, int]]) -> str: 35 | """Translates a model to the ICU Resource Bundle format. 36 | 37 | The output is intended to update the data in: 38 | https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/adaboost/jaml.txt 39 | 40 | Args: 41 | model: A model. 42 | Returns: 43 | A model string formatted in the ICU Resource Bundle format. 44 | """ 45 | indent = ' ' 46 | output = 'jaml {\n' 47 | for group_name, members in sorted(model.items()): 48 | output += f'{indent}{group_name}Keys {{\n' 49 | for key in members.keys(): 50 | output += f'{indent}{indent}"{key}",\n' 51 | output += f'{indent}}}\n' 52 | output += f'{indent}{group_name}Values:intvector {{\n' 53 | for val in members.values(): 54 | output += f'{indent}{indent}{val},\n' 55 | output += f'{indent}}}\n' 56 | output += '}' 57 | return output 58 | 59 | 60 | def normalize( 61 | model: typing.Dict[str, 62 | typing.Any]) -> typing.Dict[str, typing.Dict[str, int]]: 63 | """Updates a model to the latest format. Does nothing if it's updated already. 64 | 65 | Args: 66 | model: A model. 67 | Returns: 68 | An updated model. 69 | """ 70 | is_old_format = all([isinstance(v, int) for v in model.values()]) 71 | if is_old_format: 72 | output = {} 73 | sorted_items = sorted(model.items(), key=lambda x: x[0]) 74 | groups = itertools.groupby(sorted_items, key=lambda x: x[0].split(':')[0]) 75 | for group in groups: 76 | output[group[0]] = dict( 77 | (item[0].split(':')[-1], item[1]) for item in group[1]) 78 | return output 79 | try: 80 | assert (all([ 81 | isinstance(v, int) 82 | for groups in model.values() 83 | for v in groups.values() 84 | ])), 'Scores should be integers' 85 | except (AssertionError, AttributeError) as e: 86 | raise Exception('Unsupported model format:', e) 87 | else: 88 | return model 89 | 90 | 91 | def main() -> None: 92 | DEFAULT_FORMAT = 'json' 93 | parser = argparse.ArgumentParser( 94 | description=__doc__, formatter_class=argparse.RawTextHelpFormatter) 95 | parser.add_argument( 96 | 'model', help='File path for the JSON format model file.', type=str) 97 | parser.add_argument( 98 | '--format', 99 | help=f'Target format (default: {DEFAULT_FORMAT})', 100 | type=str, 101 | default=DEFAULT_FORMAT, 102 | choices={DEFAULT_FORMAT, 'icu'}) 103 | args = parser.parse_args() 104 | model_path: str = args.model 105 | format: str = args.format 106 | with open(model_path) as f: 107 | model = json.load(f) 108 | model = normalize(model) 109 | if format == 'json': 110 | print(json.dumps(model, ensure_ascii=False, separators=(',', ':'))) 111 | elif format == 'icu': 112 | print(translate_icu(model)) 113 | else: 114 | pass 115 | 116 | 117 | if __name__ == '__main__': 118 | main() 119 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = budoux 3 | version = attr: budoux.__init__.__version__ 4 | description = BudouX is the successor of Budou 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | license = Apache-2.0 8 | author = Shuhei Iitsuka 9 | author_email = tushuhei@google.com 10 | classifiers = 11 | Development Status :: 3 - Alpha 12 | Operating System :: OS Independent 13 | License :: OSI Approved :: Apache Software License 14 | Programming Language :: Python :: 3.9 15 | Programming Language :: Python :: 3.10 16 | Programming Language :: Python :: 3.11 17 | Programming Language :: Python :: 3.12 18 | Programming Language :: Python :: 3.13 19 | 20 | [options] 21 | python_requires= >= 3.9 22 | packages = find: 23 | include_package_data = True 24 | test_suite = tests 25 | install_requires = 26 | importlib-resources 27 | 28 | [options.extras_require] 29 | dev = 30 | build 31 | flake8 32 | isort 33 | mypy==1.15.0 34 | pytest 35 | regex 36 | toml 37 | twine 38 | types-regex 39 | types-setuptools 40 | yapf 41 | 42 | jaxcpu = 43 | jax==0.5.2 44 | 45 | [options.entry_points] 46 | console_scripts = 47 | budoux = budoux.main:main 48 | 49 | [yapf] 50 | based_on_style = yapf 51 | 52 | [flake8] 53 | # E124: closing bracket does not match visual indentation 54 | # E126: over-indentation 55 | # E501: line too long 56 | # BLK100: black formattable 57 | ignore = E124,E126,E501,BLK100 58 | indent-size = 2 59 | 60 | [mypy] 61 | python_version = 3.10 62 | pretty = True 63 | strict = True 64 | allow_untyped_calls = True 65 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from setuptools import setup 16 | 17 | setup() 18 | -------------------------------------------------------------------------------- /tests/in/1.in: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/in/2.in: -------------------------------------------------------------------------------- 1 | これはテストです。 2 | -------------------------------------------------------------------------------- /tests/in/3.in: -------------------------------------------------------------------------------- 1 | これはテストです。 2 | -------------------------------------------------------------------------------- /tests/quality/ja.tsv: -------------------------------------------------------------------------------- 1 | # label sentence 2 | init 今日は▁とても▁良い▁天気です。 3 | init これ以上▁利用する▁場合は▁教えてください。 4 | init 食器は▁そのまま▁入れて▁大丈夫です。 5 | gh152 ダウンロード▁ありがとう▁ございます。 6 | gh152 ご利用▁ありがとう▁ございました。 7 | gh157 要点を▁まとめる▁必要が▁ある。 8 | gh160 目指すのは▁あらゆる▁人に▁便利な▁ソフトウェア 9 | gh160 商品が▁まもなく▁到着します。 10 | gh160 プロジェクトが▁ようやく▁日の▁目を▁見る。 11 | gh160 明け方に▁ようやく▁目覚めると、 12 | gh160 明け方▁ようやく▁目覚めると、 13 | gh160 これは▁たまたま▁見つけた▁宝物 14 | gh160 歩いていて▁たまたま▁目に▁入った▁光景 15 | gh216 あなたの▁意図した▁とおりに▁情報を▁伝える。 16 | gh220 あの▁イーハトーヴォの▁すきとおった▁風、▁夏でも▁底に▁冷たさを▁もつ▁青い▁そら、▁うつくしい▁森で▁飾られた▁モリーオ市、▁郊外の▁ぎらぎら▁ひかる▁草の▁波。 17 | gh387 購入された▁お客様のみ▁入れます。 18 | gh387 購入された▁お客様のみ▁入場できます。 19 | gh387 パワーのみ▁有効だ 20 | b320113958 小さな▁つぶや▁空気中の▁ちり 21 | b320113958 光が▁どんどん▁空▁いっぱいに▁広がる 22 | b320113958 太陽の▁位置が▁ちがうから 23 | b320113958 太陽が▁しずむころに▁帰る 24 | b320113958 多すぎると▁うまく▁いかない 25 | b320113958 世界の▁子どもの▁命や▁権利 26 | b320113958 「ふだん▁どおり」を▁保つ 27 | b320113958 おもちゃや▁遊びに▁使える 28 | b320113958 コントロールできない▁ほど▁感情移入してしまう 29 | b320113958 いつも▁甘えがちに▁なる 30 | b320113958 存在が▁浮かび▁上がった。 31 | -------------------------------------------------------------------------------- /tests/test_html_processor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | "Tests the HTML Processor." 15 | 16 | import os 17 | import sys 18 | import unittest 19 | 20 | # module hack 21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..') 22 | sys.path.insert(0, os.path.abspath(LIB_PATH)) 23 | 24 | from budoux import html_processor # noqa (module hack) 25 | 26 | 27 | class TestTextContentExtractor(unittest.TestCase): 28 | 29 | def test_output(self) -> None: 30 | input = '

Hello, World

' 31 | expected = 'Hello, World' 32 | extractor = html_processor.TextContentExtractor() 33 | extractor.feed(input) 34 | self.assertEqual( 35 | extractor.output, expected, 36 | 'Text content should be extacted from the given HTML string.') 37 | 38 | 39 | class TestHTMLChunkResolver(unittest.TestCase): 40 | 41 | def test_output(self) -> None: 42 | input = '

abcdef

' 43 | expected = '

abcdef

' 44 | resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '') 45 | resolver.feed(input) 46 | self.assertEqual(resolver.output, expected, 47 | 'WBR tags should be inserted as specified by chunks.') 48 | 49 | def test_unpaired(self) -> None: 50 | input = '

abcdef

' 51 | expected = '

abcdef

' 52 | resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '') 53 | resolver.feed(input) 54 | self.assertEqual(resolver.output, expected, 55 | 'Unpaired close tag should not cause errors.') 56 | 57 | def test_nobr(self) -> None: 58 | input = '

abcdef

' 59 | expected = '

abcdef

' 60 | resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '') 61 | resolver.feed(input) 62 | self.assertEqual(resolver.output, expected, 63 | 'WBR tags should not be inserted if in NOBR.') 64 | 65 | def test_after_nobr(self) -> None: 66 | input = '

abxyabcdef

' 67 | expected = '

abxyabcdef

' 68 | resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '') 69 | resolver.feed(input) 70 | self.assertEqual(resolver.output, expected, 71 | 'WBR tags should be inserted if after NOBR.') 72 | 73 | def test_img_in_nobr(self) -> None: 74 | input = '

abxyabcdef

' 75 | expected = '

abxyabcdef

' 76 | resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '') 77 | resolver.feed(input) 78 | self.assertEqual(resolver.output, expected, 79 | 'IMG should not affect surrounding NOBR.') 80 | 81 | 82 | class TestResolve(unittest.TestCase): 83 | 84 | def test_with_simple_text_input(self) -> None: 85 | chunks = ['abc', 'def'] 86 | html = 'abcdef' 87 | result = html_processor.resolve(chunks, html) 88 | expected = 'abc\u200bdef' 89 | self.assertEqual(result, expected) 90 | 91 | def test_with_standard_html_input(self) -> None: 92 | chunks = ['abc', 'def'] 93 | html = 'abcdef' 94 | result = html_processor.resolve(chunks, html) 95 | expected = 'abc\u200bdef' 96 | self.assertEqual(result, expected) 97 | 98 | def test_with_nodes_to_skip(self) -> None: 99 | chunks = ['abc', 'def', 'ghi'] 100 | html = "afghi" 101 | result = html_processor.resolve(chunks, html) 102 | expected = 'af\u200bghi' 103 | self.assertEqual(result, expected) 104 | 105 | def test_with_break_before_skip(self) -> None: 106 | chunks = ['abc', 'def', 'ghi', 'jkl'] 107 | html = "abcjkl" 108 | result = html_processor.resolve(chunks, html) 109 | expected = 'abc\u200b\u200bjkl' 110 | self.assertEqual(result, expected) 111 | 112 | def test_with_nothing_to_split(self) -> None: 113 | chunks = ['abcdef'] 114 | html = 'abcdef' 115 | result = html_processor.resolve(chunks, html) 116 | expected = 'abcdef' 117 | self.assertEqual(result, expected) 118 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tests the BudouX CLI.""" 15 | 16 | import io 17 | import sys 18 | import unittest 19 | from os.path import abspath, dirname, join 20 | 21 | # module hack 22 | LIB_PATH = join(dirname(__file__), '..') 23 | sys.path.insert(0, abspath(LIB_PATH)) 24 | 25 | from budoux import main # noqa (module hack) 26 | 27 | if isinstance(sys.stdin, io.TextIOWrapper): 28 | sys.stdin.reconfigure(encoding='utf-8') 29 | 30 | if isinstance(sys.stdout, io.TextIOWrapper): 31 | sys.stdout.reconfigure(encoding='utf-8') 32 | 33 | 34 | class TestCommonOption(unittest.TestCase): 35 | 36 | def test_cmdargs_invalid_option(self) -> None: 37 | cmdargs = ['-v'] 38 | with self.assertRaises(SystemExit) as cm: 39 | main.parse_args(cmdargs) 40 | 41 | self.assertEqual(cm.exception.code, 2) 42 | 43 | def test_cmdargs_help(self) -> None: 44 | cmdargs = ['-h'] 45 | with self.assertRaises(SystemExit) as cm: 46 | main.parse_args(cmdargs) 47 | 48 | self.assertEqual(cm.exception.code, 0) 49 | 50 | def test_cmdargs_version(self) -> None: 51 | cmdargs = ['-V'] 52 | with self.assertRaises(SystemExit) as cm: 53 | main.parse_args(cmdargs) 54 | 55 | self.assertEqual(cm.exception.code, 0) 56 | 57 | 58 | class TestModelOption(unittest.TestCase): 59 | 60 | def test_cmdargs_invalid_json(self) -> None: 61 | cmdargs = ['-m', '404.json'] 62 | with self.assertRaises(SystemExit) as cm: 63 | main.parse_args(cmdargs) 64 | 65 | self.assertEqual(cm.exception.code, 2) 66 | 67 | def test_cmdargs_invalid_lang_1(self) -> None: 68 | cmdargs = ['-l', 'aa'] 69 | with self.assertRaises(SystemExit) as cm: 70 | main.parse_args(cmdargs) 71 | 72 | self.assertEqual(cm.exception.code, 2) 73 | 74 | def test_cmdargs_invalid_lang_2(self) -> None: 75 | cmdargs = ['-l', 'ja-abc'] 76 | with self.assertRaises(SystemExit) as cm: 77 | main.parse_args(cmdargs) 78 | 79 | self.assertEqual(cm.exception.code, 2) 80 | 81 | def test_cmdargs_lang_ja(self) -> None: 82 | cmdargs = ['-l', 'ja', '今日は良い天気ですね。'] 83 | output = main._main(cmdargs) 84 | 85 | self.assertEqual(output, '今日は\n良い\n天気ですね。') 86 | 87 | def test_cmdargs_lang_zh_hans(self) -> None: 88 | cmdargs = ['-l', 'zh-hans', '今天天气晴朗。'] 89 | output = main._main(cmdargs) 90 | 91 | self.assertEqual(output, '今天\n天气\n晴朗。') 92 | 93 | 94 | class TestTextArguments(unittest.TestCase): 95 | 96 | def test_cmdargs_single_text(self) -> None: 97 | cmdargs = ['これはテストです。'] 98 | output = main._main(cmdargs) 99 | 100 | self.assertEqual(output, "これは\nテストです。") 101 | 102 | def test_cmdargs_single_multiline_text(self) -> None: 103 | cmdargs = ["これはテストです。\n今日は晴天です。"] 104 | output = main._main(cmdargs) 105 | 106 | self.assertEqual(output, "これは\nテストです。\n---\n今日は\n晴天です。") 107 | 108 | def test_cmdargs_single_multiline_text_with_delimiter(self) -> None: 109 | cmdargs = ["これはテストです。\n今日は晴天です。", "-d", "@"] 110 | output = main._main(cmdargs) 111 | 112 | self.assertEqual(output, "これは\nテストです。\n@\n今日は\n晴天です。") 113 | 114 | def test_cmdargs_single_multiline_text_with_empty_delimiter(self) -> None: 115 | cmdargs = ["これはテストです。\n今日は晴天です。", "-d", ""] 116 | output = main._main(cmdargs) 117 | 118 | self.assertEqual(output, "これは\nテストです。\n\n今日は\n晴天です。") 119 | 120 | def test_cmdargs_multi_text(self) -> None: 121 | cmdargs = ['これはテストです。', '今日は晴天です。'] 122 | with self.assertRaises(SystemExit) as cm: 123 | main.main(cmdargs) 124 | 125 | self.assertEqual(cm.exception.code, 2) 126 | 127 | def test_cmdargs_single_html(self) -> None: 128 | cmdargs = ['-H', '今日はとても天気です。'] 129 | output = main._main(cmdargs) 130 | 131 | self.assertEqual( 132 | output, '' 133 | '今日は\u200bとても\u200b天気です。') 134 | 135 | def test_cmdargs_multi_html(self) -> None: 136 | cmdargs = ['-H', '今日はとても天気です。', 'これはテストです。'] 137 | with self.assertRaises(SystemExit) as cm: 138 | main._main(cmdargs) 139 | 140 | self.assertEqual(cm.exception.code, 2) 141 | 142 | 143 | class TestStdin(unittest.TestCase): 144 | 145 | def test_cmdargs_blank_stdin(self) -> None: 146 | with open( 147 | join(abspath(dirname(__file__)), "in/1.in"), 148 | "r", 149 | encoding=sys.getdefaultencoding()) as f: 150 | sys.stdin = f 151 | output = main._main([]) 152 | 153 | self.assertEqual(output, "") 154 | 155 | def test_cmdargs_text_stdin(self) -> None: 156 | with open( 157 | join(abspath(dirname(__file__)), "in/2.in"), 158 | "r", 159 | encoding=sys.getdefaultencoding()) as f: 160 | sys.stdin = f 161 | output = main._main([]) 162 | 163 | self.assertEqual(output, "これは\nテストです。") 164 | 165 | def test_cmdargs_html_stdin(self) -> None: 166 | with open( 167 | join(abspath(dirname(__file__)), "in/3.in"), 168 | "r", 169 | encoding=sys.getdefaultencoding()) as f: 170 | sys.stdin = f 171 | output = main._main(["-H"]) 172 | 173 | self.assertEqual( 174 | output, '' 175 | 'これは\u200bテストです。\u200b\n' 176 | '') 177 | 178 | 179 | if __name__ == '__main__': 180 | unittest.main() 181 | -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tests the BudouX parser.""" 15 | 16 | import os 17 | import sys 18 | import unittest 19 | 20 | # module hack 21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..') 22 | sys.path.insert(0, os.path.abspath(LIB_PATH)) 23 | 24 | from budoux import parser # noqa (module hack) 25 | 26 | 27 | class TestParser(unittest.TestCase): 28 | TEST_SENTENCE = 'abcdeabcd' 29 | 30 | def test_parse(self) -> None: 31 | p = parser.Parser({ 32 | 'UW4': { 33 | 'a': 10000 34 | }, # means "should separate right before 'a'". 35 | }) 36 | chunks = p.parse(TestParser.TEST_SENTENCE) 37 | self.assertListEqual(chunks, ['abcde', 'abcd'], 38 | 'Should separate if a strong feature item supports.') 39 | 40 | p = parser.Parser({ 41 | 'UW4': { 42 | 'b': 10000 43 | }, # means "should separate right before 'b'". 44 | }) 45 | chunks = p.parse(TestParser.TEST_SENTENCE) 46 | self.assertListEqual( 47 | chunks, ['a', 'bcdea', 'bcd'], 48 | 'Should separate even if it makes the first character a sole phrase.') 49 | 50 | p = parser.Parser({}) 51 | chunks = p.parse('') 52 | self.assertListEqual(chunks, [], 53 | 'Should return a blank list when the input is blank.') 54 | 55 | def test_translate_html_string(self) -> None: 56 | p = parser.Parser({ 57 | 'UW4': { 58 | 'a': 10000 59 | }, # means "should separate right before 'a'". 60 | }) 61 | 62 | input_html = 'xyzabcd' 63 | expected_html = ( 64 | '' 65 | 'xyz\u200babcd') 66 | output_html = p.translate_html_string(input_html) 67 | self.assertEqual( 68 | output_html, expected_html, 69 | 'Should output a html string with a SPAN parent with proper style attributes.' 70 | ) 71 | 72 | input_html = 'xyzxyzabc' 73 | # TODO: Because the content for skip elements are included, this test tries 74 | # to break before "alert". We may want to distinguish "skip from the 75 | # content" and "skip breaking" in future. 76 | expected_html = ( 77 | '' 78 | 'xyz\u200bxyz\u200babc') 79 | output_html = p.translate_html_string(input_html) 80 | self.assertEqual(output_html, expected_html, 81 | 'Should pass script tags as is.') 82 | 83 | input_html = 'xyzabcabc' 84 | expected_html = ( 85 | '' 86 | 'xyz\u200babc\u200babc') 87 | output_html = p.translate_html_string(input_html) 88 | self.assertEqual(output_html, expected_html, 89 | 'Should skip some specific tags.') 90 | 91 | input_html = 'xyzaabc' 92 | expected_html = ( 93 | '' 94 | 'xyz\u200ba\u200babc') 95 | output_html = p.translate_html_string(input_html) 96 | self.assertEqual(output_html, expected_html, 97 | 'Should not ruin attributes of child elements.') 98 | 99 | input_html = 'xyza🇯🇵🇵🇹abc' 100 | expected_html = ( 101 | '' 102 | 'xyz\u200ba🇯🇵🇵🇹\u200babc') 103 | output_html = p.translate_html_string(input_html) 104 | self.assertEqual(output_html, expected_html, 'Should work with emojis.') 105 | 106 | 107 | class TestDefaultParser(unittest.TestCase): 108 | 109 | def test_load_default_japanese_parser(self) -> None: 110 | p_ja = parser.load_default_japanese_parser() 111 | phrases = p_ja.parse('Google の使命は、世界中の情報を整理し、世界中の人がアクセスできて使えるようにすることです。') 112 | self.assertListEqual(phrases, [ 113 | 'Google の', 114 | '使命は、', 115 | '世界中の', 116 | '情報を', 117 | '整理し、', 118 | '世界中の', 119 | '人が', 120 | 'アクセスできて', 121 | '使えるように', 122 | 'する', 123 | 'ことです。', 124 | ]) 125 | 126 | def test_load_default_simplified_chinese_parser(self) -> None: 127 | p_hans = parser.load_default_simplified_chinese_parser() 128 | phrases = p_hans.parse('我们的使命是整合全球信息,供大众使用,让人人受益。') 129 | self.assertListEqual(phrases, [ 130 | '我们', 131 | '的', 132 | '使命', 133 | '是', 134 | '整合', 135 | '全球', 136 | '信息,', 137 | '供', 138 | '大众', 139 | '使用,', 140 | '让', 141 | '人', 142 | '人', 143 | '受益。', 144 | ]) 145 | 146 | def test_load_default_traditional_chinese_parser(self) -> None: 147 | p_hant = parser.load_default_traditional_chinese_parser() 148 | phrases = p_hant.parse('我們的使命是匯整全球資訊,供大眾使用,使人人受惠。') 149 | self.assertListEqual(phrases, [ 150 | '我們', 151 | '的', 152 | '使命', 153 | '是', 154 | '匯整', 155 | '全球', 156 | '資訊,', 157 | '供', 158 | '大眾', 159 | '使用,', 160 | '使', 161 | '人', 162 | '人', 163 | '受惠。', 164 | ]) 165 | 166 | 167 | if __name__ == '__main__': 168 | unittest.main() 169 | -------------------------------------------------------------------------------- /tests/test_quality.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Quality regression test.""" 15 | 16 | import os 17 | import sys 18 | import unittest 19 | 20 | # module hack 21 | LIB_PATH = os.path.join(os.path.dirname(__file__), '..') 22 | sys.path.insert(0, os.path.abspath(LIB_PATH)) 23 | 24 | from budoux import load_default_japanese_parser, utils # noqa (module hack) 25 | 26 | 27 | class TestQuality(unittest.TestCase): 28 | 29 | def test_ja(self) -> None: 30 | errors = [] 31 | parser = load_default_japanese_parser() 32 | fp = os.path.join(os.path.dirname(__file__), 'quality', 'ja.tsv') 33 | with open(fp, 'r', encoding='utf-8') as f: 34 | data = [line.split('\t') for line in f.readlines() if line[0] != '#'] 35 | expected_sentences = [line[1].strip() for line in data if len(line) > 1] 36 | for expected in expected_sentences: 37 | result = utils.SEP.join(parser.parse(expected.replace(utils.SEP, ''))) 38 | if result != expected: 39 | errors.append((expected, result)) 40 | self.assertEqual( 41 | len(errors), 0, 'Failing sentences:\n{}'.format('\n'.join( 42 | [f'expected:{err[0]}\tactual:{err[1]}' for err in errors]))) 43 | --------------------------------------------------------------------------------