├── tests ├── data │ └── dict_with_empty_line.txt └── test_tokenizer.rs ├── nlpo3-python ├── MANIFEST.in ├── tests │ ├── data │ │ └── test_dict.txt │ ├── __init__.py │ └── test_tokenize.py ├── .cargo │ └── config.toml ├── setup.py ├── build_wheels_local_manylinux.sh ├── build_wheels_local_macos.sh ├── Cargo.toml ├── CITATION.cff ├── pyproject.toml ├── nlpo3 │ └── __init__.py ├── setup.cfg ├── src │ └── lib.rs ├── README.md ├── LICENSE ├── Cargo.lock └── notebooks │ └── nlpo3_segment_benchmarks.ipynb ├── src ├── lib.rs ├── four_bytes_str.rs ├── tokenizer │ ├── tcc.rs │ ├── tokenizer_trait.rs │ ├── dict_reader.rs │ ├── tcc │ │ ├── tcc_tokenizer.rs │ │ └── tcc_rules.rs │ └── trie_char.rs ├── tokenizer.rs └── NOTE_ON_STRING.md ├── .cargo └── config.toml ├── nlpo3-nodejs ├── nlpo3 │ ├── rust_mod.d.ts │ └── index.ts ├── Cargo.toml ├── package.json ├── package-lock.json ├── .gitignore ├── README.md ├── src │ └── lib.rs ├── tsconfig.json ├── LICENSE └── Cargo.lock ├── nlpo3-cli ├── Cargo.toml ├── README.md ├── src │ └── main.rs ├── LICENSE └── Cargo.lock ├── .github └── workflows │ ├── test-main-lib.yml │ ├── test-nlpo3-cli.yml │ ├── test-nlpo3-python.yml │ ├── codeql-analysis.yml │ └── build-python-wheels.yml ├── CITATION.cff ├── Cargo.toml ├── .gitignore ├── README.md └── LICENSE /tests/data/dict_with_empty_line.txt: -------------------------------------------------------------------------------- 1 | ปฏิวัติ 2 | ปฏิรูป 3 | ปฏิเสธ 4 | 5 | ปฏิบัติ 6 | ปฏิรูป 7 | -------------------------------------------------------------------------------- /nlpo3-python/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE README.md setup.py pyproject.toml Cargo.toml 2 | recursive-include src * 3 | -------------------------------------------------------------------------------- /nlpo3-python/tests/data/test_dict.txt: -------------------------------------------------------------------------------- 1 | ค่า 2 | ค่าจ้าง 3 | ค่าจ้างเพื่อชีวิต 4 | ค่าแรง 5 | ค่ายทหาร 6 | คน 7 | ไข่ 8 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | mod four_bytes_str; 5 | pub mod tokenizer; 6 | -------------------------------------------------------------------------------- /src/four_bytes_str.rs: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | pub mod custom_regex; 5 | pub mod custom_string; 6 | -------------------------------------------------------------------------------- /src/tokenizer/tcc.rs: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | pub(crate) mod tcc_rules; 5 | pub(crate) mod tcc_tokenizer; 6 | -------------------------------------------------------------------------------- /src/tokenizer.rs: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | mod dict_reader; 5 | pub mod newmm; 6 | pub(crate) mod tcc; 7 | pub mod tokenizer_trait; 8 | mod trie_char; 9 | -------------------------------------------------------------------------------- /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.x86_64-apple-darwin] 2 | rustflags = [ 3 | "-C", "link-arg=-undefined", 4 | "-C", "link-arg=dynamic_lookup", 5 | ] 6 | 7 | [target.aarch64-apple-darwin] 8 | rustflags = [ 9 | "-C", "link-arg=-undefined", 10 | "-C", "link-arg=dynamic_lookup", 11 | ] 12 | -------------------------------------------------------------------------------- /nlpo3-python/.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.x86_64-apple-darwin] 2 | rustflags = [ 3 | "-C", "link-arg=-undefined", 4 | "-C", "link-arg=dynamic_lookup", 5 | ] 6 | 7 | [target.aarch64-apple-darwin] 8 | rustflags = [ 9 | "-C", "link-arg=-undefined", 10 | "-C", "link-arg=dynamic_lookup", 11 | ] 12 | -------------------------------------------------------------------------------- /nlpo3-nodejs/nlpo3/rust_mod.d.ts: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | export function segment(text: string, dict_name: string, safe: boolean, parallel: boolean): string[]; 5 | /** file_path is an absolute path */ 6 | export function loadDict(file_path: string, dict_name: string): string; 7 | -------------------------------------------------------------------------------- /src/tokenizer/tokenizer_trait.rs: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | use anyhow::Result as AnyResult; 5 | 6 | pub trait Tokenizer { 7 | fn segment(&self, text: &str, safe: bool, parallel: bool) -> AnyResult>; 8 | 9 | fn segment_to_string(&self, text: &str, safe: bool, parallel: bool) -> Vec; 10 | } 11 | -------------------------------------------------------------------------------- /nlpo3-python/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | """ 5 | Unit test 6 | """ 7 | 8 | import sys 9 | import unittest 10 | 11 | sys.path.append("../nlpo3") 12 | 13 | loader = unittest.TestLoader() 14 | testSuite = loader.discover("tests") 15 | testRunner = unittest.TextTestRunner(verbosity=1) 16 | testRunner.run(testSuite) 17 | -------------------------------------------------------------------------------- /nlpo3-python/setup.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from setuptools import find_packages, setup 5 | from setuptools_rust import Binding, RustExtension, Strip 6 | 7 | setup( 8 | packages=find_packages(exclude=["notebooks", "tests"]), 9 | rust_extensions=[ 10 | RustExtension( 11 | "nlpo3._nlpo3_python_backend", 12 | path="Cargo.toml", 13 | binding=Binding.PyO3, 14 | strip=Strip.No, 15 | ) 16 | ], 17 | ) 18 | -------------------------------------------------------------------------------- /nlpo3-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nlpo3-cli" 3 | version = "0.2.1-dev" 4 | edition = "2018" 5 | license = "Apache-2.0" 6 | authors = ["Vee Satayamas <5ssgdxltv@relay.firefox.com>"] 7 | description = "Command line interface for nlpO3, a Thai natural language processing library" 8 | categories = ["text-processing", "command-line-utilities"] 9 | keywords = ["thai", "tokenizer", "nlp", "word-segmentation", "cli"] 10 | homepage = "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-cli/" 11 | repository = "https://github.com/PyThaiNLP/nlpo3/" 12 | documentation = "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-cli/README.md" 13 | readme = "README.md" 14 | 15 | [[bin]] 16 | name = "nlpo3" 17 | path = "src/main.rs" 18 | 19 | [dependencies] 20 | clap = "3.0.0-beta.2" 21 | nlpo3 = "1.4.0" 22 | -------------------------------------------------------------------------------- /nlpo3-python/build_wheels_local_manylinux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # The main build wheel workflow is on GitHub Actions, 4 | # see .github/wheels.yml at the root of main repo. 5 | # This script is meant to be run local and use for testing purpose only. 6 | 7 | # This script has to run through manylinux docker image: 8 | # docker run --rm -v `pwd`:/io quay.io/pypa/manylinux2014_x86_64 bash /io/build_wheels_local_manylinux.sh 9 | 10 | set -ex 11 | 12 | curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain stable -y 13 | export PATH="$HOME/.cargo/bin:$PATH" 14 | 15 | cd /io 16 | 17 | for PYBIN in /opt/python/cp{36,37,38,39}*/bin; do 18 | "${PYBIN}/pip" install -U build setuptools setuptools-rust wheel 19 | "${PYBIN}/python" -m build --wheel 20 | done 21 | 22 | for whl in dist/*linux*.whl; do 23 | auditwheel repair "$whl" -w dist/ 24 | done 25 | -------------------------------------------------------------------------------- /nlpo3-nodejs/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nlpo3-nodejs" 3 | version = "1.0.0" 4 | edition = "2018" 5 | license = "Apache-2.0" 6 | authors = ["Thanathip Suntorntip Gorlph"] 7 | description = "Node binding for nlpO3 Thai language processing library" 8 | categories = ["text-processing"] 9 | keywords = ["thai", "tokenizer", "nlp", "word-segmentation"] 10 | homepage = "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-nodejs/" 11 | repository = "https://github.com/PyThaiNLP/nlpo3/" 12 | documentation = "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-nodejs/README.md" 13 | readme = "README.md" 14 | exclude = ["index.node"] 15 | 16 | [lib] 17 | crate-type = ["cdylib"] 18 | 19 | [dependencies] 20 | ahash = "0.8.6" 21 | lazy_static = "1.5.0" 22 | nlpo3 = "1.4.0" 23 | 24 | [dependencies.neon] 25 | version = "1.0.0" 26 | default-features = false 27 | features = ["napi-6"] 28 | -------------------------------------------------------------------------------- /nlpo3-nodejs/nlpo3/index.ts: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | import * as nativeModule from './rust_mod' 5 | /** 6 | * Load dict from dictionary file and store in hash map with key = dictName for ***segment*** function to use. 7 | * 8 | * filePath is an absolute path to the dictionary file. 9 | */ 10 | export const loadDict = (filePath: string, dictName: string): string => { 11 | return nativeModule.loadDict(filePath, dictName) 12 | } 13 | /** 14 | * Perform segmentation on "text" argument with words from dict "dictName". 15 | * 16 | * Dictionary "dictName" must be loaded with **loadDict** function first. 17 | * 18 | */ 19 | export const segment = (text: string, dictName: string, safe = false, parallel = false): string[] => { 20 | return nativeModule.segment(text, dictName, safe, parallel) 21 | } 22 | -------------------------------------------------------------------------------- /nlpo3-cli/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | SPDX-FileCopyrightText: 2024 PyThaiNLP Project 3 | SPDX-License-Identifier: Apache-2.0 4 | --- 5 | 6 | # nlpo3-cli 7 | 8 | [![crates.io](https://img.shields.io/crates/v/nlpo3-cli.svg "crates.io")](https://crates.io/crates/nlpo3-cli/) 9 | [![Apache-2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg "Apache-2.0")](https://opensource.org/licenses/Apache-2.0) 10 | 11 | Command line interface for nlpO3, a Thai natural language processing library. 12 | 13 | ## Install 14 | 15 | ```bash 16 | cargo install nlpo3-cli 17 | ``` 18 | 19 | ## Usage 20 | 21 | ```bash 22 | nlpo3 help 23 | ``` 24 | 25 | ## Example 26 | 27 | ```bash 28 | echo "ฉันกินข้าว" | nlpo3 segment 29 | ``` 30 | 31 | ## License 32 | 33 | nlpo3-cli is copyrighted by its authors 34 | and licensed under terms of the Apache Software License 2.0 (Apache-2.0). 35 | See file [LICENSE](./LICENSE) for details. 36 | -------------------------------------------------------------------------------- /.github/workflows/test-main-lib.yml: -------------------------------------------------------------------------------- 1 | name: Test main lib 2 | on: 3 | push: 4 | branches: 5 | - main 6 | paths: 7 | - 'src/**' 8 | - 'Cargo.toml' 9 | - 'tests/**' 10 | - '.github/workflows/test-main-lib.yml' 11 | pull_request: 12 | branches: 13 | - main 14 | paths: 15 | - 'src/**' 16 | - 'tests/**' 17 | - 'Cargo.toml' 18 | - '.github/workflows/test-main-lib.yml' 19 | 20 | jobs: 21 | test: 22 | strategy: 23 | fail-fast: false 24 | matrix: 25 | os: [macos-latest, ubuntu-latest, windows-latest] 26 | bitness: [64] # 32, 64 27 | include: 28 | - os: windows-latest 29 | bitness: 32 30 | 31 | runs-on: ${{ matrix.os }} 32 | 33 | steps: 34 | - name: Checkout source code 35 | uses: actions/checkout@v4 36 | 37 | - name: Setup Rust toolchain 38 | uses: actions-rust-lang/setup-rust-toolchain@v1 39 | 40 | - name: Test 41 | run: cargo test 42 | -------------------------------------------------------------------------------- /nlpo3-python/build_wheels_local_macos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # The main build wheel workflow is on GitHub Actions, 4 | # see .github/wheels.yml at the root of main repo. 5 | # This script is meant to be run local and use for testing purpose only. 6 | 7 | # Use pyenv to build on different Python versions. 8 | 9 | # If "error: implicit declaration of function 'sendfile' is invalid in C99" 10 | # occurs when installing a Python version, see fix at: 11 | # https://github.com/pyenv/pyenv/issues/1740#issuecomment-738749988 12 | 13 | set -ex 14 | 15 | # store pyenv Python version before calling the script 16 | SAVE_PYVER=$(pyenv global) 17 | 18 | for PYVER in $(ls ~/.pyenv/versions); do 19 | PYVER_MINOR=$(echo "${PYVER}" | sed -nre 's/^(pypy)?(([0-9]+\.)?[0-9]+).*/\1\2/p') 20 | echo "Build for Python ${PYVER_MINOR}" 21 | pyenv global "${PYVER}" 22 | $(pyenv which pip) install -U pip 23 | $(pyenv which pip) install -U build setuptools setuptools-rust wheel 24 | $(pyenv which python) -m build --wheel 25 | done 26 | 27 | # restore pyenv Python version 28 | pyenv global ${SAVE_PYVER} 29 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: "1.2.0" 2 | title: "nlpO3" 3 | message: >- 4 | If you use this software, please cite it using these 5 | metadata. 6 | type: software 7 | authors: 8 | - family-names: Suntorntip 9 | given-names: Thanathip 10 | - family-names: Suriyawongkul 11 | given-names: Arthit 12 | orcid: "https://orcid.org/0000-0002-9698-1899" 13 | - family-names: Phatthiyaphaibun 14 | given-names: Wannaphong 15 | orcid: "https://orcid.org/0000-0002-4153-4354" 16 | repository-code: "https://github.com/PyThaiNLP/nlpo3/" 17 | repository: "https://github.com/PyThaiNLP/nlpo3/" 18 | url: "https://github.com/PyThaiNLP/nlpo3/" 19 | abstract: "Thai natural language processing library in Rust, with Python and Node bindings. Formerly oxidized-thainlp." 20 | keywords: 21 | - "tokenizer" 22 | - "tokenization" 23 | - "Thai" 24 | - "natural language processing" 25 | - "NLP" 26 | - "Rust" 27 | - "Node.js" 28 | - "Node" 29 | - "Python" 30 | - "text processing" 31 | - "word segmentation" 32 | - "Thai language" 33 | - "Thai NLP" 34 | license: Apache-2.0 35 | version: v1.4.0 36 | date-released: "2024-11-09" 37 | -------------------------------------------------------------------------------- /nlpo3-python/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nlpo3-python" 3 | version = "1.3.2-dev" 4 | edition = "2018" 5 | license = "Apache-2.0" 6 | authors = [ 7 | "Thanathip Suntorntip Gorlph", 8 | "Arthit Suriyawongkul", 9 | "Wannaphong Phatthiyaphaibun ", 10 | ] 11 | description = "Python binding for nlpO3 Thai language processing library" 12 | categories = ["text-processing"] 13 | keywords = ["thai", "tokenizer", "nlp", "word-segmentation", "python"] 14 | homepage = "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-python" 15 | repository = "https://github.com/PyThaiNLP/nlpo3/" 16 | documentation = "https://github.com/PyThaiNLP/nlpo3/blob/main/nlpo3-python/README.md" 17 | readme = "README.md" 18 | exclude = [ 19 | ".gitignore", 20 | ".github/", 21 | "build/", 22 | "dist/", 23 | "notebooks/", 24 | "target", 25 | "tests/", 26 | "*.sh", 27 | ] 28 | 29 | [lib] 30 | name = "_nlpo3_python_backend" 31 | path = "src/lib.rs" 32 | crate-type = ["cdylib", "rlib"] 33 | 34 | [dependencies] 35 | ahash = "0.8.6" 36 | lazy_static = "1.5.0" 37 | nlpo3 = "1.4.0" 38 | 39 | [dependencies.pyo3] 40 | version = "0.22.6" 41 | features = ["extension-module"] 42 | -------------------------------------------------------------------------------- /src/NOTE_ON_STRING.md: -------------------------------------------------------------------------------- 1 | --- 2 | SPDX-FileCopyrightText: 2024 PyThaiNLP Project 3 | SPDX-License-Identifier: Apache-2.0 4 | --- 5 | 6 | # Why Use Handroll Bytes Slice As "CustomString" Instead of Rust String? 7 | 8 | Rust `String` (and `&str`) is actually a slice of valid UTF-8 bytes which is 9 | variable-length. It has no way of accessing a random index UTF-8 "character" 10 | with O(1) time complexity. 11 | 12 | This means any algorithm with operations based on "character" index position 13 | will be horribly slow on Rust String. 14 | 15 | Hence, `fixed_bytes_str` which is transformed from a slice of valid UTF-8 16 | bytes into a slice of 4-bytes length - padded left with 0. 17 | 18 | Consequently, regular expressions must be padded with `\x00` for each Unicode 19 | character to have 4 bytes. 20 | 21 | Thai characters are 3-bytes length, so every Thai char in regex is padded 22 | with `\x00` one time. 23 | 24 | For "space" in regex, it is padded with `\x00\x00\x00`. 25 | 26 | ## References 27 | 28 | - [Rust String indexing and internal representation](https://doc.rust-lang.org/book/ch08-02-strings.html#indexing-into-strings) 29 | - Read more about [UTF-8](https://en.wikipedia.org/wiki/UTF-8) at Wikipedia. 30 | -------------------------------------------------------------------------------- /nlpo3-python/CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: "1.2.0" 2 | title: "nlpO3 Python binding" 3 | message: >- 4 | If you use this software, please cite it using these 5 | metadata. 6 | type: software 7 | authors: 8 | - family-names: Suntorntip 9 | given-names: Thanathip 10 | - family-names: Suriyawongkul 11 | given-names: Arthit 12 | orcid: "https://orcid.org/0000-0002-9698-1899" 13 | - family-names: Phatthiyaphaibun 14 | given-names: Wannaphong 15 | orcid: "https://orcid.org/0000-0002-4153-4354" 16 | repository-code: "https://github.com/PyThaiNLP/nlpo3/" 17 | repository: "https://github.com/PyThaiNLP/nlpo3/" 18 | url: "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-python" 19 | abstract: "Python binding for nlpO3, a Thai natural language processing library in Rust." 20 | keywords: 21 | - "tokenizer" 22 | - "tokenization" 23 | - "Thai" 24 | - "natural language processing" 25 | - "NLP" 26 | - "Python" 27 | - "Python binding" 28 | - "Rust" 29 | - "text processing" 30 | - "word segmentation" 31 | - "Thai language" 32 | - "Thai NLP" 33 | license: Apache-2.0 34 | identifiers: 35 | - description: This is the collection of archived snapshots of all versions of the software. 36 | type: doi 37 | value: "10.5281/zenodo.14082448" 38 | version: v1.3.1 39 | date-released: "2024-11-11" 40 | -------------------------------------------------------------------------------- /nlpo3-nodejs/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "nlpo3-nodejs", 3 | "version": "1.0.0", 4 | "description": "Node.js binding for nlpO3 Thai language processing library", 5 | "repository": { 6 | "type": "git", 7 | "url": "https://github.com/PyThaiNLP/nlpo3.git" 8 | }, 9 | "keywords": [ 10 | "thai", 11 | "tokenizer", 12 | "nlp", 13 | "rust", 14 | "word-segmentation" 15 | ], 16 | "author": { 17 | "name": "Thanathip Suntorntip Gorlph", 18 | "url": "https://github.com/Gorlph/" 19 | }, 20 | "contributors": [ 21 | { 22 | "name": "Arthit Suriiyawongkul", 23 | "url": "https://github.com/bact" 24 | } 25 | ], 26 | "license": "Apache-2.0", 27 | "bugs": { 28 | "url": "https://github.com/PyThaiNLP/nlpo3/issues" 29 | }, 30 | "homepage": "https://github.com/PyThaiNLP/nlpo3/", 31 | "main": "./nlpo3/rust_mod.node", 32 | "scripts": { 33 | "build": "cargo-cp-artifact -nc ./nlpo3/rust_mod.node -- cargo build --message-format=json-render-diagnostics", 34 | "release": "cargo-cp-artifact -nc ./nlpo3/rust_mod.node -- cargo build --release --message-format=json-render-diagnostics && tsc", 35 | "install": "npm run build", 36 | "test": "cargo test" 37 | }, 38 | "devDependencies": { 39 | "cargo-cp-artifact": "^0.1", 40 | "typescript": "^4.3.5" 41 | }, 42 | "engines": { 43 | "node": ">= 12.0.0" 44 | }, 45 | "files": [ 46 | "nlpo3" 47 | ] 48 | } -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nlpo3" 3 | version = "1.4.0" 4 | edition = "2018" 5 | license = "Apache-2.0" 6 | authors = ["Thanathip Suntorntip Gorlph", "Arthit Suriyawongkul"] 7 | description = "Thai natural language processing library, with Python and Node bindings" 8 | categories = ["text-processing"] 9 | keywords = ["thai", "tokenizer", "nlp", "word-segmentation"] 10 | homepage = "https://github.com/PyThaiNLP/nlpo3/" 11 | repository = "https://github.com/PyThaiNLP/nlpo3/" 12 | documentation = "https://github.com/PyThaiNLP/nlpo3/blob/main/README.md" 13 | readme = "README.md" 14 | exclude = [ 15 | ".gitignore", 16 | ".github/*", 17 | "build_tools/*", 18 | "tests/*", 19 | "nlpo3-cli/*", 20 | "nlpo3-nodejs/*", 21 | "nlpo3-python/*", 22 | "words_th.txt", 23 | ] 24 | 25 | [profile.release] 26 | lto = true 27 | codegen-units = 1 28 | 29 | [lib] 30 | path = "src/lib.rs" 31 | # "cdylib" is necessary to produce a shared library for Python to import from. 32 | # Downstream Rust code (including code in `bin/`, `examples/`, and `tests/`) will not be able 33 | # to `use string_sum;` unless the "rlib" or "lib" crate type is also included. 34 | crate-type = ["cdylib", "rlib"] 35 | 36 | [dependencies] 37 | anyhow = "1.0.93" 38 | binary-heap-plus = "0.5.0" 39 | bytecount = "0.6.8" 40 | lazy_static = "1.5.0" 41 | rayon = "1.10.0" 42 | regex = "1.11.1" 43 | rustc-hash = "1.1.0" 44 | regex-syntax = "0.6.29" 45 | 46 | [[test]] 47 | name = "basic" 48 | path = "tests/test_tokenizer.rs" 49 | test = true 50 | -------------------------------------------------------------------------------- /.github/workflows/test-nlpo3-cli.yml: -------------------------------------------------------------------------------- 1 | name: Test nlpo3-cli 2 | on: 3 | push: 4 | branches: 5 | - main 6 | paths: 7 | - 'nlpo3-cli/**' 8 | pull_request: 9 | branches: 10 | - main 11 | paths: 12 | - 'nlpo3-cli/**' 13 | 14 | defaults: 15 | run: 16 | working-directory: nlpo3-cli 17 | 18 | jobs: 19 | test: 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | os: [macos-latest, ubuntu-latest, windows-latest] 24 | bitness: [64] # 32, 64 25 | include: 26 | - os: windows-latest 27 | bitness: 32 28 | 29 | runs-on: ${{ matrix.os }} 30 | 31 | steps: 32 | - name: Checkout source code 33 | uses: actions/checkout@v4 34 | 35 | - name: Setup Rust toolchain - non-win32 36 | uses: actions-rs/toolchain@v1 37 | with: 38 | override: true 39 | profile: minimal 40 | toolchain: stable 41 | if: ${{ !startsWith(matrix.os, 'windows') || matrix.bitness != '32' }} 42 | 43 | - name: Setup Rust toolchain - win32 44 | uses: actions-rs/toolchain@v1 45 | with: 46 | override: true 47 | profile: minimal 48 | toolchain: stable 49 | target: i686-pc-windows-msvc 50 | if: startsWith(matrix.os, 'windows') && matrix.bitness == '32' 51 | 52 | - name: Setup Rust dependencies 53 | uses: actions-rs/cargo@v1 54 | with: 55 | command: check 56 | 57 | - name: Test 58 | uses: actions-rs/cargo@v1 59 | with: 60 | command: test 61 | -------------------------------------------------------------------------------- /nlpo3-cli/src/main.rs: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | use clap::Clap; 5 | use nlpo3::tokenizer::newmm_custom::Newmm; 6 | use nlpo3::tokenizer::tokenizer_trait::Tokenizer; 7 | use std::io; 8 | use std::io::BufRead; 9 | 10 | #[derive(Clap, Debug)] 11 | #[clap(name = "nlpo3")] 12 | struct App { 13 | #[clap(subcommand)] 14 | subcommand: SubCommand, 15 | } 16 | 17 | #[derive(Clap, Debug)] 18 | enum SubCommand { 19 | /// Tokenize a string into words. 20 | #[clap()] 21 | Segment(SegmentOpts), 22 | } 23 | 24 | #[derive(Clap, Debug)] 25 | struct SegmentOpts { 26 | #[clap(short = 'd', long, default_value = "default")] 27 | dict_path: String, 28 | 29 | #[clap(short = 's', long, default_value = "|")] 30 | word_delimiter: String, 31 | 32 | /// Run in safe mode to avoid long running edge cases 33 | #[clap(short = 'z', long)] 34 | safe: bool, 35 | 36 | /// Run in multithread mode 37 | #[clap(short = 'p', long)] 38 | parallel: bool, 39 | } 40 | 41 | fn main() { 42 | let opt = App::parse(); 43 | 44 | let SubCommand::Segment(segment_opts) = opt.subcommand; 45 | let dict_path = match segment_opts.dict_path.as_str() { 46 | "default" => None, 47 | dict_name => Some(dict_name), 48 | }; 49 | 50 | let newmm = Newmm::new(dict_path); 51 | for line_opt in io::stdin().lock().lines() { 52 | let cleaned_line = match line_opt { 53 | Ok(line) => line.trim_end_matches('\n').to_string(), 54 | Err(e) => panic!("Cannot read line {}", e), 55 | }; 56 | let toks = newmm.segment( 57 | &cleaned_line, 58 | Some(segment_opts.safe), 59 | Some(segment_opts.parallel), 60 | ); 61 | println!("{}", toks.join(segment_opts.word_delimiter.as_str())); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /.github/workflows/test-nlpo3-python.yml: -------------------------------------------------------------------------------- 1 | name: Test nlpo3-python 2 | on: 3 | push: 4 | branches: 5 | - main 6 | paths: 7 | - 'nlpo3-python/**' 8 | - '!notebooks/' 9 | - '!LICENSE' 10 | - '!*.md' 11 | pull_request: 12 | branches: 13 | - main 14 | paths: 15 | - 'nlpo3-python/**' 16 | - '!notebooks/' 17 | - '!LICENSE' 18 | - '!*.md' 19 | 20 | defaults: 21 | run: 22 | working-directory: nlpo3-python 23 | 24 | jobs: 25 | test: 26 | strategy: 27 | fail-fast: false 28 | matrix: 29 | os: [macos-latest, ubuntu-latest, windows-latest] 30 | python-version: ["3.13", "3.12", "3.11", "3.10", "3.9", "3.8", "3.7"] 31 | bitness: [64] # 32, 64 32 | include: 33 | - os: windows-latest 34 | python-version: "3.9" 35 | bitness: 32 36 | exclude: 37 | - os: macos-latest 38 | python-version: "3.7" 39 | 40 | runs-on: ${{ matrix.os }} 41 | 42 | steps: 43 | - name: Checkout source code 44 | uses: actions/checkout@v4 45 | 46 | - name: Setup Rust toolchain 47 | uses: actions-rust-lang/setup-rust-toolchain@v1 48 | 49 | - name: Set up Python ${{ matrix.python-version }} 50 | uses: actions/setup-python@v5 51 | with: 52 | python-version: ${{ matrix.python-version }} 53 | cache: "pip" 54 | 55 | - name: Build wheel 56 | run: | 57 | pip install -U pip 58 | pip install -U build setuptools setuptools-rust wheel 59 | python -m build --wheel 60 | 61 | - name: Install wheel 62 | run: pip install --no-index --find-links=dist nlpo3 63 | # Since we don't know the exact name of the wheel from previous step, 64 | # use --find-links instead. 65 | 66 | - name: Test 67 | run: | 68 | cd tests 69 | python -m unittest 70 | -------------------------------------------------------------------------------- /nlpo3-python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "setuptools-rust", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "nlpo3" 7 | version = "1.3.2-dev" 8 | description = "Python binding for nlpO3 Thai language processing library in Rust" 9 | readme = "README.md" 10 | requires-python = ">=3.7" 11 | license = { text = "Apache-2.0" } 12 | keywords = ["thai", "tokenizer", "nlp", "word-segmentation", "pythainlp"] 13 | authors = [ 14 | { name = "Thanathip Suntorntip" }, 15 | { name = "Arthit Suriyawongkul" }, 16 | { name = "Wannaphong Phatthiyaphaibun" }, 17 | ] 18 | classifiers = [ 19 | "Development Status :: 5 - Production/Stable", 20 | "Programming Language :: Python :: 3 :: Only", 21 | "Programming Language :: Python :: 3.7", 22 | "Programming Language :: Python :: 3.8", 23 | "Programming Language :: Python :: 3.9", 24 | "Programming Language :: Python :: 3.10", 25 | "Programming Language :: Python :: 3.11", 26 | "Programming Language :: Python :: 3.12", 27 | "Programming Language :: Python :: 3.13", 28 | "Programming Language :: Python :: Implementation :: CPython", 29 | "Programming Language :: Python :: Implementation :: PyPy", 30 | "Intended Audience :: Developers", 31 | "License :: OSI Approved :: Apache Software License", 32 | "Natural Language :: Thai", 33 | "Topic :: Text Processing :: Linguistic", 34 | "Topic :: Software Development :: Libraries :: Python Modules", 35 | ] 36 | 37 | [project.urls] 38 | homepage = "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-python" 39 | repository = "https://github.com/PyThaiNLP/nlpo3/" 40 | 41 | [tool.poetry.dependencies] 42 | python = "^3.7" 43 | 44 | [tool.poetry.dev-dependencies] 45 | pytest = "*" 46 | pytest-runner = "*" 47 | wheel = "*" 48 | 49 | [tool.black] 50 | line-length = 79 51 | exclude = ''' 52 | /( 53 | \.eggs 54 | | \.git 55 | | \.mypy_cache 56 | | \.vscode 57 | | build 58 | | dist 59 | )/ 60 | ''' 61 | -------------------------------------------------------------------------------- /nlpo3-nodejs/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "nlpo3-nodejs", 3 | "version": "0.2.1", 4 | "lockfileVersion": 2, 5 | "requires": true, 6 | "packages": { 7 | "": { 8 | "version": "0.2.1", 9 | "hasInstallScript": true, 10 | "license": "Apache-2.0", 11 | "devDependencies": { 12 | "cargo-cp-artifact": "^0.1", 13 | "typescript": "^4.3.5" 14 | }, 15 | "engines": { 16 | "node": ">= 12.0.0" 17 | } 18 | }, 19 | "node_modules/cargo-cp-artifact": { 20 | "version": "0.1.4", 21 | "resolved": "https://registry.npmjs.org/cargo-cp-artifact/-/cargo-cp-artifact-0.1.4.tgz", 22 | "integrity": "sha512-34yUas8aUENHGdk6JhLkV4ol0GLtP78YgqpsRDmmnpADy9JoTg/DgKM3CRHAeozTRNhKoPaRFhV+BxoqkmoKUA==", 23 | "dev": true, 24 | "bin": { 25 | "cargo-cp-artifact": "bin/cargo-cp-artifact.js" 26 | } 27 | }, 28 | "node_modules/typescript": { 29 | "version": "4.3.5", 30 | "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.3.5.tgz", 31 | "integrity": "sha512-DqQgihaQ9cUrskJo9kIyW/+g0Vxsk8cDtZ52a3NGh0YNTfpUSArXSohyUGnvbPazEPLu398C0UxmKSOrPumUzA==", 32 | "dev": true, 33 | "bin": { 34 | "tsc": "bin/tsc", 35 | "tsserver": "bin/tsserver" 36 | }, 37 | "engines": { 38 | "node": ">=4.2.0" 39 | } 40 | } 41 | }, 42 | "dependencies": { 43 | "cargo-cp-artifact": { 44 | "version": "0.1.4", 45 | "resolved": "https://registry.npmjs.org/cargo-cp-artifact/-/cargo-cp-artifact-0.1.4.tgz", 46 | "integrity": "sha512-34yUas8aUENHGdk6JhLkV4ol0GLtP78YgqpsRDmmnpADy9JoTg/DgKM3CRHAeozTRNhKoPaRFhV+BxoqkmoKUA==", 47 | "dev": true 48 | }, 49 | "typescript": { 50 | "version": "4.3.5", 51 | "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.3.5.tgz", 52 | "integrity": "sha512-DqQgihaQ9cUrskJo9kIyW/+g0Vxsk8cDtZ52a3NGh0YNTfpUSArXSohyUGnvbPazEPLu398C0UxmKSOrPumUzA==", 53 | "dev": true 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/tokenizer/dict_reader.rs: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | /** 5 | * Dictionary reader. 6 | */ 7 | use crate::four_bytes_str::custom_string::CustomString; 8 | 9 | use super::trie_char::TrieChar as Trie; 10 | use std::io::BufReader; 11 | use std::{error::Error, io::prelude::*}; 12 | use std::{fs::File, path::PathBuf}; 13 | 14 | pub enum DictSource { 15 | FilePath(PathBuf), 16 | WordList(Vec), 17 | } 18 | 19 | pub fn create_dict_trie(source: DictSource) -> Result> { 20 | match source { 21 | DictSource::FilePath(file_path) => { 22 | let file_reader = File::open(file_path.as_path()); 23 | match file_reader { 24 | Ok(file) => { 25 | let mut reader = BufReader::new(file); 26 | let mut line = String::with_capacity(50); 27 | let mut dict: Vec = Vec::with_capacity(600); 28 | while reader.read_line(&mut line).unwrap() != 0 { 29 | dict.push(CustomString::new(&line)); 30 | line.clear(); 31 | } 32 | dict.shrink_to_fit(); 33 | Ok(Trie::new(&dict)) 34 | } 35 | Err(error) => Err(Box::from(error)), 36 | } 37 | } 38 | DictSource::WordList(word_list) => { 39 | let custom_word_list: Vec = word_list 40 | .into_iter() 41 | .map(|word| CustomString::new(&word)) 42 | .collect(); 43 | Ok(Trie::new(&custom_word_list)) 44 | } 45 | } 46 | } 47 | 48 | #[test] 49 | fn test_trie() { 50 | let test_word_list = vec![ 51 | "กากบาท".to_string(), 52 | "กาแฟ".to_string(), 53 | "กรรม".to_string(), 54 | "42".to_string(), 55 | "aง|.%".to_string(), 56 | ]; 57 | let trie = create_dict_trie(DictSource::WordList(test_word_list)).unwrap(); 58 | assert!(trie.contain(&CustomString::new("กาแฟ"))); 59 | assert_eq!(trie.amount_of_words(), 5); 60 | } 61 | -------------------------------------------------------------------------------- /nlpo3-python/nlpo3/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # Python-binding for nlpO3, an natural language process library. 5 | # 6 | # Provides a tokenizer. 7 | # 8 | # Authors: 9 | # Thanathip Suntorntip 10 | # Arthit Suriyawongkul 11 | 12 | from pathlib import Path 13 | from typing import List, Tuple 14 | 15 | # import from .so (Rust) 16 | from ._nlpo3_python_backend import load_dict as rust_load_dict 17 | from ._nlpo3_python_backend import segment as rust_segment 18 | 19 | # TODO: load_dict from in-memory list of words 20 | 21 | 22 | def load_dict(file_path: str, dict_name: str) -> Tuple[str, bool]: 23 | """Load dictionary from a file. 24 | 25 | Load a dictionary file into an in-memory dictionary collection, 26 | and assigned dict_name to it. 27 | *** This function does not override an existing dict name. *** 28 | 29 | :param file_path: Path to a dictionary file 30 | :type file_path: str 31 | :param dict_name: A unique dictionary name, use for reference. 32 | :type dict_name: str 33 | :return tuple[human_readable_result_str, bool] 34 | """ 35 | path = Path(file_path).resolve() 36 | 37 | return rust_load_dict(str(path), dict_name) 38 | 39 | 40 | def segment( 41 | text: str, 42 | dict_name: str, 43 | safe: bool = False, 44 | parallel: bool = False, 45 | ) -> List[str]: 46 | """Break text into tokens. 47 | 48 | This method is an implementation of newmm segmentaion. 49 | Support multithread mode - set by parallel flag. 50 | 51 | :param text: Input text 52 | :type text: str 53 | :param dict_name: Dictionary name, as assigned in load_dict() 54 | :type dict_name: str 55 | :param safe: Use safe mode to avoid long waiting time in 56 | a text with lots of ambiguous word boundaries, 57 | defaults to False 58 | :type safe: bool, optional 59 | :param parallel: Use multithread mode, defaults to False 60 | :type parallel: bool, optional 61 | :return: List of tokens 62 | :rtype: List[str] 63 | """ 64 | if not text or not isinstance(text, str): 65 | return [] 66 | 67 | result = rust_segment(text, dict_name, safe, parallel) 68 | 69 | return result 70 | -------------------------------------------------------------------------------- /nlpo3-nodejs/.gitignore: -------------------------------------------------------------------------------- 1 | nlpo3/*.node 2 | nlpo3/index.js 3 | 4 | # Logs 5 | logs 6 | *.log 7 | npm-debug.log* 8 | yarn-debug.log* 9 | yarn-error.log* 10 | lerna-debug.log* 11 | .pnpm-debug.log* 12 | 13 | # Diagnostic reports (https://nodejs.org/api/report.html) 14 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 15 | 16 | # Runtime data 17 | pids 18 | *.pid 19 | *.seed 20 | *.pid.lock 21 | 22 | # Directory for instrumented libs generated by jscoverage/JSCover 23 | lib-cov 24 | 25 | # Coverage directory used by tools like istanbul 26 | coverage 27 | *.lcov 28 | 29 | # nyc test coverage 30 | .nyc_output 31 | 32 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 33 | .grunt 34 | 35 | # Bower dependency directory (https://bower.io/) 36 | bower_components 37 | 38 | # node-waf configuration 39 | .lock-wscript 40 | 41 | # Compiled binary addons (https://nodejs.org/api/addons.html) 42 | build/Release 43 | 44 | # Dependency directories 45 | node_modules/ 46 | jspm_packages/ 47 | 48 | # Snowpack dependency directory (https://snowpack.dev/) 49 | web_modules/ 50 | 51 | # TypeScript cache 52 | *.tsbuildinfo 53 | 54 | # Optional npm cache directory 55 | .npm 56 | 57 | # Optional eslint cache 58 | .eslintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variables file 76 | .env 77 | .env.test 78 | .env.production 79 | 80 | # parcel-bundler cache (https://parceljs.org/) 81 | .cache 82 | .parcel-cache 83 | 84 | # Next.js build output 85 | .next 86 | out 87 | 88 | # Nuxt.js build / generate output 89 | .nuxt 90 | dist 91 | 92 | # Gatsby files 93 | .cache/ 94 | # Comment in the public line in if your project uses Gatsby and not Next.js 95 | # https://nextjs.org/blog/next-9-1#public-directory-support 96 | # public 97 | 98 | # vuepress build output 99 | .vuepress/dist 100 | 101 | # Serverless directories 102 | .serverless/ 103 | 104 | # FuseBox cache 105 | .fusebox/ 106 | 107 | # DynamoDB Local files 108 | .dynamodb/ 109 | 110 | # TernJS port file 111 | .tern-port 112 | 113 | # Stores VSCode versions used for testing VSCode extensions 114 | .vscode-test 115 | 116 | # yarn v2 117 | .yarn/cache 118 | .yarn/unplugged 119 | .yarn/build-state.yml 120 | .yarn/install-state.gz 121 | .pnp.* 122 | -------------------------------------------------------------------------------- /nlpo3-nodejs/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | SPDX-FileCopyrightText: 2024 PyThaiNLP Project 3 | SPDX-License-Identifier: Apache-2.0 4 | --- 5 | 6 | # nlpO3 Node.js binding 7 | 8 | [![Apache-2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg "Apache-2.0")](https://opensource.org/licenses/Apache-2.0) 9 | 10 | Node.js binding for nlpO3, a Thai natural language processing library in Rust. 11 | 12 | ## Features 13 | 14 | - Thai word tokenizer 15 | - Use maximal-matching dictionary-based tokenization algorithm 16 | and honor [Thai Character Cluster][tcc] boundaries 17 | - Fast backend in Rust 18 | - Support custom dictionary 19 | 20 | [tcc]: https://dl.acm.org/doi/10.1145/355214.355225 21 | 22 | ## Build 23 | 24 | ### Requirements 25 | 26 | - [Rust 2018 Edition](https://www.rust-lang.org/tools/install) 27 | - Node.js v12 or newer 28 | 29 | ### Steps 30 | 31 | ```bash 32 | # In this directory 33 | npm run release 34 | ``` 35 | 36 | Before build, your `nlpo3/` directory should look like this: 37 | 38 | ```text 39 | - nlpo3/ 40 | - index.ts 41 | - rust_mod.d.ts 42 | ``` 43 | 44 | After build: 45 | 46 | ```text 47 | - nlpo3/ 48 | - index.js 49 | - index.ts 50 | - rust_mod.d.ts 51 | - rust_mode.node 52 | ``` 53 | 54 | ## Install 55 | 56 | For now, copy the whole `nlpo3/` directory after build to your project. 57 | 58 | ### npm (experitmental) 59 | 60 | npm is still experimental and may not work on all platforms. Please report issues at 61 | 62 | ```shell 63 | npm i nlpo3 64 | ``` 65 | 66 | ## Usage 67 | 68 | In JavaScript: 69 | 70 | ```javascript 71 | const nlpO3 = require(`${path_to_nlpo3}`) 72 | 73 | // load dictionary and tokenize a text with it 74 | nlpO3.loadDict("path/to/dict.file", "dict_name") 75 | nloO3.segment("สวัสดีครับ", "dict_name") 76 | ``` 77 | 78 | In TypeScript: 79 | 80 | ```typescript 81 | import {segment, loadDict} from `${path_to_nlpo3}/index` 82 | 83 | // load custom dictionary and tokenize a text with it 84 | loadDict("path/to/dict.file", "dict_name") 85 | segment("สวัสดีครับ", "dict_name") 86 | ``` 87 | 88 | ## Issues 89 | 90 | Please report issues at 91 | 92 | ## TODO 93 | 94 | - Find a way to build binaries and publish on npm. 95 | 96 | ## License 97 | 98 | nlpO3 Node binding is copyrighted by its authors 99 | and licensed under terms of the Apache Software License 2.0 (Apache-2.0). 100 | See file [LICENSE](./LICENSE) for details. 101 | -------------------------------------------------------------------------------- /nlpo3-python/setup.cfg: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | [metadata] 5 | name = nlpo3 6 | version = 1.3.2-dev 7 | description = Python binding for nlpO3 Thai language processing library 8 | long_description = 9 | Python binding for nlpO3, a Thai natural language processing library in Rust. 10 | 11 | ## Features 12 | 13 | - Thai word tokenizer 14 | - use maximal-matching dictionary-based tokenization algorithm and honor Thai Character Cluster boundaries 15 | - use user-supplied dictionary 16 | - 2.5x faster than similar pure Python implementation 17 | 18 | ## Install 19 | 20 | ```bash 21 | pip install nlpo3 22 | ``` 23 | 24 | ## Usage 25 | 26 | Load file `path/to/dict.file` to memory 27 | and assigned it with name `custom_dict`. 28 | 29 | Then tokenize a text with `custom_dict` dictionary: 30 | ```python 31 | from nlpo3 import load_dict, segment 32 | 33 | load_dict("path/to/dict.file", "custom_dict") 34 | segment("สวัสดีครับ", "custom_dict") 35 | ``` 36 | 37 | it will return a list of strings: 38 | ```python 39 | ['สวัสดี', 'ครับ'] 40 | ``` 41 | (result depends on words included in the dictionary) 42 | 43 | For more documentation, go 44 | [https://github.com/PyThaiNLP/nlpo3](https://github.com/PyThaiNLP/nlpo3) 45 | 46 | long_description_content_type = text/markdown 47 | license = Apache-2.0 48 | keywords = 49 | thai 50 | tokenizer 51 | nlp 52 | word-segmentation 53 | pythainlp 54 | author = Thanathip Suntorntip, Arthit Suriyawongkul, Wannaphong Phatthiyaphaibun 55 | author_email = wannaphong@yahoo.com 56 | url = https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-python 57 | classifiers = 58 | Development Status :: 5 - Production/Stable 59 | Programming Language :: Python :: 3 :: Only 60 | Programming Language :: Python :: 3.7 61 | Programming Language :: Python :: 3.8 62 | Programming Language :: Python :: 3.9 63 | Programming Language :: Python :: 3.10 64 | Programming Language :: Python :: 3.11 65 | Programming Language :: Python :: 3.12 66 | Programming Language :: Python :: 3.13 67 | Intended Audience :: Developers 68 | License :: OSI Approved :: Apache Software License 69 | Natural Language :: Thai 70 | Topic :: Text Processing :: Linguistic 71 | Topic :: Software Development :: Libraries :: Python Modules 72 | #obsoletes = pythainlp-rust-modules 73 | 74 | [options] 75 | python_requires = >=3.7 76 | include_package_data = True 77 | packages = nlpo3 78 | zip_safe = False 79 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ main ] 17 | paths: 18 | - 'nlpo3-python/**' 19 | pull_request: 20 | # The branches below must be a subset of the branches above 21 | branches: [ main ] 22 | paths: 23 | - 'nlpo3-python/**' 24 | schedule: 25 | - cron: '26 8 * * 4' 26 | 27 | jobs: 28 | analyze: 29 | name: Analyze 30 | runs-on: ubuntu-latest 31 | permissions: 32 | actions: read 33 | contents: read 34 | security-events: write 35 | 36 | strategy: 37 | fail-fast: false 38 | matrix: 39 | language: [ 'python' ] 40 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 41 | # Learn more: 42 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 43 | 44 | steps: 45 | - name: Checkout repository 46 | uses: actions/checkout@v2 47 | 48 | # Initializes the CodeQL tools for scanning. 49 | - name: Initialize CodeQL 50 | uses: github/codeql-action/init@v1 51 | with: 52 | languages: ${{ matrix.language }} 53 | # If you wish to specify custom queries, you can do so here or in a config file. 54 | # By default, queries listed here will override any specified in a config file. 55 | # Prefix the list here with "+" to use these queries and those in the config file. 56 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 57 | 58 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 59 | # If this step fails, then you should remove it and run the build manually (see below) 60 | - name: Autobuild 61 | uses: github/codeql-action/autobuild@v1 62 | 63 | # ℹ️ Command-line programs to run using the OS shell. 64 | # 📚 https://git.io/JvXDl 65 | 66 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 67 | # and modify them (or add more) to build your code if your project 68 | # uses a compiled language 69 | 70 | #- run: | 71 | # make bootstrap 72 | # make release 73 | 74 | - name: Perform CodeQL Analysis 75 | uses: github/codeql-action/analyze@v1 76 | -------------------------------------------------------------------------------- /nlpo3-nodejs/src/lib.rs: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | use std::sync::Mutex; 5 | 6 | use ahash::AHashMap as HashMap; 7 | use lazy_static::lazy_static; 8 | use neon::prelude::*; 9 | use nlpo3::tokenizer::{newmm::NewmmTokenizer, tokenizer_trait::Tokenizer}; 10 | 11 | lazy_static! { 12 | static ref TOKENIZER_COLLECTION: Mutex>> = 13 | Mutex::new(HashMap::new()); 14 | } 15 | 16 | // Load a dictionary file to a tokenizer, 17 | // and add that tokenizer to the tokenizer collection. 18 | // 19 | // Dictionary file must one word per line. 20 | // If successful, will insert a NewmmTokenizer to TOKENIZER_COLLECTION. 21 | // returns a tuple of string of loading result and a boolean 22 | fn load_dict(mut cx: FunctionContext) -> JsResult { 23 | let mut tokenizer_col_lock = TOKENIZER_COLLECTION.lock().unwrap(); 24 | let file_path = cx.argument::(0)?.value(&mut cx); 25 | let dict_name = cx.argument::(1)?.value(&mut cx); 26 | if let Some(_) = tokenizer_col_lock.get(&dict_name) { 27 | Ok(cx.string(format!( 28 | "Failed: dictionary {} exists, please use another name.", 29 | dict_name 30 | ))) 31 | } else { 32 | let tokenizer = NewmmTokenizer::new(&file_path); 33 | tokenizer_col_lock.insert(dict_name.to_owned(), Box::new(tokenizer)); 34 | 35 | Ok(cx.string(format!( 36 | "Successful: dictionary name {} from file {} has been successfully loaded", 37 | dict_name, file_path 38 | ))) 39 | } 40 | } 41 | 42 | // Break text into tokens. 43 | // Use newmm algorithm. 44 | /// Can use multithreading, but takes a lot of memory. 45 | /// returns an array of string 46 | fn segment(mut cx: FunctionContext) -> JsResult { 47 | let text = cx.argument::(0)?.value(&mut cx); 48 | let dict_name = cx.argument::(1)?.value(&mut cx); 49 | let safe = cx.argument::(2)?.value(&mut cx); 50 | let parallel = cx.argument::(3)?.value(&mut cx); 51 | if let Some(loaded_tokenizer) = TOKENIZER_COLLECTION.lock().unwrap().get(&dict_name) { 52 | let result = loaded_tokenizer.segment_to_string(&text, safe, parallel); 53 | let js_result_array = JsArray::new(&mut cx, result.len() as u32); 54 | for (i, obj) in result.iter().enumerate() { 55 | let js_string = cx.string(obj); 56 | js_result_array.set(&mut cx, i as u32, js_string).unwrap(); 57 | } 58 | Ok(js_result_array) 59 | } else { 60 | panic!("Dictionary {} does not exist.", dict_name) 61 | } 62 | } 63 | 64 | #[neon::main] 65 | fn main(mut cx: ModuleContext) -> NeonResult<()> { 66 | cx.export_function("loadDict", load_dict)?; 67 | cx.export_function("segment", segment)?; 68 | Ok(()) 69 | } 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | **/debug/ 4 | **/target/ 5 | 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 8 | /Cargo.lock 9 | 10 | # These are backup files generated by rustfmt 11 | **/*.rs.bk 12 | 13 | # MSVC Windows builds of rustc generate these, which store debugging information 14 | *.pdb 15 | 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | wheels/ 38 | share/python-wheels/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | MANIFEST 43 | 44 | # PyInstaller 45 | # Usually these files are written by a python script from a template 46 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 47 | *.manifest 48 | *.spec 49 | 50 | # Installer logs 51 | pip-log.txt 52 | pip-delete-this-directory.txt 53 | 54 | # Unit test / coverage reports 55 | htmlcov/ 56 | .tox/ 57 | .nox/ 58 | .coverage 59 | .coverage.* 60 | .cache 61 | nosetests.xml 62 | coverage.xml 63 | *.cover 64 | *.py,cover 65 | .hypothesis/ 66 | .pytest_cache/ 67 | cover/ 68 | 69 | # Translations 70 | *.mo 71 | *.pot 72 | 73 | # Django stuff: 74 | *.log 75 | local_settings.py 76 | db.sqlite3 77 | db.sqlite3-journal 78 | 79 | # Flask stuff: 80 | instance/ 81 | .webassets-cache 82 | 83 | # Scrapy stuff: 84 | .scrapy 85 | 86 | # Sphinx documentation 87 | docs/_build/ 88 | 89 | # PyBuilder 90 | .pybuilder/ 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # IPython 97 | profile_default/ 98 | ipython_config.py 99 | 100 | # pyenv 101 | # For a library or package, you might want to ignore these files since the code is 102 | # intended to run in multiple environments; otherwise, check them in: 103 | # .python-version 104 | 105 | # pipenv 106 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 107 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 108 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 109 | # install all needed dependencies. 110 | #Pipfile.lock 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | logs/ 156 | -------------------------------------------------------------------------------- /src/tokenizer/tcc/tcc_tokenizer.rs: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | /** 5 | * TCC (Thai Character Cluster) tokenizer. 6 | */ 7 | use super::tcc_rules::{LOOKAHEAD_TCC, NON_LOOKAHEAD_TCC}; 8 | 9 | use crate::four_bytes_str::custom_string::{ 10 | CustomStringBytesSlice, FixedCharsLengthByteSlice, BYTES_PER_CHAR, 11 | }; 12 | use rustc_hash::FxHashSet as HashSet; 13 | /** 14 | The implementation of tokenizer according to Thai Character Clusters (TCCs) 15 | rules purposed by `Theeramunkong et al. 2000. \ 16 | `_ 17 | 18 | Credits: 19 | * TCC: Jakkrit TeCho 20 | * Grammar: Wittawat Jitkrittum (`link to the source file \ 21 | `_) 22 | * Python code: Korakot Chaovavanich 23 | * Rust Code Translation: Thanathip Suntorntip 24 | */ 25 | 26 | /// Returns a set of "character" indice at the end of each token 27 | pub fn tcc_pos(custom_text_type: &CustomStringBytesSlice) -> HashSet { 28 | let mut set: HashSet = HashSet::default(); 29 | set.reserve(custom_text_type.chars_len() / 10); 30 | let mut txt = custom_text_type; 31 | let mut position: usize = 0; 32 | while !txt.is_empty() { 33 | if let Some(result) = NON_LOOKAHEAD_TCC.find(txt) { 34 | let mut matched = &txt[result.start()..result.end()]; 35 | let match_length = matched.len(); 36 | if LOOKAHEAD_TCC.is_match(matched) { 37 | // trim one more char to the right. 38 | let end_bytes_index = match_length - BYTES_PER_CHAR; 39 | let end_char_index = end_bytes_index / BYTES_PER_CHAR; 40 | matched = matched.slice_by_char_indice(0, end_char_index); 41 | let segment_size = matched.chars_len(); 42 | position += segment_size; 43 | set.insert(position); 44 | txt = txt.slice_by_char_indice(end_char_index, txt.chars_len()); 45 | } else { 46 | let segment_size = matched.chars_len(); 47 | position += segment_size; 48 | set.insert(position); 49 | let end_bytes_index = match_length; 50 | let end_char_index = end_bytes_index / BYTES_PER_CHAR; 51 | txt = txt.slice_by_char_indice(end_char_index, txt.chars_len()); 52 | } 53 | } else { 54 | // not thai 55 | let first_char = txt.slice_by_char_indice(0, 1); 56 | let segment_size = first_char.chars_len(); 57 | position += segment_size; 58 | set.insert(position); 59 | txt = txt.slice_by_char_indice(1, txt.chars_len()); 60 | } 61 | } 62 | set 63 | } 64 | #[test] 65 | fn test_cluster_karan() { 66 | use crate::four_bytes_str::custom_string::CustomString; 67 | let kr_result = tcc_pos(CustomString::new("พิสูจน์ได้ค่ะ").raw_content()); 68 | // ends at พิ 69 | assert!(kr_result.contains(&2)); 70 | //สูจน์ 71 | assert!(kr_result.contains(&7)); 72 | //ได้ 73 | assert!(kr_result.contains(&10)); 74 | //ค่ะ 75 | assert!(kr_result.contains(&13)); 76 | } 77 | // เรือน้อยลอยอยู่ 78 | #[test] 79 | /// 80 | fn test_cluster_general_case() { 81 | use crate::four_bytes_str::custom_string::CustomString; 82 | let gen_result = tcc_pos(CustomString::new("เรือน้อยลอยอยู่").raw_content()); 83 | //expected cluster ['เรือ', 'น้', 'อ', 'ย', 'ล', 'อ', 'ย', 'อ', 'ยู่'] 84 | assert!(gen_result.contains(&4)); 85 | assert!(gen_result.contains(&6)); 86 | assert!(gen_result.contains(&7)); 87 | assert!(gen_result.contains(&8)); 88 | assert!(gen_result.contains(&9)); 89 | assert!(gen_result.contains(&10)); 90 | assert!(gen_result.contains(&11)); 91 | assert!(gen_result.contains(&12)); 92 | assert!(gen_result.contains(&15)); 93 | } 94 | -------------------------------------------------------------------------------- /nlpo3-python/src/lib.rs: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | /** 5 | * Python-binding for nlpO3, an natural language process library. 6 | * 7 | * Provides a tokenizer. 8 | * 9 | * Authors: 10 | * Thanathip Suntorntip 11 | * Arthit Suriyawongkul 12 | */ 13 | use std::sync::Mutex; 14 | 15 | use ahash::AHashMap as HashMap; 16 | use lazy_static::lazy_static; 17 | use nlpo3::tokenizer::newmm::NewmmTokenizer; 18 | use nlpo3::tokenizer::tokenizer_trait::Tokenizer; 19 | use pyo3::prelude::*; 20 | use pyo3::types::PyString; 21 | use pyo3::{exceptions, wrap_pyfunction}; 22 | 23 | lazy_static! { 24 | static ref TOKENIZER_COLLECTION: Mutex>> = 25 | Mutex::new(HashMap::new()); 26 | } 27 | 28 | /// Load a dictionary file to a tokenizer, 29 | /// and add that tokenizer to the tokenizer collection. 30 | /// 31 | /// Dictionary file must one word per line. 32 | /// If successful, will insert a NewmmTokenizer to TOKENIZER_COLLECTION. 33 | /// returns a tuple of string of loading result and a boolean 34 | /// 35 | /// signature: (file_path: str, dict_name: str) -> (str, boolean) 36 | #[pyfunction] 37 | #[pyo3(signature = (file_path, dict_name))] 38 | fn load_dict(file_path: &str, dict_name: &str) -> PyResult<(String, bool)> { 39 | let mut tokenizer_col_lock = TOKENIZER_COLLECTION.lock().unwrap(); 40 | if tokenizer_col_lock.get(dict_name).is_some() { 41 | Ok(( 42 | format!( 43 | "Failed: dictionary name {} already exists, please use another name.", 44 | dict_name 45 | ), 46 | false, 47 | )) 48 | } else { 49 | let tokenizer = NewmmTokenizer::new(file_path); 50 | tokenizer_col_lock.insert(dict_name.to_owned(), Box::new(tokenizer)); 51 | 52 | Ok(( 53 | format!( 54 | "Successful: file {} has been successfully loaded to dictionary name {}.", 55 | file_path, dict_name 56 | ), 57 | true, 58 | )) 59 | } 60 | } 61 | 62 | /// Break text into tokens. 63 | /// Use newmm algorithm. 64 | /// Can use multithreading, but takes a lot of memory. 65 | /// returns list of valid utf-8 bytes list 66 | /// 67 | /// signature: (text: str, dict_name: str, safe: boolean = false, parallel: boolean = false) -> List[List[u8]] 68 | /// 69 | #[pyfunction] 70 | #[pyo3(signature = (text, dict_name, safe=false, parallel=false))] 71 | fn segment( 72 | text: &Bound<'_, PyString>, 73 | dict_name: &str, 74 | safe: bool, 75 | parallel: bool, 76 | ) -> PyResult> { 77 | if let Some(loaded_tokenizer) = TOKENIZER_COLLECTION.lock().unwrap().get(dict_name) { 78 | let result = loaded_tokenizer.segment_to_string(text.to_str()?, safe, parallel); 79 | Ok(result) 80 | } else { 81 | Err(exceptions::PyRuntimeError::new_err(format!( 82 | "Dictionary name {} does not exist.", 83 | dict_name 84 | ))) 85 | } 86 | } 87 | 88 | /* 89 | /// Add words to existing dictionary 90 | #[pyfunction] 91 | fn add_word(dict_name: &str, words: Vec<&str>) -> PyResult<(String, bool)> { 92 | let mut tokenizer_col_lock = TOKENIZER_COLLECTION.lock().unwrap(); 93 | if let Some(newmm_dict) = tokenizer_col_lock.get(dict_name) { 94 | newmm_dict.add_word(&words); 95 | Ok((format!("Add new word(s) successfully."), true)) 96 | } else { 97 | Ok(( 98 | format!( 99 | "Cannot add new word(s) - dictionary instance named '{}' does not exist.", 100 | dict_name 101 | ), 102 | false, 103 | )) 104 | } 105 | } 106 | 107 | /// Remove words from existing dictionary 108 | #[pyfunction] 109 | fn remove_word(dict_name: &str, words: Vec<&str>) -> PyResult<(String, bool)> { 110 | let mut tokenizer_col_lock = TOKENIZER_COLLECTION.lock().unwrap(); 111 | if let Some(newmm_dict) = tokenizer_col_lock.get(dict_name) { 112 | newmm_dict.remove_word(&words); 113 | Ok((format!("Remove word(s) successfully."), true)) 114 | } else { 115 | Ok(( 116 | format!( 117 | "Cannot remove word(s) - dictionary instance named '{}' does not exist.", 118 | dict_name 119 | ), 120 | false, 121 | )) 122 | } 123 | } 124 | */ 125 | 126 | #[pymodule] 127 | fn _nlpo3_python_backend(m: &Bound<'_, PyModule>) -> PyResult<()> { 128 | m.add_function(wrap_pyfunction!(load_dict, m)?)?; 129 | m.add_function(wrap_pyfunction!(segment, m)?)?; 130 | Ok(()) 131 | } 132 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | SPDX-FileCopyrightText: 2024 PyThaiNLP Project 3 | SPDX-License-Identifier: Apache-2.0 4 | --- 5 | 6 | # nlpO3 7 | 8 | [![crates.io](https://img.shields.io/crates/v/nlpo3.svg "crates.io")](https://crates.io/crates/nlpo3/) 9 | [![Apache-2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg "Apache-2.0")](https://opensource.org/license/apache-2-0) 10 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14082448.svg)](https://doi.org/10.5281/zenodo.14082448) 11 | 12 | Thai natural language processing library in Rust, 13 | with Python and Node bindings. Formerly oxidized-thainlp. 14 | 15 | To use as a library in a Rust project: 16 | 17 | ```shell 18 | cargo add nlpo3 19 | ``` 20 | 21 | To use as a library in a Python project: 22 | 23 | ```shell 24 | pip install nlpo3 25 | ``` 26 | 27 | ## Table of contents 28 | 29 | - [Features](#features) 30 | - [Use](#use) 31 | - [Node.js binding](#nodejs-binding) 32 | - [Python binding](#python-binding) 33 | - [Rust library](#rust-library) 34 | - [Command-line interface](#command-line-interface) 35 | - [Dictionary](#dictionary) 36 | - [Build](#build) 37 | - [Develop](#develop) 38 | - [License](#license) 39 | 40 | ## Features 41 | 42 | - Thai word tokenizer 43 | - Use maximal-matching dictionary-based tokenization algorithm 44 | and honor [Thai Character Cluster][tcc] boundaries 45 | - [2.5x faster][benchmark] 46 | than similar pure Python implementation (PyThaiNLP's newmm) 47 | - Load a dictionary from a plain text file (one word per line) 48 | or from `Vec` 49 | 50 | [tcc]: https://dl.acm.org/doi/10.1145/355214.355225 51 | [benchmark]: ./nlpo3-python/notebooks/nlpo3_segment_benchmarks.ipynb 52 | 53 | ## Use 54 | 55 | ### Node.js binding 56 | 57 | See [nlpo3-nodejs](./nlpo3-nodejs/). 58 | 59 | ### Python binding 60 | 61 | [![PyPI](https://img.shields.io/pypi/v/nlpo3.svg "PyPI")](https://pypi.python.org/pypi/nlpo3) 62 | 63 | Example: 64 | 65 | ```python 66 | from nlpo3 import load_dict, segment 67 | 68 | load_dict("path/to/dict.file", "dict_name") 69 | segment("สวัสดีครับ", "dict_name") 70 | ``` 71 | 72 | See more at [nlpo3-python](./nlpo3-python/). 73 | 74 | ### Rust library 75 | 76 | [![crates.io](https://img.shields.io/crates/v/nlpo3.svg "crates.io")](https://crates.io/crates/nlpo3/) 77 | 78 | #### Add to dependency 79 | 80 | To use as a library in a Rust project: 81 | 82 | ```shell 83 | cargo add nlpo3 84 | ``` 85 | 86 | It will add "nlpo3" to `Cargo.toml`: 87 | 88 | ```toml 89 | [dependencies] 90 | # ... 91 | nlpo3 = "1.4.0" 92 | ``` 93 | 94 | #### Example 95 | 96 | Create a tokenizer using a dictionary from file, 97 | then use it to tokenize a string (safe mode = true, and parallel mode = false): 98 | 99 | ```rust 100 | use nlpo3::tokenizer::newmm::NewmmTokenizer; 101 | use nlpo3::tokenizer::tokenizer_trait::Tokenizer; 102 | 103 | let tokenizer = NewmmTokenizer::new("path/to/dict.file"); 104 | let tokens = tokenizer.segment("ห้องสมุดประชาชน", true, false).unwrap(); 105 | ``` 106 | 107 | Create a tokenizer using a dictionary from a vector of Strings: 108 | 109 | ```rust 110 | let words = vec!["ปาลิเมนต์".to_string(), "คอนสติติวชั่น".to_string()]; 111 | let tokenizer = NewmmTokenizer::from_word_list(words); 112 | ``` 113 | 114 | Add words to an existing tokenizer: 115 | 116 | ```rust 117 | tokenizer.add_word(&["มิวเซียม"]); 118 | ``` 119 | 120 | Remove words from an existing tokenizer: 121 | 122 | ```rust 123 | tokenizer.remove_word(&["กระเพรา", "ชานชลา"]); 124 | ``` 125 | 126 | ### Command-line interface 127 | 128 | [![crates.io](https://img.shields.io/crates/v/nlpo3-cli.svg "crates.io")](https://crates.io/crates/nlpo3-cli/) 129 | 130 | Example: 131 | 132 | ```bash 133 | echo "ฉันกินข้าว" | nlpo3 segment 134 | ``` 135 | 136 | See more at [nlpo3-cli](./nlpo3-cli/). 137 | 138 | ### Dictionary 139 | 140 | - For the interest of library size, nlpO3 does not assume what dictionary the 141 | user would like to use, and it does not come with a dictionary. 142 | - A dictionary is needed for the dictionary-based word tokenizer. 143 | - For tokenization dictionary, try 144 | - [words_th.tx][dict-pythainlp] from [PyThaiNLP][pythainlp] 145 | - ~62,000 words 146 | - CC0-1.0 147 | - [word break dictionary][dict-libthai] from [libthai][libthai] 148 | - consists of dictionaries in different categories, with a make script 149 | - LGPL-2.1 150 | 151 | [pythainlp]: https://github.com/PyThaiNLP/pythainlp 152 | [libthai]: https://github.com/tlwg/libthai/ 153 | [dict-pythainlp]: https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/words_th.txt 154 | [dict-libthai]: https://github.com/tlwg/libthai/tree/master/data 155 | 156 | ## Build 157 | 158 | ### Requirements 159 | 160 | - [Rust 2018 Edition](https://www.rust-lang.org/tools/install) 161 | 162 | ### Steps 163 | 164 | Generic test: 165 | 166 | ```bash 167 | cargo test 168 | ``` 169 | 170 | Build API document and open it to check: 171 | 172 | ```bash 173 | cargo doc --open 174 | ``` 175 | 176 | Build (remove `--release` to keep debug information): 177 | 178 | ```bash 179 | cargo build --release 180 | ``` 181 | 182 | Check `target/` for build artifacts. 183 | 184 | ## Develop 185 | 186 | ### Development document 187 | 188 | - [Notes on custom string](src/NOTE_ON_STRING.md) 189 | 190 | ### Issues 191 | 192 | - Please report issues at 193 | 194 | ## License 195 | 196 | nlpO3 is copyrighted by its authors 197 | and licensed under terms of the Apache Software License 2.0 (Apache-2.0). 198 | See file [LICENSE](./LICENSE) for details. 199 | -------------------------------------------------------------------------------- /.github/workflows/build-python-wheels.yml: -------------------------------------------------------------------------------- 1 | # Build wheels for many platforms, use cibuildwheel 2 | # see: https://github.com/pypa/cibuildwheel 3 | 4 | name: Build wheels 5 | 6 | on: 7 | push: 8 | branches: 9 | - main 10 | paths: 11 | - 'nlpo3-python/**' 12 | - '.github/workflows/build-python-wheels.yml' 13 | pull_request: 14 | branches: 15 | - main 16 | paths: 17 | - 'nlpo3-python/**' 18 | - '.github/workflows/build-python-wheels.yml' 19 | release: 20 | types: [published] 21 | workflow_dispatch: {} # manual run 22 | 23 | jobs: 24 | echo_github_env: 25 | name: Echo env variables 26 | runs-on: ubuntu-latest 27 | steps: 28 | - run: | 29 | echo "github.event.action : ${{ github.event.action }}" 30 | echo "github.event_name : ${{ github.event_name }}" 31 | echo "github.ref : ${{ github.ref }}" 32 | echo "github.ref_type : ${{ github.ref_type }}" 33 | echo "github.event.ref : ${{ github.event.ref }}" 34 | 35 | # Check whether to build the wheels and the source tarball 36 | check_build_trigger: 37 | name: Check build trigger 38 | runs-on: ubuntu-latest 39 | # Not for forks 40 | if: github.repository == 'pythainlp/nlpo3' 41 | outputs: 42 | build: ${{ steps.check_build_trigger.outputs.build }} 43 | steps: 44 | - name: Checkout source code 45 | uses: actions/checkout@v4 46 | with: 47 | ref: ${{ github.event.pull_request.head.sha }} 48 | - id: check_build_trigger 49 | name: Check build trigger 50 | run: bash build_tools/github/check_build_trigger.sh 51 | # To trigger the build steps, add "[cd build]" to commit message 52 | 53 | build_wheels: 54 | name: Build ${{ matrix.os }} 55 | runs-on: ${{ matrix.os }} 56 | needs: check_build_trigger 57 | if: needs.check_build_trigger.outputs.build 58 | strategy: 59 | # Ensure that a wheel builder finishes even if another fails 60 | fail-fast: false 61 | matrix: 62 | os: [macos-latest, ubuntu-latest, windows-latest] 63 | python-version: ["3.13"] 64 | env: 65 | CIBW_BUILD: "" # blank, let cibuildwheel build all supported platforms 66 | 67 | steps: 68 | - name: Checkout source code 69 | uses: actions/checkout@v4 70 | 71 | - name: Setup Python 72 | uses: actions/setup-python@v5 73 | with: 74 | python-version: ${{ matrix.python-version }} 75 | cache: "pip" 76 | - name: Install Python dependencies 77 | run: python -m pip install --upgrade pip 78 | 79 | - name: Setup Rust toolchain 80 | if: startsWith(matrix.os, 'ubuntu-') == false 81 | uses: actions-rust-lang/setup-rust-toolchain@v1 82 | # For Linux, Rust will be installed inside a cibuildwheel container later 83 | 84 | - name: Setup rustup target 85 | if: startsWith(matrix.os, 'macos-') 86 | run: rustup target add x86_64-apple-darwin 87 | # For cross-compile x86 on GitHub arm64 runner 88 | 89 | - name: Build Python wheels 90 | uses: pypa/cibuildwheel@v2.21.3 91 | with: 92 | package-dir: nlpo3-python 93 | output-dir: wheelhouse 94 | env: 95 | CIBW_BUILD_VERBOSITY: 1 96 | # See CIBW_BUILD, CIBW_SKIP, CIBW_ARCHS and other build selectors at: 97 | # https://cibuildwheel.readthedocs.io/en/stable/options/#build-skip 98 | CIBW_SKIP: "*-musllinux_i686" 99 | CIBW_ARCHS_MACOS: "x86_64 arm64" 100 | CIBW_ENVIRONMENT_MACOS: | 101 | MACOSX_DEPLOYMENT_TARGET=10.9 102 | PATH="$HOME/.cargo/bin:$PATH" 103 | CC=/usr/bin/clang 104 | CXX=/usr/bin/clang++ 105 | CIBW_ARCHS_LINUX: "auto" 106 | CIBW_ENVIRONMENT_LINUX: PATH="$HOME/.cargo/bin:$PATH" 107 | CIBW_BEFORE_BUILD_LINUX: | 108 | pip install --upgrade setuptools-rust 109 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 110 | # It is needed to install Rust for Linux, 111 | # because cibuildwheel on Linux runs inside a container 112 | # and the container does not have Rust. 113 | CIBW_ARCHS_WINDOWS: "AMD64 x86" 114 | 115 | - name: Store artifacts 116 | uses: actions/upload-artifact@v3 117 | with: 118 | path: ./wheelhouse/*.whl 119 | 120 | build_sdist: 121 | name: Build source distribution 122 | runs-on: ubuntu-latest 123 | needs: check_build_trigger 124 | if: needs.check_build_trigger.outputs.build 125 | steps: 126 | - name: Checkout source code 127 | uses: actions/checkout@v4 128 | - name: Setup Python 129 | uses: actions/setup-python@v5 130 | with: 131 | python-version: "3.13" 132 | cache: "pip" 133 | - name: Build source distribution 134 | run: | 135 | cd nlpo3-python 136 | bash ../build_tools/github/build_source.sh 137 | - name: Store artifacts 138 | uses: actions/upload-artifact@v3 139 | with: 140 | path: nlpo3-python/dist/*.tar.gz 141 | 142 | publish_pypi: 143 | name: Publish to PyPI 144 | runs-on: ubuntu-latest 145 | needs: [build_wheels, build_sdist] 146 | # Publish when a GitHub Release is created: 147 | if: github.event_name == 'release' && github.event.action == 'published' 148 | # Alternatively, upload to PyPI on every tag starting with 'v': 149 | #if: github.event_name == 'push' && startsWith(github.event.ref, 'v') 150 | steps: 151 | - name: Retrieve artifacts 152 | uses: actions/download-artifact@v3 153 | with: 154 | name: artifact 155 | path: dist 156 | - name: Publish package 157 | uses: pypa/gh-action-pypi-publish@v1.12.2 158 | with: 159 | skip-existing: true 160 | user: __token__ 161 | password: ${{ secrets.PYPI_API_TOKEN }} 162 | -------------------------------------------------------------------------------- /nlpo3-nodejs/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Basic Options */ 4 | // "incremental": true, /* Enable incremental compilation */ 5 | "target": "es6", /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019' or 'ESNEXT'. */ 6 | "module": "commonjs", /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', or 'ESNext'. */ 7 | // "lib": [], /* Specify library files to be included in the compilation. */ 8 | // "allowJs": true, /* Allow javascript files to be compiled. */ 9 | // "checkJs": true, /* Report errors in .js files. */ 10 | // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */ 11 | // "declaration": true, /* Generates corresponding '.d.ts' file. */ 12 | // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */ 13 | // "sourceMap": true, /* Generates corresponding '.map' file. */ 14 | // "outFile": "./", /* Concatenate and emit output to single file. */ 15 | // "outDir": "./build", /* Redirect output structure to the directory. */ 16 | // "rootDir": "./", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */ 17 | // "composite": true, /* Enable project compilation */ 18 | // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */ 19 | // "removeComments": true, /* Do not emit comments to output. */ 20 | // "noEmit": true, /* Do not emit outputs. */ 21 | // "importHelpers": true, /* Import emit helpers from 'tslib'. */ 22 | // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */ 23 | // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */ 24 | 25 | /* Strict Type-Checking Options */ 26 | "strict": true, /* Enable all strict type-checking options. */ 27 | // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */ 28 | // "strictNullChecks": true, /* Enable strict null checks. */ 29 | // "strictFunctionTypes": true, /* Enable strict checking of function types. */ 30 | // "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */ 31 | // "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */ 32 | // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */ 33 | // "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */ 34 | 35 | /* Additional Checks */ 36 | // "noUnusedLocals": true, /* Report errors on unused locals. */ 37 | // "noUnusedParameters": true, /* Report errors on unused parameters. */ 38 | // "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */ 39 | // "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */ 40 | 41 | /* Module Resolution Options */ 42 | // "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */ 43 | "baseUrl": "nlpo3", /* Base directory to resolve non-absolute module names. */ 44 | // "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */ 45 | // "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */ 46 | // "typeRoots": [], /* List of folders to include type definitions from. */ 47 | // "types": [], /* Type declaration files to be included in compilation. */ 48 | // "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */ 49 | "esModuleInterop": true, /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */ 50 | // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */ 51 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ 52 | 53 | /* Source Map Options */ 54 | // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */ 55 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 56 | // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */ 57 | // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */ 58 | 59 | /* Experimental Options */ 60 | // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */ 61 | // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */ 62 | 63 | /* Advanced Options */ 64 | "forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */ 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /nlpo3-python/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | SPDX-FileCopyrightText: 2024 PyThaiNLP Project 3 | SPDX-License-Identifier: Apache-2.0 4 | --- 5 | 6 | # nlpO3 Python binding 7 | 8 | [![PyPI](https://img.shields.io/pypi/v/nlpo3.svg "PyPI")](https://pypi.python.org/pypi/nlpo3) 9 | [![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg "Python 3.7")](https://www.python.org/downloads/) 10 | [![Apache-2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg "Apache-2.0")](https://opensource.org/license/apache-2-0) 11 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14082448.svg)](https://doi.org/10.5281/zenodo.14082448) 12 | 13 | Python binding for nlpO3, a Thai natural language processing library in Rust. 14 | 15 | To install: 16 | 17 | ```bash 18 | pip install nlpo3 19 | ``` 20 | 21 | ## Table of Contents 22 | 23 | - [Features](#features) 24 | - [Use](#use) 25 | - [Dictionary](#dictionary) 26 | - [Build](#build) 27 | - [Issues](#issues) 28 | - [License](#license) 29 | - [Binary wheels](#binary-wheels) 30 | 31 | ## Features 32 | 33 | - Thai word tokenizer 34 | - `segment()` - use maximal-matching dictionary-based tokenization algorithm 35 | and honor [Thai Character Cluster][tcc] boundaries 36 | - [2.5x faster][benchmark] 37 | than similar pure Python implementation (PyThaiNLP's newmm) 38 | - `load_dict()` - load a dictionary from a plain text file 39 | (one word per line) 40 | 41 | [tcc]: https://dl.acm.org/doi/10.1145/355214.355225 42 | [benchmark]: ./notebooks/nlpo3_segment_benchmarks.ipynb 43 | 44 | ## Use 45 | 46 | Load file `path/to/dict.file` to memory 47 | and assign a name `dict_name` to it. 48 | 49 | Then tokenize a text with the `dict_name` dictionary: 50 | 51 | ```python 52 | from nlpo3 import load_dict, segment 53 | 54 | load_dict("path/to/dict.file", "custom_dict") 55 | segment("สวัสดีครับ", "dict_name") 56 | ``` 57 | 58 | it will return a list of strings: 59 | 60 | ```python 61 | ['สวัสดี', 'ครับ'] 62 | ``` 63 | 64 | (result depends on words included in the dictionary) 65 | 66 | Use multithread mode, also use the `dict_name` dictionary: 67 | 68 | ```python 69 | segment("สวัสดีครับ", dict_name="dict_name", parallel=True) 70 | ``` 71 | 72 | Use safe mode to avoid long waiting time in some edge cases 73 | for text with lots of ambiguous word boundaries: 74 | 75 | ```python 76 | segment("สวัสดีครับ", dict_name="dict_name", safe=True) 77 | ``` 78 | 79 | ### Dictionary 80 | 81 | - For the interest of library size, nlpO3 does not assume what dictionary the 82 | user would like to use, and it does not come with a dictionary. 83 | - A dictionary is needed for the dictionary-based word tokenizer. 84 | - For tokenization dictionary, try 85 | - [words_th.txt][dict-pythainlp] from [PyThaiNLP][pythainlp] 86 | - ~62,000 words 87 | - CC0-1.0 88 | - [word break dictionary][dict-libthai] from [libthai][libthai] 89 | - consists of dictionaries in different categories, with a make script 90 | - LGPL-2.1 91 | 92 | [pythainlp]: https://github.com/PyThaiNLP/pythainlp 93 | [libthai]: https://github.com/tlwg/libthai/ 94 | [dict-pythainlp]: https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/words_th.txt 95 | [dict-libthai]: https://github.com/tlwg/libthai/tree/master/data 96 | 97 | ## Build 98 | 99 | ### Requirements 100 | 101 | - [Rust 2018 Edition](https://www.rust-lang.org/tools/install) 102 | - Python 3.7 or newer (PyO3's minimum supported version) 103 | - Python Development Headers 104 | - Ubuntu: `sudo apt-get install python3-dev` 105 | - macOS: No action needed 106 | - [PyO3](https://github.com/PyO3/pyo3) - already included in `Cargo.toml` 107 | - [setuptools-rust](https://github.com/PyO3/setuptools-rust) 108 | 109 | ### Steps 110 | 111 | ```bash 112 | python -m pip install --upgrade build 113 | python -m build 114 | ``` 115 | 116 | This should generate a wheel file, in `dist/` directory, 117 | which can be installed by pip. 118 | 119 | To install a wheel from a local directory: 120 | 121 | ```bash 122 | pip install dist/nlpo3-1.3.1-cp311-cp311-macosx_12_0_x86_64.whl 123 | ``` 124 | 125 | ### Test 126 | 127 | To run a Python unit test: 128 | 129 | ```bash 130 | cd tests 131 | python -m unittest 132 | ``` 133 | 134 | ## Issues 135 | 136 | Please report issues at 137 | 138 | ## License 139 | 140 | nlpO3 Python binding is copyrighted by its authors 141 | and licensed under terms of the Apache Software License 2.0 (Apache-2.0). 142 | See file [LICENSE](./LICENSE) for details. 143 | 144 | ## Binary wheels 145 | 146 | A pre-built binary package is available from [PyPI][pypi] for these platforms: 147 | 148 | [pypi]: https://pypi.org/project/nlpo3/ 149 | 150 | |Python|OS|Architecture|Has binary wheel?| 151 | |-|-|-|-| 152 | |3.13|Windows|x86|✅| 153 | ||Windows|AMD64|✅| 154 | ||macOS|x86_64|✅| 155 | ||macOS|arm64|✅| 156 | ||manylinux|x86_64|✅| 157 | ||manylinux|i686|✅| 158 | ||musllinux|x86_64|✅| 159 | |3.12|Windows|x86|✅| 160 | ||Windows|AMD64|✅| 161 | ||macOS|x86_64|✅| 162 | ||macOS|arm64|✅| 163 | ||manylinux|x86_64|✅| 164 | ||manylinux|i686|✅| 165 | ||musllinux|x86_64|✅| 166 | |3.11|Windows|x86|✅| 167 | ||Windows|AMD64|✅| 168 | ||macOS|x86_64|✅| 169 | ||macOS|arm64|✅| 170 | ||manylinux|x86_64|✅| 171 | ||manylinux|i686|✅| 172 | ||musllinux|x86_64|✅| 173 | |3.10|Windows|x86|✅| 174 | ||Windows|AMD64|✅| 175 | ||macOS|x86_64|✅| 176 | ||macOS|arm64|✅| 177 | ||manylinux|x86_64|✅| 178 | ||manylinux|i686|✅| 179 | ||musllinux|x86_64|✅| 180 | |3.9|Windows|x86|✅| 181 | ||Windows|AMD64|✅| 182 | ||macOS|x86_64|✅| 183 | ||macOS|arm64|✅| 184 | ||manylinux|x86_64|✅| 185 | ||manylinux|i686|✅| 186 | ||musllinux|x86_64|✅| 187 | |3.8|Windows|x86|✅| 188 | ||Windows|AMD64|✅| 189 | ||macOS|x86_64|✅| 190 | ||macOS|arm64|✅| 191 | ||manylinux|x86_64|✅| 192 | ||manylinux|i686|✅| 193 | ||musllinux|x86_64|✅| 194 | |3.7|Windows|x86|✅| 195 | ||Windows|AMD64|✅| 196 | ||macOS|x86_64|✅| 197 | ||macOS|arm64|❌| 198 | ||manylinux|x86_64|✅| 199 | ||manylinux|i686|✅| 200 | ||musllinux|x86_64|✅| 201 | |PyPy 3.10|Windows|x86|❌| 202 | ||Windows|AMD64|✅| 203 | ||macOS|x86_64|✅| 204 | ||macOS|arm64|✅| 205 | ||manylinux|x86_64|✅| 206 | ||manylinux|i686|✅| 207 | |PyPy 3.9|Windows|x86|❌| 208 | ||Windows|AMD64|✅| 209 | ||macOS|x86_64|✅| 210 | ||macOS|arm64|✅| 211 | ||manylinux|x86_64|✅| 212 | ||manylinux|i686|✅| 213 | |PyPy 3.8|Windows|x86|❌| 214 | ||Windows|AMD64|✅| 215 | ||macOS|x86_64|✅| 216 | ||macOS|arm64|✅| 217 | ||manylinux|x86_64|✅| 218 | ||manylinux|i686|✅| 219 | |PyPy 3.7|Windows|x86|❌| 220 | ||Windows|AMD64|✅| 221 | ||macOS|x86_64|✅| 222 | ||macOS|arm64|❌| 223 | ||manylinux|x86_64|✅| 224 | ||manylinux|i686|✅| 225 | -------------------------------------------------------------------------------- /src/tokenizer/trie_char.rs: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | /** 5 | * This module is meant to be a direct implementation of Dict Trie in PyThaiNLP. 6 | * 7 | * Many functions are implemented as a recursive function 8 | * because of the limits imposed by Rust Borrow Checker and 9 | * this author's (Thanathip) little experience. 10 | * 11 | * Rust Code: Thanathip Suntorntip (Gorlph) 12 | * 13 | * For basic information of trie, visit this wikipedia page 14 | * https://en.wikipedia.org/wiki/Trie 15 | */ 16 | use crate::four_bytes_str::custom_string::{ 17 | CustomString, CustomStringBytesSlice, CustomStringBytesVec, FixedCharsLengthByteSlice, 18 | }; 19 | 20 | use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet}; 21 | use std::borrow::BorrowMut; 22 | 23 | #[derive(Debug)] 24 | struct TrieNode { 25 | /// 26 | children: HashMap, 27 | end: bool, 28 | } 29 | 30 | impl Default for TrieNode { 31 | fn default() -> Self { 32 | Self::new() 33 | } 34 | } 35 | 36 | impl TrieNode { 37 | pub fn new() -> Self { 38 | Self { 39 | children: HashMap::default(), 40 | end: false, 41 | } 42 | } 43 | 44 | fn find_child(&self, word: &char) -> Option<&Self> { 45 | self.children.get(word) 46 | } 47 | 48 | #[allow(dead_code)] 49 | fn remove_child(&mut self, word: &char) { 50 | self.children.remove(word); 51 | } 52 | 53 | #[allow(dead_code)] 54 | fn find_mut_child(&mut self, word: &char) -> Option<&mut Self> { 55 | self.children.get_mut(word) 56 | } 57 | 58 | #[allow(dead_code)] 59 | fn set_not_end(&mut self) { 60 | self.end = false; 61 | } 62 | 63 | fn add_word(&mut self, input_word: &CustomString) { 64 | // thanks to https://stackoverflow.com/questions/36957286/how-do-you-implement-this-simple-trie-node-in-rust 65 | if input_word.is_empty() { 66 | self.end = true; 67 | return; 68 | } 69 | self.children 70 | .entry(*input_word.get_chars_content().first().unwrap()) 71 | .or_insert_with(TrieNode::new) 72 | .add_word(&input_word.substring(1, input_word.chars_len())); 73 | } 74 | 75 | fn remove_word(&mut self, input_word: &CustomString) { 76 | let mut word = input_word; 77 | let char_count = word.chars_len(); 78 | // if has at least 1 char 79 | if char_count >= 1 { 80 | let character = word.get_chars_content().first().unwrap(); 81 | if let Some(child) = self.find_mut_child(character) { 82 | // move 1 character 83 | let substring_of_word = word.substring(1, word.chars_len()); 84 | if char_count == 1 { 85 | child.set_not_end(); 86 | } 87 | word = &substring_of_word; 88 | child.remove_word(word); 89 | if !child.end && child.children.is_empty() { 90 | self.remove_child(character); 91 | } 92 | }; 93 | } 94 | } 95 | } 96 | 97 | #[derive(Debug)] 98 | /// This version of Trie still stores custom bytes vector as words, 99 | /// but prefix operation and its node uses char instead. 100 | pub struct TrieChar { 101 | words: HashSet, 102 | root: TrieNode, 103 | } 104 | 105 | impl TrieChar { 106 | pub fn new(words: &[CustomString]) -> Self { 107 | let mut instance = Self { 108 | words: HashSet::default(), 109 | root: TrieNode::new(), 110 | }; 111 | for word in words.iter() { 112 | instance.add(word); 113 | } 114 | instance 115 | } 116 | 117 | #[allow(dead_code)] 118 | fn remove_word_from_set(&mut self, word: &CustomString) { 119 | self.words.remove(word.raw_content()); 120 | } 121 | 122 | pub fn add(&mut self, word: &CustomString) { 123 | let stripped_word = word.trim(); 124 | if !stripped_word.is_empty() { 125 | self.words.insert(stripped_word.raw_content().into()); 126 | let current_cursor = self.root.borrow_mut(); 127 | current_cursor.add_word(&stripped_word); 128 | } 129 | } 130 | 131 | pub fn remove(&mut self, word: &CustomString) { 132 | let stripped_word = word.trim(); 133 | if !stripped_word.is_empty() && self.words.contains(stripped_word.raw_content()) { 134 | self.remove_word_from_set(&stripped_word); 135 | self.root.remove_word(&stripped_word); // remove from node 136 | } 137 | } 138 | #[allow(dead_code)] 139 | pub fn contain(&self, word: &CustomString) -> bool { 140 | self.words.contains(word.raw_content()) 141 | } 142 | #[allow(dead_code)] 143 | pub fn iterate(&self) -> std::collections::hash_set::Iter<'_, Vec> { 144 | self.words.iter() 145 | } 146 | #[allow(dead_code)] 147 | pub fn amount_of_words(&self) -> usize { 148 | self.words.len() 149 | } 150 | /// Returns a vec of substring (as reference) as produced by words stored in dict_trie. 151 | pub fn prefix_ref<'p>( 152 | prefix: &'p CustomString, 153 | dict_trie: &Self, 154 | ) -> Vec<&'p CustomStringBytesSlice> { 155 | let mut result: Vec<&[u8]> = vec![]; 156 | let prefix_cpy = prefix; 157 | let mut current_index = 0; 158 | let mut current_node_wrap = Some(&dict_trie.root); 159 | while current_index < prefix_cpy.chars_len() { 160 | let character = prefix_cpy.get_char_at(current_index); 161 | if let Some(current_node) = current_node_wrap { 162 | if let Some(child) = current_node.find_child(&character) { 163 | if child.end { 164 | let substring_of_prefix = prefix_cpy 165 | .raw_content() 166 | .slice_by_char_indice(0, current_index + 1); 167 | result.push(substring_of_prefix); 168 | } 169 | current_node_wrap = Some(child); 170 | } else { 171 | break; 172 | } 173 | } 174 | current_index = current_index + 1; 175 | } 176 | result 177 | } 178 | } 179 | 180 | #[test] 181 | fn test_add_and_remove_word() { 182 | let mut trie = TrieChar::new(&[CustomString::new("ศาล")]); 183 | assert_eq!(trie.amount_of_words(), 1); 184 | trie.add(&CustomString::new("ศาล")); 185 | assert_eq!(trie.amount_of_words(), 1); 186 | trie.add(&CustomString::new(" ศาล ")); 187 | assert_eq!(trie.amount_of_words(), 1); 188 | trie.add(&CustomString::new("ศาลา")); 189 | assert_eq!(trie.amount_of_words(), 2); 190 | trie.remove(&CustomString::new("ศาลา")); 191 | assert_eq!(trie.amount_of_words(), 1); 192 | trie.remove(&CustomString::new("ลา")); 193 | assert_eq!(trie.amount_of_words(), 1); 194 | trie.remove(&CustomString::new("ศาล")); 195 | assert_eq!(trie.amount_of_words(), 0); 196 | trie.remove(&CustomString::new("")); 197 | assert_eq!(trie.amount_of_words(), 0); 198 | } 199 | -------------------------------------------------------------------------------- /src/tokenizer/tcc/tcc_rules.rs: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | /** 5 | * Rules for TCC (Thai Character Cluster) tokenization. 6 | */ 7 | use crate::four_bytes_str::custom_regex::regex_pattern_to_custom_pattern; 8 | use lazy_static::lazy_static; 9 | use regex::bytes::Regex; 10 | // เc็ck 1 11 | // เcctาะk 2 12 | // เccีtยะk 3 13 | // เccีtย(?=[เ-ไก-ฮ]|$)k look ahead 1 14 | // เcc็ck 4 15 | // เcิc์ck 5 16 | // เcิtck 6 17 | // เcีtยะ?k 7 18 | // เcืtอะ?k 8 19 | // เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)k look ahead 2 20 | // เctา?ะ?k 9 21 | // cัtวะk 10 22 | // c[ัื]tc[ุิะ]?k 11 23 | // c[ิุู]์ 12 24 | // c[ะ-ู]tk 13 25 | // cรรc์ 14 26 | // c็ 15 27 | // ct[ะาำ]?k 16 28 | // ck 17 29 | // แc็c 18 30 | // แcc์ 19 31 | // แctะ 20 32 | // แcc็c 21 33 | // แccc์ 22 34 | // โctะ 23 35 | // [เ-ไ]ct 24 36 | // ก็ 37 | // อึ 38 | // หึ 39 | pub fn replace_tcc_symbol(tcc_pattern: &str) -> String { 40 | tcc_pattern 41 | .replace('k', "(cc?[dิ]?[์])?") 42 | .replace('c', "[ก-ฮ]") 43 | .replace('t', "[่-๋]?") 44 | .replace('d', &"อูอุ".replace('อ', "")) 45 | } 46 | lazy_static! { 47 | pub static ref NON_LOOKAHEAD_TCC: Regex = Regex::new( 48 | &[ 49 | r"^เc็ck", // 1 50 | r"^เcctาะk", // 2 51 | r"^เccีtยะk", // 3 52 | r"^เcc็ck", // 4 53 | r"^เcิc์ck", // 5 54 | r"^เcิtck", // 6 55 | r"^เcีtยะ?k", // 7 56 | r"^เcืtอะ?k", // 8 57 | r"^เctา?ะ?k", // 9 58 | r"^cัtวะk", // 10 59 | r"^c[ัื]tc[ุิะ]?k", // 11 60 | r"^c[ิุู]์k", // 12 61 | r"^c[ะ-ู]tk", // 13 62 | r"^cรรc์ ็", // 14 63 | r"^c็", // 15 64 | r"^ct[ะาำ]?k", // 16 65 | r"^ck", // 17 66 | r"^แc็c", // 18 67 | r"^แcc์", // 19 68 | r"^แctะ", // 20 69 | r"^แcc็c", // 21 70 | r"^แccc์", // 22 71 | r"^โctะ", // 23 72 | r"^[เ-ไ]ct", // 24 73 | r"^ก็", 74 | r"^อึ", 75 | r"^หึ", 76 | r"^(เccีtย)[เ-ไก-ฮ]k", // look ahead 1 77 | r"^(เc[ิีุู]tย)[เ-ไก-ฮ]k", // look ahead 2 78 | ].map(|pattern| { 79 | regex_pattern_to_custom_pattern(&replace_tcc_symbol(pattern)).unwrap() 80 | }).join("|") 81 | ).unwrap(); 82 | pub static ref LOOKAHEAD_TCC: Regex = Regex::new( 83 | &[ 84 | r"^(เccีtย)[เ-ไก-ฮ]k", //เccีtย(?=[เ-ไก-ฮ]|$) 85 | r"^(เc[ิีุู]tย)[เ-ไก-ฮ]k" //เc[ิีุู]tย(?=[เ-ไก-ฮ]|$) 86 | ].map(|pattern| { 87 | regex_pattern_to_custom_pattern(&replace_tcc_symbol(pattern)).unwrap() 88 | }).join("|") 89 | ) 90 | .unwrap(); 91 | } 92 | 93 | #[test] 94 | fn tcc_regex_test_cases() { 95 | // เc็c 1 1 96 | // เcctาะ 2 2 97 | // เccีtยะ 3 3 98 | // เcc็c 4 4 99 | // เcิc์c 5 5 100 | // เcิtc 6 6 101 | // เcีtยะ? 7 7 102 | // เcืtอะ? 8 8 103 | // เctา?ะ? 9 9 104 | // cัtวะ 10 105 | // c[ัื]tc[ุิะ]? 11 106 | // c[ิุู]์ 12 107 | // c[ะ-ู]t 13 108 | // c็ 14 109 | // ct[ะาำ]? 15 110 | // แc็c 16 111 | // แcc์ 17 112 | // แctะ 18 113 | // แcc็c 19 114 | // แccc์ 20 115 | // โctะ 21 116 | // [เ-ไ]ct 22 117 | 118 | let case_1 = replace_tcc_symbol("^เc็ck"); 119 | let case_2 = replace_tcc_symbol("^เcctาะ"); 120 | let case_3 = replace_tcc_symbol("^เccีtยะ"); 121 | let case_4 = replace_tcc_symbol("^เcc็c"); 122 | let case_5 = replace_tcc_symbol("^เcิc์c"); 123 | let case_6 = replace_tcc_symbol("^เcิtc"); 124 | let case_7 = replace_tcc_symbol("^เcีtยะ?"); 125 | let case_8 = replace_tcc_symbol("^เcืtอะ?"); 126 | let case_9 = replace_tcc_symbol("^เctา?ะ?"); 127 | let case_10 = replace_tcc_symbol("^cัtวะ"); 128 | let case_11 = replace_tcc_symbol("^c[ัื]tc[ุิะ]?"); 129 | let case_12 = replace_tcc_symbol("^c[ิุู]์"); 130 | let case_13 = replace_tcc_symbol("^c[ะ-ู]t"); 131 | let case_14 = replace_tcc_symbol("^c็"); 132 | let case_15 = replace_tcc_symbol("^ct[ะาำ]?"); 133 | let case_16 = replace_tcc_symbol("^แc็c"); 134 | let case_17 = replace_tcc_symbol("^แcc์"); 135 | let case_18 = replace_tcc_symbol("^แctะ"); 136 | let case_19 = replace_tcc_symbol("^แcc็c"); 137 | let case_20 = replace_tcc_symbol("^แccc์"); 138 | let case_21 = replace_tcc_symbol("^โctะ"); 139 | let case_22 = replace_tcc_symbol("^[เ-ไ]ct"); 140 | 141 | // This is the only Karan case. 142 | assert_eq!( 143 | regex_pattern_to_custom_pattern(&case_1).unwrap(), 144 | r"^\x00เ\x00[ก-ฮ]\x00็\x00[ก-ฮ](\x00[ก-ฮ](\x00[ก-ฮ])?(\x00[ิุ-ู])?\x00[์])?" 145 | ); 146 | assert_eq!( 147 | regex_pattern_to_custom_pattern(&case_2).unwrap(), 148 | r"^\x00เ\x00[ก-ฮ]\x00[ก-ฮ](\x00[่-๋])?\x00า\x00ะ" 149 | ); 150 | 151 | assert_eq!( 152 | regex_pattern_to_custom_pattern(&case_3).unwrap(), 153 | r"^\x00เ\x00[ก-ฮ]\x00[ก-ฮ]\x00ี(\x00[่-๋])?\x00ย\x00ะ" 154 | ); 155 | 156 | assert_eq!( 157 | regex_pattern_to_custom_pattern(&case_4).unwrap(), 158 | r"^\x00เ\x00[ก-ฮ]\x00[ก-ฮ]\x00็\x00[ก-ฮ]" 159 | ); 160 | assert_eq!( 161 | regex_pattern_to_custom_pattern(&case_5).unwrap(), 162 | r"^\x00เ\x00[ก-ฮ]\x00ิ\x00[ก-ฮ]\x00์\x00[ก-ฮ]" 163 | ); 164 | assert_eq!( 165 | regex_pattern_to_custom_pattern(&case_6).unwrap(), 166 | r"^\x00เ\x00[ก-ฮ]\x00ิ(\x00[่-๋])?\x00[ก-ฮ]" 167 | ); 168 | assert_eq!( 169 | regex_pattern_to_custom_pattern(&case_7).unwrap(), 170 | r"^\x00เ\x00[ก-ฮ]\x00ี(\x00[่-๋])?\x00ย(\x00ะ)?" 171 | ); 172 | assert_eq!( 173 | regex_pattern_to_custom_pattern(&case_8).unwrap(), 174 | r"^\x00เ\x00[ก-ฮ]\x00ื(\x00[่-๋])?\x00อ(\x00ะ)?" 175 | ); 176 | assert_eq!( 177 | regex_pattern_to_custom_pattern(&case_9).unwrap(), 178 | r"^\x00เ\x00[ก-ฮ](\x00[่-๋])?(\x00า)?(\x00ะ)?" 179 | ); 180 | 181 | assert_eq!( 182 | regex_pattern_to_custom_pattern(&case_10).unwrap(), 183 | r"^\x00[ก-ฮ]\x00ั(\x00[่-๋])?\x00ว\x00ะ" 184 | ); 185 | assert_eq!( 186 | regex_pattern_to_custom_pattern(&case_11).unwrap(), 187 | r"^\x00[ก-ฮ]\x00[ัื](\x00[่-๋])?\x00[ก-ฮ](\x00[ะิุ])?" 188 | ); 189 | assert_eq!( 190 | regex_pattern_to_custom_pattern(&case_12).unwrap(), 191 | r"^\x00[ก-ฮ]\x00[ิุ-ู]\x00์" 192 | ); 193 | assert_eq!( 194 | regex_pattern_to_custom_pattern(&case_13).unwrap(), 195 | r"^\x00[ก-ฮ]\x00[ะ-ู](\x00[่-๋])?" 196 | ); 197 | assert_eq!( 198 | regex_pattern_to_custom_pattern(&case_14).unwrap(), 199 | r"^\x00[ก-ฮ]\x00็" 200 | ); 201 | assert_eq!( 202 | regex_pattern_to_custom_pattern(&case_15).unwrap(), 203 | r"^\x00[ก-ฮ](\x00[่-๋])?(\x00[ะา-ำ])?" 204 | ); 205 | assert_eq!( 206 | regex_pattern_to_custom_pattern(&case_16).unwrap(), 207 | r"^\x00แ\x00[ก-ฮ]\x00็\x00[ก-ฮ]" 208 | ); 209 | assert_eq!( 210 | regex_pattern_to_custom_pattern(&case_17).unwrap(), 211 | r"^\x00แ\x00[ก-ฮ]\x00[ก-ฮ]\x00์" 212 | ); 213 | assert_eq!( 214 | regex_pattern_to_custom_pattern(&case_18).unwrap(), 215 | r"^\x00แ\x00[ก-ฮ](\x00[่-๋])?\x00ะ" 216 | ); 217 | assert_eq!( 218 | regex_pattern_to_custom_pattern(&case_19).unwrap(), 219 | r"^\x00แ\x00[ก-ฮ]\x00[ก-ฮ]\x00็\x00[ก-ฮ]" 220 | ); 221 | assert_eq!( 222 | regex_pattern_to_custom_pattern(&case_20).unwrap(), 223 | r"^\x00แ\x00[ก-ฮ]\x00[ก-ฮ]\x00[ก-ฮ]\x00์" 224 | ); 225 | assert_eq!( 226 | regex_pattern_to_custom_pattern(&case_21).unwrap(), 227 | r"^\x00โ\x00[ก-ฮ](\x00[่-๋])?\x00ะ" 228 | ); 229 | assert_eq!( 230 | regex_pattern_to_custom_pattern(&case_22).unwrap(), 231 | r"^\x00[เ-ไ]\x00[ก-ฮ](\x00[่-๋])?" 232 | ); 233 | 234 | let look_ahead_case_1 = replace_tcc_symbol(r"^(เccีtย)[เ-ไก-ฮ]"); 235 | let look_ahead_1_regex = regex_pattern_to_custom_pattern(&look_ahead_case_1).unwrap(); 236 | let look_ahead_case_2 = replace_tcc_symbol(r"^(เc[ิีุู]tย)[เ-ไก-ฮ]"); 237 | let look_ahead_2_regex = regex_pattern_to_custom_pattern(&look_ahead_case_2).unwrap(); 238 | assert!( 239 | (look_ahead_1_regex == r"^(\x00เ\x00[ก-ฮ]\x00[ก-ฮ]\x00ี(\x00[่-๋])?\x00ย)\x00[เ-ไก-ฮ]" 240 | || look_ahead_1_regex == r"^(\x00เ\x00[ก-ฮ]\x00[ก-ฮ]\x00ี(\x00[่-๋])?\x00ย)\x00[ก-ฮเ-ไ]") 241 | ); 242 | assert_eq!( 243 | look_ahead_2_regex, 244 | r"^(\x00เ\x00[ก-ฮ]\x00[ิ-ีุ-ู](\x00[่-๋])?\x00ย)\x00[ก-ฮเ-ไ]" 245 | ); 246 | } 247 | 248 | #[test] 249 | fn newmm_exception_match_cases() { 250 | assert_eq!( 251 | r"^(\x00\x00\x00\r)?\x00\x00\x00\n", 252 | regex_pattern_to_custom_pattern(r"(?x)^\r?\n").unwrap() 253 | ); 254 | 255 | assert_eq!( 256 | r"^(\x00\x00\x00[\t ])+", 257 | regex_pattern_to_custom_pattern(r"^[ \t]+").unwrap() 258 | ); 259 | assert_eq!( 260 | r"^(\x00\x00\x00[\-A-Za-z])+", 261 | regex_pattern_to_custom_pattern(r"(?x)^[-a-zA-Z]+").unwrap() 262 | ); 263 | assert_eq!( 264 | r"^(\x00[๐-๙])+(\x00\x00\x00[,\.](\x00[๐-๙])+)*", 265 | regex_pattern_to_custom_pattern(r"(?x)^[๐-๙]+([,\.][๐-๙]+)*").unwrap() 266 | ); 267 | assert_eq!( 268 | r"^(\x00\x00\x00[0-9])+(\x00\x00\x00[,\.](\x00\x00\x00[0-9])+)*", 269 | regex_pattern_to_custom_pattern(r"(?x)^[0-9]+([,\.][0-9]+)*").unwrap() 270 | ); 271 | assert_eq!( 272 | r"^(\x00[ก-ฮ]){0,2}$", 273 | regex_pattern_to_custom_pattern(r"^[ก-ฮ]{0,2}$").unwrap() 274 | ) 275 | } 276 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /nlpo3-cli/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /nlpo3-nodejs/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /nlpo3-python/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /nlpo3-python/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "ahash" 7 | version = "0.8.11" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" 10 | dependencies = [ 11 | "cfg-if", 12 | "getrandom", 13 | "once_cell", 14 | "version_check", 15 | "zerocopy", 16 | ] 17 | 18 | [[package]] 19 | name = "aho-corasick" 20 | version = "0.7.18" 21 | source = "registry+https://github.com/rust-lang/crates.io-index" 22 | checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" 23 | dependencies = [ 24 | "memchr", 25 | ] 26 | 27 | [[package]] 28 | name = "anyhow" 29 | version = "1.0.45" 30 | source = "registry+https://github.com/rust-lang/crates.io-index" 31 | checksum = "ee10e43ae4a853c0a3591d4e2ada1719e553be18199d9da9d4a83f5927c2f5c7" 32 | 33 | [[package]] 34 | name = "autocfg" 35 | version = "1.0.1" 36 | source = "registry+https://github.com/rust-lang/crates.io-index" 37 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" 38 | 39 | [[package]] 40 | name = "binary-heap-plus" 41 | version = "0.4.1" 42 | source = "registry+https://github.com/rust-lang/crates.io-index" 43 | checksum = "4f068638f8ff9e118a9361e66a411eff410e7fb3ecaa23bf9272324f8fc606d7" 44 | dependencies = [ 45 | "compare", 46 | ] 47 | 48 | [[package]] 49 | name = "bytecount" 50 | version = "0.6.2" 51 | source = "registry+https://github.com/rust-lang/crates.io-index" 52 | checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e" 53 | 54 | [[package]] 55 | name = "cfg-if" 56 | version = "1.0.0" 57 | source = "registry+https://github.com/rust-lang/crates.io-index" 58 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 59 | 60 | [[package]] 61 | name = "compare" 62 | version = "0.1.0" 63 | source = "registry+https://github.com/rust-lang/crates.io-index" 64 | checksum = "120133d4db2ec47efe2e26502ee984747630c67f51974fca0b6c1340cf2368d3" 65 | 66 | [[package]] 67 | name = "crossbeam-channel" 68 | version = "0.5.1" 69 | source = "registry+https://github.com/rust-lang/crates.io-index" 70 | checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" 71 | dependencies = [ 72 | "cfg-if", 73 | "crossbeam-utils", 74 | ] 75 | 76 | [[package]] 77 | name = "crossbeam-deque" 78 | version = "0.8.1" 79 | source = "registry+https://github.com/rust-lang/crates.io-index" 80 | checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" 81 | dependencies = [ 82 | "cfg-if", 83 | "crossbeam-epoch", 84 | "crossbeam-utils", 85 | ] 86 | 87 | [[package]] 88 | name = "crossbeam-epoch" 89 | version = "0.9.5" 90 | source = "registry+https://github.com/rust-lang/crates.io-index" 91 | checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" 92 | dependencies = [ 93 | "cfg-if", 94 | "crossbeam-utils", 95 | "lazy_static", 96 | "memoffset 0.6.4", 97 | "scopeguard", 98 | ] 99 | 100 | [[package]] 101 | name = "crossbeam-utils" 102 | version = "0.8.5" 103 | source = "registry+https://github.com/rust-lang/crates.io-index" 104 | checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" 105 | dependencies = [ 106 | "cfg-if", 107 | "lazy_static", 108 | ] 109 | 110 | [[package]] 111 | name = "either" 112 | version = "1.6.1" 113 | source = "registry+https://github.com/rust-lang/crates.io-index" 114 | checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" 115 | 116 | [[package]] 117 | name = "getrandom" 118 | version = "0.2.15" 119 | source = "registry+https://github.com/rust-lang/crates.io-index" 120 | checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" 121 | dependencies = [ 122 | "cfg-if", 123 | "libc", 124 | "wasi", 125 | ] 126 | 127 | [[package]] 128 | name = "heck" 129 | version = "0.5.0" 130 | source = "registry+https://github.com/rust-lang/crates.io-index" 131 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 132 | 133 | [[package]] 134 | name = "hermit-abi" 135 | version = "0.1.19" 136 | source = "registry+https://github.com/rust-lang/crates.io-index" 137 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 138 | dependencies = [ 139 | "libc", 140 | ] 141 | 142 | [[package]] 143 | name = "indoc" 144 | version = "2.0.5" 145 | source = "registry+https://github.com/rust-lang/crates.io-index" 146 | checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" 147 | 148 | [[package]] 149 | name = "lazy_static" 150 | version = "1.5.0" 151 | source = "registry+https://github.com/rust-lang/crates.io-index" 152 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 153 | 154 | [[package]] 155 | name = "libc" 156 | version = "0.2.162" 157 | source = "registry+https://github.com/rust-lang/crates.io-index" 158 | checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" 159 | 160 | [[package]] 161 | name = "memchr" 162 | version = "2.4.1" 163 | source = "registry+https://github.com/rust-lang/crates.io-index" 164 | checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" 165 | 166 | [[package]] 167 | name = "memoffset" 168 | version = "0.6.4" 169 | source = "registry+https://github.com/rust-lang/crates.io-index" 170 | checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" 171 | dependencies = [ 172 | "autocfg", 173 | ] 174 | 175 | [[package]] 176 | name = "memoffset" 177 | version = "0.9.1" 178 | source = "registry+https://github.com/rust-lang/crates.io-index" 179 | checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" 180 | dependencies = [ 181 | "autocfg", 182 | ] 183 | 184 | [[package]] 185 | name = "nlpo3" 186 | version = "1.4.0" 187 | source = "registry+https://github.com/rust-lang/crates.io-index" 188 | checksum = "eb843198757c6b5082c2811352cc159dda13275c1bf41e24bdcffb0554772608" 189 | dependencies = [ 190 | "anyhow", 191 | "binary-heap-plus", 192 | "bytecount", 193 | "lazy_static", 194 | "rayon", 195 | "regex", 196 | "regex-syntax", 197 | "rustc-hash", 198 | ] 199 | 200 | [[package]] 201 | name = "nlpo3-python" 202 | version = "1.3.1" 203 | dependencies = [ 204 | "ahash", 205 | "lazy_static", 206 | "nlpo3", 207 | "pyo3", 208 | ] 209 | 210 | [[package]] 211 | name = "num_cpus" 212 | version = "1.13.0" 213 | source = "registry+https://github.com/rust-lang/crates.io-index" 214 | checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" 215 | dependencies = [ 216 | "hermit-abi", 217 | "libc", 218 | ] 219 | 220 | [[package]] 221 | name = "once_cell" 222 | version = "1.20.2" 223 | source = "registry+https://github.com/rust-lang/crates.io-index" 224 | checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" 225 | 226 | [[package]] 227 | name = "portable-atomic" 228 | version = "1.9.0" 229 | source = "registry+https://github.com/rust-lang/crates.io-index" 230 | checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" 231 | 232 | [[package]] 233 | name = "proc-macro2" 234 | version = "1.0.89" 235 | source = "registry+https://github.com/rust-lang/crates.io-index" 236 | checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" 237 | dependencies = [ 238 | "unicode-ident", 239 | ] 240 | 241 | [[package]] 242 | name = "pyo3" 243 | version = "0.22.6" 244 | source = "registry+https://github.com/rust-lang/crates.io-index" 245 | checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884" 246 | dependencies = [ 247 | "cfg-if", 248 | "indoc", 249 | "libc", 250 | "memoffset 0.9.1", 251 | "once_cell", 252 | "portable-atomic", 253 | "pyo3-build-config", 254 | "pyo3-ffi", 255 | "pyo3-macros", 256 | "unindent", 257 | ] 258 | 259 | [[package]] 260 | name = "pyo3-build-config" 261 | version = "0.22.6" 262 | source = "registry+https://github.com/rust-lang/crates.io-index" 263 | checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38" 264 | dependencies = [ 265 | "once_cell", 266 | "target-lexicon", 267 | ] 268 | 269 | [[package]] 270 | name = "pyo3-ffi" 271 | version = "0.22.6" 272 | source = "registry+https://github.com/rust-lang/crates.io-index" 273 | checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636" 274 | dependencies = [ 275 | "libc", 276 | "pyo3-build-config", 277 | ] 278 | 279 | [[package]] 280 | name = "pyo3-macros" 281 | version = "0.22.6" 282 | source = "registry+https://github.com/rust-lang/crates.io-index" 283 | checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453" 284 | dependencies = [ 285 | "proc-macro2", 286 | "pyo3-macros-backend", 287 | "quote", 288 | "syn", 289 | ] 290 | 291 | [[package]] 292 | name = "pyo3-macros-backend" 293 | version = "0.22.6" 294 | source = "registry+https://github.com/rust-lang/crates.io-index" 295 | checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" 296 | dependencies = [ 297 | "heck", 298 | "proc-macro2", 299 | "pyo3-build-config", 300 | "quote", 301 | "syn", 302 | ] 303 | 304 | [[package]] 305 | name = "quote" 306 | version = "1.0.37" 307 | source = "registry+https://github.com/rust-lang/crates.io-index" 308 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" 309 | dependencies = [ 310 | "proc-macro2", 311 | ] 312 | 313 | [[package]] 314 | name = "rayon" 315 | version = "1.5.1" 316 | source = "registry+https://github.com/rust-lang/crates.io-index" 317 | checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" 318 | dependencies = [ 319 | "autocfg", 320 | "crossbeam-deque", 321 | "either", 322 | "rayon-core", 323 | ] 324 | 325 | [[package]] 326 | name = "rayon-core" 327 | version = "1.9.1" 328 | source = "registry+https://github.com/rust-lang/crates.io-index" 329 | checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" 330 | dependencies = [ 331 | "crossbeam-channel", 332 | "crossbeam-deque", 333 | "crossbeam-utils", 334 | "lazy_static", 335 | "num_cpus", 336 | ] 337 | 338 | [[package]] 339 | name = "regex" 340 | version = "1.5.4" 341 | source = "registry+https://github.com/rust-lang/crates.io-index" 342 | checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" 343 | dependencies = [ 344 | "aho-corasick", 345 | "memchr", 346 | "regex-syntax", 347 | ] 348 | 349 | [[package]] 350 | name = "regex-syntax" 351 | version = "0.6.25" 352 | source = "registry+https://github.com/rust-lang/crates.io-index" 353 | checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" 354 | 355 | [[package]] 356 | name = "rustc-hash" 357 | version = "1.1.0" 358 | source = "registry+https://github.com/rust-lang/crates.io-index" 359 | checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" 360 | 361 | [[package]] 362 | name = "scopeguard" 363 | version = "1.1.0" 364 | source = "registry+https://github.com/rust-lang/crates.io-index" 365 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" 366 | 367 | [[package]] 368 | name = "syn" 369 | version = "2.0.87" 370 | source = "registry+https://github.com/rust-lang/crates.io-index" 371 | checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" 372 | dependencies = [ 373 | "proc-macro2", 374 | "quote", 375 | "unicode-ident", 376 | ] 377 | 378 | [[package]] 379 | name = "target-lexicon" 380 | version = "0.12.16" 381 | source = "registry+https://github.com/rust-lang/crates.io-index" 382 | checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" 383 | 384 | [[package]] 385 | name = "unicode-ident" 386 | version = "1.0.13" 387 | source = "registry+https://github.com/rust-lang/crates.io-index" 388 | checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" 389 | 390 | [[package]] 391 | name = "unindent" 392 | version = "0.2.3" 393 | source = "registry+https://github.com/rust-lang/crates.io-index" 394 | checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" 395 | 396 | [[package]] 397 | name = "version_check" 398 | version = "0.9.5" 399 | source = "registry+https://github.com/rust-lang/crates.io-index" 400 | checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" 401 | 402 | [[package]] 403 | name = "wasi" 404 | version = "0.11.0+wasi-snapshot-preview1" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 407 | 408 | [[package]] 409 | name = "zerocopy" 410 | version = "0.7.35" 411 | source = "registry+https://github.com/rust-lang/crates.io-index" 412 | checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" 413 | dependencies = [ 414 | "zerocopy-derive", 415 | ] 416 | 417 | [[package]] 418 | name = "zerocopy-derive" 419 | version = "0.7.35" 420 | source = "registry+https://github.com/rust-lang/crates.io-index" 421 | checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" 422 | dependencies = [ 423 | "proc-macro2", 424 | "quote", 425 | "syn", 426 | ] 427 | -------------------------------------------------------------------------------- /nlpo3-python/tests/test_tokenize.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import unittest 5 | from typing import List 6 | 7 | from nlpo3 import load_dict, segment 8 | 9 | 10 | class TestTokenizePackage(unittest.TestCase): 11 | def setUp(self): 12 | self.TEXT_1 = "หมอนทองตากลมหูว์MBK39 :.ฉฺ๐๐๓-#™±" 13 | self.TEXT_2 = "ทดสอบ" 14 | 15 | self.LONG_TEXT = ( 16 | "ไต้หวัน (แป่ะเอ๋ยี้: Tâi-oân; ไต่อวัน) หรือ ไถวาน " 17 | "(อักษรโรมัน: Taiwan; จีนตัวย่อ: 台湾; จีนตัวเต็ม: 臺灣/台灣; พินอิน: " 18 | "Táiwān; ไถวาน) หรือชื่อทางการว่า สาธารณรัฐจีน (จีนตัวย่อ: 中华民国; " 19 | "จีนตัวเต็ม: 中華民國; พินอิน: Zhōnghuá " 20 | "Mínguó) เป็นรัฐในทวีปเอเชียตะวันออก[7][8][9] ปัจจุบันประกอบด้วย" 21 | "เกาะใหญ่ 5 แห่ง คือ จินเหมิน (金門), ไต้หวัน, เผิงหู (澎湖), หมาจู่ " 22 | "(馬祖), และอูชิว (烏坵) กับทั้งเกาะเล็กเกาะน้อยอีกจำนวนหนึ่ง " 23 | 'ท้องที่ดังกล่าวเรียกรวมกันว่า "พื้นที่ไต้หวัน" (臺灣地區)\n' 24 | "ไต้หวันด้านตะวันตกติดกับจีนแผ่นดินใหญ่ ด้านตะวันออกและตะวันออก" 25 | "เฉียงเหนือติดกับญี่ปุ่น และด้านใต้ติดกับฟิลิปปินส์ กรุงไทเปเป็น" 26 | "เมืองหลวง ส่วนไทเปใหม่เป็นเขตปกครองที่จัดตั้งขึ้นใหม่ กินพื้นที่" 27 | "กรุงไทเปและเป็นเขตซึ่งประชากรหนาแน่นที่สุดในเวลานี้\n" 28 | "เกาะไต้หวันเดิมเป็นที่อยู่ของชนพื้นเมือง และมีชาวจีนจากแผ่นดิน" 29 | "ใหญ่เข้ามาอาศัยร่วมด้วย จนกระทั่งชาววิลันดาและสเปนเดินทางเข้า" 30 | "มาในยุคสำรวจเมื่อศตวรรษที่ 17 และมาตั้งบ้านเรือนกลายเป็นนิคม" 31 | "ใหญ่โต ต่อมาปี 1662 ราชวงศ์หมิงในแผ่นดินใหญ่ถูกราชวงศ์ชิงแทนที่ " 32 | "เจิ้ง เฉิงกง (鄭成功) ขุนศึกหมิง รวมกำลังหนีมาถึงเกาะไต้หวัน " 33 | "และรุกไล่ฝรั่งออกไปได้อย่างราบคาบ เขาจึงตั้งราชอาณาจักรตงหนิง " 34 | '(東寧) ขึ้นบนเกาะเพื่อ "โค่นชิงฟื้นหมิง" แต่ในปี 1683 ราชวงศ์' 35 | "ชิงปราบปรามอาณาจักรตงหนิงและเข้าครอบครองไต้หวันเป็นผลสำเร็จ " 36 | "ไต้หวันจึงกลายเป็นมณฑลหนึ่งของจีน อย่างไรก็ดี ความบาดหมางระหว่าง" 37 | "จีนกับญี่ปุ่นเป็นเหตุให้ญี่ปุ่นได้ไต้หวันไปในปี 1895\n" 38 | "ก่อนเสียไต้หวันคืนแก่จีนหลังสงครามโลกครั้งที่สอง ช่วงนั้น มีการ" 39 | "เปลี่ยนแปลงการปกครองในจีน พรรคก๊กมินตั๋ง ได้เป็นใหญ่ " 40 | "แต่ไม่นานก็เสียทีให้แก่พรรคคอมมิวนิสต์จีน พรรคก๊กมินตั๋งจึงหนี" 41 | "มายังเกาะไต้หวันและสถาปนาสาธารณรัฐจีนขึ้นบนเกาะแยกต่างหาก " 42 | "ส่วนฝ่ายคอมมิวนิสต์จีนที่เป็นฝ่ายได้รับชัยชนะได้สถาปนาสาธารณรัฐ" 43 | "ประชาชนจีนบนแผ่นดินใหญ่ อย่างไรก็ดี จีนยังคงถือว่า ไต้หวันเป็น" 44 | "มณฑลหนึ่งของตน และไต้หวันเองก็ยังมิได้รับการยอมรับจากนานาชาติ" 45 | "ว่าเป็นประเทศเอกราชมาจนบัดนี้\n" 46 | "ในช่วงทศวรรษ 1980 ถึงต้นทศวรรษ 1990 การเมืองการปกครอง" 47 | "สาธารณรัฐจีน (ไต้หวัน) เจริญรุ่งเรืองจนเป็นประชาธิปไตยที่มีพรรค" 48 | "การเมืองหลายพรรคและมีการเลือกตั้งทั่วหน้า ในช่วงกลางศตวรรษที่ " 49 | "20 เศรษฐกิจไต้หวันงอกงามอย่างรวดเร็ว ไต้หวันจึงกลายเป็นประเทศ" 50 | "พัฒนาแล้ว ได้ชื่อว่าเป็นหนึ่งในสี่เสือแห่งเอเชีย มีอุตสาหกรรม" 51 | "ล้ำหน้า และมีเศรษฐกิจใหญ่โตเป็นอันดับที่ 19 ของโลก[11][12] " 52 | "อุตสาหกรรมที่ใช้เทคโนโลยีชั้นสูงของไต้หวันยังมีบทบาทสำคัญมากใน" 53 | "เศรษฐกิจโลก เป็นเหตุให้ไต้หวันได้เป็นสมาชิกองค์การการค้าโลกและ" 54 | "ความร่วมมือทางเศรษฐกิจเอเชีย-แปซิฟิก เสรีภาพของสื่อมวลชน เสรี" 55 | "ภาพทางเศรษฐกิจ การสาธารณสุข[13]การศึกษา และดัชนีการพัฒนามนุษย์ใน" 56 | "ไต้หวันยังได้รับการจัดอยู่ในอันดับสูงด้วย[14][4][15]\n" 57 | "สาธารณรัฐจีน มีลักษณะเป็นกลุ่มเกาะ ภูมิประเทศติดกับทะเล ไม่ติด" 58 | "กับประเทศใดเลย ห่างจากเกาะทางทิศเหนือและทิศตะวันตกเป็นสาธารณรัฐ" 59 | "ประชาชนจีน ทิศใต้เป็นประเทศฟิลิปปินส์และทะเลจีนใต้ ส่วนทิศ" 60 | "ตะวันออกเป็นมหาสมุทรแปซิฟิก\n" 61 | "ในปี ค.ศ. 1638 หลังการพ่ายแพ้ของหลานชายของเจิ้ง เฉิงกง " 62 | "จากการบุกโจมตีทางทัพเรือของราชวงศ์ชิงแมนจูที่นำทัพโดยชื่อ หลาง" 63 | "จากทางใต้ของมณฑลฝูเจี้ยน ทำให้ราชวงศ์ชิงผนวกยึดเกาะไต้หวันเป็น" 64 | "ส่วนหนึ่งสำเร็จ และวางไว้ภายใต้เขตอำนาจของมณฑลฝูเจี้ยน ราชสำนัก" 65 | "ราชวงศ์ชิงพยายามลดการละเมิดสิทธิ์และความไม่ลงรอยกันในพื้นที่โดย" 66 | "ออกกฎหมายเพื่อจัดการตรวจคนเข้าเมืองและเคารพสิทธิในที่ดินของชน" 67 | "พื้นเมืองไต้หวัน ผู้อพยพจากฝูเจี้ยนทางใต้ส่วนใหญ่ยังคงเดินทางไป" 68 | "ไต้หวัน เขตแดนระหว่างดินแดนที่เสียภาษีและสิ่งที่ถูกพิจารณาว่า" 69 | 'เป็นดินแดน "เขตอันตราย" เปลี่ยนไปทางทิศตะวันออกโดยชาวพื้นเมือง' 70 | "บางคนเข้ารีตรับวัฒนธรรมแบบจีน ในขณะที่คนอื่นถอยกลับเข้าในภูเขา " 71 | "ในช่วงเวลานี้มีความขัดแย้งจำนวนมากระหว่างกลุ่มชาวฮั่นด้วยกันเอง" 72 | "จากภูมิภาคต่าง ๆ ของฝูเจี้ยนทางใต้โดยเฉพาะอย่างยิ่งระหว่างเฉวียน" 73 | "โจวกับฉางโจว และระหว่างฝูเจี้ยนตอนใต้และชาวพื้นเมืองไต้หวัน\n" 74 | "พ.ศ. 2454 (ค.ศ. 1911) การจลาจลอู่ฮั่นในประเทศจีน เป็นจุดเริ่มต้น" 75 | "การล่มสลายของราชวงศ์ชิง เมื่อพรรคคอมมิวนิสต์จีนเข้ามีอำนาจในจีน" 76 | "แผ่นดินใหญ่เมื่อ พ.ศ. 2492 (1949) พรรคก๊กมินตั๋ง พรรคการเมือง" 77 | "ชาตินิยมของจีนที่เป็นฝ่ายแพ้ก็พาผู้คนอพยพหนีออกจากแผ่นดินใหญ่มา" 78 | "ตั้งหลักที่ไต้หวัน เพื่อวางแผนกลับไปครองอำนาจในจีนต่อไป\n" 79 | "ชาวจีนมากกว่า 1 ล้าน 5 แสนคน อพยพตามมาอยู่ที่เกาะไต้หวันในยุคที่" 80 | "เหมา เจ๋อตง มีอำนาจเต็มที่ในจีนแผ่นดินใหญ่ ผู้นำของประเทศทั้งสอง" 81 | "จีนคือผู้นำพรรคคอมมิวนิสต์กับผู้นำสาธารณรัฐจีนบนเกาะไต้หวัน แย่ง" 82 | "กันเป็นกระบอกเสียงของประชาชนจีนในเวทีโลก แต่เสียงของนานาประเทศ" 83 | "ส่วนใหญ่เกรงอิทธิพลของจีนแผ่นดินใหญ่ จึงให้การยอมรับจีนแผ่นดิน" 84 | "ใหญ่มากกว่า\n" 85 | "ในปี พ.ศ. 2514 (ค.ศ. 1971) ก่อนที่นายพล เจียง ไคเช็ก" 86 | "(ภาษาจีน: 蔣中正) จะถึงอสัญกรรมไม่กี่ปี สาธารณรัฐจีนซึ่งเป็น" 87 | "ประเทศที่ร่วมก่อตั้งองค์การสหประชาชาติได้สูญเสียสมาชิกภาพใน" 88 | "ฐานะตัวแทนชาวจีนให้กับสาธารณรัฐประชาชนจีน ในปี พ.ศ. 2521 (1978)" 89 | "สหประชาชาติประกาศรับรองจีนเดียวคือจีนแผ่นดินใหญ่และตัดสัมพันธ์" 90 | "ทางการเมืองกับสาธารณรัฐจีน ทั้งสหรัฐอเมริกาก็ได้ถอนการรับรองว่า" 91 | "สาธารณรัฐจีนมีฐานะเป็นรัฐ ไต้หวันจึงกลายเป็นเพียงดินแดนที่จีน" 92 | "อ้างว่าเป็นส่วนหนึ่งของสาธารณรัฐประชาชนจีนตั้งแต่นั้นเป็นต้นมา\n" 93 | "เมื่อเจียง ไคเช็ก ถึงแก่อสัญกรรมในปี พ.ศ. 2518 (1975) ลูกชาย" 94 | "ที่ชื่อ เจี่ยง จิงกั๋ว ได้เป็นผู้สืบทอดการปกครอง" 95 | "ไต้หวันต่อและเริ่มกระบวนการ วางรากฐานไปสู่ประชาธิปไตย\n" 96 | "หลังจากที่ประธานาธิบดี เจียง จิงกั๋ว เสียชีวิต ไต้หวันจึงได้เข้า" 97 | "สู่ระบอบประชาธิปไตยเต็มรูปแบบ ประธานาธิบดีคนใหม่ ซึ่งเกิดใน" 98 | "ไต้หวัน ชื่อ หลี่ เติงฮุย ขึ้นบริหารประเทศ โดยการสนับสนุนของ" 99 | "เจี่ยง จิงกั๋ว ทั้งที่ หลี่ เติงฮุย นั้นเคลื่อนไหว" 100 | "สนับสนุนเอกราชไต้หวัน นาย รัฐบาลจีนที่ปักกิ่งได้ตั้ง" 101 | 'ฉายาประธานาธิบดีไต้หวันคนใหม่ว่า "จิ้งจกปากหวาน" ' 102 | "ช่วงเวลาที่นายหลี่ เติงฮุย เป็นประธานาธิบดี การเมืองของไต้หวัน" 103 | "เกิดการแตกแยกออกเป็น 3 ฝ่ายคือ 1) พวกก๊กมินตั๋ง ที่ต้องการกลับ" 104 | "ไปรวมประเทศกับจีนแผ่นดินใหญ่ (รวมจีนแผ่นดินใหญ่ภายใต้การปกครอง" 105 | "ของสาธารณรัฐจีน) 2) พวกที่ต้องการให้ไต้หวันเป็นประเทศอิสระไม่" 106 | "เกี่ยวข้องกับจีนแผ่นดินใหญ่ และ 3) พวกที่ต้องการดำรงฐานะของ" 107 | "ประเทศไว้ดังเดิมต่อไป\n" 108 | "ไต้หวันกับจีนแผ่นดินใหญ่นัดเจรจาหาทางออกของข้อขัดแย้งทางการเมือง" 109 | "ครั้งแรกที่สิงคโปร์เมื่อปี พ.ศ. 2536 (ค.ศ. 1993) แต่ปรากฏว่าจีน" 110 | "แผ่นดินใหญ่ประวิงเวลาลงนามในสัญญาหลายฉบับที่เป็นข้อตกลงร่วมกัน " 111 | "ทำให้ผลการเจรจาคราวนั้นไม่ก้าวหน้าไปถึงไหน ความสัมพันธ์ระหว่าง" 112 | "สองจีนเลวร้ายลงทุกที เมื่อประธานาธิบดี หลี่ เติงฮุย เดินทางไป" 113 | "เยือนสหรัฐอเมริกาและได้รับการยอมรับอย่างเอิกเกริก ทำให้จีนแผ่น" 114 | "ดินใหญ่ไม่พอใจอย่างมาก จึงข่มขวัญไต้หวันกับประเทศที่ให้การสนับ" 115 | "สนุนไต้หวัน ด้วยการทำการซ้อมรบขึ้นใกล้ ๆ เกาะไต้หวัน สหรัฐ" 116 | "อเมริกาออกมาแสดงอาการปกป้องคุ้มครองไต้หวันด้วยการส่งกำลังกอง" 117 | "เรือรบของสหรัฐฯ มาป้วนเปี้ยนอยู่ในน่านน้ำที่จีนซ้อมรบ\n" 118 | "ขณะที่โลกกำลังล่อแหลมกับสถานการณ์ที่ตึงเครียดในน่านน้ำจีนมาก" 119 | "ขึ้นทุกทีนั้น ไต้หวันก็จัดให้มีการเลือกตั้งครั้งใหม่ และในการ" 120 | "เลือกตั้งครั้งใหม่นั้นเอง ไต้หวันก็ได้นายหลี่ เติงฮุย เป็น" 121 | "ประธานาธิบดีอีกครั้ง\n" 122 | "ไต้หวันเข้าสู่สภาวะวิกฤต เมื่อเกิดแผ่นดินไหวครั้งร้ายแรงที่สุดใน" 123 | "ประวัติศาสตร์ในเดือนกันยายน พ.ศ. 2542 (ค.ศ. 1999) ทำให้ประชากร" 124 | "ส่วนมากที่เป็นชาวพื้นเมืองเสียชีวิตไป 2,000 คน ทั้งเมืองมีแต่" 125 | "เศษซากปรักหักพังจากภัยธรรมชาติ และช่วงนี้ไต้หวันต้องเผชิญความ" 126 | "ยากลำบาก จีนแผ่นดินใหญ่ก็เพิ่มความกดดันไม่ให้นานาชาติ" 127 | "เข้ามายุ่งเกี่ยวกับไต้หวันแม้ในยามคับขันเช่นนี้ โดยประกาศว่า " 128 | "หากมีประเทศใดจะเข้าไปให้ความช่วยเหลือไต้หวัน จะต้องได้รับอนุญาต" 129 | "จากจีนก่อน ซึ่งคำประกาศของจีนแผ่นดินใหญ่สวนทางกับเมตตาธรรมของ" 130 | "ประเทศทั่วโลกที่ต้องการให้ความช่วยเหลือไต้หวัน\n" 131 | "เดือนมีนาคม พ.ศ. 2543 (ค.ศ. 2000) มีการเลือกตั้งใหม่ในไต้หวัน " 132 | "ชาวไต้หวันเลือกผู้แทนจากพรรคประชาธิปไตยก้าวหน้า คือ นายเฉิน สุย" 133 | "เปี่ยน เป็นประธานาธิบดีคนใหม่ของไต้หวัน ผู้ประกาศนโยบายการเมือง" 134 | "แข็งกร้าวว่าไต้หวันต้องการแยกตัวเป็นอิสระจากจีนแผ่นดินใหญ่ ยุติ" 135 | "ยุคของพรรคชาตินิยมที่ยังฝักใฝ่แผ่นดินใหญ่อยู่ จีนแผ่นดินใหญ่จึง" 136 | "ถือว่าเป็นกบฏต่อการปกครองของจีน เพราะแต่ไหนแต่ไร ไต้หวันไม่เคย" 137 | "ประกาศอย่างเป็นทางการว่าเป็นประเทศอิสระแยกจากจีน และจีนพูดอยู่" 138 | "เสมอว่าไต้หวันเป็นเด็กในปกครองที่ค่อนข้างจะหัวดื้อและเกเร หาก" 139 | "ไต้หวันประกาศว่าเป็นอิสระจากจีนเมื่อใด จีนก็จะยกกำลังจัดการ" 140 | "กับไต้หวันทันที\n" 141 | "ในขณะที่ความสัมพันธ์ทางการเมืองระหว่างสองจีนในสายตาชาวโลก" 142 | "เลวร้ายลง จีนทั้งสองกลับมีการติดต่อทางการค้ากันมากขึ้น มีการ" 143 | "ผ่อนปรนอนุญาตให้ชาวไต้หวันเดินทางไปจีนแผ่นดินใหญ่เพื่อเยี่ยม" 144 | "ญาติได้ เกิดปรากฏการณ์สำคัญคือนักธุรกิจไต้หวันหอบเงินทุนกว่า " 145 | "20,000 ล้านดอลลาร์สหรัฐ ไปลงทุนดำเนินธุรกิจทางตอนใต้ของจีน" 146 | "แผ่นดินใหญ่ จนกระทั่งขณะนี้ชาวไต้หวันกลายเป็นนักลงทุนรายใหญ่" 147 | "เป็นลำดับ 2 ของจีน\n" 148 | "วันที่ 24 พฤษภาคม 2560 ศาลรัฐธรรมนูญวินิจฉัยว่ากฎหมายสมรส" 149 | "ปัจจุบันในเวลานั้น ละเมิดรัฐธรรมนูญ โดยปฏิเสธสิทธิสมรสของคู่รัก" 150 | "เพศเดียวกันชาวไต้หวัน ศาลวินิจฉัยว่าหากสภานิติบัญญัติไม่ผ่าน" 151 | "การแก้ไขกฎหมายที่เพียงพอต่อกฎหมายสมรสของไต้หวันภายในสองปี " 152 | "การสมรสเพศเดียวกันจะชอบด้วยกฎหมายโดยอัตโนมัติในไต้หวัน[17] " 153 | "วันที่ 17 พฤษภาคม 2562 สภานิติบัญญัติไต้หวันอนุมัติ" 154 | "ร่างกฎหมายทำให้การสมรสเพศเดียวกันชอบด้วยกฎหมาย" 155 | " ทำให้เป็นประเทศแรกในทวีปเอเชียที่ผ่านกฎหมายดังกล่าว[18][19]" 156 | ) 157 | 158 | self.DANGER_TEXT_1 = ( 159 | "ชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิ" 160 | "ชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิ" 161 | "ชิชิชิชิชิชิชิชิชิ" 162 | ) 163 | 164 | self.DANGER_TEXT_2 = ( 165 | "ด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้าน" 166 | "หน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้าน" 167 | ) 168 | 169 | self.DANGER_TEXT_3 = ( 170 | "ด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้า" 171 | "ด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้า" 172 | "ด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้า" 173 | "ด้านหน้าด้านหน้าด้านกกกกกก" 174 | "กกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกก" 175 | ) 176 | 177 | def test_segment(self): 178 | DICT_FILENAME = "data/test_dict.txt" 179 | DICT_NAME = "test_dict" 180 | load_dict(DICT_FILENAME, DICT_NAME) 181 | 182 | self.assertEqual(segment(None, DICT_NAME), []) 183 | self.assertEqual(segment("", DICT_NAME), []) 184 | self.assertEqual(segment(" ", DICT_NAME), [" "]) 185 | self.assertEqual( 186 | segment("ไข่คน2021", DICT_NAME), 187 | ["ไข่", "คน", "2021"], 188 | ) 189 | self.assertIn( 190 | "ค่าจ้าง", # in dict there is "ค่า" and "ค่าจ้าง" 191 | segment( 192 | "ค่าจ้างที่ได้รับต้องทำให้แรงงาน" 193 | "สามารถเลี้ยงดูตัวเองและครอบครัว" 194 | "อย่างสมศักดิ์ศรีความเป็นมนุษย์", 195 | DICT_NAME, 196 | ), 197 | ) 198 | self.assertIsInstance(segment(self.TEXT_1, DICT_NAME), List) 199 | self.assertIsInstance(segment(self.TEXT_2, DICT_NAME), List) 200 | self.assertIsInstance(segment(self.LONG_TEXT, DICT_NAME), List) 201 | self.assertIsInstance(segment(self.DANGER_TEXT_1, DICT_NAME), List) 202 | self.assertIsInstance(segment(self.DANGER_TEXT_2, DICT_NAME), List) 203 | self.assertIsInstance(segment(self.DANGER_TEXT_3, DICT_NAME), List) 204 | -------------------------------------------------------------------------------- /nlpo3-cli/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "0.7.18" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "anyhow" 16 | version = "1.0.93" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" 19 | 20 | [[package]] 21 | name = "atty" 22 | version = "0.2.14" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" 25 | dependencies = [ 26 | "hermit-abi", 27 | "libc", 28 | "winapi", 29 | ] 30 | 31 | [[package]] 32 | name = "autocfg" 33 | version = "1.0.1" 34 | source = "registry+https://github.com/rust-lang/crates.io-index" 35 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" 36 | 37 | [[package]] 38 | name = "binary-heap-plus" 39 | version = "0.4.1" 40 | source = "registry+https://github.com/rust-lang/crates.io-index" 41 | checksum = "4f068638f8ff9e118a9361e66a411eff410e7fb3ecaa23bf9272324f8fc606d7" 42 | dependencies = [ 43 | "compare", 44 | ] 45 | 46 | [[package]] 47 | name = "bitflags" 48 | version = "1.2.1" 49 | source = "registry+https://github.com/rust-lang/crates.io-index" 50 | checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" 51 | 52 | [[package]] 53 | name = "bytecount" 54 | version = "0.6.2" 55 | source = "registry+https://github.com/rust-lang/crates.io-index" 56 | checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e" 57 | 58 | [[package]] 59 | name = "cfg-if" 60 | version = "1.0.0" 61 | source = "registry+https://github.com/rust-lang/crates.io-index" 62 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 63 | 64 | [[package]] 65 | name = "clap" 66 | version = "3.0.0-beta.2" 67 | source = "registry+https://github.com/rust-lang/crates.io-index" 68 | checksum = "4bd1061998a501ee7d4b6d449020df3266ca3124b941ec56cf2005c3779ca142" 69 | dependencies = [ 70 | "atty", 71 | "bitflags", 72 | "clap_derive", 73 | "indexmap", 74 | "lazy_static", 75 | "os_str_bytes", 76 | "strsim", 77 | "termcolor", 78 | "textwrap", 79 | "unicode-width", 80 | "vec_map", 81 | ] 82 | 83 | [[package]] 84 | name = "clap_derive" 85 | version = "3.0.0-beta.2" 86 | source = "registry+https://github.com/rust-lang/crates.io-index" 87 | checksum = "370f715b81112975b1b69db93e0b56ea4cd4e5002ac43b2da8474106a54096a1" 88 | dependencies = [ 89 | "heck", 90 | "proc-macro-error", 91 | "proc-macro2", 92 | "quote", 93 | "syn", 94 | ] 95 | 96 | [[package]] 97 | name = "compare" 98 | version = "0.1.0" 99 | source = "registry+https://github.com/rust-lang/crates.io-index" 100 | checksum = "120133d4db2ec47efe2e26502ee984747630c67f51974fca0b6c1340cf2368d3" 101 | 102 | [[package]] 103 | name = "crossbeam-channel" 104 | version = "0.5.1" 105 | source = "registry+https://github.com/rust-lang/crates.io-index" 106 | checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" 107 | dependencies = [ 108 | "cfg-if", 109 | "crossbeam-utils", 110 | ] 111 | 112 | [[package]] 113 | name = "crossbeam-deque" 114 | version = "0.8.1" 115 | source = "registry+https://github.com/rust-lang/crates.io-index" 116 | checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" 117 | dependencies = [ 118 | "cfg-if", 119 | "crossbeam-epoch", 120 | "crossbeam-utils", 121 | ] 122 | 123 | [[package]] 124 | name = "crossbeam-epoch" 125 | version = "0.9.5" 126 | source = "registry+https://github.com/rust-lang/crates.io-index" 127 | checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" 128 | dependencies = [ 129 | "cfg-if", 130 | "crossbeam-utils", 131 | "lazy_static", 132 | "memoffset", 133 | "scopeguard", 134 | ] 135 | 136 | [[package]] 137 | name = "crossbeam-utils" 138 | version = "0.8.5" 139 | source = "registry+https://github.com/rust-lang/crates.io-index" 140 | checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" 141 | dependencies = [ 142 | "cfg-if", 143 | "lazy_static", 144 | ] 145 | 146 | [[package]] 147 | name = "either" 148 | version = "1.6.1" 149 | source = "registry+https://github.com/rust-lang/crates.io-index" 150 | checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" 151 | 152 | [[package]] 153 | name = "hashbrown" 154 | version = "0.11.2" 155 | source = "registry+https://github.com/rust-lang/crates.io-index" 156 | checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" 157 | 158 | [[package]] 159 | name = "heck" 160 | version = "0.3.3" 161 | source = "registry+https://github.com/rust-lang/crates.io-index" 162 | checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" 163 | dependencies = [ 164 | "unicode-segmentation", 165 | ] 166 | 167 | [[package]] 168 | name = "hermit-abi" 169 | version = "0.1.19" 170 | source = "registry+https://github.com/rust-lang/crates.io-index" 171 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 172 | dependencies = [ 173 | "libc", 174 | ] 175 | 176 | [[package]] 177 | name = "indexmap" 178 | version = "1.7.0" 179 | source = "registry+https://github.com/rust-lang/crates.io-index" 180 | checksum = "bc633605454125dec4b66843673f01c7df2b89479b32e0ed634e43a91cff62a5" 181 | dependencies = [ 182 | "autocfg", 183 | "hashbrown", 184 | ] 185 | 186 | [[package]] 187 | name = "lazy_static" 188 | version = "1.4.0" 189 | source = "registry+https://github.com/rust-lang/crates.io-index" 190 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 191 | 192 | [[package]] 193 | name = "libc" 194 | version = "0.2.98" 195 | source = "registry+https://github.com/rust-lang/crates.io-index" 196 | checksum = "320cfe77175da3a483efed4bc0adc1968ca050b098ce4f2f1c13a56626128790" 197 | 198 | [[package]] 199 | name = "memchr" 200 | version = "2.4.0" 201 | source = "registry+https://github.com/rust-lang/crates.io-index" 202 | checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" 203 | 204 | [[package]] 205 | name = "memoffset" 206 | version = "0.6.4" 207 | source = "registry+https://github.com/rust-lang/crates.io-index" 208 | checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" 209 | dependencies = [ 210 | "autocfg", 211 | ] 212 | 213 | [[package]] 214 | name = "nlpo3" 215 | version = "1.4.0" 216 | source = "registry+https://github.com/rust-lang/crates.io-index" 217 | checksum = "eb843198757c6b5082c2811352cc159dda13275c1bf41e24bdcffb0554772608" 218 | dependencies = [ 219 | "anyhow", 220 | "binary-heap-plus", 221 | "bytecount", 222 | "lazy_static", 223 | "rayon", 224 | "regex", 225 | "regex-syntax", 226 | "rustc-hash", 227 | ] 228 | 229 | [[package]] 230 | name = "nlpo3-cli" 231 | version = "0.2.1-dev" 232 | dependencies = [ 233 | "clap", 234 | "nlpo3", 235 | ] 236 | 237 | [[package]] 238 | name = "num_cpus" 239 | version = "1.13.0" 240 | source = "registry+https://github.com/rust-lang/crates.io-index" 241 | checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" 242 | dependencies = [ 243 | "hermit-abi", 244 | "libc", 245 | ] 246 | 247 | [[package]] 248 | name = "os_str_bytes" 249 | version = "2.4.0" 250 | source = "registry+https://github.com/rust-lang/crates.io-index" 251 | checksum = "afb2e1c3ee07430c2cf76151675e583e0f19985fa6efae47d6848a3e2c824f85" 252 | 253 | [[package]] 254 | name = "proc-macro-error" 255 | version = "1.0.4" 256 | source = "registry+https://github.com/rust-lang/crates.io-index" 257 | checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" 258 | dependencies = [ 259 | "proc-macro-error-attr", 260 | "proc-macro2", 261 | "quote", 262 | "syn", 263 | "version_check", 264 | ] 265 | 266 | [[package]] 267 | name = "proc-macro-error-attr" 268 | version = "1.0.4" 269 | source = "registry+https://github.com/rust-lang/crates.io-index" 270 | checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" 271 | dependencies = [ 272 | "proc-macro2", 273 | "quote", 274 | "version_check", 275 | ] 276 | 277 | [[package]] 278 | name = "proc-macro2" 279 | version = "1.0.28" 280 | source = "registry+https://github.com/rust-lang/crates.io-index" 281 | checksum = "5c7ed8b8c7b886ea3ed7dde405212185f423ab44682667c8c6dd14aa1d9f6612" 282 | dependencies = [ 283 | "unicode-xid", 284 | ] 285 | 286 | [[package]] 287 | name = "quote" 288 | version = "1.0.9" 289 | source = "registry+https://github.com/rust-lang/crates.io-index" 290 | checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" 291 | dependencies = [ 292 | "proc-macro2", 293 | ] 294 | 295 | [[package]] 296 | name = "rayon" 297 | version = "1.5.1" 298 | source = "registry+https://github.com/rust-lang/crates.io-index" 299 | checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" 300 | dependencies = [ 301 | "autocfg", 302 | "crossbeam-deque", 303 | "either", 304 | "rayon-core", 305 | ] 306 | 307 | [[package]] 308 | name = "rayon-core" 309 | version = "1.9.1" 310 | source = "registry+https://github.com/rust-lang/crates.io-index" 311 | checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" 312 | dependencies = [ 313 | "crossbeam-channel", 314 | "crossbeam-deque", 315 | "crossbeam-utils", 316 | "lazy_static", 317 | "num_cpus", 318 | ] 319 | 320 | [[package]] 321 | name = "regex" 322 | version = "1.5.4" 323 | source = "registry+https://github.com/rust-lang/crates.io-index" 324 | checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" 325 | dependencies = [ 326 | "aho-corasick", 327 | "memchr", 328 | "regex-syntax", 329 | ] 330 | 331 | [[package]] 332 | name = "regex-syntax" 333 | version = "0.6.25" 334 | source = "registry+https://github.com/rust-lang/crates.io-index" 335 | checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" 336 | 337 | [[package]] 338 | name = "rustc-hash" 339 | version = "1.1.0" 340 | source = "registry+https://github.com/rust-lang/crates.io-index" 341 | checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" 342 | 343 | [[package]] 344 | name = "scopeguard" 345 | version = "1.1.0" 346 | source = "registry+https://github.com/rust-lang/crates.io-index" 347 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" 348 | 349 | [[package]] 350 | name = "strsim" 351 | version = "0.10.0" 352 | source = "registry+https://github.com/rust-lang/crates.io-index" 353 | checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" 354 | 355 | [[package]] 356 | name = "syn" 357 | version = "1.0.74" 358 | source = "registry+https://github.com/rust-lang/crates.io-index" 359 | checksum = "1873d832550d4588c3dbc20f01361ab00bfe741048f71e3fecf145a7cc18b29c" 360 | dependencies = [ 361 | "proc-macro2", 362 | "quote", 363 | "unicode-xid", 364 | ] 365 | 366 | [[package]] 367 | name = "termcolor" 368 | version = "1.1.2" 369 | source = "registry+https://github.com/rust-lang/crates.io-index" 370 | checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" 371 | dependencies = [ 372 | "winapi-util", 373 | ] 374 | 375 | [[package]] 376 | name = "textwrap" 377 | version = "0.12.1" 378 | source = "registry+https://github.com/rust-lang/crates.io-index" 379 | checksum = "203008d98caf094106cfaba70acfed15e18ed3ddb7d94e49baec153a2b462789" 380 | dependencies = [ 381 | "unicode-width", 382 | ] 383 | 384 | [[package]] 385 | name = "unicode-segmentation" 386 | version = "1.8.0" 387 | source = "registry+https://github.com/rust-lang/crates.io-index" 388 | checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" 389 | 390 | [[package]] 391 | name = "unicode-width" 392 | version = "0.1.8" 393 | source = "registry+https://github.com/rust-lang/crates.io-index" 394 | checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" 395 | 396 | [[package]] 397 | name = "unicode-xid" 398 | version = "0.2.2" 399 | source = "registry+https://github.com/rust-lang/crates.io-index" 400 | checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" 401 | 402 | [[package]] 403 | name = "vec_map" 404 | version = "0.8.2" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" 407 | 408 | [[package]] 409 | name = "version_check" 410 | version = "0.9.3" 411 | source = "registry+https://github.com/rust-lang/crates.io-index" 412 | checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" 413 | 414 | [[package]] 415 | name = "winapi" 416 | version = "0.3.9" 417 | source = "registry+https://github.com/rust-lang/crates.io-index" 418 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 419 | dependencies = [ 420 | "winapi-i686-pc-windows-gnu", 421 | "winapi-x86_64-pc-windows-gnu", 422 | ] 423 | 424 | [[package]] 425 | name = "winapi-i686-pc-windows-gnu" 426 | version = "0.4.0" 427 | source = "registry+https://github.com/rust-lang/crates.io-index" 428 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 429 | 430 | [[package]] 431 | name = "winapi-util" 432 | version = "0.1.5" 433 | source = "registry+https://github.com/rust-lang/crates.io-index" 434 | checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" 435 | dependencies = [ 436 | "winapi", 437 | ] 438 | 439 | [[package]] 440 | name = "winapi-x86_64-pc-windows-gnu" 441 | version = "0.4.0" 442 | source = "registry+https://github.com/rust-lang/crates.io-index" 443 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 444 | -------------------------------------------------------------------------------- /nlpo3-nodejs/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "ahash" 7 | version = "0.8.11" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" 10 | dependencies = [ 11 | "cfg-if", 12 | "getrandom", 13 | "once_cell", 14 | "version_check", 15 | "zerocopy", 16 | ] 17 | 18 | [[package]] 19 | name = "aho-corasick" 20 | version = "0.7.18" 21 | source = "registry+https://github.com/rust-lang/crates.io-index" 22 | checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" 23 | dependencies = [ 24 | "memchr", 25 | ] 26 | 27 | [[package]] 28 | name = "anyhow" 29 | version = "1.0.45" 30 | source = "registry+https://github.com/rust-lang/crates.io-index" 31 | checksum = "ee10e43ae4a853c0a3591d4e2ada1719e553be18199d9da9d4a83f5927c2f5c7" 32 | 33 | [[package]] 34 | name = "autocfg" 35 | version = "1.0.1" 36 | source = "registry+https://github.com/rust-lang/crates.io-index" 37 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" 38 | 39 | [[package]] 40 | name = "binary-heap-plus" 41 | version = "0.4.1" 42 | source = "registry+https://github.com/rust-lang/crates.io-index" 43 | checksum = "4f068638f8ff9e118a9361e66a411eff410e7fb3ecaa23bf9272324f8fc606d7" 44 | dependencies = [ 45 | "compare", 46 | ] 47 | 48 | [[package]] 49 | name = "bytecount" 50 | version = "0.6.2" 51 | source = "registry+https://github.com/rust-lang/crates.io-index" 52 | checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e" 53 | 54 | [[package]] 55 | name = "cfg-if" 56 | version = "1.0.0" 57 | source = "registry+https://github.com/rust-lang/crates.io-index" 58 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 59 | 60 | [[package]] 61 | name = "compare" 62 | version = "0.1.0" 63 | source = "registry+https://github.com/rust-lang/crates.io-index" 64 | checksum = "120133d4db2ec47efe2e26502ee984747630c67f51974fca0b6c1340cf2368d3" 65 | 66 | [[package]] 67 | name = "crossbeam-channel" 68 | version = "0.5.1" 69 | source = "registry+https://github.com/rust-lang/crates.io-index" 70 | checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" 71 | dependencies = [ 72 | "cfg-if", 73 | "crossbeam-utils", 74 | ] 75 | 76 | [[package]] 77 | name = "crossbeam-deque" 78 | version = "0.8.1" 79 | source = "registry+https://github.com/rust-lang/crates.io-index" 80 | checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" 81 | dependencies = [ 82 | "cfg-if", 83 | "crossbeam-epoch", 84 | "crossbeam-utils", 85 | ] 86 | 87 | [[package]] 88 | name = "crossbeam-epoch" 89 | version = "0.9.5" 90 | source = "registry+https://github.com/rust-lang/crates.io-index" 91 | checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" 92 | dependencies = [ 93 | "cfg-if", 94 | "crossbeam-utils", 95 | "lazy_static", 96 | "memoffset", 97 | "scopeguard", 98 | ] 99 | 100 | [[package]] 101 | name = "crossbeam-utils" 102 | version = "0.8.5" 103 | source = "registry+https://github.com/rust-lang/crates.io-index" 104 | checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" 105 | dependencies = [ 106 | "cfg-if", 107 | "lazy_static", 108 | ] 109 | 110 | [[package]] 111 | name = "either" 112 | version = "1.6.1" 113 | source = "registry+https://github.com/rust-lang/crates.io-index" 114 | checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" 115 | 116 | [[package]] 117 | name = "getrandom" 118 | version = "0.2.15" 119 | source = "registry+https://github.com/rust-lang/crates.io-index" 120 | checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" 121 | dependencies = [ 122 | "cfg-if", 123 | "libc", 124 | "wasi", 125 | ] 126 | 127 | [[package]] 128 | name = "hermit-abi" 129 | version = "0.1.19" 130 | source = "registry+https://github.com/rust-lang/crates.io-index" 131 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 132 | dependencies = [ 133 | "libc", 134 | ] 135 | 136 | [[package]] 137 | name = "lazy_static" 138 | version = "1.5.0" 139 | source = "registry+https://github.com/rust-lang/crates.io-index" 140 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 141 | 142 | [[package]] 143 | name = "libc" 144 | version = "0.2.162" 145 | source = "registry+https://github.com/rust-lang/crates.io-index" 146 | checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" 147 | 148 | [[package]] 149 | name = "libloading" 150 | version = "0.8.5" 151 | source = "registry+https://github.com/rust-lang/crates.io-index" 152 | checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" 153 | dependencies = [ 154 | "cfg-if", 155 | "windows-targets", 156 | ] 157 | 158 | [[package]] 159 | name = "memchr" 160 | version = "2.4.1" 161 | source = "registry+https://github.com/rust-lang/crates.io-index" 162 | checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" 163 | 164 | [[package]] 165 | name = "memoffset" 166 | version = "0.6.4" 167 | source = "registry+https://github.com/rust-lang/crates.io-index" 168 | checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" 169 | dependencies = [ 170 | "autocfg", 171 | ] 172 | 173 | [[package]] 174 | name = "neon" 175 | version = "1.0.0" 176 | source = "registry+https://github.com/rust-lang/crates.io-index" 177 | checksum = "7d75440242411c87dc39847b0e33e961ec1f10326a9d8ecf9c1ea64a3b3c13dc" 178 | dependencies = [ 179 | "libloading", 180 | "neon-macros", 181 | "once_cell", 182 | "semver", 183 | "send_wrapper", 184 | "smallvec", 185 | ] 186 | 187 | [[package]] 188 | name = "neon-macros" 189 | version = "1.0.0" 190 | source = "registry+https://github.com/rust-lang/crates.io-index" 191 | checksum = "c6813fde79b646e47e7ad75f480aa80ef76a5d9599e2717407961531169ee38b" 192 | dependencies = [ 193 | "quote", 194 | "syn", 195 | "syn-mid", 196 | ] 197 | 198 | [[package]] 199 | name = "nlpo3" 200 | version = "1.4.0" 201 | source = "registry+https://github.com/rust-lang/crates.io-index" 202 | checksum = "eb843198757c6b5082c2811352cc159dda13275c1bf41e24bdcffb0554772608" 203 | dependencies = [ 204 | "anyhow", 205 | "binary-heap-plus", 206 | "bytecount", 207 | "lazy_static", 208 | "rayon", 209 | "regex", 210 | "regex-syntax", 211 | "rustc-hash", 212 | ] 213 | 214 | [[package]] 215 | name = "nlpo3-nodejs" 216 | version = "1.0.0" 217 | dependencies = [ 218 | "ahash", 219 | "lazy_static", 220 | "neon", 221 | "nlpo3", 222 | ] 223 | 224 | [[package]] 225 | name = "num_cpus" 226 | version = "1.13.0" 227 | source = "registry+https://github.com/rust-lang/crates.io-index" 228 | checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" 229 | dependencies = [ 230 | "hermit-abi", 231 | "libc", 232 | ] 233 | 234 | [[package]] 235 | name = "once_cell" 236 | version = "1.20.2" 237 | source = "registry+https://github.com/rust-lang/crates.io-index" 238 | checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" 239 | 240 | [[package]] 241 | name = "proc-macro2" 242 | version = "1.0.89" 243 | source = "registry+https://github.com/rust-lang/crates.io-index" 244 | checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" 245 | dependencies = [ 246 | "unicode-ident", 247 | ] 248 | 249 | [[package]] 250 | name = "quote" 251 | version = "1.0.37" 252 | source = "registry+https://github.com/rust-lang/crates.io-index" 253 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" 254 | dependencies = [ 255 | "proc-macro2", 256 | ] 257 | 258 | [[package]] 259 | name = "rayon" 260 | version = "1.5.1" 261 | source = "registry+https://github.com/rust-lang/crates.io-index" 262 | checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" 263 | dependencies = [ 264 | "autocfg", 265 | "crossbeam-deque", 266 | "either", 267 | "rayon-core", 268 | ] 269 | 270 | [[package]] 271 | name = "rayon-core" 272 | version = "1.9.1" 273 | source = "registry+https://github.com/rust-lang/crates.io-index" 274 | checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" 275 | dependencies = [ 276 | "crossbeam-channel", 277 | "crossbeam-deque", 278 | "crossbeam-utils", 279 | "lazy_static", 280 | "num_cpus", 281 | ] 282 | 283 | [[package]] 284 | name = "regex" 285 | version = "1.5.4" 286 | source = "registry+https://github.com/rust-lang/crates.io-index" 287 | checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" 288 | dependencies = [ 289 | "aho-corasick", 290 | "memchr", 291 | "regex-syntax", 292 | ] 293 | 294 | [[package]] 295 | name = "regex-syntax" 296 | version = "0.6.25" 297 | source = "registry+https://github.com/rust-lang/crates.io-index" 298 | checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" 299 | 300 | [[package]] 301 | name = "rustc-hash" 302 | version = "1.1.0" 303 | source = "registry+https://github.com/rust-lang/crates.io-index" 304 | checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" 305 | 306 | [[package]] 307 | name = "scopeguard" 308 | version = "1.1.0" 309 | source = "registry+https://github.com/rust-lang/crates.io-index" 310 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" 311 | 312 | [[package]] 313 | name = "semver" 314 | version = "1.0.23" 315 | source = "registry+https://github.com/rust-lang/crates.io-index" 316 | checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" 317 | 318 | [[package]] 319 | name = "send_wrapper" 320 | version = "0.6.0" 321 | source = "registry+https://github.com/rust-lang/crates.io-index" 322 | checksum = "cd0b0ec5f1c1ca621c432a25813d8d60c88abe6d3e08a3eb9cf37d97a0fe3d73" 323 | 324 | [[package]] 325 | name = "smallvec" 326 | version = "1.13.2" 327 | source = "registry+https://github.com/rust-lang/crates.io-index" 328 | checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" 329 | 330 | [[package]] 331 | name = "syn" 332 | version = "2.0.87" 333 | source = "registry+https://github.com/rust-lang/crates.io-index" 334 | checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" 335 | dependencies = [ 336 | "proc-macro2", 337 | "quote", 338 | "unicode-ident", 339 | ] 340 | 341 | [[package]] 342 | name = "syn-mid" 343 | version = "0.6.0" 344 | source = "registry+https://github.com/rust-lang/crates.io-index" 345 | checksum = "b5dc35bb08dd1ca3dfb09dce91fd2d13294d6711c88897d9a9d60acf39bce049" 346 | dependencies = [ 347 | "proc-macro2", 348 | "quote", 349 | "syn", 350 | ] 351 | 352 | [[package]] 353 | name = "unicode-ident" 354 | version = "1.0.13" 355 | source = "registry+https://github.com/rust-lang/crates.io-index" 356 | checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" 357 | 358 | [[package]] 359 | name = "version_check" 360 | version = "0.9.5" 361 | source = "registry+https://github.com/rust-lang/crates.io-index" 362 | checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" 363 | 364 | [[package]] 365 | name = "wasi" 366 | version = "0.11.0+wasi-snapshot-preview1" 367 | source = "registry+https://github.com/rust-lang/crates.io-index" 368 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 369 | 370 | [[package]] 371 | name = "windows-targets" 372 | version = "0.52.6" 373 | source = "registry+https://github.com/rust-lang/crates.io-index" 374 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 375 | dependencies = [ 376 | "windows_aarch64_gnullvm", 377 | "windows_aarch64_msvc", 378 | "windows_i686_gnu", 379 | "windows_i686_gnullvm", 380 | "windows_i686_msvc", 381 | "windows_x86_64_gnu", 382 | "windows_x86_64_gnullvm", 383 | "windows_x86_64_msvc", 384 | ] 385 | 386 | [[package]] 387 | name = "windows_aarch64_gnullvm" 388 | version = "0.52.6" 389 | source = "registry+https://github.com/rust-lang/crates.io-index" 390 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 391 | 392 | [[package]] 393 | name = "windows_aarch64_msvc" 394 | version = "0.52.6" 395 | source = "registry+https://github.com/rust-lang/crates.io-index" 396 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 397 | 398 | [[package]] 399 | name = "windows_i686_gnu" 400 | version = "0.52.6" 401 | source = "registry+https://github.com/rust-lang/crates.io-index" 402 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 403 | 404 | [[package]] 405 | name = "windows_i686_gnullvm" 406 | version = "0.52.6" 407 | source = "registry+https://github.com/rust-lang/crates.io-index" 408 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 409 | 410 | [[package]] 411 | name = "windows_i686_msvc" 412 | version = "0.52.6" 413 | source = "registry+https://github.com/rust-lang/crates.io-index" 414 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 415 | 416 | [[package]] 417 | name = "windows_x86_64_gnu" 418 | version = "0.52.6" 419 | source = "registry+https://github.com/rust-lang/crates.io-index" 420 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 421 | 422 | [[package]] 423 | name = "windows_x86_64_gnullvm" 424 | version = "0.52.6" 425 | source = "registry+https://github.com/rust-lang/crates.io-index" 426 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 427 | 428 | [[package]] 429 | name = "windows_x86_64_msvc" 430 | version = "0.52.6" 431 | source = "registry+https://github.com/rust-lang/crates.io-index" 432 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 433 | 434 | [[package]] 435 | name = "zerocopy" 436 | version = "0.7.35" 437 | source = "registry+https://github.com/rust-lang/crates.io-index" 438 | checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" 439 | dependencies = [ 440 | "zerocopy-derive", 441 | ] 442 | 443 | [[package]] 444 | name = "zerocopy-derive" 445 | version = "0.7.35" 446 | source = "registry+https://github.com/rust-lang/crates.io-index" 447 | checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" 448 | dependencies = [ 449 | "proc-macro2", 450 | "quote", 451 | "syn", 452 | ] 453 | -------------------------------------------------------------------------------- /nlpo3-python/notebooks/nlpo3_segment_benchmarks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Benchmark `nlpo3.segment`" 7 | ], 8 | "metadata": { 9 | "id": "6PCaAU1y66FE" 10 | } 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "source": [ 15 | "Benchmarks nlpO3 (Rust) `nlpo3.segment` (unsafe) against PyThaiNLP (Python) `newmm.segment` (unsafe) using the same dictionary.\n", 16 | "\n", 17 | "https://github.com/PyThaiNLP/nlpo3/" 18 | ], 19 | "metadata": {} 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "source": [ 25 | "import time\n", 26 | "\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "import numpy as np\n", 29 | "from tqdm.auto import tqdm\n", 30 | "\n", 31 | "\n", 32 | "def time_func(func, arg):\n", 33 | " start_time = time.perf_counter_ns()\n", 34 | " func(arg)\n", 35 | " return time.perf_counter_ns() - start_time" 36 | ], 37 | "outputs": [], 38 | "metadata": { 39 | "id": "iAlScT9d66FX", 40 | "outputId": "fe7a569c-f384-4e95-cbc3-3d412a99fc74" 41 | } 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "source": [ 46 | "## Load Custom Dictionary to the Tokenizers\n", 47 | "\n", 48 | "Both `o3_newmm()` and `py_newmm()` will use the same word list (`words_th.txt`)" 49 | ], 50 | "metadata": { 51 | "id": "Ssp84MKA66Fb" 52 | } 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "source": [ 58 | "DICT_FILE = \"../../words_th.txt\"" 59 | ], 60 | "outputs": [], 61 | "metadata": { 62 | "id": "XHh2LIdG66Fd", 63 | "outputId": "f1f8c12d-fd61-40f2-d31f-98c99cd120ae" 64 | } 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "source": [ 70 | "from pythainlp.tokenize.newmm import segment as py_segment\n", 71 | "from pythainlp.util import dict_trie\n", 72 | "\n", 73 | "trie = dict_trie(dict_source=DICT_FILE)\n", 74 | "\n", 75 | "def py_newmm(txt, safe_mode=False):\n", 76 | " return py_segment(txt, safe_mode=safe_mode, custom_dict=trie)" 77 | ], 78 | "outputs": [], 79 | "metadata": {} 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "source": [ 85 | "from nlpo3 import load_dict\n", 86 | "from nlpo3 import segment as o3_segment\n", 87 | "\n", 88 | "load_dict(DICT_FILE, \"test_dict\") # create \"test_dict\" dictionary\n", 89 | "\n", 90 | "def o3_newmm(txt, safe=False, parallel=False):\n", 91 | " return o3_segment(txt, dict_name=\"test_dict\", safe=safe, parallel=parallel)" 92 | ], 93 | "outputs": [ 94 | { 95 | "output_type": "stream", 96 | "name": "stdout", 97 | "text": [ 98 | "Successful: dictionary name test_dict from file ../../words_th.txt has been successfully loaded\n" 99 | ] 100 | } 101 | ], 102 | "metadata": {} 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "source": [ 107 | "## Load Test data" 108 | ], 109 | "metadata": {} 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 5, 114 | "source": [ 115 | "from datasets import load_dataset\n", 116 | "\n", 117 | "datasets = load_dataset('wisesight_sentiment')\n", 118 | "datasets" 119 | ], 120 | "outputs": [ 121 | { 122 | "output_type": "stream", 123 | "name": "stderr", 124 | "text": [ 125 | "Reusing dataset wisesight_sentiment (/Users/test/.cache/huggingface/datasets/wisesight_sentiment/wisesight_sentiment/1.0.0/fc2b1bdfe79571b2e281e4afdb5aac069cf9270bf0f85694239be672a4191969)\n" 126 | ] 127 | }, 128 | { 129 | "output_type": "execute_result", 130 | "data": { 131 | "text/plain": [ 132 | "DatasetDict({\n", 133 | " train: Dataset({\n", 134 | " features: ['texts', 'category'],\n", 135 | " num_rows: 21628\n", 136 | " })\n", 137 | " validation: Dataset({\n", 138 | " features: ['texts', 'category'],\n", 139 | " num_rows: 2404\n", 140 | " })\n", 141 | " test: Dataset({\n", 142 | " features: ['texts', 'category'],\n", 143 | " num_rows: 2671\n", 144 | " })\n", 145 | "})" 146 | ] 147 | }, 148 | "metadata": {}, 149 | "execution_count": 5 150 | } 151 | ], 152 | "metadata": {} 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 6, 157 | "source": [ 158 | "txt = datasets['train']['texts'][0]\n", 159 | "txt" 160 | ], 161 | "outputs": [ 162 | { 163 | "output_type": "execute_result", 164 | "data": { 165 | "text/plain": [ 166 | "'ไปจองมาแล้วนาจา Mitsubishi Attrage ได้หลังสงกรานต์เลย รอขับอยู่นาจา กระทัดรัด เหมาะกับสาวๆขับรถคนเดียวแบบเรา ราคาสบายกระเป๋า ประหยัดน้ำมัน วิ่งไกลแค่ไหนหายห่วงค่ะ'" 167 | ] 168 | }, 169 | "metadata": {}, 170 | "execution_count": 6 171 | } 172 | ], 173 | "metadata": { 174 | "id": "GCtUUACp66Fc", 175 | "outputId": "fe1c7236-28b3-4ee4-a0e4-81312e7762fb" 176 | } 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 7, 181 | "source": [ 182 | "py_newmm(txt)[:10]" 183 | ], 184 | "outputs": [ 185 | { 186 | "output_type": "execute_result", 187 | "data": { 188 | "text/plain": [ 189 | "['ไป', 'จอง', 'มา', 'แล้', 'วนา', 'จา', ' ', 'Mitsubishi', ' ', 'Attrage']" 190 | ] 191 | }, 192 | "metadata": {}, 193 | "execution_count": 7 194 | } 195 | ], 196 | "metadata": {} 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 8, 201 | "source": [ 202 | "o3_newmm(txt)[:10]" 203 | ], 204 | "outputs": [ 205 | { 206 | "output_type": "execute_result", 207 | "data": { 208 | "text/plain": [ 209 | "['ไป', 'จอง', 'มา', 'แล้', 'วนา', 'จา', ' ', 'Mitsubishi', ' ', 'Attrage']" 210 | ] 211 | }, 212 | "metadata": {}, 213 | "execution_count": 8 214 | } 215 | ], 216 | "metadata": {} 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "source": [ 221 | "## One Example" 222 | ], 223 | "metadata": { 224 | "id": "Qyrh-uny66Fh" 225 | } 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "source": [ 230 | "### Average Run Time for One Example" 231 | ], 232 | "metadata": { 233 | "id": "9YqrA2Pb66Fj" 234 | } 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 9, 239 | "source": [ 240 | "%timeit py_newmm(txt)" 241 | ], 242 | "outputs": [ 243 | { 244 | "output_type": "stream", 245 | "name": "stdout", 246 | "text": [ 247 | "345 µs ± 15.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" 248 | ] 249 | } 250 | ], 251 | "metadata": { 252 | "id": "Iz58c6Ff66Fj", 253 | "outputId": "0d45dec2-bece-494e-8725-31ffdc97e1de" 254 | } 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 10, 259 | "source": [ 260 | "%timeit o3_newmm(txt)" 261 | ], 262 | "outputs": [ 263 | { 264 | "output_type": "stream", 265 | "name": "stdout", 266 | "text": [ 267 | "83.8 µs ± 9.53 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" 268 | ] 269 | } 270 | ], 271 | "metadata": { 272 | "id": "8jvijfPJ66Fl", 273 | "outputId": "64a89f5b-468e-4af5-da35-6c938019e021" 274 | } 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 11, 279 | "source": [ 280 | "%timeit o3_newmm(txt, parallel=True)" 281 | ], 282 | "outputs": [ 283 | { 284 | "output_type": "stream", 285 | "name": "stdout", 286 | "text": [ 287 | "187 µs ± 13.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" 288 | ] 289 | } 290 | ], 291 | "metadata": { 292 | "id": "lAdLgqdu66Fm", 293 | "outputId": "42e39b71-9331-4311-a401-61fa68c21fde" 294 | } 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 12, 299 | "source": [ 300 | "%timeit py_newmm(txt, safe_mode=True)" 301 | ], 302 | "outputs": [ 303 | { 304 | "output_type": "stream", 305 | "name": "stdout", 306 | "text": [ 307 | "380 µs ± 45.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" 308 | ] 309 | } 310 | ], 311 | "metadata": {} 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 13, 316 | "source": [ 317 | "%timeit o3_newmm(txt, safe=True)" 318 | ], 319 | "outputs": [ 320 | { 321 | "output_type": "stream", 322 | "name": "stdout", 323 | "text": [ 324 | "80.7 µs ± 3.81 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" 325 | ] 326 | } 327 | ], 328 | "metadata": {} 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "source": [ 333 | "## All Examples" 334 | ], 335 | "metadata": { 336 | "id": "L0lWERZk66Fm" 337 | } 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "source": [ 342 | "### Check If Results Match for All Examples" 343 | ], 344 | "metadata": { 345 | "id": "GXmhzISZ66Fn" 346 | } 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 14, 351 | "source": [ 352 | "corrects = [o3_newmm(txt) == py_newmm(txt) for txt in datasets['train']['texts']]\n", 353 | "np.mean(corrects), len(corrects)" 354 | ], 355 | "outputs": [], 356 | "metadata": { 357 | "id": "ZY9Mosag66Fn", 358 | "outputId": "46a56ce0-fd20-430a-d9b6-9564f4c25141" 359 | } 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "source": [ 364 | "### Average Run Time Across All Examples" 365 | ], 366 | "metadata": { 367 | "id": "Tyc_cHaf66Fo" 368 | } 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "source": [ 374 | "py_newmms = [time_func(py_newmm, txt) for txt in datasets['train']['texts']]\n", 375 | "o3_newmms = [time_func(o3_newmm, txt) for txt in datasets['train']['texts']]\n", 376 | "\n", 377 | "# o3 newmm is over 2x faster than python newmm, on average\n", 378 | "np.mean(py_newmms), np.mean(o3_newmms), np.mean(py_newmms) / np.mean(o3_newmms)" 379 | ], 380 | "outputs": [], 381 | "metadata": { 382 | "id": "uYeUydsQ66Fo", 383 | "outputId": "b20ed761-fa2b-42b8-8a6c-ff3a1d2dc6b3" 384 | } 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "source": [ 390 | "# look at distribution; o3 newmm also consistently performs better\n", 391 | "plt.hist(py_newmms, bins=30, alpha=0.5)\n", 392 | "plt.hist(o3_newmms, bins=30, alpha=0.5)" 393 | ], 394 | "outputs": [], 395 | "metadata": { 396 | "id": "8hRoDxm966Fp", 397 | "outputId": "c4e8c0fd-97ca-4e3a-ee63-1281f84bb1d9" 398 | } 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "source": [ 403 | "## Run Time as Sequence Length Grows" 404 | ], 405 | "metadata": { 406 | "id": "EMZZ8SgY66Fp" 407 | } 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "source": [ 413 | "txt = datasets['train']['texts'][1]\n", 414 | "txt2 = ''.join(o3_newmm(txt)[:10])\n", 415 | "txt2, len(o3_newmm(txt2))" 416 | ], 417 | "outputs": [], 418 | "metadata": { 419 | "id": "3P_z59rS66Fp", 420 | "outputId": "5951dd75-388c-4f9e-a1df-f0e98f663ec3" 421 | } 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "source": [ 427 | "py_newmms = [time_func(py_newmm, txt2*i) for i in tqdm([10**j for j in range(5)])]\n", 428 | "o3_newmms = [time_func(o3_newmm, txt2*i) for i in tqdm([10**j for j in range(5)])]" 429 | ], 430 | "outputs": [], 431 | "metadata": { 432 | "colab": { 433 | "referenced_widgets": [ 434 | "0689a5cf946049a0ac98bdf9e1353810", 435 | "2b481450056f4c1883c163bf066110a3" 436 | ] 437 | }, 438 | "id": "FEmDkPHL66Fq", 439 | "outputId": "2b2c96ab-7044-423b-9b40-e06ca186a213" 440 | } 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "source": [ 445 | "Performance starts really deviate when sequence length > 10^3 tokens and above.\n", 446 | "\n", 447 | "python newmm is dashed line." 448 | ], 449 | "metadata": {} 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "source": [ 455 | "positions = [i for i in range(5)]\n", 456 | "labels = [f'10^{i+1}' for i in range(5)]\n", 457 | "plt.xticks(positions, labels)\n", 458 | "plt.plot(py_newmms, linestyle='dashed')\n", 459 | "plt.plot(o3_newmms)" 460 | ], 461 | "outputs": [], 462 | "metadata": { 463 | "id": "sT8GL0oX66Fr", 464 | "outputId": "6bb4acfd-4721-47cb-d8ff-943c67a4cedf" 465 | } 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "source": [ 471 | "# zooming in on inflexion point\n", 472 | "positions = [i for i in range(4)]\n", 473 | "labels = [f'10^{i+1}' for i in range(4)]\n", 474 | "plt.xticks(positions, labels)\n", 475 | "plt.plot(py_newmms[:-1], linestyle='dashed')\n", 476 | "plt.plot(o3_newmms[:-1])" 477 | ], 478 | "outputs": [], 479 | "metadata": { 480 | "id": "5YU5aiNs66Fs" 481 | } 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "source": [], 487 | "outputs": [], 488 | "metadata": {} 489 | } 490 | ], 491 | "metadata": { 492 | "colab": { 493 | "name": "oxidized_segment.ipynb", 494 | "provenance": [] 495 | }, 496 | "kernelspec": { 497 | "name": "python3", 498 | "display_name": "Python 3.9.6 64-bit ('3.9.6')" 499 | }, 500 | "language_info": { 501 | "codemirror_mode": { 502 | "name": "ipython", 503 | "version": 3 504 | }, 505 | "file_extension": ".py", 506 | "mimetype": "text/x-python", 507 | "name": "python", 508 | "nbconvert_exporter": "python", 509 | "pygments_lexer": "ipython3", 510 | "version": "3.9.6" 511 | }, 512 | "interpreter": { 513 | "hash": "a14e36384937f4cd2b884b3a26e89421f14fe79660135b8adf66e600478fad4c" 514 | } 515 | }, 516 | "nbformat": 4, 517 | "nbformat_minor": 1 518 | } 519 | -------------------------------------------------------------------------------- /tests/test_tokenizer.rs: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2024 PyThaiNLP Project 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | /** 5 | * Test the NewmmTokenizer with the default dictionary. 6 | */ 7 | use nlpo3::tokenizer::newmm::NewmmTokenizer; 8 | use nlpo3::tokenizer::tokenizer_trait::Tokenizer; 9 | 10 | const FIRST_TEXT: &str = "นิสสันผ่อนจนเพลียนาวาร่า.."; 11 | const SECOND_TEXT: &str = 12 | "อาชญากรรมทางการแพทย์.. หลอกลวงคนไข้ผ่าตัด ตัดหมอนรองข้อเข่าอำพราง รพ.กรุงเทพภูเก็ตปลอมเวชระเบียน ตอนที่๑."; 13 | const DEFAULT_DICT_PATH: &str = "/words_th.txt"; // relative to cargo 14 | 15 | #[test] 16 | fn test_dict_with_empty_line() { 17 | const DICT_PATH: &str = "/tests/data/dict_with_empty_line.txt"; 18 | let mut relative_dict_path = env!("CARGO_MANIFEST_DIR").to_string(); 19 | relative_dict_path.push_str(DICT_PATH); 20 | let _tokenizer = NewmmTokenizer::new(&relative_dict_path); 21 | } 22 | 23 | #[test] 24 | fn test_from_word_list() { 25 | let words = vec!["ปาลิเมนต์".to_string(), "คอนสติติวชั่น".to_string()]; 26 | let _tokenizer = NewmmTokenizer::from_word_list(words); 27 | } 28 | 29 | #[test] 30 | fn test_long_text_byte_tokenizer() { 31 | let mut relative_dict_path = env!("CARGO_MANIFEST_DIR").to_string(); 32 | relative_dict_path.push_str(DEFAULT_DICT_PATH); 33 | 34 | let text = [ 35 | "ไต้หวัน (แป่ะเอ๋ยี้: Tâi-oân; ไต่อวัน) หรือ ไถวาน ", 36 | "(อักษรโรมัน: Taiwan; จีนตัวย่อ: 台湾; จีนตัวเต็ม: 臺灣/台灣; พินอิน: ", 37 | "Táiwān; ไถวาน) หรือชื่อทางการว่า สาธารณรัฐจีน (จีนตัวย่อ: 中华民国; ", 38 | "จีนตัวเต็ม: 中華民國; พินอิน: Zhōnghuá ", 39 | "Mínguó) เป็นรัฐในทวีปเอเชียตะวันออก[7][8][9] ปัจจุบันประกอบด้วย", 40 | "เกาะใหญ่ 5 แห่ง คือ จินเหมิน (金門), ไต้หวัน, เผิงหู (澎湖), หมาจู่ ", 41 | "(馬祖), และอูชิว (烏坵) กับทั้งเกาะเล็กเกาะน้อยอีกจำนวนหนึ่ง ", 42 | "ท้องที่ดังกล่าวเรียกรวมกันว่า \"พื้นที่ไต้หวัน\" (臺灣地區)\n", 43 | "ไต้หวันด้านตะวันตกติดกับจีนแผ่นดินใหญ่ ด้านตะวันออกและตะวันออก", 44 | "เฉียงเหนือติดกับญี่ปุ่น และด้านใต้ติดกับฟิลิปปินส์ กรุงไทเปเป็น", 45 | "เมืองหลวง ส่วนไทเปใหม่เป็นเขตปกครองที่จัดตั้งขึ้นใหม่ กินพื้นที่", 46 | "กรุงไทเปและเป็นเขตซึ่งประชากรหนาแน่นที่สุดในเวลานี้\n", 47 | "เกาะไต้หวันเดิมเป็นที่อยู่ของชนพื้นเมือง และมีชาวจีนจากแผ่นดิน", 48 | "ใหญ่เข้ามาอาศัยร่วมด้วย จนกระทั่งชาววิลันดาและสเปนเดินทางเข้า", 49 | "มาในยุคสำรวจเมื่อศตวรรษที่ 17 และมาตั้งบ้านเรือนกลายเป็นนิคม", 50 | "ใหญ่โต ต่อมาปี 1662 ราชวงศ์หมิงในแผ่นดินใหญ่ถูกราชวงศ์ชิงแทนที่ ", 51 | "เจิ้ง เฉิงกง (鄭成功) ขุนศึกหมิง รวมกำลังหนีมาถึงเกาะไต้หวัน ", 52 | "และรุกไล่ฝรั่งออกไปได้อย่างราบคาบ เขาจึงตั้งราชอาณาจักรตงหนิง ", 53 | "(東寧) ขึ้นบนเกาะเพื่อ \"โค่นชิงฟื้นหมิง\" แต่ในปี 1683 ราชวงศ์", 54 | "ชิงปราบปรามอาณาจักรตงหนิงและเข้าครอบครองไต้หวันเป็นผลสำเร็จ ", 55 | "ไต้หวันจึงกลายเป็นมณฑลหนึ่งของจีน อย่างไรก็ดี ความบาดหมางระหว่าง", 56 | "จีนกับญี่ปุ่นเป็นเหตุให้ญี่ปุ่นได้ไต้หวันไปในปี 1895\n", 57 | "ก่อนเสียไต้หวันคืนแก่จีนหลังสงครามโลกครั้งที่สอง ช่วงนั้น มีการ", 58 | "เปลี่ยนแปลงการปกครองในจีน พรรคก๊กมินตั๋ง ได้เป็นใหญ่ ", 59 | "แต่ไม่นานก็เสียทีให้แก่พรรคคอมมิวนิสต์จีน พรรคก๊กมินตั๋งจึงหนี", 60 | "มายังเกาะไต้หวันและสถาปนาสาธารณรัฐจีนขึ้นบนเกาะแยกต่างหาก ", 61 | "ส่วนฝ่ายคอมมิวนิสต์จีนที่เป็นฝ่ายได้รับชัยชนะได้สถาปนาสาธารณรัฐ", 62 | "ประชาชนจีนบนแผ่นดินใหญ่ อย่างไรก็ดี จีนยังคงถือว่า ไต้หวันเป็น", 63 | "มณฑลหนึ่งของตน และไต้หวันเองก็ยังมิได้รับการยอมรับจากนานาชาติ", 64 | "ว่าเป็นประเทศเอกราชมาจนบัดนี้\n", 65 | "ในช่วงทศวรรษ 1980 ถึงต้นทศวรรษ 1990 การเมืองการปกครอง", 66 | "สาธารณรัฐจีน (ไต้หวัน) เจริญรุ่งเรืองจนเป็นประชาธิปไตยที่มีพรรค", 67 | "การเมืองหลายพรรคและมีการเลือกตั้งทั่วหน้า ในช่วงกลางศตวรรษที่ ", 68 | "20 เศรษฐกิจไต้หวันงอกงามอย่างรวดเร็ว ไต้หวันจึงกลายเป็นประเทศ", 69 | "พัฒนาแล้ว ได้ชื่อว่าเป็นหนึ่งในสี่เสือแห่งเอเชีย มีอุตสาหกรรม", 70 | "ล้ำหน้า และมีเศรษฐกิจใหญ่โตเป็นอันดับที่ 19 ของโลก[11][12] ", 71 | "อุตสาหกรรมที่ใช้เทคโนโลยีชั้นสูงของไต้หวันยังมีบทบาทสำคัญมากใน", 72 | "เศรษฐกิจโลก เป็นเหตุให้ไต้หวันได้เป็นสมาชิกองค์การการค้าโลกและ", 73 | "ความร่วมมือทางเศรษฐกิจเอเชีย-แปซิฟิก เสรีภาพของสื่อมวลชน เสรี", 74 | "ภาพทางเศรษฐกิจ การสาธารณสุข[13]การศึกษา และดัชนีการพัฒนามนุษย์ใน", 75 | "ไต้หวันยังได้รับการจัดอยู่ในอันดับสูงด้วย[14][4][15]\n", 76 | "สาธารณรัฐจีน มีลักษณะเป็นกลุ่มเกาะ ภูมิประเทศติดกับทะเล ไม่ติด", 77 | "กับประเทศใดเลย ห่างจากเกาะทางทิศเหนือและทิศตะวันตกเป็นสาธารณรัฐ", 78 | "ประชาชนจีน ทิศใต้เป็นประเทศฟิลิปปินส์และทะเลจีนใต้ ส่วนทิศ", 79 | "ตะวันออกเป็นมหาสมุทรแปซิฟิก\n", 80 | "ในปี ค.ศ. 1638 หลังการพ่ายแพ้ของหลานชายของเจิ้ง เฉิงกง ", 81 | "จากการบุกโจมตีทางทัพเรือของราชวงศ์ชิงแมนจูที่นำทัพโดยชื่อ หลาง", 82 | "จากทางใต้ของมณฑลฝูเจี้ยน ทำให้ราชวงศ์ชิงผนวกยึดเกาะไต้หวันเป็น", 83 | "ส่วนหนึ่งสำเร็จ และวางไว้ภายใต้เขตอำนาจของมณฑลฝูเจี้ยน ราชสำนัก", 84 | "ราชวงศ์ชิงพยายามลดการละเมิดสิทธิ์และความไม่ลงรอยกันในพื้นที่โดย", 85 | "ออกกฎหมายเพื่อจัดการตรวจคนเข้าเมืองและเคารพสิทธิในที่ดินของชน", 86 | "พื้นเมืองไต้หวัน ผู้อพยพจากฝูเจี้ยนทางใต้ส่วนใหญ่ยังคงเดินทางไป", 87 | "ไต้หวัน เขตแดนระหว่างดินแดนที่เสียภาษีและสิ่งที่ถูกพิจารณาว่า", 88 | "เป็นดินแดน \"เขตอันตราย\" เปลี่ยนไปทางทิศตะวันออกโดยชาวพื้นเมือง", 89 | "บางคนเข้ารีตรับวัฒนธรรมแบบจีน ในขณะที่คนอื่นถอยกลับเข้าในภูเขา ", 90 | "ในช่วงเวลานี้มีความขัดแย้งจำนวนมากระหว่างกลุ่มชาวฮั่นด้วยกันเอง", 91 | "จากภูมิภาคต่าง ๆ ของฝูเจี้ยนทางใต้โดยเฉพาะอย่างยิ่งระหว่างเฉวียน", 92 | "โจวกับฉางโจว และระหว่างฝูเจี้ยนตอนใต้และชาวพื้นเมืองไต้หวัน\n", 93 | "พ.ศ. 2454 (ค.ศ. 1911) การจลาจลอู่ฮั่นในประเทศจีน เป็นจุดเริ่มต้น", 94 | "การล่มสลายของราชวงศ์ชิง เมื่อพรรคคอมมิวนิสต์จีนเข้ามีอำนาจในจีน", 95 | "แผ่นดินใหญ่เมื่อ พ.ศ. 2492 (1949) พรรคก๊กมินตั๋ง พรรคการเมือง", 96 | "ชาตินิยมของจีนที่เป็นฝ่ายแพ้ก็พาผู้คนอพยพหนีออกจากแผ่นดินใหญ่มา", 97 | "ตั้งหลักที่ไต้หวัน เพื่อวางแผนกลับไปครองอำนาจในจีนต่อไป\n", 98 | "ชาวจีนมากกว่า 1 ล้าน 5 แสนคน อพยพตามมาอยู่ที่เกาะไต้หวันในยุคที่", 99 | "เหมา เจ๋อตง มีอำนาจเต็มที่ในจีนแผ่นดินใหญ่ ผู้นำของประเทศทั้งสอง", 100 | "จีนคือผู้นำพรรคคอมมิวนิสต์กับผู้นำสาธารณรัฐจีนบนเกาะไต้หวัน แย่ง", 101 | "กันเป็นกระบอกเสียงของประชาชนจีนในเวทีโลก แต่เสียงของนานาประเทศ", 102 | "ส่วนใหญ่เกรงอิทธิพลของจีนแผ่นดินใหญ่ จึงให้การยอมรับจีนแผ่นดิน", 103 | "ใหญ่มากกว่า\n", 104 | "ในปี พ.ศ. 2514 (ค.ศ. 1971) ก่อนที่นายพล เจียง ไคเช็ก", 105 | "(ภาษาจีน: 蔣中正) จะถึงอสัญกรรมไม่กี่ปี สาธารณรัฐจีนซึ่งเป็น", 106 | "ประเทศที่ร่วมก่อตั้งองค์การสหประชาชาติได้สูญเสียสมาชิกภาพใน", 107 | "ฐานะตัวแทนชาวจีนให้กับสาธารณรัฐประชาชนจีน ในปี พ.ศ. 2521 (1978)", 108 | "สหประชาชาติประกาศรับรองจีนเดียวคือจีนแผ่นดินใหญ่และตัดสัมพันธ์", 109 | "ทางการเมืองกับสาธารณรัฐจีน ทั้งสหรัฐอเมริกาก็ได้ถอนการรับรองว่า", 110 | "สาธารณรัฐจีนมีฐานะเป็นรัฐ ไต้หวันจึงกลายเป็นเพียงดินแดนที่จีน", 111 | "อ้างว่าเป็นส่วนหนึ่งของสาธารณรัฐประชาชนจีนตั้งแต่นั้นเป็นต้นมา\n", 112 | "เมื่อเจียง ไคเช็ก ถึงแก่อสัญกรรมในปี พ.ศ. 2518 (1975) ลูกชาย", 113 | "ที่ชื่อ เจี่ยง จิงกั๋ว ได้เป็นผู้สืบทอดการปกครอง", 114 | "ไต้หวันต่อและเริ่มกระบวนการ วางรากฐานไปสู่ประชาธิปไตย\n", 115 | "หลังจากที่ประธานาธิบดี เจียง จิงกั๋ว เสียชีวิต ไต้หวันจึงได้เข้า", 116 | "สู่ระบอบประชาธิปไตยเต็มรูปแบบ ประธานาธิบดีคนใหม่ ซึ่งเกิดใน", 117 | "ไต้หวัน ชื่อ หลี่ เติงฮุย ขึ้นบริหารประเทศ โดยการสนับสนุนของ", 118 | "เจี่ยง จิงกั๋ว ทั้งที่ หลี่ เติงฮุย นั้นเคลื่อนไหว", 119 | "สนับสนุนเอกราชไต้หวัน นาย รัฐบาลจีนที่ปักกิ่งได้ตั้ง", 120 | "ฉายาประธานาธิบดีไต้หวันคนใหม่ว่า \"จิ้งจกปากหวาน\" ", 121 | "ช่วงเวลาที่นายหลี่ เติงฮุย เป็นประธานาธิบดี การเมืองของไต้หวัน", 122 | "เกิดการแตกแยกออกเป็น 3 ฝ่ายคือ 1) พวกก๊กมินตั๋ง ที่ต้องการกลับ", 123 | "ไปรวมประเทศกับจีนแผ่นดินใหญ่ (รวมจีนแผ่นดินใหญ่ภายใต้การปกครอง", 124 | "ของสาธารณรัฐจีน) 2) พวกที่ต้องการให้ไต้หวันเป็นประเทศอิสระไม่", 125 | "เกี่ยวข้องกับจีนแผ่นดินใหญ่ และ 3) พวกที่ต้องการดำรงฐานะของ", 126 | "ประเทศไว้ดังเดิมต่อไป\n", 127 | "ไต้หวันกับจีนแผ่นดินใหญ่นัดเจรจาหาทางออกของข้อขัดแย้งทางการเมือง", 128 | "ครั้งแรกที่สิงคโปร์เมื่อปี พ.ศ. 2536 (ค.ศ. 1993) แต่ปรากฏว่าจีน", 129 | "แผ่นดินใหญ่ประวิงเวลาลงนามในสัญญาหลายฉบับที่เป็นข้อตกลงร่วมกัน ", 130 | "ทำให้ผลการเจรจาคราวนั้นไม่ก้าวหน้าไปถึงไหน ความสัมพันธ์ระหว่าง", 131 | "สองจีนเลวร้ายลงทุกที เมื่อประธานาธิบดี หลี่ เติงฮุย เดินทางไป", 132 | "เยือนสหรัฐอเมริกาและได้รับการยอมรับอย่างเอิกเกริก ทำให้จีนแผ่น", 133 | "ดินใหญ่ไม่พอใจอย่างมาก จึงข่มขวัญไต้หวันกับประเทศที่ให้การสนับ", 134 | "สนุนไต้หวัน ด้วยการทำการซ้อมรบขึ้นใกล้ ๆ เกาะไต้หวัน สหรัฐ", 135 | "อเมริกาออกมาแสดงอาการปกป้องคุ้มครองไต้หวันด้วยการส่งกำลังกอง", 136 | "เรือรบของสหรัฐฯ มาป้วนเปี้ยนอยู่ในน่านน้ำที่จีนซ้อมรบ\n", 137 | "ขณะที่โลกกำลังล่อแหลมกับสถานการณ์ที่ตึงเครียดในน่านน้ำจีนมาก", 138 | "ขึ้นทุกทีนั้น ไต้หวันก็จัดให้มีการเลือกตั้งครั้งใหม่ และในการ", 139 | "เลือกตั้งครั้งใหม่นั้นเอง ไต้หวันก็ได้นายหลี่ เติงฮุย เป็น", 140 | "ประธานาธิบดีอีกครั้ง\n", 141 | "ไต้หวันเข้าสู่สภาวะวิกฤต เมื่อเกิดแผ่นดินไหวครั้งร้ายแรงที่สุดใน", 142 | "ประวัติศาสตร์ในเดือนกันยายน พ.ศ. 2542 (ค.ศ. 1999) ทำให้ประชากร", 143 | "ส่วนมากที่เป็นชาวพื้นเมืองเสียชีวิตไป 2,000 คน ทั้งเมืองมีแต่", 144 | "เศษซากปรักหักพังจากภัยธรรมชาติ และช่วงนี้ไต้หวันต้องเผชิญความ", 145 | "ยากลำบาก จีนแผ่นดินใหญ่ก็เพิ่มความกดดันไม่ให้นานาชาติ", 146 | "เข้ามายุ่งเกี่ยวกับไต้หวันแม้ในยามคับขันเช่นนี้ โดยประกาศว่า ", 147 | "หากมีประเทศใดจะเข้าไปให้ความช่วยเหลือไต้หวัน จะต้องได้รับอนุญาต", 148 | "จากจีนก่อน ซึ่งคำประกาศของจีนแผ่นดินใหญ่สวนทางกับเมตตาธรรมของ", 149 | "ประเทศทั่วโลกที่ต้องการให้ความช่วยเหลือไต้หวัน\n", 150 | "เดือนมีนาคม พ.ศ. 2543 (ค.ศ. 2000) มีการเลือกตั้งใหม่ในไต้หวัน ", 151 | "ชาวไต้หวันเลือกผู้แทนจากพรรคประชาธิปไตยก้าวหน้า คือ นายเฉิน สุย", 152 | "เปี่ยน เป็นประธานาธิบดีคนใหม่ของไต้หวัน ผู้ประกาศนโยบายการเมือง", 153 | "แข็งกร้าวว่าไต้หวันต้องการแยกตัวเป็นอิสระจากจีนแผ่นดินใหญ่ ยุติ", 154 | "ยุคของพรรคชาตินิยมที่ยังฝักใฝ่แผ่นดินใหญ่อยู่ จีนแผ่นดินใหญ่จึง", 155 | "ถือว่าเป็นกบฏต่อการปกครองของจีน เพราะแต่ไหนแต่ไร ไต้หวันไม่เคย", 156 | "ประกาศอย่างเป็นทางการว่าเป็นประเทศอิสระแยกจากจีน และจีนพูดอยู่", 157 | "เสมอว่าไต้หวันเป็นเด็กในปกครองที่ค่อนข้างจะหัวดื้อและเกเร หาก", 158 | "ไต้หวันประกาศว่าเป็นอิสระจากจีนเมื่อใด จีนก็จะยกกำลังจัดการ", 159 | "กับไต้หวันทันที\n", 160 | "ในขณะที่ความสัมพันธ์ทางการเมืองระหว่างสองจีนในสายตาชาวโลก", 161 | "เลวร้ายลง จีนทั้งสองกลับมีการติดต่อทางการค้ากันมากขึ้น มีการ", 162 | "ผ่อนปรนอนุญาตให้ชาวไต้หวันเดินทางไปจีนแผ่นดินใหญ่เพื่อเยี่ยม", 163 | "ญาติได้ เกิดปรากฏการณ์สำคัญคือนักธุรกิจไต้หวันหอบเงินทุนกว่า ", 164 | "20,000 ล้านดอลลาร์สหรัฐ ไปลงทุนดำเนินธุรกิจทางตอนใต้ของจีน", 165 | "แผ่นดินใหญ่ จนกระทั่งขณะนี้ชาวไต้หวันกลายเป็นนักลงทุนรายใหญ่", 166 | "เป็นลำดับ 2 ของจีน\n", 167 | "วันที่ 24 พฤษภาคม 2560 ศาลรัฐธรรมนูญวินิจฉัยว่ากฎหมายสมรส", 168 | "ปัจจุบันในเวลานั้น ละเมิดรัฐธรรมนูญ โดยปฏิเสธสิทธิสมรสของคู่รัก", 169 | "เพศเดียวกันชาวไต้หวัน ศาลวินิจฉัยว่าหากสภานิติบัญญัติไม่ผ่าน", 170 | "การแก้ไขกฎหมายที่เพียงพอต่อกฎหมายสมรสของไต้หวันภายในสองปี ", 171 | "การสมรสเพศเดียวกันจะชอบด้วยกฎหมายโดยอัตโนมัติในไต้หวัน[17] ", 172 | "วันที่ 17 พฤษภาคม 2562 สภานิติบัญญัติไต้หวันอนุมัติ", 173 | "ร่างกฎหมายทำให้การสมรสเพศเดียวกันชอบด้วยกฎหมาย", 174 | " ทำให้เป็นประเทศแรกในทวีปเอเชียที่ผ่านกฎหมายดังกล่าว[18][19]", 175 | ] 176 | .join(""); 177 | 178 | let tokenizer = NewmmTokenizer::new(&relative_dict_path); 179 | let result = tokenizer.segment(&text, false, true).unwrap(); 180 | let safe_result = tokenizer.segment(&text, true, true).unwrap(); 181 | assert_eq!(result.len(), 1889); 182 | assert_eq!(safe_result.len(), 1991); 183 | } 184 | 185 | #[test] 186 | fn test_standard_short_word() { 187 | let mut relative_dict_path = env!("CARGO_MANIFEST_DIR").to_string(); 188 | relative_dict_path.push_str(DEFAULT_DICT_PATH); 189 | 190 | let tokenizer = NewmmTokenizer::new(&relative_dict_path); 191 | assert_eq!( 192 | tokenizer.segment_to_string("1) ประมวลผลภาษาไทย", false, false), 193 | ["1", ")", " ", "ประมวลผล", "ภาษาไทย"] 194 | ); 195 | assert_eq!( 196 | tokenizer.segment_to_string("มาตรา39", false, false), 197 | ["มาตรา", "39"] 198 | ); 199 | assert_eq!( 200 | tokenizer.segment_to_string("19...", false, false), 201 | ["19", "..."] 202 | ); 203 | assert_eq!( 204 | tokenizer.segment_to_string("19.", false, false), 205 | ["19", "."] 206 | ); 207 | assert_eq!( 208 | tokenizer.segment_to_string("19.84", false, false), 209 | ["19.84"] 210 | ); 211 | assert_eq!( 212 | tokenizer.segment_to_string("127.0.0.1", false, false), 213 | ["127.0.0.1"] 214 | ); 215 | assert_eq!( 216 | tokenizer.segment_to_string("USD1,984.42", false, false), 217 | ["USD", "1,984.42"] 218 | ); 219 | } 220 | 221 | #[test] 222 | fn test_add_or_remove_word() { 223 | let mut relative_dict_path = env!("CARGO_MANIFEST_DIR").to_string(); 224 | relative_dict_path.push_str(DEFAULT_DICT_PATH); 225 | 226 | let mut tokenizer = NewmmTokenizer::new(&relative_dict_path); 227 | tokenizer.add_word(&["ห้องสมุดประชาชนเทศบาลตำบลวิชิต"]); 228 | assert_eq!( 229 | tokenizer.segment_to_string("ห้องสมุดประชาชนเทศบาลตำบลวิชิต", false, false), 230 | ["ห้องสมุดประชาชนเทศบาลตำบลวิชิต"] 231 | ); 232 | tokenizer.remove_word(&["ห้องสมุดประชาชนเทศบาลตำบลวิชิต", "ห้องสมุดประชาชน", "ประชาชน"]); 233 | assert_eq!( 234 | tokenizer.segment_to_string("ห้องสมุดประชาชนเทศบาลตำบลวิชิต", false, false), 235 | ["ห้องสมุด", "ประชา", "ชน", "เทศบาลตำบล", "วิชิต"] 236 | ); 237 | } 238 | 239 | #[test] 240 | fn test_with_some_real_data() { 241 | let mut relative_dict_path = env!("CARGO_MANIFEST_DIR").to_string(); 242 | relative_dict_path.push_str(DEFAULT_DICT_PATH); 243 | 244 | let tokenizer = NewmmTokenizer::new(&relative_dict_path); 245 | assert_eq!( 246 | tokenizer.segment_to_string(FIRST_TEXT, false, false), 247 | ["นิสสัน", "ผ่อน", "จน", "เพลีย", "นาวา", "ร่า", ".."] 248 | ); 249 | assert_eq!( 250 | tokenizer.segment_to_string(SECOND_TEXT, false, false), 251 | [ 252 | "อาชญากรรม", 253 | "ทางการแพทย์", 254 | "..", 255 | " ", 256 | "หลอกลวง", 257 | "คนไข้", 258 | "ผ่าตัด", 259 | " ", 260 | "ตัด", 261 | "หมอน", 262 | "รอง", 263 | "ข้อ", 264 | "เข่า", 265 | "อำพราง", 266 | " ", 267 | "รพ.", 268 | "กรุงเทพ", 269 | "ภูเก็ต", 270 | "ปลอม", 271 | "เวช", 272 | "ระเบียน", 273 | " ", 274 | "ตอนที่", 275 | "๑", 276 | "." 277 | ] 278 | ); 279 | } 280 | 281 | #[test] 282 | fn test_thai_number() { 283 | let mut relative_dict_path = env!("CARGO_MANIFEST_DIR").to_string(); 284 | relative_dict_path.push_str(DEFAULT_DICT_PATH); 285 | 286 | let tokenizer = NewmmTokenizer::new(&relative_dict_path); 287 | assert_eq!( 288 | tokenizer.segment_to_string("๑๙...", false, false), 289 | ["๑๙", "..."] 290 | ); 291 | assert_eq!( 292 | tokenizer.segment_to_string("๑๙.", false, false), 293 | ["๑๙", "."] 294 | ); 295 | assert_eq!( 296 | tokenizer.segment_to_string("๑๙.๘๔", false, false), 297 | ["๑๙.๘๔"] 298 | ); 299 | assert_eq!( 300 | tokenizer.segment_to_string("๑๒๗.๐.๐.๑", false, false), 301 | ["๑๒๗.๐.๐.๑"] 302 | ); 303 | assert_eq!( 304 | tokenizer.segment_to_string("USD๑,๙๘๔.๔๒", false, false), 305 | ["USD", "๑,๙๘๔.๔๒"] 306 | ); 307 | } 308 | --------------------------------------------------------------------------------