├── .github └── workflows │ ├── build.yml │ └── deploy.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── cortex.yaml ├── images └── logo.png ├── poetry.lock ├── pyproject.toml ├── pywordsegment ├── __init__.py ├── bigrams.msgpack.gz ├── pywordsegment.pyi └── unigrams.msgpack.gz ├── scripts ├── bigrams_downloader.py ├── serializer.py └── unigrams_downloader.py ├── setup.cfg ├── src └── lib.rs └── tests ├── __init__.py └── test_pywordsegment.py /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | on: 3 | - push 4 | - pull_request 5 | jobs: 6 | lint: 7 | if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags') 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout 11 | uses: actions/checkout@v3 12 | - name: Install latest rust 13 | uses: actions-rs/toolchain@v1 14 | with: 15 | toolchain: stable 16 | profile: minimal 17 | override: true 18 | components: clippy 19 | - name: Lint with clippy 20 | uses: actions-rs/cargo@v1 21 | with: 22 | command: clippy 23 | args: --all-targets --all-features 24 | test: 25 | runs-on: ${{ matrix.os }} 26 | needs: lint 27 | strategy: 28 | fail-fast: false 29 | matrix: 30 | python-version: 31 | - '3.7' 32 | - '3.8' 33 | - '3.9' 34 | - '3.10' 35 | - '3.11' 36 | os: 37 | - ubuntu-latest 38 | - macos-latest 39 | - windows-latest 40 | steps: 41 | - name: Checkout 42 | uses: actions/checkout@v3 43 | - name: Set up Python ${{ matrix.python-version }} 44 | uses: actions/setup-python@v3 45 | with: 46 | python-version: ${{ matrix.python-version }} 47 | - name: Install Poetry 48 | uses: abatilo/actions-poetry@v2.1.3 49 | - name: Install Rust 50 | uses: actions-rs/toolchain@v1 51 | with: 52 | profile: minimal 53 | toolchain: stable 54 | override: true 55 | - name: Install dependencies 56 | run: poetry install 57 | - name: Build Python package 58 | run: poetry run maturin develop 59 | - name: Test 60 | run: poetry run pytest -Werror tests 61 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy 2 | on: 3 | release: 4 | types: 5 | - released 6 | jobs: 7 | deploy: 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | python-version: 13 | - '3.7' 14 | - '3.8' 15 | - '3.9' 16 | - '3.10' 17 | - '3.11' 18 | os: 19 | - ubuntu-latest 20 | - macos-latest 21 | - windows-latest 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v3 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v4 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install Rust 30 | uses: actions-rs/toolchain@v1 31 | with: 32 | profile: minimal 33 | toolchain: stable 34 | override: true 35 | - name: Install Cross-compilers (macOS) 36 | if: matrix.os == 'macos-latest' 37 | run: | 38 | rustup target add x86_64-apple-darwin 39 | rustup target add aarch64-apple-darwin 40 | - name: Publish Package 41 | uses: PyO3/maturin-action@v1 42 | with: 43 | command: publish 44 | args: --username=__token__ ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.7' && '' || '--no-sdist' }} --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }} 45 | env: 46 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 47 | if: matrix.os != 'macos-latest' 48 | - name: Publish macOS (x86_64) Package 49 | if: matrix.os == 'macos-latest' 50 | uses: PyO3/maturin-action@v1 51 | with: 52 | command: publish 53 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=x86_64-apple-darwin --no-sdist 54 | env: 55 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 56 | - name: Publish macOS (arm64) Package 57 | if: matrix.os == 'macos-latest' 58 | uses: PyO3/maturin-action@v1 59 | with: 60 | command: publish 61 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-apple-darwin --no-sdist 62 | env: 63 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | # Distribution / packaging 8 | .Python 9 | env/ 10 | build/ 11 | develop-eggs/ 12 | dist/ 13 | downloads/ 14 | eggs/ 15 | .eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # conflict temp files 32 | *.py.orig 33 | *.mock 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | coverage_html_report/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | 58 | # Sphinx documentation 59 | docs/_build/ 60 | 61 | # PyBuilder 62 | target/ 63 | 64 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio 65 | 66 | *.iml 67 | 68 | ## Directory-based project format: 69 | .idea/ 70 | 71 | # Tests 72 | generic_tests.py 73 | cloudflare_test.py 74 | 75 | ############################ 76 | #Eclipse Specific GitIgnore# 77 | ############################ 78 | *.pydevproject 79 | .project 80 | .metadata 81 | bin/** 82 | tmp/** 83 | tmp/**/* 84 | *.tmp 85 | *.bak 86 | *.swp 87 | *~.nib 88 | local.properties 89 | .classpath 90 | .settings/ 91 | .loadpath 92 | 93 | 94 | # Git mergetool traces 95 | *.orig 96 | 97 | # VS Code internal directory 98 | .vscode/ 99 | 100 | *.dat 101 | *.code-workspace 102 | .history 103 | 104 | # Intsights development playground 105 | playground/ 106 | 107 | pytest-report\.csv 108 | *.cppimporthash 109 | .rendered.* 110 | Databases.db 111 | 112 | # Node.js 113 | dist/ 114 | node_modules/ 115 | coverage/ 116 | 117 | # Generated by Cargo 118 | # will have compiled files and executables 119 | /target/ 120 | 121 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 122 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 123 | Cargo.lock 124 | 125 | # These are backup files generated by rustfmt 126 | **/*.rs.bk 127 | 128 | *.sqlite3 129 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pywordsegment" 3 | version = "0.4.3" 4 | authors = ["Gal Ben David "] 5 | edition = "2021" 6 | description = "Concatenated-word segmentation Python library written in Rust" 7 | readme = "README.md" 8 | repository = "https://github.com/intsights/pywordsegment" 9 | homepage = "https://github.com/intsights/pywordsegment" 10 | license = "MIT" 11 | keywords = [ 12 | "word", 13 | "segment", 14 | "rust", 15 | "pyo3", 16 | ] 17 | 18 | [package.metadata.maturin] 19 | 20 | [lib] 21 | name = "pywordsegment" 22 | crate-type = ["cdylib"] 23 | 24 | [dependencies] 25 | ahash = "0.7" 26 | rmp-serde = "1" 27 | 28 | [dependencies.pyo3] 29 | version = "0.16.5" 30 | features = ["extension-module"] 31 | 32 | [profile.release] 33 | lto = true 34 | panic = "abort" 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Gal Ben David 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Logo 4 | 5 |

6 | Concatenated-word segmentation Python library written in Rust 7 |

8 |

9 | 10 | 11 | ![license](https://img.shields.io/badge/MIT-License-blue) 12 | ![Python](https://img.shields.io/badge/Python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue) 13 | ![OS](https://img.shields.io/badge/OS-Mac%20%7C%20Linux%20%7C%20Windows-blue) 14 | ![Build](https://github.com/intsights/pywordsegment/workflows/Build/badge.svg) 15 | [![PyPi](https://img.shields.io/pypi/v/pywordsegment.svg)](https://pypi.org/project/pywordsegment/) 16 | 17 | ## Table of Contents 18 | 19 | - [Table of Contents](#table-of-contents) 20 | - [About The Project](#about-the-project) 21 | - [Built With](#built-with) 22 | - [Installation](#installation) 23 | - [Usage](#usage) 24 | - [License](#license) 25 | - [Contact](#contact) 26 | 27 | 28 | ## About The Project 29 | 30 | A fast concatenated-word segmentation library written in Rust, inspired by [wordninja](https://github.com/keredson/wordninja) and [wordsegment](https://github.com/grantjenks/python-wordsegment). The binding uses [pyo3](https://github.com/PyO3/pyo3) to interact with the rust package. 31 | 32 | 33 | ### Built With 34 | 35 | * [pyo3](https://github.com/PyO3/pyo3) 36 | 37 | 38 | ### Installation 39 | 40 | ```sh 41 | pip3 install pywordsegment 42 | ``` 43 | 44 | 45 | ## Usage 46 | 47 | ```python 48 | import pywordsegment 49 | 50 | # The internal UNIGRAMS & BIGRAMS corpuses are lazy initialized 51 | # once per the whole module. Multiple WordSegmenter instances would 52 | # not create new dictionaries. 53 | 54 | # Segments a word to its parts 55 | pywordsegment.WordSegmenter.segment( 56 | text="theusashops", 57 | ) 58 | # ["the", "usa", "shops"] 59 | 60 | 61 | # This function checks whether the substring exists as a whole segment 62 | # inside text. 63 | pywordsegment.WordSegmenter.exist_as_segment( 64 | substring="inter", 65 | text="internationalairport", 66 | ) 67 | # False 68 | 69 | pywordsegment.WordSegmenter.exist_as_segment( 70 | substring="inter", 71 | text="intermilan", 72 | ) 73 | # True 74 | ``` 75 | 76 | 77 | ## License 78 | 79 | Distributed under the MIT License. See `LICENSE` for more information. 80 | 81 | 82 | ## Contact 83 | 84 | Gal Ben David - gal@intsights.com 85 | 86 | Project Link: [https://github.com/intsights/pywordsegment](https://github.com/intsights/pywordsegment) 87 | -------------------------------------------------------------------------------- /cortex.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | info: 3 | title: Pywordsegment 4 | description: Concatenated-word segmentation Python library written in Rust 5 | x-cortex-git: 6 | github: 7 | alias: intsightsorg 8 | repository: Intsights/PyWordSegment 9 | x-cortex-tag: pywordsegment 10 | x-cortex-type: service 11 | x-cortex-domain-parents: 12 | - tag: threatintel-phishing 13 | x-cortex-groups: 14 | - exposure:external-ship 15 | - target:library 16 | openapi: 3.0.1 17 | servers: 18 | - url: "/" 19 | -------------------------------------------------------------------------------- /images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PyWordSegment/b9fda304be80080d2b8ce966a146e2af6fd253b6/images/logo.png -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | [[package]] 2 | name = "colorama" 3 | version = "0.4.6" 4 | description = "Cross-platform colored terminal text." 5 | category = "dev" 6 | optional = false 7 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" 8 | 9 | [[package]] 10 | name = "exceptiongroup" 11 | version = "1.2.0" 12 | description = "Backport of PEP 654 (exception groups)" 13 | category = "dev" 14 | optional = false 15 | python-versions = ">=3.7" 16 | 17 | [package.extras] 18 | test = ["pytest (>=6)"] 19 | 20 | [[package]] 21 | name = "importlib-metadata" 22 | version = "6.7.0" 23 | description = "Read metadata from Python packages" 24 | category = "dev" 25 | optional = false 26 | python-versions = ">=3.7" 27 | 28 | [package.dependencies] 29 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} 30 | zipp = ">=0.5" 31 | 32 | [package.extras] 33 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"] 34 | perf = ["ipython"] 35 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-ruff", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] 36 | 37 | [[package]] 38 | name = "iniconfig" 39 | version = "2.0.0" 40 | description = "brain-dead simple config-ini parsing" 41 | category = "dev" 42 | optional = false 43 | python-versions = ">=3.7" 44 | 45 | [[package]] 46 | name = "maturin" 47 | version = "1.4.0" 48 | description = "Build and publish crates with pyo3, rust-cpython and cffi bindings as well as rust binaries as python packages" 49 | category = "dev" 50 | optional = false 51 | python-versions = ">=3.7" 52 | 53 | [package.dependencies] 54 | tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} 55 | 56 | [package.extras] 57 | zig = ["ziglang (>=0.10.0,<0.11.0)"] 58 | patchelf = ["patchelf"] 59 | 60 | [[package]] 61 | name = "packaging" 62 | version = "23.2" 63 | description = "Core utilities for Python packages" 64 | category = "dev" 65 | optional = false 66 | python-versions = ">=3.7" 67 | 68 | [[package]] 69 | name = "pluggy" 70 | version = "1.2.0" 71 | description = "plugin and hook calling mechanisms for python" 72 | category = "dev" 73 | optional = false 74 | python-versions = ">=3.7" 75 | 76 | [package.dependencies] 77 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 78 | 79 | [package.extras] 80 | dev = ["pre-commit", "tox"] 81 | testing = ["pytest", "pytest-benchmark"] 82 | 83 | [[package]] 84 | name = "pytest" 85 | version = "7.4.4" 86 | description = "pytest: simple powerful testing with Python" 87 | category = "dev" 88 | optional = false 89 | python-versions = ">=3.7" 90 | 91 | [package.dependencies] 92 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 93 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} 94 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 95 | iniconfig = "*" 96 | packaging = "*" 97 | pluggy = ">=0.12,<2.0" 98 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} 99 | 100 | [package.extras] 101 | testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] 102 | 103 | [[package]] 104 | name = "pytest-runner" 105 | version = "6.0.1" 106 | description = "Invoke py.test as distutils command with dependency resolution" 107 | category = "dev" 108 | optional = false 109 | python-versions = ">=3.7" 110 | 111 | [package.extras] 112 | docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"] 113 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-virtualenv", "types-setuptools", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] 114 | 115 | [[package]] 116 | name = "tomli" 117 | version = "2.0.1" 118 | description = "A lil' TOML parser" 119 | category = "dev" 120 | optional = false 121 | python-versions = ">=3.7" 122 | 123 | [[package]] 124 | name = "typing-extensions" 125 | version = "4.7.1" 126 | description = "Backported and Experimental Type Hints for Python 3.7+" 127 | category = "dev" 128 | optional = false 129 | python-versions = ">=3.7" 130 | 131 | [[package]] 132 | name = "zipp" 133 | version = "3.15.0" 134 | description = "Backport of pathlib-compatible object wrapper for zip files" 135 | category = "dev" 136 | optional = false 137 | python-versions = ">=3.7" 138 | 139 | [package.extras] 140 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"] 141 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "jaraco.functools", "more-itertools", "big-o", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "pytest-flake8"] 142 | 143 | [metadata] 144 | lock-version = "1.1" 145 | python-versions = "^3.7" 146 | content-hash = "d3751775f5a48f55874329689185792d15525d44f15678cc3bfeb66b5dea0d3d" 147 | 148 | [metadata.files] 149 | colorama = [] 150 | exceptiongroup = [] 151 | importlib-metadata = [] 152 | iniconfig = [] 153 | maturin = [] 154 | packaging = [] 155 | pluggy = [] 156 | pytest = [] 157 | pytest-runner = [] 158 | tomli = [] 159 | typing-extensions = [] 160 | zipp = [] 161 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=0.12,<0.13"] 3 | build-backend = "maturin" 4 | 5 | [tool.maturin] 6 | sdist-include = [ 7 | "Cargo.toml", 8 | "pyproject.toml", 9 | "pywordsegment/*.gz", 10 | "pywordsegment/*.py", 11 | "pywordsegment/*.pyi", 12 | "src/*", 13 | ] 14 | 15 | [tool.poetry] 16 | name = "pywordsegment" 17 | version = "0.4.3" 18 | authors = ["Gal Ben David "] 19 | description = "Concatenated-word segmentation Python library written in Rust" 20 | readme = "README.md" 21 | repository = "https://github.com/intsights/pywordsegment" 22 | homepage = "https://github.com/intsights/pywordsegment" 23 | license = "MIT" 24 | keywords = [ 25 | "word", 26 | "segment", 27 | "rust", 28 | "pyo3" 29 | ] 30 | classifiers = [ 31 | "License :: OSI Approved :: MIT License", 32 | "Operating System :: MacOS", 33 | "Operating System :: Microsoft", 34 | "Operating System :: POSIX :: Linux", 35 | "Programming Language :: Python :: 3.7", 36 | "Programming Language :: Python :: 3.8", 37 | "Programming Language :: Python :: 3.9", 38 | "Programming Language :: Python :: 3.10", 39 | "Programming Language :: Python :: 3.11", 40 | "Programming Language :: Rust", 41 | ] 42 | 43 | [tool.poetry.dependencies] 44 | python = "^3.7" 45 | 46 | [tool.poetry.dev-dependencies] 47 | pytest = "*" 48 | wheel = "*" 49 | pytest-runner = "*" 50 | maturin = "*" 51 | -------------------------------------------------------------------------------- /pywordsegment/__init__.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import importlib.resources 3 | import sys 4 | import typing 5 | 6 | from . import pywordsegment 7 | 8 | PY_VERSION_MAJOR = sys.version_info.major 9 | PY_VERSION_MINOR = sys.version_info.minor 10 | 11 | class WordSegmenter: 12 | word_segmenter: pywordsegment.WordSegmenter = None 13 | 14 | @staticmethod 15 | def load() -> None: 16 | if WordSegmenter.word_segmenter is None: 17 | if PY_VERSION_MAJOR >= 3 and PY_VERSION_MINOR >= 11: 18 | with importlib.resources.files( 19 | __package__, 20 | ).joinpath( 21 | 'unigrams.msgpack.gz', 22 | ).open( 23 | 'rb', 24 | ) as unigrams_msgpack, importlib.resources.files( 25 | __package__, 26 | ).joinpath( 27 | 'bigrams.msgpack.gz', 28 | ).open( 29 | 'rb', 30 | ) as bigrams_msgpack: 31 | unigrams_serialized = gzip.decompress( 32 | data=unigrams_msgpack.read(), 33 | ) 34 | bigrams_serialized = gzip.decompress( 35 | data=bigrams_msgpack.read(), 36 | ) 37 | 38 | else: 39 | unigrams_serialized = gzip.decompress( 40 | data=importlib.resources.read_binary( 41 | package=__package__, 42 | resource='unigrams.msgpack.gz', 43 | ), 44 | ) 45 | 46 | bigrams_serialized = gzip.decompress( 47 | data=importlib.resources.read_binary( 48 | package=__package__, 49 | resource='bigrams.msgpack.gz', 50 | ), 51 | ) 52 | 53 | WordSegmenter.word_segmenter = pywordsegment.WordSegmenter( 54 | unigrams_serialized=unigrams_serialized, 55 | bigrams_serialized=bigrams_serialized, 56 | ) 57 | 58 | @staticmethod 59 | def segment( 60 | text: str, 61 | ) -> typing.List[str]: 62 | if WordSegmenter.word_segmenter is None: 63 | WordSegmenter.load() 64 | 65 | return WordSegmenter.word_segmenter.segment(text) 66 | 67 | @staticmethod 68 | def exist_as_segment( 69 | substring: str, 70 | text: str, 71 | ) -> bool: 72 | if WordSegmenter.word_segmenter is None: 73 | WordSegmenter.load() 74 | 75 | return WordSegmenter.word_segmenter.exist_as_segment(substring, text) 76 | -------------------------------------------------------------------------------- /pywordsegment/bigrams.msgpack.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PyWordSegment/b9fda304be80080d2b8ce966a146e2af6fd253b6/pywordsegment/bigrams.msgpack.gz -------------------------------------------------------------------------------- /pywordsegment/pywordsegment.pyi: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | 4 | class WordSegmenter: 5 | @staticmethod 6 | def load() -> None: ... 7 | 8 | @staticmethod 9 | def segment( 10 | text: str, 11 | ) -> typing.List[str]: ... 12 | 13 | @staticmethod 14 | def exist_as_segment( 15 | substring: str, 16 | text: str, 17 | ) -> bool: ... 18 | -------------------------------------------------------------------------------- /pywordsegment/unigrams.msgpack.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PyWordSegment/b9fda304be80080d2b8ce966a146e2af6fd253b6/pywordsegment/unigrams.msgpack.gz -------------------------------------------------------------------------------- /scripts/bigrams_downloader.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import sqlite3 3 | import urllib.request 4 | import concurrent.futures 5 | 6 | 7 | def process_url( 8 | url, 9 | ): 10 | print(f'processing {url}') 11 | 12 | db_connection = sqlite3.connect( 13 | database='bigrams.sqlite3', 14 | timeout=100000, 15 | ) 16 | db_cursor = db_connection.cursor() 17 | db_cursor.execute( 18 | ''' 19 | CREATE TABLE IF NOT EXISTS bigrams ( 20 | bigram_first TEXT, 21 | bigram_second TEXT, 22 | count INTEGER, 23 | UNIQUE(bigram_first, bigram_second) 24 | ) 25 | ''' 26 | ) 27 | db_cursor.execute( 28 | ''' 29 | CREATE INDEX IF NOT EXISTS count ON bigrams (count) 30 | ''' 31 | ) 32 | db_connection.commit() 33 | 34 | chunk = [] 35 | with urllib.request.urlopen( 36 | url=url, 37 | ) as response: 38 | with gzip.GzipFile( 39 | fileobj=response, 40 | ) as uncompressed: 41 | for line in uncompressed: 42 | bigram, _, fragments = line.decode().partition('\t') 43 | 44 | bigram_first, _, bigram_second = bigram.lower().partition(' ') 45 | bigram_first, _, _ = bigram_first.rpartition('_') 46 | bigram_second, _, _ = bigram_second.rpartition('_') 47 | 48 | if not bigram_first.isalnum() or not bigram_second.isalnum(): 49 | continue 50 | 51 | count = 0 52 | for frag in fragments.split('\t'): 53 | count += int(frag.split(',')[1]) 54 | 55 | if len(chunk) == 100000: 56 | db_cursor.executemany( 57 | ''' 58 | INSERT INTO bigrams 59 | VALUES (?, ?, ?) 60 | ON CONFLICT (bigram_first, bigram_second) DO 61 | UPDATE SET count = count + ?; 62 | ''', 63 | chunk, 64 | ) 65 | db_connection.commit() 66 | chunk.clear() 67 | else: 68 | chunk.append( 69 | ( 70 | bigram_first, 71 | bigram_second, 72 | count, 73 | count, 74 | ) 75 | ) 76 | 77 | db_cursor.executemany( 78 | ''' 79 | INSERT INTO bigrams 80 | VALUES (?, ?, ?) 81 | ON CONFLICT (bigram_first, bigram_second) DO 82 | UPDATE SET count = count + ?; 83 | ''', 84 | chunk, 85 | ) 86 | db_connection.commit() 87 | 88 | 89 | futures = [] 90 | with concurrent.futures.ProcessPoolExecutor( 91 | max_workers=30, 92 | ) as executor: 93 | urls = [ 94 | f'http://storage.googleapis.com/books/ngrams/books/20200217/eng/2-{i:05d}-of-00589.gz' 95 | for i in range(0, 589) 96 | ] 97 | for url in urls: 98 | futures.append(executor.submit(process_url, url)) 99 | 100 | for future in concurrent.futures.as_completed(futures): 101 | print(f'finished {future.result()}') 102 | -------------------------------------------------------------------------------- /scripts/serializer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import msgpack 3 | import sqlite3 4 | import math 5 | 6 | 7 | unigrams_connection = sqlite3.connect( 8 | database='unigrams.sqlite3', 9 | timeout=10, 10 | ) 11 | unigrams_db_cursor = unigrams_connection.cursor() 12 | 13 | bigrams_connection = sqlite3.connect( 14 | database='bigrams.sqlite3', 15 | timeout=10, 16 | ) 17 | bigrams_db_cursor = bigrams_connection.cursor() 18 | 19 | unigrams = { 20 | unigram: float(count) 21 | for unigram, count in unigrams_db_cursor.execute( 22 | ''' 23 | SELECT word, count 24 | FROM unigrams 25 | ORDER BY count DESC 26 | LIMIT 1000000; 27 | ''' 28 | ) 29 | } 30 | unigrams_total_count = sum(unigrams.values()) 31 | 32 | bigrams = {} 33 | bigrams_total_count = 0 34 | for bigram_first, bigram_second, count in bigrams_db_cursor.execute( 35 | ''' 36 | SELECT bigram_first, bigram_second, count 37 | FROM bigrams 38 | ORDER BY count DESC 39 | LIMIT 100000; 40 | ''' 41 | ): 42 | bigram_first = bigram_first.decode() 43 | bigram_second = bigram_second.decode() 44 | 45 | if bigram_first in bigrams: 46 | bigrams[bigram_first][bigram_second] = float(count) 47 | else: 48 | bigrams[bigram_first] = { 49 | bigram_second: float(count) 50 | } 51 | 52 | bigrams_total_count += count 53 | 54 | bigrams_processed = {} 55 | for bigram_first, inner in bigrams.items(): 56 | for bigram_second, count in inner.items(): 57 | if bigram_first in unigrams: 58 | if bigram_first not in bigrams_processed: 59 | bigrams_processed[bigram_first] = {} 60 | bigrams_processed[bigram_first][bigram_second] = math.log10( 61 | (count / bigrams_total_count) / 62 | (unigrams[bigram_first] / unigrams_total_count) 63 | ) 64 | 65 | unigrams_processed = { 66 | unigram: math.log10(count / unigrams_total_count) 67 | for unigram, count in unigrams.items() 68 | } 69 | unigrams_processed['unigrams_total_count'] = unigrams_total_count 70 | 71 | with gzip.GzipFile( 72 | filename='unigrams.msgpack.gz', 73 | mode='wb', 74 | ) as compressed_file: 75 | compressed_file.write(msgpack.packb(unigrams_processed)) 76 | 77 | with gzip.GzipFile( 78 | filename='bigrams.msgpack.gz', 79 | mode='wb', 80 | ) as compressed_file: 81 | compressed_file.write(msgpack.packb(bigrams_processed)) 82 | -------------------------------------------------------------------------------- /scripts/unigrams_downloader.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import sqlite3 3 | import urllib.request 4 | 5 | 6 | db_connection = sqlite3.connect( 7 | database='unigrams.sqlite3', 8 | timeout=10, 9 | ) 10 | db_cursor = db_connection.cursor() 11 | db_cursor.execute( 12 | ''' 13 | CREATE TABLE IF NOT EXISTS unigrams ( 14 | unigram TEXT, 15 | count INTEGER, 16 | UNIQUE(unigram) 17 | ) 18 | ''' 19 | ) 20 | db_cursor.execute( 21 | ''' 22 | CREATE INDEX IF NOT EXISTS count ON unigrams (count) 23 | ''' 24 | ) 25 | db_connection.commit() 26 | 27 | urls = [ 28 | f'http://storage.googleapis.com/books/ngrams/books/20200217/eng/1-{i:05d}-of-00024.gz' 29 | for i in range(0, 24) 30 | ] 31 | for url in urls: 32 | print(f'processing {url}') 33 | 34 | with urllib.request.urlopen( 35 | url=url, 36 | ) as response: 37 | with gzip.GzipFile( 38 | fileobj=response, 39 | ) as uncompressed: 40 | for line in uncompressed: 41 | fragments = line.decode().split('\t') 42 | unigram = fragments[0].lower() 43 | if not unigram.isalnum(): 44 | continue 45 | 46 | count = 0 47 | for frag in fragments[1:]: 48 | year, number_of_instances, volume = frag.split(',') 49 | count += int(number_of_instances) 50 | 51 | db_cursor.execute( 52 | ''' 53 | INSERT INTO unigrams 54 | VALUES (?, ?) 55 | ON CONFLICT (unigram) DO 56 | UPDATE SET count = count + ?; 57 | ''', 58 | ( 59 | unigram, 60 | count, 61 | count, 62 | ), 63 | ) 64 | 65 | db_connection.commit() 66 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | 4 | [tool:pytest] 5 | addopts = --tb=native -s 6 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use ahash::RandomState; 2 | use pyo3::prelude::*; 3 | use std::collections::HashMap; 4 | 5 | const MAX_WORD_LEN: usize = 24; 6 | 7 | #[pyclass] 8 | struct WordSegmenter { 9 | unigrams: HashMap, 10 | bigrams: HashMap, RandomState>, 11 | unknown_unigrams: [f64; MAX_WORD_LEN + 1], 12 | } 13 | 14 | #[pymethods] 15 | impl WordSegmenter { 16 | #[new] 17 | fn new( 18 | unigrams_serialized: &[u8], 19 | bigrams_serialized: &[u8], 20 | ) -> Self { 21 | let unigrams: HashMap = rmp_serde::from_slice(unigrams_serialized).unwrap(); 22 | let bigrams = rmp_serde::from_slice(bigrams_serialized).unwrap(); 23 | 24 | let total_unigrams_frequency = unigrams.get("unigrams_total_count").unwrap(); 25 | let mut unknown_unigrams = [0.0; MAX_WORD_LEN + 1]; 26 | for (word_len, value) in unknown_unigrams.iter_mut().enumerate() { 27 | *value = (10.0 / (total_unigrams_frequency * 10_f64.powi(word_len as i32))).log10(); 28 | } 29 | 30 | WordSegmenter { 31 | unigrams, 32 | bigrams, 33 | unknown_unigrams, 34 | } 35 | } 36 | 37 | fn segment( 38 | &self, 39 | py: Python, 40 | text: String, 41 | ) -> PyResult> { 42 | let clean_text = text 43 | .to_ascii_lowercase() 44 | .replace( 45 | |c: char| !c.is_ascii_alphanumeric(), 46 | "" 47 | ); 48 | 49 | let words = self.search(&clean_text); 50 | 51 | Ok(words.into_py(py)) 52 | } 53 | 54 | fn exist_as_segment( 55 | &self, 56 | substring: String, 57 | text: String, 58 | ) -> PyResult { 59 | let clean_text = text 60 | .to_ascii_lowercase() 61 | .replace( 62 | |c: char| !c.is_ascii_alphanumeric(), 63 | "" 64 | ); 65 | 66 | let clean_substring = substring 67 | .to_ascii_lowercase() 68 | .replace( 69 | |c: char| !c.is_ascii_alphanumeric(), 70 | "" 71 | ); 72 | 73 | let segmented_text = self.search(&clean_text); 74 | let segmented_substring = self.search(&clean_substring); 75 | 76 | let segmented_substring_pattern = format!("-{}-", segmented_substring.join("-")); 77 | let segmented_text_pattern = format!("-{}-", segmented_text.join("-")); 78 | 79 | Ok(segmented_text_pattern.contains(&segmented_substring_pattern)) 80 | } 81 | } 82 | 83 | impl WordSegmenter { 84 | fn score( 85 | &self, 86 | word: &str, 87 | previous: &str, 88 | ) -> f64 { 89 | if !previous.is_empty() { 90 | if let Some(first_bigram_layer) = self.bigrams.get(previous) { 91 | if let Some(bigram_frequency) = first_bigram_layer.get(word) { 92 | return *bigram_frequency; 93 | } 94 | } 95 | } 96 | 97 | match self.unigrams.get(word) { 98 | Some(frequency) => *frequency, 99 | None => self.unknown_unigrams[word.len()], 100 | } 101 | } 102 | 103 | fn search<'a>( 104 | &self, 105 | text: &'a str, 106 | ) -> Vec<&'a str> { 107 | let mut result = Vec::with_capacity(text.len()); 108 | let mut candidates = Vec::with_capacity(text.len()); 109 | 110 | if text.is_empty() { 111 | return result; 112 | } 113 | 114 | for end in 1..=text.len() { 115 | let start = end.saturating_sub(MAX_WORD_LEN); 116 | for split in start..end { 117 | let (prev, prev_score) = match split { 118 | 0 => ("", 0.0), 119 | _ => { 120 | let (prefix_len, prefix_score) = candidates[split - 1]; 121 | let word = &text[split - prefix_len as usize..split]; 122 | (word, prefix_score) 123 | } 124 | }; 125 | 126 | let word = &text[split..end]; 127 | let score = self.score(word, prev) + prev_score; 128 | match candidates.get_mut(end - 1) { 129 | Some((cur_len, cur_score)) if *cur_score < score => { 130 | *cur_len = end - split; 131 | *cur_score = score; 132 | } 133 | None => candidates.push((end - split, score)), 134 | _ => {}, 135 | } 136 | } 137 | } 138 | 139 | let mut end = text.len(); 140 | let (mut best_len, mut _best_score) = candidates[end - 1]; 141 | loop { 142 | let word = &text[end - best_len..end]; 143 | result.insert(0, word); 144 | 145 | end -= best_len; 146 | if end == 0 { 147 | break; 148 | } 149 | 150 | best_len = candidates[end - 1].0; 151 | } 152 | 153 | result 154 | } 155 | } 156 | 157 | #[pymodule] 158 | fn pywordsegment( 159 | _py: Python, 160 | m: &PyModule, 161 | ) -> PyResult<()> { 162 | m.add_class::()?; 163 | 164 | Ok(()) 165 | } 166 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PyWordSegment/b9fda304be80080d2b8ce966a146e2af6fd253b6/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_pywordsegment.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pywordsegment 4 | 5 | 6 | class WordSegmentTestCase( 7 | unittest.TestCase, 8 | ): 9 | def test_segment_1( 10 | self, 11 | ): 12 | self.assertEqual( 13 | first=pywordsegment.WordSegmenter.segment( 14 | text='theusashops', 15 | ), 16 | second=[ 17 | 'the', 18 | 'usa', 19 | 'shops', 20 | ], 21 | ) 22 | 23 | def test_segment_2( 24 | self, 25 | ): 26 | self.assertEqual( 27 | first=pywordsegment.WordSegmenter.segment( 28 | text='choosespain', 29 | ), 30 | second=[ 31 | 'choose', 32 | 'spain', 33 | ], 34 | ) 35 | 36 | def test_segment_3( 37 | self, 38 | ): 39 | self.assertEqual( 40 | first=pywordsegment.WordSegmenter.segment( 41 | text='thisisatest', 42 | ), 43 | second=[ 44 | 'this', 45 | 'is', 46 | 'a', 47 | 'test', 48 | ], 49 | ) 50 | 51 | def test_segment_4( 52 | self, 53 | ): 54 | self.assertEqual( 55 | first=pywordsegment.WordSegmenter.segment( 56 | text='wheninthecourseofhumaneventsitbecomesnecessary', 57 | ), 58 | second=[ 59 | 'when', 60 | 'in', 61 | 'the', 62 | 'course', 63 | 'of', 64 | 'human', 65 | 'events', 66 | 'it', 67 | 'becomes', 68 | 'necessary', 69 | ], 70 | ) 71 | 72 | def test_segment_5( 73 | self, 74 | ): 75 | self.assertEqual( 76 | first=pywordsegment.WordSegmenter.segment( 77 | text='whorepresents', 78 | ), 79 | second=[ 80 | 'who', 81 | 'represents', 82 | ], 83 | ) 84 | 85 | def test_segment_6( 86 | self, 87 | ): 88 | self.assertEqual( 89 | first=pywordsegment.WordSegmenter.segment( 90 | text='expertsexchange', 91 | ), 92 | second=[ 93 | 'experts', 94 | 'exchange', 95 | ], 96 | ) 97 | 98 | def test_segment_7( 99 | self, 100 | ): 101 | self.assertEqual( 102 | first=pywordsegment.WordSegmenter.segment( 103 | text='speedofart', 104 | ), 105 | second=[ 106 | 'speed', 107 | 'of', 108 | 'art', 109 | ], 110 | ) 111 | 112 | def test_segment_8( 113 | self, 114 | ): 115 | self.assertEqual( 116 | first=pywordsegment.WordSegmenter.segment( 117 | text='nowisthetimeforallgood', 118 | ), 119 | second=[ 120 | 'now', 121 | 'is', 122 | 'the', 123 | 'time', 124 | 'for', 125 | 'all', 126 | 'good', 127 | ], 128 | ) 129 | 130 | def test_segment_9( 131 | self, 132 | ): 133 | self.assertEqual( 134 | first=pywordsegment.WordSegmenter.segment( 135 | text='itisatruthuniversallyacknowledged', 136 | ), 137 | second=[ 138 | 'it', 139 | 'is', 140 | 'a', 141 | 'truth', 142 | 'universally', 143 | 'acknowledged', 144 | ], 145 | ) 146 | 147 | def test_segment_10( 148 | self, 149 | ): 150 | self.assertEqual( 151 | first=pywordsegment.WordSegmenter.segment( 152 | text='itwasabrightcolddayinaprilandtheclockswerestrikingthirteen', 153 | ), 154 | second=[ 155 | 'it', 156 | 'was', 157 | 'a', 158 | 'bright', 159 | 'cold', 160 | 'day', 161 | 'in', 162 | 'april', 163 | 'and', 164 | 'the', 165 | 'clocks', 166 | 'were', 167 | 'striking', 168 | 'thirteen', 169 | ], 170 | ) 171 | 172 | def test_segment_11( 173 | self, 174 | ): 175 | self.assertEqual( 176 | first=pywordsegment.WordSegmenter.segment( 177 | text='CaseTest', 178 | ), 179 | second=[ 180 | 'case', 181 | 'test', 182 | ], 183 | ) 184 | 185 | def test_segment_12( 186 | self, 187 | ): 188 | self.assertEqual( 189 | first=pywordsegment.WordSegmenter.segment( 190 | text='', 191 | ), 192 | second=[], 193 | ) 194 | 195 | def test_segment_13( 196 | self, 197 | ): 198 | self.assertEqual( 199 | first=pywordsegment.WordSegmenter.segment( 200 | text='a', 201 | ), 202 | second=[ 203 | 'a', 204 | ], 205 | ) 206 | 207 | def test_exist_as_segment_1( 208 | self, 209 | ): 210 | self.assertFalse( 211 | expr=pywordsegment.WordSegmenter.exist_as_segment( 212 | substring='man', 213 | text='manual', 214 | ), 215 | ) 216 | self.assertTrue( 217 | expr=pywordsegment.WordSegmenter.exist_as_segment( 218 | substring='man', 219 | text='oneman', 220 | ), 221 | ) 222 | --------------------------------------------------------------------------------