├── .github
└── workflows
│ ├── build.yml
│ └── deploy.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── README.md
├── cortex.yaml
├── images
└── logo.png
├── poetry.lock
├── pyproject.toml
├── pywordsegment
├── __init__.py
├── bigrams.msgpack.gz
├── pywordsegment.pyi
└── unigrams.msgpack.gz
├── scripts
├── bigrams_downloader.py
├── serializer.py
└── unigrams_downloader.py
├── setup.cfg
├── src
└── lib.rs
└── tests
├── __init__.py
└── test_pywordsegment.py
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 | on:
3 | - push
4 | - pull_request
5 | jobs:
6 | lint:
7 | if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags')
8 | runs-on: ubuntu-latest
9 | steps:
10 | - name: Checkout
11 | uses: actions/checkout@v3
12 | - name: Install latest rust
13 | uses: actions-rs/toolchain@v1
14 | with:
15 | toolchain: stable
16 | profile: minimal
17 | override: true
18 | components: clippy
19 | - name: Lint with clippy
20 | uses: actions-rs/cargo@v1
21 | with:
22 | command: clippy
23 | args: --all-targets --all-features
24 | test:
25 | runs-on: ${{ matrix.os }}
26 | needs: lint
27 | strategy:
28 | fail-fast: false
29 | matrix:
30 | python-version:
31 | - '3.7'
32 | - '3.8'
33 | - '3.9'
34 | - '3.10'
35 | - '3.11'
36 | os:
37 | - ubuntu-latest
38 | - macos-latest
39 | - windows-latest
40 | steps:
41 | - name: Checkout
42 | uses: actions/checkout@v3
43 | - name: Set up Python ${{ matrix.python-version }}
44 | uses: actions/setup-python@v3
45 | with:
46 | python-version: ${{ matrix.python-version }}
47 | - name: Install Poetry
48 | uses: abatilo/actions-poetry@v2.1.3
49 | - name: Install Rust
50 | uses: actions-rs/toolchain@v1
51 | with:
52 | profile: minimal
53 | toolchain: stable
54 | override: true
55 | - name: Install dependencies
56 | run: poetry install
57 | - name: Build Python package
58 | run: poetry run maturin develop
59 | - name: Test
60 | run: poetry run pytest -Werror tests
61 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | name: Deploy
2 | on:
3 | release:
4 | types:
5 | - released
6 | jobs:
7 | deploy:
8 | runs-on: ${{ matrix.os }}
9 | strategy:
10 | fail-fast: false
11 | matrix:
12 | python-version:
13 | - '3.7'
14 | - '3.8'
15 | - '3.9'
16 | - '3.10'
17 | - '3.11'
18 | os:
19 | - ubuntu-latest
20 | - macos-latest
21 | - windows-latest
22 | steps:
23 | - name: Checkout
24 | uses: actions/checkout@v3
25 | - name: Set up Python ${{ matrix.python-version }}
26 | uses: actions/setup-python@v4
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | - name: Install Rust
30 | uses: actions-rs/toolchain@v1
31 | with:
32 | profile: minimal
33 | toolchain: stable
34 | override: true
35 | - name: Install Cross-compilers (macOS)
36 | if: matrix.os == 'macos-latest'
37 | run: |
38 | rustup target add x86_64-apple-darwin
39 | rustup target add aarch64-apple-darwin
40 | - name: Publish Package
41 | uses: PyO3/maturin-action@v1
42 | with:
43 | command: publish
44 | args: --username=__token__ ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.7' && '' || '--no-sdist' }} --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }}
45 | env:
46 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
47 | if: matrix.os != 'macos-latest'
48 | - name: Publish macOS (x86_64) Package
49 | if: matrix.os == 'macos-latest'
50 | uses: PyO3/maturin-action@v1
51 | with:
52 | command: publish
53 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=x86_64-apple-darwin --no-sdist
54 | env:
55 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
56 | - name: Publish macOS (arm64) Package
57 | if: matrix.os == 'macos-latest'
58 | uses: PyO3/maturin-action@v1
59 | with:
60 | command: publish
61 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-apple-darwin --no-sdist
62 | env:
63 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 | # Distribution / packaging
8 | .Python
9 | env/
10 | build/
11 | develop-eggs/
12 | dist/
13 | downloads/
14 | eggs/
15 | .eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 |
25 | # PyInstaller
26 | # Usually these files are written by a python script from a template
27 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 |
31 | # conflict temp files
32 | *.py.orig
33 | *.mock
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | coverage_html_report/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *,cover
49 | .pytest_cache/
50 |
51 | # Translations
52 | *.mo
53 | *.pot
54 |
55 | # Django stuff:
56 | *.log
57 |
58 | # Sphinx documentation
59 | docs/_build/
60 |
61 | # PyBuilder
62 | target/
63 |
64 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
65 |
66 | *.iml
67 |
68 | ## Directory-based project format:
69 | .idea/
70 |
71 | # Tests
72 | generic_tests.py
73 | cloudflare_test.py
74 |
75 | ############################
76 | #Eclipse Specific GitIgnore#
77 | ############################
78 | *.pydevproject
79 | .project
80 | .metadata
81 | bin/**
82 | tmp/**
83 | tmp/**/*
84 | *.tmp
85 | *.bak
86 | *.swp
87 | *~.nib
88 | local.properties
89 | .classpath
90 | .settings/
91 | .loadpath
92 |
93 |
94 | # Git mergetool traces
95 | *.orig
96 |
97 | # VS Code internal directory
98 | .vscode/
99 |
100 | *.dat
101 | *.code-workspace
102 | .history
103 |
104 | # Intsights development playground
105 | playground/
106 |
107 | pytest-report\.csv
108 | *.cppimporthash
109 | .rendered.*
110 | Databases.db
111 |
112 | # Node.js
113 | dist/
114 | node_modules/
115 | coverage/
116 |
117 | # Generated by Cargo
118 | # will have compiled files and executables
119 | /target/
120 |
121 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
122 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
123 | Cargo.lock
124 |
125 | # These are backup files generated by rustfmt
126 | **/*.rs.bk
127 |
128 | *.sqlite3
129 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "pywordsegment"
3 | version = "0.4.3"
4 | authors = ["Gal Ben David "]
5 | edition = "2021"
6 | description = "Concatenated-word segmentation Python library written in Rust"
7 | readme = "README.md"
8 | repository = "https://github.com/intsights/pywordsegment"
9 | homepage = "https://github.com/intsights/pywordsegment"
10 | license = "MIT"
11 | keywords = [
12 | "word",
13 | "segment",
14 | "rust",
15 | "pyo3",
16 | ]
17 |
18 | [package.metadata.maturin]
19 |
20 | [lib]
21 | name = "pywordsegment"
22 | crate-type = ["cdylib"]
23 |
24 | [dependencies]
25 | ahash = "0.7"
26 | rmp-serde = "1"
27 |
28 | [dependencies.pyo3]
29 | version = "0.16.5"
30 | features = ["extension-module"]
31 |
32 | [profile.release]
33 | lto = true
34 | panic = "abort"
35 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Gal Ben David
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Concatenated-word segmentation Python library written in Rust
7 |
8 |
9 |
10 |
11 | 
12 | 
13 | 
14 | 
15 | [](https://pypi.org/project/pywordsegment/)
16 |
17 | ## Table of Contents
18 |
19 | - [Table of Contents](#table-of-contents)
20 | - [About The Project](#about-the-project)
21 | - [Built With](#built-with)
22 | - [Installation](#installation)
23 | - [Usage](#usage)
24 | - [License](#license)
25 | - [Contact](#contact)
26 |
27 |
28 | ## About The Project
29 |
30 | A fast concatenated-word segmentation library written in Rust, inspired by [wordninja](https://github.com/keredson/wordninja) and [wordsegment](https://github.com/grantjenks/python-wordsegment). The binding uses [pyo3](https://github.com/PyO3/pyo3) to interact with the rust package.
31 |
32 |
33 | ### Built With
34 |
35 | * [pyo3](https://github.com/PyO3/pyo3)
36 |
37 |
38 | ### Installation
39 |
40 | ```sh
41 | pip3 install pywordsegment
42 | ```
43 |
44 |
45 | ## Usage
46 |
47 | ```python
48 | import pywordsegment
49 |
50 | # The internal UNIGRAMS & BIGRAMS corpuses are lazy initialized
51 | # once per the whole module. Multiple WordSegmenter instances would
52 | # not create new dictionaries.
53 |
54 | # Segments a word to its parts
55 | pywordsegment.WordSegmenter.segment(
56 | text="theusashops",
57 | )
58 | # ["the", "usa", "shops"]
59 |
60 |
61 | # This function checks whether the substring exists as a whole segment
62 | # inside text.
63 | pywordsegment.WordSegmenter.exist_as_segment(
64 | substring="inter",
65 | text="internationalairport",
66 | )
67 | # False
68 |
69 | pywordsegment.WordSegmenter.exist_as_segment(
70 | substring="inter",
71 | text="intermilan",
72 | )
73 | # True
74 | ```
75 |
76 |
77 | ## License
78 |
79 | Distributed under the MIT License. See `LICENSE` for more information.
80 |
81 |
82 | ## Contact
83 |
84 | Gal Ben David - gal@intsights.com
85 |
86 | Project Link: [https://github.com/intsights/pywordsegment](https://github.com/intsights/pywordsegment)
87 |
--------------------------------------------------------------------------------
/cortex.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | info:
3 | title: Pywordsegment
4 | description: Concatenated-word segmentation Python library written in Rust
5 | x-cortex-git:
6 | github:
7 | alias: intsightsorg
8 | repository: Intsights/PyWordSegment
9 | x-cortex-tag: pywordsegment
10 | x-cortex-type: service
11 | x-cortex-domain-parents:
12 | - tag: threatintel-phishing
13 | x-cortex-groups:
14 | - exposure:external-ship
15 | - target:library
16 | openapi: 3.0.1
17 | servers:
18 | - url: "/"
19 |
--------------------------------------------------------------------------------
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyWordSegment/b9fda304be80080d2b8ce966a146e2af6fd253b6/images/logo.png
--------------------------------------------------------------------------------
/poetry.lock:
--------------------------------------------------------------------------------
1 | [[package]]
2 | name = "colorama"
3 | version = "0.4.6"
4 | description = "Cross-platform colored terminal text."
5 | category = "dev"
6 | optional = false
7 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
8 |
9 | [[package]]
10 | name = "exceptiongroup"
11 | version = "1.2.0"
12 | description = "Backport of PEP 654 (exception groups)"
13 | category = "dev"
14 | optional = false
15 | python-versions = ">=3.7"
16 |
17 | [package.extras]
18 | test = ["pytest (>=6)"]
19 |
20 | [[package]]
21 | name = "importlib-metadata"
22 | version = "6.7.0"
23 | description = "Read metadata from Python packages"
24 | category = "dev"
25 | optional = false
26 | python-versions = ">=3.7"
27 |
28 | [package.dependencies]
29 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
30 | zipp = ">=0.5"
31 |
32 | [package.extras]
33 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"]
34 | perf = ["ipython"]
35 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-ruff", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"]
36 |
37 | [[package]]
38 | name = "iniconfig"
39 | version = "2.0.0"
40 | description = "brain-dead simple config-ini parsing"
41 | category = "dev"
42 | optional = false
43 | python-versions = ">=3.7"
44 |
45 | [[package]]
46 | name = "maturin"
47 | version = "1.4.0"
48 | description = "Build and publish crates with pyo3, rust-cpython and cffi bindings as well as rust binaries as python packages"
49 | category = "dev"
50 | optional = false
51 | python-versions = ">=3.7"
52 |
53 | [package.dependencies]
54 | tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
55 |
56 | [package.extras]
57 | zig = ["ziglang (>=0.10.0,<0.11.0)"]
58 | patchelf = ["patchelf"]
59 |
60 | [[package]]
61 | name = "packaging"
62 | version = "23.2"
63 | description = "Core utilities for Python packages"
64 | category = "dev"
65 | optional = false
66 | python-versions = ">=3.7"
67 |
68 | [[package]]
69 | name = "pluggy"
70 | version = "1.2.0"
71 | description = "plugin and hook calling mechanisms for python"
72 | category = "dev"
73 | optional = false
74 | python-versions = ">=3.7"
75 |
76 | [package.dependencies]
77 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
78 |
79 | [package.extras]
80 | dev = ["pre-commit", "tox"]
81 | testing = ["pytest", "pytest-benchmark"]
82 |
83 | [[package]]
84 | name = "pytest"
85 | version = "7.4.4"
86 | description = "pytest: simple powerful testing with Python"
87 | category = "dev"
88 | optional = false
89 | python-versions = ">=3.7"
90 |
91 | [package.dependencies]
92 | colorama = {version = "*", markers = "sys_platform == \"win32\""}
93 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
94 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
95 | iniconfig = "*"
96 | packaging = "*"
97 | pluggy = ">=0.12,<2.0"
98 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
99 |
100 | [package.extras]
101 | testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
102 |
103 | [[package]]
104 | name = "pytest-runner"
105 | version = "6.0.1"
106 | description = "Invoke py.test as distutils command with dependency resolution"
107 | category = "dev"
108 | optional = false
109 | python-versions = ">=3.7"
110 |
111 | [package.extras]
112 | docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"]
113 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-virtualenv", "types-setuptools", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"]
114 |
115 | [[package]]
116 | name = "tomli"
117 | version = "2.0.1"
118 | description = "A lil' TOML parser"
119 | category = "dev"
120 | optional = false
121 | python-versions = ">=3.7"
122 |
123 | [[package]]
124 | name = "typing-extensions"
125 | version = "4.7.1"
126 | description = "Backported and Experimental Type Hints for Python 3.7+"
127 | category = "dev"
128 | optional = false
129 | python-versions = ">=3.7"
130 |
131 | [[package]]
132 | name = "zipp"
133 | version = "3.15.0"
134 | description = "Backport of pathlib-compatible object wrapper for zip files"
135 | category = "dev"
136 | optional = false
137 | python-versions = ">=3.7"
138 |
139 | [package.extras]
140 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"]
141 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "jaraco.functools", "more-itertools", "big-o", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "pytest-flake8"]
142 |
143 | [metadata]
144 | lock-version = "1.1"
145 | python-versions = "^3.7"
146 | content-hash = "d3751775f5a48f55874329689185792d15525d44f15678cc3bfeb66b5dea0d3d"
147 |
148 | [metadata.files]
149 | colorama = []
150 | exceptiongroup = []
151 | importlib-metadata = []
152 | iniconfig = []
153 | maturin = []
154 | packaging = []
155 | pluggy = []
156 | pytest = []
157 | pytest-runner = []
158 | tomli = []
159 | typing-extensions = []
160 | zipp = []
161 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["maturin>=0.12,<0.13"]
3 | build-backend = "maturin"
4 |
5 | [tool.maturin]
6 | sdist-include = [
7 | "Cargo.toml",
8 | "pyproject.toml",
9 | "pywordsegment/*.gz",
10 | "pywordsegment/*.py",
11 | "pywordsegment/*.pyi",
12 | "src/*",
13 | ]
14 |
15 | [tool.poetry]
16 | name = "pywordsegment"
17 | version = "0.4.3"
18 | authors = ["Gal Ben David "]
19 | description = "Concatenated-word segmentation Python library written in Rust"
20 | readme = "README.md"
21 | repository = "https://github.com/intsights/pywordsegment"
22 | homepage = "https://github.com/intsights/pywordsegment"
23 | license = "MIT"
24 | keywords = [
25 | "word",
26 | "segment",
27 | "rust",
28 | "pyo3"
29 | ]
30 | classifiers = [
31 | "License :: OSI Approved :: MIT License",
32 | "Operating System :: MacOS",
33 | "Operating System :: Microsoft",
34 | "Operating System :: POSIX :: Linux",
35 | "Programming Language :: Python :: 3.7",
36 | "Programming Language :: Python :: 3.8",
37 | "Programming Language :: Python :: 3.9",
38 | "Programming Language :: Python :: 3.10",
39 | "Programming Language :: Python :: 3.11",
40 | "Programming Language :: Rust",
41 | ]
42 |
43 | [tool.poetry.dependencies]
44 | python = "^3.7"
45 |
46 | [tool.poetry.dev-dependencies]
47 | pytest = "*"
48 | wheel = "*"
49 | pytest-runner = "*"
50 | maturin = "*"
51 |
--------------------------------------------------------------------------------
/pywordsegment/__init__.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import importlib.resources
3 | import sys
4 | import typing
5 |
6 | from . import pywordsegment
7 |
8 | PY_VERSION_MAJOR = sys.version_info.major
9 | PY_VERSION_MINOR = sys.version_info.minor
10 |
11 | class WordSegmenter:
12 | word_segmenter: pywordsegment.WordSegmenter = None
13 |
14 | @staticmethod
15 | def load() -> None:
16 | if WordSegmenter.word_segmenter is None:
17 | if PY_VERSION_MAJOR >= 3 and PY_VERSION_MINOR >= 11:
18 | with importlib.resources.files(
19 | __package__,
20 | ).joinpath(
21 | 'unigrams.msgpack.gz',
22 | ).open(
23 | 'rb',
24 | ) as unigrams_msgpack, importlib.resources.files(
25 | __package__,
26 | ).joinpath(
27 | 'bigrams.msgpack.gz',
28 | ).open(
29 | 'rb',
30 | ) as bigrams_msgpack:
31 | unigrams_serialized = gzip.decompress(
32 | data=unigrams_msgpack.read(),
33 | )
34 | bigrams_serialized = gzip.decompress(
35 | data=bigrams_msgpack.read(),
36 | )
37 |
38 | else:
39 | unigrams_serialized = gzip.decompress(
40 | data=importlib.resources.read_binary(
41 | package=__package__,
42 | resource='unigrams.msgpack.gz',
43 | ),
44 | )
45 |
46 | bigrams_serialized = gzip.decompress(
47 | data=importlib.resources.read_binary(
48 | package=__package__,
49 | resource='bigrams.msgpack.gz',
50 | ),
51 | )
52 |
53 | WordSegmenter.word_segmenter = pywordsegment.WordSegmenter(
54 | unigrams_serialized=unigrams_serialized,
55 | bigrams_serialized=bigrams_serialized,
56 | )
57 |
58 | @staticmethod
59 | def segment(
60 | text: str,
61 | ) -> typing.List[str]:
62 | if WordSegmenter.word_segmenter is None:
63 | WordSegmenter.load()
64 |
65 | return WordSegmenter.word_segmenter.segment(text)
66 |
67 | @staticmethod
68 | def exist_as_segment(
69 | substring: str,
70 | text: str,
71 | ) -> bool:
72 | if WordSegmenter.word_segmenter is None:
73 | WordSegmenter.load()
74 |
75 | return WordSegmenter.word_segmenter.exist_as_segment(substring, text)
76 |
--------------------------------------------------------------------------------
/pywordsegment/bigrams.msgpack.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyWordSegment/b9fda304be80080d2b8ce966a146e2af6fd253b6/pywordsegment/bigrams.msgpack.gz
--------------------------------------------------------------------------------
/pywordsegment/pywordsegment.pyi:
--------------------------------------------------------------------------------
1 | import typing
2 |
3 |
4 | class WordSegmenter:
5 | @staticmethod
6 | def load() -> None: ...
7 |
8 | @staticmethod
9 | def segment(
10 | text: str,
11 | ) -> typing.List[str]: ...
12 |
13 | @staticmethod
14 | def exist_as_segment(
15 | substring: str,
16 | text: str,
17 | ) -> bool: ...
18 |
--------------------------------------------------------------------------------
/pywordsegment/unigrams.msgpack.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyWordSegment/b9fda304be80080d2b8ce966a146e2af6fd253b6/pywordsegment/unigrams.msgpack.gz
--------------------------------------------------------------------------------
/scripts/bigrams_downloader.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import sqlite3
3 | import urllib.request
4 | import concurrent.futures
5 |
6 |
7 | def process_url(
8 | url,
9 | ):
10 | print(f'processing {url}')
11 |
12 | db_connection = sqlite3.connect(
13 | database='bigrams.sqlite3',
14 | timeout=100000,
15 | )
16 | db_cursor = db_connection.cursor()
17 | db_cursor.execute(
18 | '''
19 | CREATE TABLE IF NOT EXISTS bigrams (
20 | bigram_first TEXT,
21 | bigram_second TEXT,
22 | count INTEGER,
23 | UNIQUE(bigram_first, bigram_second)
24 | )
25 | '''
26 | )
27 | db_cursor.execute(
28 | '''
29 | CREATE INDEX IF NOT EXISTS count ON bigrams (count)
30 | '''
31 | )
32 | db_connection.commit()
33 |
34 | chunk = []
35 | with urllib.request.urlopen(
36 | url=url,
37 | ) as response:
38 | with gzip.GzipFile(
39 | fileobj=response,
40 | ) as uncompressed:
41 | for line in uncompressed:
42 | bigram, _, fragments = line.decode().partition('\t')
43 |
44 | bigram_first, _, bigram_second = bigram.lower().partition(' ')
45 | bigram_first, _, _ = bigram_first.rpartition('_')
46 | bigram_second, _, _ = bigram_second.rpartition('_')
47 |
48 | if not bigram_first.isalnum() or not bigram_second.isalnum():
49 | continue
50 |
51 | count = 0
52 | for frag in fragments.split('\t'):
53 | count += int(frag.split(',')[1])
54 |
55 | if len(chunk) == 100000:
56 | db_cursor.executemany(
57 | '''
58 | INSERT INTO bigrams
59 | VALUES (?, ?, ?)
60 | ON CONFLICT (bigram_first, bigram_second) DO
61 | UPDATE SET count = count + ?;
62 | ''',
63 | chunk,
64 | )
65 | db_connection.commit()
66 | chunk.clear()
67 | else:
68 | chunk.append(
69 | (
70 | bigram_first,
71 | bigram_second,
72 | count,
73 | count,
74 | )
75 | )
76 |
77 | db_cursor.executemany(
78 | '''
79 | INSERT INTO bigrams
80 | VALUES (?, ?, ?)
81 | ON CONFLICT (bigram_first, bigram_second) DO
82 | UPDATE SET count = count + ?;
83 | ''',
84 | chunk,
85 | )
86 | db_connection.commit()
87 |
88 |
89 | futures = []
90 | with concurrent.futures.ProcessPoolExecutor(
91 | max_workers=30,
92 | ) as executor:
93 | urls = [
94 | f'http://storage.googleapis.com/books/ngrams/books/20200217/eng/2-{i:05d}-of-00589.gz'
95 | for i in range(0, 589)
96 | ]
97 | for url in urls:
98 | futures.append(executor.submit(process_url, url))
99 |
100 | for future in concurrent.futures.as_completed(futures):
101 | print(f'finished {future.result()}')
102 |
--------------------------------------------------------------------------------
/scripts/serializer.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import msgpack
3 | import sqlite3
4 | import math
5 |
6 |
7 | unigrams_connection = sqlite3.connect(
8 | database='unigrams.sqlite3',
9 | timeout=10,
10 | )
11 | unigrams_db_cursor = unigrams_connection.cursor()
12 |
13 | bigrams_connection = sqlite3.connect(
14 | database='bigrams.sqlite3',
15 | timeout=10,
16 | )
17 | bigrams_db_cursor = bigrams_connection.cursor()
18 |
19 | unigrams = {
20 | unigram: float(count)
21 | for unigram, count in unigrams_db_cursor.execute(
22 | '''
23 | SELECT word, count
24 | FROM unigrams
25 | ORDER BY count DESC
26 | LIMIT 1000000;
27 | '''
28 | )
29 | }
30 | unigrams_total_count = sum(unigrams.values())
31 |
32 | bigrams = {}
33 | bigrams_total_count = 0
34 | for bigram_first, bigram_second, count in bigrams_db_cursor.execute(
35 | '''
36 | SELECT bigram_first, bigram_second, count
37 | FROM bigrams
38 | ORDER BY count DESC
39 | LIMIT 100000;
40 | '''
41 | ):
42 | bigram_first = bigram_first.decode()
43 | bigram_second = bigram_second.decode()
44 |
45 | if bigram_first in bigrams:
46 | bigrams[bigram_first][bigram_second] = float(count)
47 | else:
48 | bigrams[bigram_first] = {
49 | bigram_second: float(count)
50 | }
51 |
52 | bigrams_total_count += count
53 |
54 | bigrams_processed = {}
55 | for bigram_first, inner in bigrams.items():
56 | for bigram_second, count in inner.items():
57 | if bigram_first in unigrams:
58 | if bigram_first not in bigrams_processed:
59 | bigrams_processed[bigram_first] = {}
60 | bigrams_processed[bigram_first][bigram_second] = math.log10(
61 | (count / bigrams_total_count) /
62 | (unigrams[bigram_first] / unigrams_total_count)
63 | )
64 |
65 | unigrams_processed = {
66 | unigram: math.log10(count / unigrams_total_count)
67 | for unigram, count in unigrams.items()
68 | }
69 | unigrams_processed['unigrams_total_count'] = unigrams_total_count
70 |
71 | with gzip.GzipFile(
72 | filename='unigrams.msgpack.gz',
73 | mode='wb',
74 | ) as compressed_file:
75 | compressed_file.write(msgpack.packb(unigrams_processed))
76 |
77 | with gzip.GzipFile(
78 | filename='bigrams.msgpack.gz',
79 | mode='wb',
80 | ) as compressed_file:
81 | compressed_file.write(msgpack.packb(bigrams_processed))
82 |
--------------------------------------------------------------------------------
/scripts/unigrams_downloader.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import sqlite3
3 | import urllib.request
4 |
5 |
6 | db_connection = sqlite3.connect(
7 | database='unigrams.sqlite3',
8 | timeout=10,
9 | )
10 | db_cursor = db_connection.cursor()
11 | db_cursor.execute(
12 | '''
13 | CREATE TABLE IF NOT EXISTS unigrams (
14 | unigram TEXT,
15 | count INTEGER,
16 | UNIQUE(unigram)
17 | )
18 | '''
19 | )
20 | db_cursor.execute(
21 | '''
22 | CREATE INDEX IF NOT EXISTS count ON unigrams (count)
23 | '''
24 | )
25 | db_connection.commit()
26 |
27 | urls = [
28 | f'http://storage.googleapis.com/books/ngrams/books/20200217/eng/1-{i:05d}-of-00024.gz'
29 | for i in range(0, 24)
30 | ]
31 | for url in urls:
32 | print(f'processing {url}')
33 |
34 | with urllib.request.urlopen(
35 | url=url,
36 | ) as response:
37 | with gzip.GzipFile(
38 | fileobj=response,
39 | ) as uncompressed:
40 | for line in uncompressed:
41 | fragments = line.decode().split('\t')
42 | unigram = fragments[0].lower()
43 | if not unigram.isalnum():
44 | continue
45 |
46 | count = 0
47 | for frag in fragments[1:]:
48 | year, number_of_instances, volume = frag.split(',')
49 | count += int(number_of_instances)
50 |
51 | db_cursor.execute(
52 | '''
53 | INSERT INTO unigrams
54 | VALUES (?, ?)
55 | ON CONFLICT (unigram) DO
56 | UPDATE SET count = count + ?;
57 | ''',
58 | (
59 | unigram,
60 | count,
61 | count,
62 | ),
63 | )
64 |
65 | db_connection.commit()
66 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
3 |
4 | [tool:pytest]
5 | addopts = --tb=native -s
6 |
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | use ahash::RandomState;
2 | use pyo3::prelude::*;
3 | use std::collections::HashMap;
4 |
5 | const MAX_WORD_LEN: usize = 24;
6 |
7 | #[pyclass]
8 | struct WordSegmenter {
9 | unigrams: HashMap,
10 | bigrams: HashMap, RandomState>,
11 | unknown_unigrams: [f64; MAX_WORD_LEN + 1],
12 | }
13 |
14 | #[pymethods]
15 | impl WordSegmenter {
16 | #[new]
17 | fn new(
18 | unigrams_serialized: &[u8],
19 | bigrams_serialized: &[u8],
20 | ) -> Self {
21 | let unigrams: HashMap = rmp_serde::from_slice(unigrams_serialized).unwrap();
22 | let bigrams = rmp_serde::from_slice(bigrams_serialized).unwrap();
23 |
24 | let total_unigrams_frequency = unigrams.get("unigrams_total_count").unwrap();
25 | let mut unknown_unigrams = [0.0; MAX_WORD_LEN + 1];
26 | for (word_len, value) in unknown_unigrams.iter_mut().enumerate() {
27 | *value = (10.0 / (total_unigrams_frequency * 10_f64.powi(word_len as i32))).log10();
28 | }
29 |
30 | WordSegmenter {
31 | unigrams,
32 | bigrams,
33 | unknown_unigrams,
34 | }
35 | }
36 |
37 | fn segment(
38 | &self,
39 | py: Python,
40 | text: String,
41 | ) -> PyResult> {
42 | let clean_text = text
43 | .to_ascii_lowercase()
44 | .replace(
45 | |c: char| !c.is_ascii_alphanumeric(),
46 | ""
47 | );
48 |
49 | let words = self.search(&clean_text);
50 |
51 | Ok(words.into_py(py))
52 | }
53 |
54 | fn exist_as_segment(
55 | &self,
56 | substring: String,
57 | text: String,
58 | ) -> PyResult {
59 | let clean_text = text
60 | .to_ascii_lowercase()
61 | .replace(
62 | |c: char| !c.is_ascii_alphanumeric(),
63 | ""
64 | );
65 |
66 | let clean_substring = substring
67 | .to_ascii_lowercase()
68 | .replace(
69 | |c: char| !c.is_ascii_alphanumeric(),
70 | ""
71 | );
72 |
73 | let segmented_text = self.search(&clean_text);
74 | let segmented_substring = self.search(&clean_substring);
75 |
76 | let segmented_substring_pattern = format!("-{}-", segmented_substring.join("-"));
77 | let segmented_text_pattern = format!("-{}-", segmented_text.join("-"));
78 |
79 | Ok(segmented_text_pattern.contains(&segmented_substring_pattern))
80 | }
81 | }
82 |
83 | impl WordSegmenter {
84 | fn score(
85 | &self,
86 | word: &str,
87 | previous: &str,
88 | ) -> f64 {
89 | if !previous.is_empty() {
90 | if let Some(first_bigram_layer) = self.bigrams.get(previous) {
91 | if let Some(bigram_frequency) = first_bigram_layer.get(word) {
92 | return *bigram_frequency;
93 | }
94 | }
95 | }
96 |
97 | match self.unigrams.get(word) {
98 | Some(frequency) => *frequency,
99 | None => self.unknown_unigrams[word.len()],
100 | }
101 | }
102 |
103 | fn search<'a>(
104 | &self,
105 | text: &'a str,
106 | ) -> Vec<&'a str> {
107 | let mut result = Vec::with_capacity(text.len());
108 | let mut candidates = Vec::with_capacity(text.len());
109 |
110 | if text.is_empty() {
111 | return result;
112 | }
113 |
114 | for end in 1..=text.len() {
115 | let start = end.saturating_sub(MAX_WORD_LEN);
116 | for split in start..end {
117 | let (prev, prev_score) = match split {
118 | 0 => ("", 0.0),
119 | _ => {
120 | let (prefix_len, prefix_score) = candidates[split - 1];
121 | let word = &text[split - prefix_len as usize..split];
122 | (word, prefix_score)
123 | }
124 | };
125 |
126 | let word = &text[split..end];
127 | let score = self.score(word, prev) + prev_score;
128 | match candidates.get_mut(end - 1) {
129 | Some((cur_len, cur_score)) if *cur_score < score => {
130 | *cur_len = end - split;
131 | *cur_score = score;
132 | }
133 | None => candidates.push((end - split, score)),
134 | _ => {},
135 | }
136 | }
137 | }
138 |
139 | let mut end = text.len();
140 | let (mut best_len, mut _best_score) = candidates[end - 1];
141 | loop {
142 | let word = &text[end - best_len..end];
143 | result.insert(0, word);
144 |
145 | end -= best_len;
146 | if end == 0 {
147 | break;
148 | }
149 |
150 | best_len = candidates[end - 1].0;
151 | }
152 |
153 | result
154 | }
155 | }
156 |
157 | #[pymodule]
158 | fn pywordsegment(
159 | _py: Python,
160 | m: &PyModule,
161 | ) -> PyResult<()> {
162 | m.add_class::()?;
163 |
164 | Ok(())
165 | }
166 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyWordSegment/b9fda304be80080d2b8ce966a146e2af6fd253b6/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_pywordsegment.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pywordsegment
4 |
5 |
6 | class WordSegmentTestCase(
7 | unittest.TestCase,
8 | ):
9 | def test_segment_1(
10 | self,
11 | ):
12 | self.assertEqual(
13 | first=pywordsegment.WordSegmenter.segment(
14 | text='theusashops',
15 | ),
16 | second=[
17 | 'the',
18 | 'usa',
19 | 'shops',
20 | ],
21 | )
22 |
23 | def test_segment_2(
24 | self,
25 | ):
26 | self.assertEqual(
27 | first=pywordsegment.WordSegmenter.segment(
28 | text='choosespain',
29 | ),
30 | second=[
31 | 'choose',
32 | 'spain',
33 | ],
34 | )
35 |
36 | def test_segment_3(
37 | self,
38 | ):
39 | self.assertEqual(
40 | first=pywordsegment.WordSegmenter.segment(
41 | text='thisisatest',
42 | ),
43 | second=[
44 | 'this',
45 | 'is',
46 | 'a',
47 | 'test',
48 | ],
49 | )
50 |
51 | def test_segment_4(
52 | self,
53 | ):
54 | self.assertEqual(
55 | first=pywordsegment.WordSegmenter.segment(
56 | text='wheninthecourseofhumaneventsitbecomesnecessary',
57 | ),
58 | second=[
59 | 'when',
60 | 'in',
61 | 'the',
62 | 'course',
63 | 'of',
64 | 'human',
65 | 'events',
66 | 'it',
67 | 'becomes',
68 | 'necessary',
69 | ],
70 | )
71 |
72 | def test_segment_5(
73 | self,
74 | ):
75 | self.assertEqual(
76 | first=pywordsegment.WordSegmenter.segment(
77 | text='whorepresents',
78 | ),
79 | second=[
80 | 'who',
81 | 'represents',
82 | ],
83 | )
84 |
85 | def test_segment_6(
86 | self,
87 | ):
88 | self.assertEqual(
89 | first=pywordsegment.WordSegmenter.segment(
90 | text='expertsexchange',
91 | ),
92 | second=[
93 | 'experts',
94 | 'exchange',
95 | ],
96 | )
97 |
98 | def test_segment_7(
99 | self,
100 | ):
101 | self.assertEqual(
102 | first=pywordsegment.WordSegmenter.segment(
103 | text='speedofart',
104 | ),
105 | second=[
106 | 'speed',
107 | 'of',
108 | 'art',
109 | ],
110 | )
111 |
112 | def test_segment_8(
113 | self,
114 | ):
115 | self.assertEqual(
116 | first=pywordsegment.WordSegmenter.segment(
117 | text='nowisthetimeforallgood',
118 | ),
119 | second=[
120 | 'now',
121 | 'is',
122 | 'the',
123 | 'time',
124 | 'for',
125 | 'all',
126 | 'good',
127 | ],
128 | )
129 |
130 | def test_segment_9(
131 | self,
132 | ):
133 | self.assertEqual(
134 | first=pywordsegment.WordSegmenter.segment(
135 | text='itisatruthuniversallyacknowledged',
136 | ),
137 | second=[
138 | 'it',
139 | 'is',
140 | 'a',
141 | 'truth',
142 | 'universally',
143 | 'acknowledged',
144 | ],
145 | )
146 |
147 | def test_segment_10(
148 | self,
149 | ):
150 | self.assertEqual(
151 | first=pywordsegment.WordSegmenter.segment(
152 | text='itwasabrightcolddayinaprilandtheclockswerestrikingthirteen',
153 | ),
154 | second=[
155 | 'it',
156 | 'was',
157 | 'a',
158 | 'bright',
159 | 'cold',
160 | 'day',
161 | 'in',
162 | 'april',
163 | 'and',
164 | 'the',
165 | 'clocks',
166 | 'were',
167 | 'striking',
168 | 'thirteen',
169 | ],
170 | )
171 |
172 | def test_segment_11(
173 | self,
174 | ):
175 | self.assertEqual(
176 | first=pywordsegment.WordSegmenter.segment(
177 | text='CaseTest',
178 | ),
179 | second=[
180 | 'case',
181 | 'test',
182 | ],
183 | )
184 |
185 | def test_segment_12(
186 | self,
187 | ):
188 | self.assertEqual(
189 | first=pywordsegment.WordSegmenter.segment(
190 | text='',
191 | ),
192 | second=[],
193 | )
194 |
195 | def test_segment_13(
196 | self,
197 | ):
198 | self.assertEqual(
199 | first=pywordsegment.WordSegmenter.segment(
200 | text='a',
201 | ),
202 | second=[
203 | 'a',
204 | ],
205 | )
206 |
207 | def test_exist_as_segment_1(
208 | self,
209 | ):
210 | self.assertFalse(
211 | expr=pywordsegment.WordSegmenter.exist_as_segment(
212 | substring='man',
213 | text='manual',
214 | ),
215 | )
216 | self.assertTrue(
217 | expr=pywordsegment.WordSegmenter.exist_as_segment(
218 | substring='man',
219 | text='oneman',
220 | ),
221 | )
222 |
--------------------------------------------------------------------------------