├── .github
    └── workflows
    │   ├── build.yml
    │   └── deploy.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── README.md
├── cortex.yaml
├── images
    └── logo.png
├── poetry.lock
├── pyproject.toml
├── pywordsegment
    ├── __init__.py
    ├── bigrams.msgpack.gz
    ├── pywordsegment.pyi
    └── unigrams.msgpack.gz
├── scripts
    ├── bigrams_downloader.py
    ├── serializer.py
    └── unigrams_downloader.py
├── setup.cfg
├── src
    └── lib.rs
└── tests
    ├── __init__.py
    └── test_pywordsegment.py


/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | on:
 3 |   - push
 4 |   - pull_request
 5 | jobs:
 6 |   lint:
 7 |     if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags')
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - name: Checkout
11 |         uses: actions/checkout@v3
12 |       - name: Install latest rust
13 |         uses: actions-rs/toolchain@v1
14 |         with:
15 |           toolchain: stable
16 |           profile: minimal
17 |           override: true
18 |           components: clippy
19 |       - name: Lint with clippy
20 |         uses: actions-rs/cargo@v1
21 |         with:
22 |           command: clippy
23 |           args: --all-targets --all-features
24 |   test:
25 |     runs-on: ${{ matrix.os }}
26 |     needs: lint
27 |     strategy:
28 |       fail-fast: false
29 |       matrix:
30 |         python-version:
31 |           - '3.7'
32 |           - '3.8'
33 |           - '3.9'
34 |           - '3.10'
35 |           - '3.11'
36 |         os:
37 |           - ubuntu-latest
38 |           - macos-latest
39 |           - windows-latest
40 |     steps:
41 |       - name: Checkout
42 |         uses: actions/checkout@v3
43 |       - name: Set up Python ${{ matrix.python-version }}
44 |         uses: actions/setup-python@v3
45 |         with:
46 |           python-version: ${{ matrix.python-version }}
47 |       - name: Install Poetry
48 |         uses: abatilo/actions-poetry@v2.1.3
49 |       - name: Install Rust
50 |         uses: actions-rs/toolchain@v1
51 |         with:
52 |           profile: minimal
53 |           toolchain: stable
54 |           override: true
55 |       - name: Install dependencies
56 |         run: poetry install
57 |       - name: Build Python package
58 |         run: poetry run maturin develop
59 |       - name: Test
60 |         run: poetry run pytest -Werror tests
61 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy
 2 | on:
 3 |   release:
 4 |     types:
 5 |       - released
 6 | jobs:
 7 |   deploy:
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         python-version:
13 |           - '3.7'
14 |           - '3.8'
15 |           - '3.9'
16 |           - '3.10'
17 |           - '3.11'
18 |         os:
19 |           - ubuntu-latest
20 |           - macos-latest
21 |           - windows-latest
22 |     steps:
23 |       - name: Checkout
24 |         uses: actions/checkout@v3
25 |       - name: Set up Python ${{ matrix.python-version }}
26 |         uses: actions/setup-python@v4
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 |       - name: Install Rust
30 |         uses: actions-rs/toolchain@v1
31 |         with:
32 |           profile: minimal
33 |           toolchain: stable
34 |           override: true
35 |       - name: Install Cross-compilers (macOS)
36 |         if: matrix.os == 'macos-latest'
37 |         run: |
38 |           rustup target add x86_64-apple-darwin
39 |           rustup target add aarch64-apple-darwin
40 |       - name: Publish Package
41 |         uses: PyO3/maturin-action@v1
42 |         with:
43 |           command: publish
44 |           args: --username=__token__ ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.7' && '' || '--no-sdist' }} --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }}
45 |         env:
46 |           MATURIN_PASSWORD: ${{ secrets.pypi_password }}
47 |         if: matrix.os != 'macos-latest'
48 |       - name: Publish macOS (x86_64) Package
49 |         if: matrix.os == 'macos-latest'
50 |         uses: PyO3/maturin-action@v1
51 |         with:
52 |           command: publish
53 |           args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=x86_64-apple-darwin --no-sdist
54 |         env:
55 |           MATURIN_PASSWORD: ${{ secrets.pypi_password }}
56 |       - name: Publish macOS (arm64) Package
57 |         if: matrix.os == 'macos-latest'
58 |         uses: PyO3/maturin-action@v1
59 |         with:
60 |           command: publish
61 |           args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-apple-darwin --no-sdist
62 |         env:
63 |           MATURIN_PASSWORD: ${{ secrets.pypi_password }}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | 
  5 | # C extensions
  6 | *.so
  7 | # Distribution / packaging
  8 | .Python
  9 | env/
 10 | build/
 11 | develop-eggs/
 12 | dist/
 13 | downloads/
 14 | eggs/
 15 | .eggs/
 16 | lib/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | *.egg-info/
 22 | .installed.cfg
 23 | *.egg
 24 | 
 25 | # PyInstaller
 26 | #  Usually these files are written by a python script from a template
 27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 28 | *.manifest
 29 | *.spec
 30 | 
 31 | # conflict temp files
 32 | *.py.orig
 33 | *.mock
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | coverage_html_report/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *,cover
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | 
 58 | # Sphinx documentation
 59 | docs/_build/
 60 | 
 61 | # PyBuilder
 62 | target/
 63 | 
 64 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
 65 | 
 66 | *.iml
 67 | 
 68 | ## Directory-based project format:
 69 | .idea/
 70 | 
 71 | # Tests
 72 | generic_tests.py
 73 | cloudflare_test.py
 74 | 
 75 | ############################
 76 | #Eclipse Specific GitIgnore#
 77 | ############################
 78 | *.pydevproject
 79 | .project
 80 | .metadata
 81 | bin/**
 82 | tmp/**
 83 | tmp/**/*
 84 | *.tmp
 85 | *.bak
 86 | *.swp
 87 | *~.nib
 88 | local.properties
 89 | .classpath
 90 | .settings/
 91 | .loadpath
 92 | 
 93 | 
 94 | # Git mergetool traces
 95 | *.orig
 96 | 
 97 | # VS Code internal directory
 98 | .vscode/
 99 | 
100 | *.dat
101 | *.code-workspace
102 | .history
103 | 
104 | # Intsights development playground
105 | playground/
106 | 
107 | pytest-report\.csv
108 | *.cppimporthash
109 | .rendered.*
110 | Databases.db
111 | 
112 | # Node.js
113 | dist/
114 | node_modules/
115 | coverage/
116 | 
117 | # Generated by Cargo
118 | # will have compiled files and executables
119 | /target/
120 | 
121 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
122 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
123 | Cargo.lock
124 | 
125 | # These are backup files generated by rustfmt
126 | **/*.rs.bk
127 | 
128 | *.sqlite3
129 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "pywordsegment"
 3 | version = "0.4.3"
 4 | authors = ["Gal Ben David <gal@intsights.com>"]
 5 | edition = "2021"
 6 | description = "Concatenated-word segmentation Python library written in Rust"
 7 | readme = "README.md"
 8 | repository = "https://github.com/intsights/pywordsegment"
 9 | homepage = "https://github.com/intsights/pywordsegment"
10 | license = "MIT"
11 | keywords = [
12 |     "word",
13 |     "segment",
14 |     "rust",
15 |     "pyo3",
16 | ]
17 | 
18 | [package.metadata.maturin]
19 | 
20 | [lib]
21 | name = "pywordsegment"
22 | crate-type = ["cdylib"]
23 | 
24 | [dependencies]
25 | ahash = "0.7"
26 | rmp-serde = "1"
27 | 
28 | [dependencies.pyo3]
29 | version = "0.16.5"
30 | features = ["extension-module"]
31 | 
32 | [profile.release]
33 | lto = true
34 | panic = "abort"
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Gal Ben David
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |     <a href="https://github.com/intsights/pywordsegment">
 3 |         <img src="https://raw.githubusercontent.com/intsights/pywordsegment/master/images/logo.png" alt="Logo">
 4 |     </a>
 5 |     <h3 align="center">
 6 |         Concatenated-word segmentation Python library written in Rust
 7 |     </h3>
 8 | </p>
 9 | 
10 | 
11 | ![license](https://img.shields.io/badge/MIT-License-blue)
12 | ![Python](https://img.shields.io/badge/Python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue)
13 | ![OS](https://img.shields.io/badge/OS-Mac%20%7C%20Linux%20%7C%20Windows-blue)
14 | ![Build](https://github.com/intsights/pywordsegment/workflows/Build/badge.svg)
15 | [![PyPi](https://img.shields.io/pypi/v/pywordsegment.svg)](https://pypi.org/project/pywordsegment/)
16 | 
17 | ## Table of Contents
18 | 
19 | - [Table of Contents](#table-of-contents)
20 | - [About The Project](#about-the-project)
21 |   - [Built With](#built-with)
22 |   - [Installation](#installation)
23 | - [Usage](#usage)
24 | - [License](#license)
25 | - [Contact](#contact)
26 | 
27 | 
28 | ## About The Project
29 | 
30 | A fast concatenated-word segmentation library written in Rust, inspired by [wordninja](https://github.com/keredson/wordninja) and [wordsegment](https://github.com/grantjenks/python-wordsegment). The binding uses [pyo3](https://github.com/PyO3/pyo3) to interact with the rust package.
31 | 
32 | 
33 | ### Built With
34 | 
35 | * [pyo3](https://github.com/PyO3/pyo3)
36 | 
37 | 
38 | ### Installation
39 | 
40 | ```sh
41 | pip3 install pywordsegment
42 | ```
43 | 
44 | 
45 | ## Usage
46 | 
47 | ```python
48 | import pywordsegment
49 | 
50 | # The internal UNIGRAMS & BIGRAMS corpuses are lazy initialized
51 | # once per the whole module. Multiple WordSegmenter instances would
52 | # not create new dictionaries.
53 | 
54 | # Segments a word to its parts
55 | pywordsegment.WordSegmenter.segment(
56 |     text="theusashops",
57 | )
58 | # ["the", "usa", "shops"]
59 | 
60 | 
61 | # This function checks whether the substring exists as a whole segment
62 | # inside text.
63 | pywordsegment.WordSegmenter.exist_as_segment(
64 |     substring="inter",
65 |     text="internationalairport",
66 | )
67 | # False
68 | 
69 | pywordsegment.WordSegmenter.exist_as_segment(
70 |     substring="inter",
71 |     text="intermilan",
72 | )
73 | # True
74 | ```
75 | 
76 | 
77 | ## License
78 | 
79 | Distributed under the MIT License. See `LICENSE` for more information.
80 | 
81 | 
82 | ## Contact
83 | 
84 | Gal Ben David - gal@intsights.com
85 | 
86 | Project Link: [https://github.com/intsights/pywordsegment](https://github.com/intsights/pywordsegment)
87 | 


--------------------------------------------------------------------------------
/cortex.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | info:
 3 |   title: Pywordsegment
 4 |   description: Concatenated-word segmentation Python library written in Rust
 5 |   x-cortex-git:
 6 |     github:
 7 |       alias: intsightsorg
 8 |       repository: Intsights/PyWordSegment
 9 |   x-cortex-tag: pywordsegment
10 |   x-cortex-type: service
11 |   x-cortex-domain-parents:
12 |   - tag: threatintel-phishing
13 |   x-cortex-groups:
14 |   - exposure:external-ship
15 |   - target:library
16 | openapi: 3.0.1
17 | servers:
18 | - url: "/"
19 | 


--------------------------------------------------------------------------------
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyWordSegment/b9fda304be80080d2b8ce966a146e2af6fd253b6/images/logo.png


--------------------------------------------------------------------------------
/poetry.lock:
--------------------------------------------------------------------------------
  1 | [[package]]
  2 | name = "colorama"
  3 | version = "0.4.6"
  4 | description = "Cross-platform colored terminal text."
  5 | category = "dev"
  6 | optional = false
  7 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
  8 | 
  9 | [[package]]
 10 | name = "exceptiongroup"
 11 | version = "1.2.0"
 12 | description = "Backport of PEP 654 (exception groups)"
 13 | category = "dev"
 14 | optional = false
 15 | python-versions = ">=3.7"
 16 | 
 17 | [package.extras]
 18 | test = ["pytest (>=6)"]
 19 | 
 20 | [[package]]
 21 | name = "importlib-metadata"
 22 | version = "6.7.0"
 23 | description = "Read metadata from Python packages"
 24 | category = "dev"
 25 | optional = false
 26 | python-versions = ">=3.7"
 27 | 
 28 | [package.dependencies]
 29 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
 30 | zipp = ">=0.5"
 31 | 
 32 | [package.extras]
 33 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"]
 34 | perf = ["ipython"]
 35 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-ruff", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"]
 36 | 
 37 | [[package]]
 38 | name = "iniconfig"
 39 | version = "2.0.0"
 40 | description = "brain-dead simple config-ini parsing"
 41 | category = "dev"
 42 | optional = false
 43 | python-versions = ">=3.7"
 44 | 
 45 | [[package]]
 46 | name = "maturin"
 47 | version = "1.4.0"
 48 | description = "Build and publish crates with pyo3, rust-cpython and cffi bindings as well as rust binaries as python packages"
 49 | category = "dev"
 50 | optional = false
 51 | python-versions = ">=3.7"
 52 | 
 53 | [package.dependencies]
 54 | tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
 55 | 
 56 | [package.extras]
 57 | zig = ["ziglang (>=0.10.0,<0.11.0)"]
 58 | patchelf = ["patchelf"]
 59 | 
 60 | [[package]]
 61 | name = "packaging"
 62 | version = "23.2"
 63 | description = "Core utilities for Python packages"
 64 | category = "dev"
 65 | optional = false
 66 | python-versions = ">=3.7"
 67 | 
 68 | [[package]]
 69 | name = "pluggy"
 70 | version = "1.2.0"
 71 | description = "plugin and hook calling mechanisms for python"
 72 | category = "dev"
 73 | optional = false
 74 | python-versions = ">=3.7"
 75 | 
 76 | [package.dependencies]
 77 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
 78 | 
 79 | [package.extras]
 80 | dev = ["pre-commit", "tox"]
 81 | testing = ["pytest", "pytest-benchmark"]
 82 | 
 83 | [[package]]
 84 | name = "pytest"
 85 | version = "7.4.4"
 86 | description = "pytest: simple powerful testing with Python"
 87 | category = "dev"
 88 | optional = false
 89 | python-versions = ">=3.7"
 90 | 
 91 | [package.dependencies]
 92 | colorama = {version = "*", markers = "sys_platform == \"win32\""}
 93 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
 94 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
 95 | iniconfig = "*"
 96 | packaging = "*"
 97 | pluggy = ">=0.12,<2.0"
 98 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 99 | 
100 | [package.extras]
101 | testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
102 | 
103 | [[package]]
104 | name = "pytest-runner"
105 | version = "6.0.1"
106 | description = "Invoke py.test as distutils command with dependency resolution"
107 | category = "dev"
108 | optional = false
109 | python-versions = ">=3.7"
110 | 
111 | [package.extras]
112 | docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"]
113 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-virtualenv", "types-setuptools", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"]
114 | 
115 | [[package]]
116 | name = "tomli"
117 | version = "2.0.1"
118 | description = "A lil' TOML parser"
119 | category = "dev"
120 | optional = false
121 | python-versions = ">=3.7"
122 | 
123 | [[package]]
124 | name = "typing-extensions"
125 | version = "4.7.1"
126 | description = "Backported and Experimental Type Hints for Python 3.7+"
127 | category = "dev"
128 | optional = false
129 | python-versions = ">=3.7"
130 | 
131 | [[package]]
132 | name = "zipp"
133 | version = "3.15.0"
134 | description = "Backport of pathlib-compatible object wrapper for zip files"
135 | category = "dev"
136 | optional = false
137 | python-versions = ">=3.7"
138 | 
139 | [package.extras]
140 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"]
141 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "jaraco.functools", "more-itertools", "big-o", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "pytest-flake8"]
142 | 
143 | [metadata]
144 | lock-version = "1.1"
145 | python-versions = "^3.7"
146 | content-hash = "d3751775f5a48f55874329689185792d15525d44f15678cc3bfeb66b5dea0d3d"
147 | 
148 | [metadata.files]
149 | colorama = []
150 | exceptiongroup = []
151 | importlib-metadata = []
152 | iniconfig = []
153 | maturin = []
154 | packaging = []
155 | pluggy = []
156 | pytest = []
157 | pytest-runner = []
158 | tomli = []
159 | typing-extensions = []
160 | zipp = []
161 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["maturin>=0.12,<0.13"]
 3 | build-backend = "maturin"
 4 | 
 5 | [tool.maturin]
 6 | sdist-include = [
 7 |     "Cargo.toml",
 8 |     "pyproject.toml",
 9 |     "pywordsegment/*.gz",
10 |     "pywordsegment/*.py",
11 |     "pywordsegment/*.pyi",
12 |     "src/*",
13 | ]
14 | 
15 | [tool.poetry]
16 | name = "pywordsegment"
17 | version = "0.4.3"
18 | authors = ["Gal Ben David <gal@intsights.com>"]
19 | description = "Concatenated-word segmentation Python library written in Rust"
20 | readme = "README.md"
21 | repository = "https://github.com/intsights/pywordsegment"
22 | homepage = "https://github.com/intsights/pywordsegment"
23 | license = "MIT"
24 | keywords = [
25 |     "word",
26 |     "segment",
27 |     "rust",
28 |     "pyo3"
29 | ]
30 | classifiers = [
31 |     "License :: OSI Approved :: MIT License",
32 |     "Operating System :: MacOS",
33 |     "Operating System :: Microsoft",
34 |     "Operating System :: POSIX :: Linux",
35 |     "Programming Language :: Python :: 3.7",
36 |     "Programming Language :: Python :: 3.8",
37 |     "Programming Language :: Python :: 3.9",
38 |     "Programming Language :: Python :: 3.10",
39 |     "Programming Language :: Python :: 3.11",
40 |     "Programming Language :: Rust",
41 | ]
42 | 
43 | [tool.poetry.dependencies]
44 | python = "^3.7"
45 | 
46 | [tool.poetry.dev-dependencies]
47 | pytest = "*"
48 | wheel = "*"
49 | pytest-runner = "*"
50 | maturin = "*"
51 | 


--------------------------------------------------------------------------------
/pywordsegment/__init__.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import importlib.resources
 3 | import sys
 4 | import typing
 5 | 
 6 | from . import pywordsegment
 7 | 
 8 | PY_VERSION_MAJOR = sys.version_info.major
 9 | PY_VERSION_MINOR = sys.version_info.minor
10 | 
11 | class WordSegmenter:
12 |     word_segmenter: pywordsegment.WordSegmenter = None
13 | 
14 |     @staticmethod
15 |     def load() -> None:
16 |         if WordSegmenter.word_segmenter is None:
17 |             if PY_VERSION_MAJOR >= 3 and PY_VERSION_MINOR >= 11:
18 |                 with importlib.resources.files(
19 |                     __package__,
20 |                 ).joinpath(
21 |                     'unigrams.msgpack.gz',
22 |                 ).open(
23 |                     'rb',
24 |                 ) as unigrams_msgpack, importlib.resources.files(
25 |                     __package__,
26 |                 ).joinpath(
27 |                     'bigrams.msgpack.gz',
28 |                 ).open(
29 |                     'rb',
30 |                 ) as bigrams_msgpack:
31 |                     unigrams_serialized = gzip.decompress(
32 |                         data=unigrams_msgpack.read(),
33 |                     )
34 |                     bigrams_serialized = gzip.decompress(
35 |                         data=bigrams_msgpack.read(),
36 |                     )
37 | 
38 |             else:
39 |                 unigrams_serialized = gzip.decompress(
40 |                     data=importlib.resources.read_binary(
41 |                         package=__package__,
42 |                         resource='unigrams.msgpack.gz',
43 |                     ),
44 |                 )
45 | 
46 |                 bigrams_serialized = gzip.decompress(
47 |                     data=importlib.resources.read_binary(
48 |                         package=__package__,
49 |                         resource='bigrams.msgpack.gz',
50 |                     ),
51 |                 )
52 | 
53 |             WordSegmenter.word_segmenter = pywordsegment.WordSegmenter(
54 |                 unigrams_serialized=unigrams_serialized,
55 |                 bigrams_serialized=bigrams_serialized,
56 |             )
57 | 
58 |     @staticmethod
59 |     def segment(
60 |         text: str,
61 |     ) -> typing.List[str]:
62 |         if WordSegmenter.word_segmenter is None:
63 |             WordSegmenter.load()
64 | 
65 |         return WordSegmenter.word_segmenter.segment(text)
66 | 
67 |     @staticmethod
68 |     def exist_as_segment(
69 |         substring: str,
70 |         text: str,
71 |     ) -> bool:
72 |         if WordSegmenter.word_segmenter is None:
73 |             WordSegmenter.load()
74 | 
75 |         return WordSegmenter.word_segmenter.exist_as_segment(substring, text)
76 | 


--------------------------------------------------------------------------------
/pywordsegment/bigrams.msgpack.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyWordSegment/b9fda304be80080d2b8ce966a146e2af6fd253b6/pywordsegment/bigrams.msgpack.gz


--------------------------------------------------------------------------------
/pywordsegment/pywordsegment.pyi:
--------------------------------------------------------------------------------
 1 | import typing
 2 | 
 3 | 
 4 | class WordSegmenter:
 5 |     @staticmethod
 6 |     def load() -> None: ...
 7 | 
 8 |     @staticmethod
 9 |     def segment(
10 |         text: str,
11 |     ) -> typing.List[str]: ...
12 | 
13 |     @staticmethod
14 |     def exist_as_segment(
15 |         substring: str,
16 |         text: str,
17 |     ) -> bool: ...
18 | 


--------------------------------------------------------------------------------
/pywordsegment/unigrams.msgpack.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyWordSegment/b9fda304be80080d2b8ce966a146e2af6fd253b6/pywordsegment/unigrams.msgpack.gz


--------------------------------------------------------------------------------
/scripts/bigrams_downloader.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import sqlite3
  3 | import urllib.request
  4 | import concurrent.futures
  5 | 
  6 | 
  7 | def process_url(
  8 |     url,
  9 | ):
 10 |     print(f'processing {url}')
 11 | 
 12 |     db_connection = sqlite3.connect(
 13 |         database='bigrams.sqlite3',
 14 |         timeout=100000,
 15 |     )
 16 |     db_cursor = db_connection.cursor()
 17 |     db_cursor.execute(
 18 |         '''
 19 |             CREATE TABLE IF NOT EXISTS bigrams (
 20 |                 bigram_first TEXT,
 21 |                 bigram_second TEXT,
 22 |                 count INTEGER,
 23 |                 UNIQUE(bigram_first, bigram_second)
 24 |             )
 25 |         '''
 26 |     )
 27 |     db_cursor.execute(
 28 |         '''
 29 |             CREATE INDEX IF NOT EXISTS count ON bigrams (count)
 30 |         '''
 31 |     )
 32 |     db_connection.commit()
 33 | 
 34 |     chunk = []
 35 |     with urllib.request.urlopen(
 36 |         url=url,
 37 |     ) as response:
 38 |         with gzip.GzipFile(
 39 |             fileobj=response,
 40 |         ) as uncompressed:
 41 |             for line in uncompressed:
 42 |                 bigram, _, fragments = line.decode().partition('\t')
 43 | 
 44 |                 bigram_first, _, bigram_second = bigram.lower().partition(' ')
 45 |                 bigram_first, _, _ = bigram_first.rpartition('_')
 46 |                 bigram_second, _, _ = bigram_second.rpartition('_')
 47 | 
 48 |                 if not bigram_first.isalnum() or not bigram_second.isalnum():
 49 |                     continue
 50 | 
 51 |                 count = 0
 52 |                 for frag in fragments.split('\t'):
 53 |                     count += int(frag.split(',')[1])
 54 | 
 55 |                 if len(chunk) == 100000:
 56 |                     db_cursor.executemany(
 57 |                         '''
 58 |                             INSERT INTO bigrams
 59 |                             VALUES (?, ?, ?)
 60 |                             ON CONFLICT (bigram_first, bigram_second) DO
 61 |                             UPDATE SET count = count + ?;
 62 |                         ''',
 63 |                         chunk,
 64 |                     )
 65 |                     db_connection.commit()
 66 |                     chunk.clear()
 67 |                 else:
 68 |                     chunk.append(
 69 |                         (
 70 |                             bigram_first,
 71 |                             bigram_second,
 72 |                             count,
 73 |                             count,
 74 |                         )
 75 |                     )
 76 | 
 77 |     db_cursor.executemany(
 78 |         '''
 79 |             INSERT INTO bigrams
 80 |             VALUES (?, ?, ?)
 81 |             ON CONFLICT (bigram_first, bigram_second) DO
 82 |             UPDATE SET count = count + ?;
 83 |         ''',
 84 |         chunk,
 85 |     )
 86 |     db_connection.commit()
 87 | 
 88 | 
 89 | futures = []
 90 | with concurrent.futures.ProcessPoolExecutor(
 91 |     max_workers=30,
 92 | ) as executor:
 93 |     urls = [
 94 |         f'http://storage.googleapis.com/books/ngrams/books/20200217/eng/2-{i:05d}-of-00589.gz'
 95 |         for i in range(0, 589)
 96 |     ]
 97 |     for url in urls:
 98 |         futures.append(executor.submit(process_url, url))
 99 | 
100 | for future in concurrent.futures.as_completed(futures):
101 |     print(f'finished {future.result()}')
102 | 


--------------------------------------------------------------------------------
/scripts/serializer.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import msgpack
 3 | import sqlite3
 4 | import math
 5 | 
 6 | 
 7 | unigrams_connection = sqlite3.connect(
 8 |     database='unigrams.sqlite3',
 9 |     timeout=10,
10 | )
11 | unigrams_db_cursor = unigrams_connection.cursor()
12 | 
13 | bigrams_connection = sqlite3.connect(
14 |     database='bigrams.sqlite3',
15 |     timeout=10,
16 | )
17 | bigrams_db_cursor = bigrams_connection.cursor()
18 | 
19 | unigrams = {
20 |     unigram: float(count)
21 |     for unigram, count in unigrams_db_cursor.execute(
22 |         '''
23 |             SELECT word, count
24 |             FROM unigrams
25 |             ORDER BY count DESC
26 |             LIMIT 1000000;
27 |         '''
28 |     )
29 | }
30 | unigrams_total_count = sum(unigrams.values())
31 | 
32 | bigrams = {}
33 | bigrams_total_count = 0
34 | for bigram_first, bigram_second, count in bigrams_db_cursor.execute(
35 |     '''
36 |         SELECT bigram_first, bigram_second, count
37 |         FROM bigrams
38 |         ORDER BY count DESC
39 |         LIMIT 100000;
40 |     '''
41 | ):
42 |     bigram_first = bigram_first.decode()
43 |     bigram_second = bigram_second.decode()
44 | 
45 |     if bigram_first in bigrams:
46 |         bigrams[bigram_first][bigram_second] = float(count)
47 |     else:
48 |         bigrams[bigram_first] = {
49 |             bigram_second: float(count)
50 |         }
51 | 
52 |     bigrams_total_count += count
53 | 
54 | bigrams_processed = {}
55 | for bigram_first, inner in bigrams.items():
56 |     for bigram_second, count in inner.items():
57 |         if bigram_first in unigrams:
58 |             if bigram_first not in bigrams_processed:
59 |                 bigrams_processed[bigram_first] = {}
60 |             bigrams_processed[bigram_first][bigram_second] = math.log10(
61 |                 (count / bigrams_total_count) /
62 |                 (unigrams[bigram_first] / unigrams_total_count)
63 |             )
64 | 
65 | unigrams_processed = {
66 |     unigram: math.log10(count / unigrams_total_count)
67 |     for unigram, count in unigrams.items()
68 | }
69 | unigrams_processed['unigrams_total_count'] = unigrams_total_count
70 | 
71 | with gzip.GzipFile(
72 |     filename='unigrams.msgpack.gz',
73 |     mode='wb',
74 | ) as compressed_file:
75 |     compressed_file.write(msgpack.packb(unigrams_processed))
76 | 
77 | with gzip.GzipFile(
78 |     filename='bigrams.msgpack.gz',
79 |     mode='wb',
80 | ) as compressed_file:
81 |     compressed_file.write(msgpack.packb(bigrams_processed))
82 | 


--------------------------------------------------------------------------------
/scripts/unigrams_downloader.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import sqlite3
 3 | import urllib.request
 4 | 
 5 | 
 6 | db_connection = sqlite3.connect(
 7 |     database='unigrams.sqlite3',
 8 |     timeout=10,
 9 | )
10 | db_cursor = db_connection.cursor()
11 | db_cursor.execute(
12 |     '''
13 |         CREATE TABLE IF NOT EXISTS unigrams (
14 |             unigram TEXT,
15 |             count INTEGER,
16 |             UNIQUE(unigram)
17 |         )
18 |     '''
19 | )
20 | db_cursor.execute(
21 |     '''
22 |         CREATE INDEX IF NOT EXISTS count ON unigrams (count)
23 |     '''
24 | )
25 | db_connection.commit()
26 | 
27 | urls = [
28 |     f'http://storage.googleapis.com/books/ngrams/books/20200217/eng/1-{i:05d}-of-00024.gz'
29 |     for i in range(0, 24)
30 | ]
31 | for url in urls:
32 |     print(f'processing {url}')
33 | 
34 |     with urllib.request.urlopen(
35 |         url=url,
36 |     ) as response:
37 |         with gzip.GzipFile(
38 |             fileobj=response,
39 |         ) as uncompressed:
40 |             for line in uncompressed:
41 |                 fragments = line.decode().split('\t')
42 |                 unigram = fragments[0].lower()
43 |                 if not unigram.isalnum():
44 |                     continue
45 | 
46 |                 count = 0
47 |                 for frag in fragments[1:]:
48 |                     year, number_of_instances, volume = frag.split(',')
49 |                     count += int(number_of_instances)
50 | 
51 |                 db_cursor.execute(
52 |                     '''
53 |                         INSERT INTO unigrams
54 |                         VALUES (?, ?)
55 |                         ON CONFLICT (unigram) DO
56 |                         UPDATE SET count = count + ?;
57 |                     ''',
58 |                     (
59 |                         unigram,
60 |                         count,
61 |                         count,
62 |                     ),
63 |                 )
64 | 
65 | db_connection.commit()
66 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
3 | 
4 | [tool:pytest]
5 | addopts = --tb=native -s
6 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | use ahash::RandomState;
  2 | use pyo3::prelude::*;
  3 | use std::collections::HashMap;
  4 | 
  5 | const MAX_WORD_LEN: usize = 24;
  6 | 
  7 | #[pyclass]
  8 | struct WordSegmenter {
  9 |     unigrams: HashMap<String, f64, RandomState>,
 10 |     bigrams: HashMap<String, HashMap<String, f64, RandomState>, RandomState>,
 11 |     unknown_unigrams: [f64; MAX_WORD_LEN + 1],
 12 | }
 13 | 
 14 | #[pymethods]
 15 | impl WordSegmenter {
 16 |     #[new]
 17 |     fn new(
 18 |         unigrams_serialized: &[u8],
 19 |         bigrams_serialized: &[u8],
 20 |     ) -> Self {
 21 |         let unigrams: HashMap<String, f64, RandomState> = rmp_serde::from_slice(unigrams_serialized).unwrap();
 22 |         let bigrams = rmp_serde::from_slice(bigrams_serialized).unwrap();
 23 | 
 24 |         let total_unigrams_frequency = unigrams.get("unigrams_total_count").unwrap();
 25 |         let mut unknown_unigrams = [0.0; MAX_WORD_LEN + 1];
 26 |         for (word_len, value) in unknown_unigrams.iter_mut().enumerate() {
 27 |             *value = (10.0 / (total_unigrams_frequency * 10_f64.powi(word_len as i32))).log10();
 28 |         }
 29 | 
 30 |         WordSegmenter {
 31 |             unigrams,
 32 |             bigrams,
 33 |             unknown_unigrams,
 34 |         }
 35 |     }
 36 | 
 37 |     fn segment(
 38 |         &self,
 39 |         py: Python,
 40 |         text: String,
 41 |     ) -> PyResult<Py<PyAny>> {
 42 |         let clean_text = text
 43 |             .to_ascii_lowercase()
 44 |             .replace(
 45 |                 |c: char| !c.is_ascii_alphanumeric(),
 46 |                 ""
 47 |             );
 48 | 
 49 |         let words = self.search(&clean_text);
 50 | 
 51 |         Ok(words.into_py(py))
 52 |     }
 53 | 
 54 |     fn exist_as_segment(
 55 |         &self,
 56 |         substring: String,
 57 |         text: String,
 58 |     ) -> PyResult<bool> {
 59 |         let clean_text = text
 60 |             .to_ascii_lowercase()
 61 |             .replace(
 62 |                 |c: char| !c.is_ascii_alphanumeric(),
 63 |                 ""
 64 |             );
 65 | 
 66 |         let clean_substring = substring
 67 |             .to_ascii_lowercase()
 68 |             .replace(
 69 |                 |c: char| !c.is_ascii_alphanumeric(),
 70 |                 ""
 71 |             );
 72 | 
 73 |         let segmented_text = self.search(&clean_text);
 74 |         let segmented_substring = self.search(&clean_substring);
 75 | 
 76 |         let segmented_substring_pattern = format!("-{}-", segmented_substring.join("-"));
 77 |         let segmented_text_pattern = format!("-{}-", segmented_text.join("-"));
 78 | 
 79 |         Ok(segmented_text_pattern.contains(&segmented_substring_pattern))
 80 |     }
 81 | }
 82 | 
 83 | impl WordSegmenter {
 84 |     fn score(
 85 |         &self,
 86 |         word: &str,
 87 |         previous: &str,
 88 |     ) -> f64 {
 89 |         if !previous.is_empty() {
 90 |             if let Some(first_bigram_layer) = self.bigrams.get(previous) {
 91 |                 if let Some(bigram_frequency) = first_bigram_layer.get(word) {
 92 |                     return *bigram_frequency;
 93 |                 }
 94 |             }
 95 |         }
 96 | 
 97 |         match self.unigrams.get(word) {
 98 |             Some(frequency) => *frequency,
 99 |             None => self.unknown_unigrams[word.len()],
100 |         }
101 |     }
102 | 
103 |     fn search<'a>(
104 |         &self,
105 |         text: &'a str,
106 |     ) -> Vec<&'a str> {
107 |         let mut result = Vec::with_capacity(text.len());
108 |         let mut candidates = Vec::with_capacity(text.len());
109 | 
110 |         if text.is_empty() {
111 |             return result;
112 |         }
113 | 
114 |         for end in 1..=text.len() {
115 |             let start = end.saturating_sub(MAX_WORD_LEN);
116 |             for split in start..end {
117 |                 let (prev, prev_score) = match split {
118 |                     0 => ("", 0.0),
119 |                     _ => {
120 |                         let (prefix_len, prefix_score) = candidates[split - 1];
121 |                         let word = &text[split - prefix_len as usize..split];
122 |                         (word, prefix_score)
123 |                     }
124 |                 };
125 | 
126 |                 let word = &text[split..end];
127 |                 let score = self.score(word, prev) + prev_score;
128 |                 match candidates.get_mut(end - 1) {
129 |                     Some((cur_len, cur_score)) if *cur_score < score => {
130 |                         *cur_len = end - split;
131 |                         *cur_score = score;
132 |                     }
133 |                     None => candidates.push((end - split, score)),
134 |                     _ => {},
135 |                 }
136 |             }
137 |         }
138 | 
139 |         let mut end = text.len();
140 |         let (mut best_len, mut _best_score) = candidates[end - 1];
141 |         loop {
142 |             let word = &text[end - best_len..end];
143 |             result.insert(0, word);
144 | 
145 |             end -= best_len;
146 |             if end == 0 {
147 |                 break;
148 |             }
149 | 
150 |             best_len = candidates[end - 1].0;
151 |         }
152 | 
153 |         result
154 |     }
155 | }
156 | 
157 | #[pymodule]
158 | fn pywordsegment(
159 |     _py: Python,
160 |     m: &PyModule,
161 | ) -> PyResult<()> {
162 |     m.add_class::<WordSegmenter>()?;
163 | 
164 |     Ok(())
165 | }
166 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyWordSegment/b9fda304be80080d2b8ce966a146e2af6fd253b6/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_pywordsegment.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import pywordsegment
  4 | 
  5 | 
  6 | class WordSegmentTestCase(
  7 |     unittest.TestCase,
  8 | ):
  9 |     def test_segment_1(
 10 |         self,
 11 |     ):
 12 |         self.assertEqual(
 13 |             first=pywordsegment.WordSegmenter.segment(
 14 |                 text='theusashops',
 15 |             ),
 16 |             second=[
 17 |                 'the',
 18 |                 'usa',
 19 |                 'shops',
 20 |             ],
 21 |         )
 22 | 
 23 |     def test_segment_2(
 24 |         self,
 25 |     ):
 26 |         self.assertEqual(
 27 |             first=pywordsegment.WordSegmenter.segment(
 28 |                 text='choosespain',
 29 |             ),
 30 |             second=[
 31 |                 'choose',
 32 |                 'spain',
 33 |             ],
 34 |         )
 35 | 
 36 |     def test_segment_3(
 37 |         self,
 38 |     ):
 39 |         self.assertEqual(
 40 |             first=pywordsegment.WordSegmenter.segment(
 41 |                 text='thisisatest',
 42 |             ),
 43 |             second=[
 44 |                 'this',
 45 |                 'is',
 46 |                 'a',
 47 |                 'test',
 48 |             ],
 49 |         )
 50 | 
 51 |     def test_segment_4(
 52 |         self,
 53 |     ):
 54 |         self.assertEqual(
 55 |             first=pywordsegment.WordSegmenter.segment(
 56 |                 text='wheninthecourseofhumaneventsitbecomesnecessary',
 57 |             ),
 58 |             second=[
 59 |                 'when',
 60 |                 'in',
 61 |                 'the',
 62 |                 'course',
 63 |                 'of',
 64 |                 'human',
 65 |                 'events',
 66 |                 'it',
 67 |                 'becomes',
 68 |                 'necessary',
 69 |             ],
 70 |         )
 71 | 
 72 |     def test_segment_5(
 73 |         self,
 74 |     ):
 75 |         self.assertEqual(
 76 |             first=pywordsegment.WordSegmenter.segment(
 77 |                 text='whorepresents',
 78 |             ),
 79 |             second=[
 80 |                 'who',
 81 |                 'represents',
 82 |             ],
 83 |         )
 84 | 
 85 |     def test_segment_6(
 86 |         self,
 87 |     ):
 88 |         self.assertEqual(
 89 |             first=pywordsegment.WordSegmenter.segment(
 90 |                 text='expertsexchange',
 91 |             ),
 92 |             second=[
 93 |                 'experts',
 94 |                 'exchange',
 95 |             ],
 96 |         )
 97 | 
 98 |     def test_segment_7(
 99 |         self,
100 |     ):
101 |         self.assertEqual(
102 |             first=pywordsegment.WordSegmenter.segment(
103 |                 text='speedofart',
104 |             ),
105 |             second=[
106 |                 'speed',
107 |                 'of',
108 |                 'art',
109 |             ],
110 |         )
111 | 
112 |     def test_segment_8(
113 |         self,
114 |     ):
115 |         self.assertEqual(
116 |             first=pywordsegment.WordSegmenter.segment(
117 |                 text='nowisthetimeforallgood',
118 |             ),
119 |             second=[
120 |                 'now',
121 |                 'is',
122 |                 'the',
123 |                 'time',
124 |                 'for',
125 |                 'all',
126 |                 'good',
127 |             ],
128 |         )
129 | 
130 |     def test_segment_9(
131 |         self,
132 |     ):
133 |         self.assertEqual(
134 |             first=pywordsegment.WordSegmenter.segment(
135 |                 text='itisatruthuniversallyacknowledged',
136 |             ),
137 |             second=[
138 |                 'it',
139 |                 'is',
140 |                 'a',
141 |                 'truth',
142 |                 'universally',
143 |                 'acknowledged',
144 |             ],
145 |         )
146 | 
147 |     def test_segment_10(
148 |         self,
149 |     ):
150 |         self.assertEqual(
151 |             first=pywordsegment.WordSegmenter.segment(
152 |                 text='itwasabrightcolddayinaprilandtheclockswerestrikingthirteen',
153 |             ),
154 |             second=[
155 |                 'it',
156 |                 'was',
157 |                 'a',
158 |                 'bright',
159 |                 'cold',
160 |                 'day',
161 |                 'in',
162 |                 'april',
163 |                 'and',
164 |                 'the',
165 |                 'clocks',
166 |                 'were',
167 |                 'striking',
168 |                 'thirteen',
169 |             ],
170 |         )
171 | 
172 |     def test_segment_11(
173 |         self,
174 |     ):
175 |         self.assertEqual(
176 |             first=pywordsegment.WordSegmenter.segment(
177 |                 text='CaseTest',
178 |             ),
179 |             second=[
180 |                 'case',
181 |                 'test',
182 |             ],
183 |         )
184 | 
185 |     def test_segment_12(
186 |         self,
187 |     ):
188 |         self.assertEqual(
189 |             first=pywordsegment.WordSegmenter.segment(
190 |                 text='',
191 |             ),
192 |             second=[],
193 |         )
194 | 
195 |     def test_segment_13(
196 |         self,
197 |     ):
198 |         self.assertEqual(
199 |             first=pywordsegment.WordSegmenter.segment(
200 |                 text='a',
201 |             ),
202 |             second=[
203 |                 'a',
204 |             ],
205 |         )
206 | 
207 |     def test_exist_as_segment_1(
208 |         self,
209 |     ):
210 |         self.assertFalse(
211 |             expr=pywordsegment.WordSegmenter.exist_as_segment(
212 |                 substring='man',
213 |                 text='manual',
214 |             ),
215 |         )
216 |         self.assertTrue(
217 |             expr=pywordsegment.WordSegmenter.exist_as_segment(
218 |                 substring='man',
219 |                 text='oneman',
220 |             ),
221 |         )
222 | 


--------------------------------------------------------------------------------