├── .github
└── workflows
│ └── publish.yml
├── .gitignore
├── Cargo.toml
├── README.md
├── docs
├── bm25.png
├── count_vectorizer.png
├── count_vectorizer_char.png
├── flashtext.png
├── logo.png
└── tfidf.png
├── pyproject.toml
├── python
└── lenlp
│ ├── __init__.py
│ ├── analyzer
│ ├── __init__.py
│ └── analyze.py
│ ├── counter
│ ├── __init__.py
│ └── count.py
│ ├── flash
│ ├── __init__.py
│ └── flash_text.py
│ ├── normalizer
│ ├── __init__.py
│ └── normalize.py
│ └── sparse
│ ├── __init__.py
│ ├── bm25_vectorizer.py
│ ├── count_vectorizer.py
│ └── tfidf_vectorizer.py
└── rust
├── lib.rs
├── rsanalyzer.rs
├── rscounter.rs
├── rsflashtext.rs
├── rsnormalizer.rs
├── rssparse.rs
├── rsstop_words.rs
└── rsvectorizer.rs
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by maturin v1.5.1
2 | # To update, run
3 | #
4 | # maturin generate-ci github
5 | #
6 | name: CI
7 |
8 | on:
9 | push:
10 | branches:
11 | - main
12 | workflow_dispatch:
13 |
14 | permissions:
15 | contents: read
16 |
17 | jobs:
18 | linux:
19 | runs-on: ${{ matrix.runner }}
20 | strategy:
21 | matrix:
22 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
23 | target: [x86_64, x86, aarch64, armv7, s390x, ppc64le]
24 | runner: [ubuntu-latest]
25 | exclude:
26 | - target: x86
27 | python-version: ['3.8', '3.9', '3.11', '3.12']
28 | - target: aarch64
29 | python-version: ['3.8', '3.9', '3.11', '3.12']
30 | - target: armv7
31 | python-version: ['3.8', '3.9', '3.11', '3.12']
32 | - target: s390x
33 | python-version: ['3.8', '3.9', '3.11', '3.12']
34 | - target: ppc64le
35 | python-version: ['3.8', '3.9', '3.11', '3.12']
36 | steps:
37 | - uses: actions/checkout@v4
38 | - uses: actions/setup-python@v5
39 | with:
40 | python-version: ${{ matrix.python-version }}
41 | - name: Build wheels
42 | uses: PyO3/maturin-action@v1
43 | with:
44 | target: ${{ matrix.target }}
45 | args: --release --out dist --find-interpreter
46 | sccache: 'true'
47 | manylinux: auto
48 | - name: Upload wheels
49 | uses: actions/upload-artifact@v4
50 | with:
51 | name: wheels-linux-${{ matrix.target }}-py${{ matrix.python-version }}-${{ github.run_id }}
52 | path: dist
53 |
54 | windows:
55 | runs-on: ${{ matrix.runner }}
56 | strategy:
57 | matrix:
58 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
59 | target: [x64, x86]
60 | runner: [windows-latest]
61 | exclude:
62 | - target: x86
63 | python-version: ['3.8', '3.9', '3.11', '3.12']
64 | steps:
65 | - uses: actions/checkout@v4
66 | - uses: actions/setup-python@v5
67 | with:
68 | python-version: ${{ matrix.python-version }}
69 | architecture: ${{ matrix.target }}
70 | - name: Build wheels
71 | uses: PyO3/maturin-action@v1
72 | with:
73 | target: ${{ matrix.target }}
74 | args: --release --out dist --find-interpreter
75 | sccache: 'true'
76 | - name: Upload wheels
77 | uses: actions/upload-artifact@v4
78 | with:
79 | name: wheels-windows-${{ matrix.target }}-py${{ matrix.python-version }}-${{ github.run_id }}
80 | path: dist
81 |
82 | macos:
83 | runs-on: ${{ matrix.runner }}
84 | strategy:
85 | matrix:
86 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
87 | target: [x86_64, aarch64]
88 | runner: [macos-latest]
89 | exclude:
90 | - target: aarch64
91 | python-version: ['3.8', '3.9', '3.11', '3.12']
92 | steps:
93 | - uses: actions/checkout@v4
94 | - uses: actions/setup-python@v5
95 | with:
96 | python-version: ${{ matrix.python-version }}
97 | - name: Build wheels
98 | uses: PyO3/maturin-action@v1
99 | with:
100 | target: ${{ matrix.target }}
101 | args: --release --out dist --find-interpreter
102 | sccache: 'true'
103 | - name: Upload wheels
104 | uses: actions/upload-artifact@v4
105 | with:
106 | name: wheels-macos-${{ matrix.target }}-py${{ matrix.python-version }}-${{ github.run_id }}
107 | path: dist
108 |
109 | sdist:
110 | runs-on: ubuntu-latest
111 | steps:
112 | - uses: actions/checkout@v4
113 | - name: Build sdist
114 | uses: PyO3/maturin-action@v1
115 | with:
116 | command: sdist
117 | args: --out dist
118 | - name: Upload sdist
119 | uses: actions/upload-artifact@v4
120 | with:
121 | name: wheels-sdist-${{ github.run_id }}
122 | path: dist
123 |
124 | release:
125 | name: Release
126 | runs-on: ubuntu-latest
127 | needs: [linux, windows, macos, sdist]
128 | steps:
129 | - uses: actions/download-artifact@v4
130 | - name: Publish to PyPI
131 | uses: PyO3/maturin-action@v1
132 | env:
133 | MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_PASSWORD }}
134 | with:
135 | command: upload
136 | args: --non-interactive --skip-existing wheels-*/*
137 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | Cargo.lock
3 | .DS_Store
4 |
5 | *.json
6 |
7 | # Byte-compiled / optimized / DLL files
8 | __pycache__/
9 | *.py[cod]
10 | *$py.class
11 |
12 | *.test
13 | *.onx
14 | *.qonx
15 | *.DS_Store
16 | *.pyc
17 | *.ipynb_checkpoints
18 | *.pickle
19 | *.pkl
20 | *.icloud
21 | cache/
22 | # C extensions
23 | *.so
24 | test/
25 |
26 | # Distribution / packaging
27 | .Python
28 | build/
29 | develop-eggs/
30 | dist/
31 | downloads/
32 | eggs/
33 | .eggs/
34 | lib/
35 | lib64/
36 | parts/
37 | sdist/
38 | var/
39 | wheels/
40 | pip-wheel-metadata/
41 | share/python-wheels/
42 | *.egg-info/
43 | .installed.cfg
44 | *.egg
45 | MANIFEST
46 |
47 | # PyInstaller
48 | # Usually these files are written by a python script from a template
49 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
50 | *.manifest
51 | *.spec
52 |
53 | # Installer logs
54 | pip-log.txt
55 | pip-delete-this-directory.txt
56 |
57 | # Unit test / coverage reports
58 | htmlcov/
59 | .tox/
60 | .nox/
61 | .coverage
62 | .coverage.*
63 | .cache
64 | nosetests.xml
65 | coverage.xml
66 | *.cover
67 | *.py,cover
68 | .hypothesis/
69 | .pytest_cache/
70 |
71 | # Translations
72 | *.mo
73 | *.pot
74 |
75 | # Django stuff:
76 | *.log
77 | local_settings.py
78 | db.sqlite3
79 | db.sqlite3-journal
80 |
81 | # Flask stuff:
82 | instance/
83 | .webassets-cache
84 |
85 | # Scrapy stuff:
86 | .scrapy
87 |
88 | # Sphinx documentation
89 | docs/_build/
90 |
91 | # PyBuilder
92 | target/
93 |
94 | # Jupyter Notebook
95 | .ipynb_checkpoints
96 |
97 | # IPython
98 | profile_default/
99 | ipython_config.py
100 |
101 | # pyenv
102 | .python-version
103 |
104 | # pipenv
105 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
106 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
107 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
108 | # install all needed dependencies.
109 | #Pipfile.lock
110 |
111 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
112 | __pypackages__/
113 |
114 | # Celery stuff
115 | celerybeat-schedule
116 | celerybeat.pid
117 |
118 | # SageMath parsed files
119 | *.sage.py
120 |
121 | # Environments
122 | .env
123 | .venv
124 | env/
125 | venv/
126 | ENV/
127 | env.bak/
128 | venv.bak/
129 |
130 | # Spyder project settings
131 | .spyderproject
132 | .spyproject
133 |
134 | # Rope project settings
135 | .ropeproject
136 |
137 | # mkdocs documentation
138 | /site
139 |
140 | # mypy
141 | .mypy_cache/
142 | .dmypy.json
143 | dmypy.json
144 |
145 | # Pyre type checker
146 | .pyre/
147 | test.ipynb
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "_rslenlp"
3 | edition = "2021"
4 |
5 | [lib]
6 | name = "_rslenlp"
7 | crate-type = ["cdylib"]
8 | path = "rust/lib.rs"
9 |
10 | [dependencies]
11 | unidecode = "0.3.0"
12 | rayon = "1.10.0"
13 | pyo3 = { version = "0.24.2", features = [
14 | "extension-module",
15 | "generate-import-lib",
16 | ] }
17 | serde = { version = "1.0.202", features = ["derive"] }
18 | serde_json = { version = "1.0.117" }
19 | bincode = "1.3.3"
20 | ndarray = "0.15"
21 | numpy = "0.24"
22 |
23 | [profile.dev]
24 | opt-level = 0
25 |
26 | [profile.release]
27 | opt-level = 3
28 |
29 | [tool.maturin]
30 | features = ["pyo3/extension-module"]
31 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
LeNLP
3 |
Natural Language Processing toolbox for Python with Rust
4 |
5 |
6 | 
7 |
8 |
9 |
10 |

11 |
12 |
13 |
14 | LeNLP is a toolkit dedicated to natural language processing (NLP). It provides optimized and parallelized functions in Rust for use in Python, offering high performance and ease of integration.
15 |
16 | ## Installation
17 |
18 | We can install LeNLP using:
19 |
20 | ```
21 | pip install lenlp
22 | ```
23 |
24 | ## Sections
25 |
26 | - [Installation](#installation)
27 | - [Quick Start](#quick-start)
28 | - [Sparse Module](#sparse-module)
29 | - [CountVectorizer](#countvectorizer)
30 | - [TfidfVectorizer](#TfidfVectorizer)
31 | - [BM25Vectorizer](#bm25vectorizer)
32 | - [FlashText](#flashtext)
33 | - [Extras](#extras)
34 | - [Counter](#counter)
35 | - [Normalizer](#normalizer)
36 |
37 | ## Quick Start
38 |
39 | ### Sparse Module
40 |
41 | The `sparse` module offers a variety of vectorizers and transformers for text data. These sparse matrices are `scipy.sparse.csr_matrix` objects, optimized for memory usage and speed. They can be used as drop-in replacements for `scikit-learn` vectorizers.
42 |
43 | #### CountVectorizer
44 |
45 | The `CountVectorizer` converts a list of texts into a sparse matrix of token counts. This is a Rust implementation of the `CountVectorizer` from `scikit-learn`.
46 |
47 | ```python
48 | from lenlp import sparse
49 |
50 | vectorizer = sparse.CountVectorizer(
51 | ngram_range=(3, 5), # range of n-grams
52 | analyzer="char_wb", # word, char, char_wb
53 | normalize=True, # lowercase and strip accents
54 | stop_words=["based"], # list of stop words
55 | )
56 | ```
57 |
58 | You can fit the vectorizer and transform a list of texts into a sparse matrix of token counts:
59 |
60 | ```python
61 | X = [
62 | "Hello World",
63 | "Rust based vectorizer"
64 | ]
65 |
66 | matrix = vectorizer.fit_transform(X)
67 | ```
68 |
69 | Or use separate calls:
70 |
71 | ```python
72 | vectorizer.fit(X)
73 | matrix = vectorizer.transform(X)
74 | ```
75 |
76 | Benchmark:
77 |
78 | 
79 |
80 | LeNLP CountVectorizer versus Sklearn CountVectorizer `fit_transform` with `char` analyzer.
81 |
82 | #### TfidfVectorizer
83 |
84 | The `TfidfVectorizer` converts a list of texts into a sparse matrix of tf-idf weights, implemented in Rust.
85 |
86 | ```python
87 | from lenlp import sparse
88 |
89 | vectorizer = sparse.TfidfVectorizer(
90 | ngram_range=(3, 5), # Range of n-grams
91 | analyzer="char_wb", # Options: word, char, char_wb
92 | normalize=True, # Lowercase and strip accents
93 | stop_words=["based"] # List of stop words
94 | )
95 | ```
96 |
97 | Fit the vectorizer and transform texts:
98 |
99 | ```python
100 | X = [
101 | "Hello World",
102 | "Rust based vectorizer"
103 | ]
104 |
105 | matrix = vectorizer.fit_transform(X)
106 | ```
107 |
108 | Or use separate calls:
109 |
110 | ```python
111 | vectorizer.fit(X)
112 | matrix = vectorizer.transform(X)
113 | ```
114 |
115 | Benchmark:
116 |
117 | 
118 |
119 | LeNLP TfidfVectorizer versus Sklearn TfidfVectorizer `fit_transform` with `char` analyzer.
120 |
121 | #### BM25Vectorizer
122 |
123 | The `BM25Vectorizer` converts texts into a sparse matrix of BM25 weights, which are more accurate than tf-idf and count weights.
124 |
125 | ```python
126 | from lenlp import sparse
127 |
128 | vectorizer = sparse.BM25Vectorizer(
129 | ngram_range=(3, 5), # Range of n-grams
130 | analyzer="char_wb", # Options: word, char, char_wb
131 | normalize=True, # Lowercase and strip accents
132 | stop_words=["based"] # List of stop words
133 | )
134 | ```
135 |
136 | Fit the vectorizer and transform texts:
137 |
138 | ```python
139 | X = [
140 | "Hello World",
141 | "Rust based vectorizer"
142 | ]
143 |
144 | matrix = vectorizer.fit_transform(X)
145 | ```
146 |
147 | Or use separate calls:
148 |
149 | ```python
150 | vectorizer.fit(X)
151 | matrix = vectorizer.transform(X)
152 | ```
153 |
154 | Benchmark:
155 |
156 | 
157 |
158 |
159 | LeNLP BM25Vectorizer versus LeNLP TfidfVectorizer `fit_transform` with `char` analyzer. BM25Vectorizer counterpart is not available in Sklearn.
160 |
161 | ### FlashText
162 |
163 | The `flashtext` module allows for efficient keyword extraction from texts. It implements the FlashText algorithm as described in the paper *[Replace or Retrieve Keywords In Documents At Scale](https://arxiv.org/pdf/1711.00046)*.
164 |
165 | ```python
166 | from lenlp import flash
167 |
168 | flash_text = flash.FlashText(
169 | normalize=True # remove accents and lowercase
170 | )
171 |
172 | # Add keywords we want to retrieve:
173 | flash_text.add(["paris", "bordeaux", "toulouse"])
174 | ```
175 |
176 | Extract keywords and their positions from sentences:
177 |
178 | ```python
179 | sentences = [
180 | "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux",
181 | "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse",
182 | ]
183 |
184 | flash_text.extract(sentences)
185 | ```
186 |
187 | Output:
188 |
189 | ```python
190 | [[('toulouse', 0, 8), ('bordeaux', 60, 68), ('bordeaux', 74, 82)],
191 | [('paris', 0, 5), ('bordeaux', 62, 70), ('toulouse', 76, 84)]]
192 | ```
193 |
194 | The FlashText algorithm is highly efficient, significantly faster than regular expressions for keyword extraction. LeNLP's implementation normalizes input documents by removing accents and converting to lowercase to enhance keyword extraction.
195 |
196 | Benchmark:
197 |
198 | 
199 |
200 | LeNLP FlashText is benchmarked versus the official implementation of [FlashText](https://github.com/vi3k6i5/flashtext).
201 |
202 | ### Extras
203 |
204 | #### Counter
205 |
206 | The counter module allows to convert a list of texts into a dictionary of token counts.
207 |
208 | ```python
209 | from lenlp import counter
210 |
211 | sentences = [
212 | "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux",
213 | "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse",
214 | ]
215 |
216 | counter.count(
217 | sentences,
218 | ngram_range=(1, 1), # Range of n-grams
219 | analyzer="word", # Options: word, char, char_wb
220 | normalize=True, # Lowercase and strip accents
221 | stop_words=["its", "in", "is", "of", "the", "and", "to", "a"] # List of stop words
222 | )
223 | ```
224 |
225 | Output:
226 |
227 | ```python
228 | [{'compared': 1,
229 | 'south': 1,
230 | 'city': 1,
231 | 'toulouse': 1,
232 | 'bordeaux': 2,
233 | 'france': 1},
234 | {'toulouse': 1,
235 | 'france': 1,
236 | 'capital': 1,
237 | 'paris': 1,
238 | 'north': 1,
239 | 'compared': 1,
240 | 'bordeaux': 1}]
241 | ```
242 |
243 | #### Normalizer
244 |
245 | The normalizer module allows to normalize a list of texts by removing accents and converting to lowercase.
246 |
247 | ```python
248 | from lenlp import normalizer
249 |
250 | sentences = [
251 | "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux",
252 | "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse",
253 | ]
254 |
255 | normalizer.normalize(sentences)
256 | ```
257 |
258 | Output:
259 |
260 | ```python
261 | [
262 | 'toulouse is a city in france its in the south compared to bordeaux and bordeaux',
263 | 'paris is the capital of france its in the north compared to bordeaux and toulouse',
264 | ]
265 | ```
266 |
267 | ## References
268 |
269 | - *[FlashText](https://github.com/vi3k6i5/flashtext)*
270 | - *[Scikit Learn](https://github.com/scikit-learn/scikit-learn)*
271 | - *[PyO3](https://github.com/PyO3/pyo3)*
272 | - *[Maturin](https://github.com/PyO3/maturin)*
273 |
274 |
--------------------------------------------------------------------------------
/docs/bm25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/bm25.png
--------------------------------------------------------------------------------
/docs/count_vectorizer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/count_vectorizer.png
--------------------------------------------------------------------------------
/docs/count_vectorizer_char.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/count_vectorizer_char.png
--------------------------------------------------------------------------------
/docs/flashtext.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/flashtext.png
--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/logo.png
--------------------------------------------------------------------------------
/docs/tfidf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/tfidf.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["maturin >= 1.5.1"]
3 | build-backend = "maturin"
4 |
5 | [project]
6 | name = "lenlp"
7 | version = "1.2.0"
8 | description = "Natural Language Processing library for Python from Rust."
9 |
10 | authors = [{ name = "Raphael Sourty", email = "raphael.sourty@gmail.com" }]
11 |
12 |
13 | keywords = []
14 |
15 | classifiers = [
16 | "Programming Language :: Python :: 3",
17 | "Programming Language :: Rust",
18 | "Operating System :: OS Independent",
19 | ]
20 |
21 | requires-python = ">=3.8"
22 |
23 | dependencies = ["scikit-learn >= 1.5.0", "scipy >= 1.13.1"]
24 |
25 | [project.urls]
26 | Homepage = "https://github.com/raphaelsty/lenlp"
27 | Documentation = "https://github.com/raphaelsty/lenlp"
28 | Repository = "https://github.com/raphaelsty/lenlp"
29 |
30 | [project.optional-dependencies]
31 | dev = [
32 | "maturin >= 1.5.1",
33 | "pytest-cov >= 5.0.0",
34 | "pytest >= 7.4.4",
35 | "ruff >= 0.1.15",
36 | ]
37 | [tool.maturin]
38 | bindings = "pyo3"
39 | features = ["pyo3/extension-module"]
40 | python-source = "python"
41 | module-name = "lenlp._rslenlp"
42 |
43 | [tool.include]
44 | include = ["Cargo.toml", "pyproject.toml", "README.md", "rust/*"]
45 |
46 | [tool.pytest.ini_options]
47 | filterwarnings = [
48 | "ignore::DeprecationWarning",
49 | "ignore::RuntimeWarning",
50 | "ignore::UserWarning",
51 | ]
52 | addopts = [
53 | "--doctest-modules",
54 | "--verbose",
55 | "-ra",
56 | "--cov-config=.coveragerc",
57 | "-m not web and not slow",
58 | ]
59 | doctest_optionflags = ["NORMALIZE_WHITESPACE", "NUMBER"]
60 | norecursedirs = ["build", "docs", "node_modules"]
61 | markers = [
62 | "web: tests that require using the Internet",
63 | "slow: tests that take a long time to run",
64 | ]
65 |
--------------------------------------------------------------------------------
/python/lenlp/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "analyzer",
3 | "counter",
4 | "flash",
5 | "normalizer",
6 | "sparse",
7 | ]
8 |
--------------------------------------------------------------------------------
/python/lenlp/analyzer/__init__.py:
--------------------------------------------------------------------------------
1 | from .analyze import analyze
2 |
3 | __all__ = [
4 | "analyze",
5 | ]
6 |
--------------------------------------------------------------------------------
/python/lenlp/analyzer/analyze.py:
--------------------------------------------------------------------------------
1 | from lenlp._rslenlp import rschar_ngrams_many, rschar_wb_ngrams_many, rssplit_words_many
2 |
3 | __all__ = ["analyze"]
4 |
5 |
6 | def analyze(
7 | x: str | list[str],
8 | analyzer: str = "word",
9 | ngram_range: tuple[int, int] = (1, 1),
10 | ) -> str | list[str]:
11 | """Split text or list of texts into words or characters.
12 |
13 | Parameters
14 | ----------
15 | x
16 | str or list of str.
17 | analyzer
18 | {word, char, char_wb}, default=word.
19 | Whether the feature should be made of word n-gram or character n-grams. Option
20 | char_wb creates character n-grams only from text inside word boundaries;
21 | n-grams at the edges of words are padded with space.
22 | ngram_range
23 | tuple (min_n, max_n), default=(1).
24 | The lower and upper boundary of the range of n-values for different n-grams to
25 | be extracted. All values of n such that min_n <= n <= max_n will be used.
26 | Examples
27 | --------
28 | >>> from lenlp import analyzer
29 |
30 | >>> analyzer.analyze("Hello, world!", analyzer="word")
31 | ['Hello,', 'world!']
32 |
33 | >>> analyzer.analyze("Hello, world!", analyzer="char_wb", ngram_range=(3, 3))
34 | ['Hel', 'ell', 'llo', 'lo,', 'o, ', ', w', ' wo', 'wor', 'orl', 'rld', 'ld!']
35 |
36 | >>> analyzer.analyze(["hello, world", "good"], analyzer="char", ngram_range=(2, 3))
37 | [['he', 'el', 'll', 'lo', 'o,', ', ', ' w', 'wo', 'or', 'rl', 'ld', 'hel', 'ell', 'llo', 'lo,', 'o, ', ', w', ' wo', 'wor', 'orl', 'rld'], ['go', 'oo', 'od', 'goo', 'ood']]
38 |
39 | """
40 | return_string = True if isinstance(x, str) else False
41 | x = [x] if isinstance(x, str) else x
42 | n_sizes = list(range(ngram_range[0], ngram_range[1] + 1))
43 |
44 | match analyzer:
45 | case "word":
46 | y = rssplit_words_many(x, n_sizes=n_sizes)
47 | case "char":
48 | y = rschar_ngrams_many(x, n_sizes=n_sizes)
49 | case "char_wb":
50 | y = rschar_wb_ngrams_many(x, n_sizes=n_sizes)
51 |
52 | return y[0] if return_string else y
53 |
--------------------------------------------------------------------------------
/python/lenlp/counter/__init__.py:
--------------------------------------------------------------------------------
1 | from .count import count
2 |
3 | __all__ = ["count"]
4 |
--------------------------------------------------------------------------------
/python/lenlp/counter/count.py:
--------------------------------------------------------------------------------
1 | from lenlp._rslenlp import (
2 | rsvectorize_char_ngrams_many,
3 | rsvectorize_char_wb_ngrams_many,
4 | rsvectorize_split_words_many,
5 | )
6 |
7 | __all__ = ["count"]
8 |
9 |
10 | def count(
11 | x: str | list[str],
12 | analyzer: str = "word",
13 | ngram_range: tuple[int, int] = (1, 1),
14 | normalize: bool = True,
15 | stop_words: list[str] = None,
16 | sort: bool = False,
17 | ) -> dict[str, int]:
18 | """Count the frequency of words in a text or in a list of texts. Tokens are unordered within
19 | the same text.
20 |
21 | Parameters
22 | ----------
23 | x
24 | str or list of str.
25 | analyzer
26 | {word, char, char_wb}, default=word.
27 | Whether the feature should be made of word n-gram or character n-grams. Option
28 | char_wb creates character n-grams only from text inside word boundaries;
29 | n-grams at the edges of words are padded with space.
30 | ngram_range
31 | tuple (min_n, max_n), default=1.
32 | The lower and upper boundary of the range of n-values for different n-grams to
33 | be extracted. All values of n such that min_n <= n <= max_n will be used.
34 | normalize
35 | bool, default=True.
36 | Whether to normalize the text before counting. It will lowercase the text and remove
37 | punctuation.
38 | stop_words
39 | list of str, default=None.
40 | A list of stop words that will be removed from the text.
41 |
42 | Examples
43 | --------
44 | >>> from lenlp import counter
45 |
46 | >>> counter.count("Hello, world!", sort=True)
47 | {'hello': 1, 'world': 1}
48 |
49 | >>> counter.count("Hello, world!", ngram_range=(2, 2), sort=True, normalize=False)
50 | {'Hello, world!': 1}
51 |
52 | >>> counter.count(["Hello, world!", "How are you?"], stop_words=["are", "you"], sort=True)
53 | [{'hello': 1, 'world': 1}, {'how': 1}]
54 |
55 | >>> counter.count(["Hello, world!", "hello"], analyzer="char_wb", ngram_range=(3, 7), stop_words=["hello"], sort=True)
56 | [{'orl': 1, 'orld': 1, 'rld': 1, 'wor': 1, 'worl': 1, 'world': 1}, {}]
57 |
58 | >>> counter.count("Hello, world!", analyzer="char_wb", ngram_range=(3, 7), sort=True)
59 | {' wo': 1, ' wor': 1, ' worl': 1, ' world': 1, 'ell': 1, 'ello': 1, 'ello ': 1, 'ello w': 1, 'ello wo': 1, 'hel': 1, 'hell': 1, 'hello': 1, 'hello ': 1, 'hello w': 1, 'llo': 1, 'llo ': 1, 'llo w': 1, 'llo wo': 1, 'llo wor': 1, 'lo ': 1, 'lo w': 1, 'lo wo': 1, 'lo wor': 1, 'lo worl': 1, 'o w': 1, 'o wo': 1, 'o wor': 1, 'o worl': 1, 'o world': 1, 'orl': 1, 'orld': 1, 'rld': 1, 'wor': 1, 'worl': 1, 'world': 1}
60 |
61 | >>> counter.count("Hello, world!", analyzer="char", ngram_range=(3, 7), sort=True)
62 | {' wo': 1, ' wor': 1, ' worl': 1, ' world': 1, 'ell': 1, 'ello': 1, 'ello ': 1, 'ello w': 1, 'ello wo': 1, 'hel': 1, 'hell': 1, 'hello': 1, 'hello ': 1, 'hello w': 1, 'llo': 1, 'llo ': 1, 'llo w': 1, 'llo wo': 1, 'llo wor': 1, 'lo ': 1, 'lo w': 1, 'lo wo': 1, 'lo wor': 1, 'lo worl': 1, 'o w': 1, 'o wo': 1, 'o wor': 1, 'o worl': 1, 'o world': 1, 'orl': 1, 'orld': 1, 'rld': 1, 'wor': 1, 'worl': 1, 'world': 1}
63 |
64 | >>> counter.count(["Hello, world!", "hello"], analyzer="char", ngram_range=(3, 7), stop_words=["hello"], sort=True)
65 | [{'orl': 1, 'orld': 1, 'rld': 1, 'wor': 1, 'worl': 1, 'world': 1}, {}]
66 |
67 | """
68 | return_string = True if isinstance(x, str) else False
69 | x = [x] if isinstance(x, str) else x
70 | n_sizes = list(range(ngram_range[0], ngram_range[1] + 1))
71 |
72 | match analyzer:
73 | case "word":
74 | y = rsvectorize_split_words_many(
75 | x, n_sizes=n_sizes, stop_words=stop_words, normalize=normalize
76 | )
77 | case "char":
78 | y = rsvectorize_char_ngrams_many(
79 | x, n_sizes=n_sizes, stop_words=stop_words, normalize=normalize
80 | )
81 |
82 | case "char_wb":
83 | y = rsvectorize_char_wb_ngrams_many(
84 | x, n_sizes=n_sizes, stop_words=stop_words, normalize=normalize
85 | )
86 |
87 | if sort:
88 | y = [dict(sorted(d.items())) for d in y]
89 |
90 | return y[0] if return_string else y
91 |
--------------------------------------------------------------------------------
/python/lenlp/flash/__init__.py:
--------------------------------------------------------------------------------
1 | from .flash_text import FlashText
2 |
3 | __all__ = ["FlashText"]
4 |
--------------------------------------------------------------------------------
/python/lenlp/flash/flash_text.py:
--------------------------------------------------------------------------------
1 | from lenlp._rslenlp import RSKeywordProcessor
2 |
3 | __all__ = ["FlashText"]
4 |
5 |
6 | class FlashText:
7 | """FlashText retrieve keywords from text.
8 |
9 | Parameters
10 | ----------
11 | lowercase
12 | bool, default=True.
13 | Whether to lowercase the text before extracting keywords.
14 | normalize
15 | bool, default=True.
16 | Whether to normalize the text before extracting keywords. It will lowercase the text
17 | and remove punctuation.
18 |
19 | Examples
20 | --------
21 | >>> from lenlp import flash
22 |
23 | >>> flash_text = flash.FlashText(normalize=True)
24 | >>> flash_text = flash_text.add(["hello", "world"])
25 |
26 | >>> flash_text.extract(["Hello, world!", "world", "hello"])
27 | [[('hello', 0, 5), ('world', 7, 12)], [('world', 0, 5)], [('hello', 0, 5)]]
28 |
29 | """
30 |
31 | def __init__(
32 | self,
33 | lowercase: bool = True,
34 | normalize: bool = True,
35 | ) -> None:
36 | self.flash = RSKeywordProcessor(lowercase=lowercase, normalize=normalize)
37 |
38 | def add(
39 | self,
40 | x: str | list[str],
41 | clean_name: str | None = None,
42 | ) -> None:
43 | """Add a keyword to the FlashText object."""
44 | x = [x] if isinstance(x, str) else x
45 | self.flash.add_keywords_many(x, clean_name)
46 | return self
47 |
48 | def extract(self, x: str | list[str]) -> list[str]:
49 | """Extract keywords from a sentence."""
50 | is_string = isinstance(x, str)
51 | x = [x] if isinstance(x, str) else x
52 | y = self.flash.extract_keywords_many(x)
53 | return y[0] if is_string else y
54 |
--------------------------------------------------------------------------------
/python/lenlp/normalizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .normalize import normalize
2 |
3 | __all__ = ["normalize"]
4 |
--------------------------------------------------------------------------------
/python/lenlp/normalizer/normalize.py:
--------------------------------------------------------------------------------
1 | from lenlp._rslenlp import rsnormalize, rsnormalize_many
2 |
3 | __all__ = ["normalize"]
4 |
5 |
6 | def normalize(x: str | list[str]) -> str:
7 | """Lowercase, remove punctation and unidecode single text.
8 |
9 | Examples
10 | --------
11 | >>> from lenlp import normalizer
12 |
13 | >>> normalizer.normalize("Hello, world!")
14 | 'hello world'
15 |
16 | >>> normalizer.normalize(["Hello, world!", "How are you?"])
17 | ['hello world', 'how are you']
18 |
19 | """
20 | return rsnormalize(x) if isinstance(x, str) else rsnormalize_many(x)
21 |
--------------------------------------------------------------------------------
/python/lenlp/sparse/__init__.py:
--------------------------------------------------------------------------------
1 | from .bm25_vectorizer import BM25Vectorizer
2 | from .count_vectorizer import CountVectorizer
3 | from .tfidf_vectorizer import TfidfVectorizer
4 |
5 | __all__ = ["BM25Vectorizer", "CountVectorizer", "TfidfVectorizer"]
6 |
--------------------------------------------------------------------------------
/python/lenlp/sparse/bm25_vectorizer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.sparse import csr_matrix
3 | from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2
4 |
5 | from .tfidf_vectorizer import TfidfVectorizer
6 |
7 |
8 | class BM25Vectorizer(TfidfVectorizer):
9 | """BM25Vectorizer is a class that converts a collection of text documents to a sparse
10 | bm25 matrix.
11 |
12 | Parameters
13 | ----------
14 | analyzer
15 | {word, char, char_wb}, default=word.
16 | Whether the feature should be made of word n-gram or character n-grams. Option
17 | char_wb creates character n-grams only from text inside word boundaries;
18 | n-grams at the edges of words are padded with space.
19 | ngram_range
20 | tuple (min_n, max_n), default=(1, 1).
21 | The lower and upper boundary of the range of n-values for different n-grams to
22 | be extracted. All values of n such that min_n <= n <= max_n will be used.
23 | normalize
24 | bool, default=True.
25 | Whether to normalize the text before counting. It will lowercase the text and remove
26 | punctuation.
27 | stop_words
28 | list of str, default=None.
29 | A list of stop words that will be removed from the text.
30 | b
31 | The impact of document length normalization. Default is `0.75`, Higher will
32 | penalize longer documents more.
33 | k1
34 | How quickly the impact of term frequency saturates. Default is `1.5`, Higher
35 | will make term frequency more influential.
36 | epsilon
37 | Smoothing term. Default is `0`.
38 |
39 | Examples
40 | --------
41 | >>> from lenlp import sparse
42 |
43 | >>> bm25_vectorizer = sparse.BM25Vectorizer(
44 | ... analyzer="word",
45 | ... normalize=True,
46 | ... stop_words=None,
47 | ... )
48 |
49 | >>> x = ["Hello, world!", "How are you?"]
50 |
51 | >>> bm25_vectorizer = bm25_vectorizer.fit(x)
52 | >>> matrix = bm25_vectorizer.transform(x)
53 | >>> matrix.shape
54 | (2, 5)
55 |
56 | >>> len(bm25_vectorizer.vocabulary)
57 | 5
58 |
59 | >>> matrix = bm25_vectorizer.fit_transform(x)
60 | >>> matrix.shape
61 | (2, 5)
62 |
63 | """
64 |
65 | def __init__(
66 | self,
67 | analyzer: str = "word",
68 | ngram_range: tuple[int, int] = (1, 1),
69 | normalize: bool = True,
70 | stop_words: list[str] = None,
71 | k1: float = 1.5,
72 | b: float = 0.75,
73 | epsilon: float = 0,
74 | ) -> None:
75 | super().__init__(
76 | analyzer=analyzer,
77 | ngram_range=ngram_range,
78 | normalize=normalize,
79 | stop_words=stop_words,
80 | )
81 |
82 | self.k1 = k1
83 | self.b = b
84 | self.epsilon = epsilon
85 | self.average_len = None
86 |
87 | def update(self, matrix: csr_matrix) -> csr_matrix:
88 | """Update the idf values."""
89 | self.tf = (matrix > 0).sum(axis=0)
90 | len_documents = (matrix).sum(axis=1)
91 | self.average_len = len_documents.mean()
92 | self.count = matrix.shape[0]
93 |
94 | self.idf = np.squeeze(
95 | a=np.asarray(
96 | a=np.log((self.count - self.tf + 0.5) / (self.tf + 0.5) + 1),
97 | dtype=np.float32,
98 | )
99 | )
100 |
101 | def _transform(self, matrix: csr_matrix) -> csr_matrix:
102 | """Transform a count matrix to a bm25 matrix."""
103 | len_documents = (matrix).sum(axis=1)
104 | regularization = np.squeeze(
105 | a=np.asarray(
106 | a=(
107 | self.k1 * (1 - self.b + self.b * (len_documents / self.average_len))
108 | ).flatten()
109 | )
110 | )
111 |
112 | denominator = matrix.tocsc()
113 | denominator.data += np.take(a=regularization, indices=denominator.indices)
114 | matrix.data = (
115 | (matrix.data * (self.k1 + 1)) / denominator.tocsr().data
116 | ) + self.epsilon
117 |
118 | matrix = matrix.multiply(other=self.idf).tocsr()
119 | inplace_csr_row_normalize_l2(matrix)
120 | return matrix
121 |
--------------------------------------------------------------------------------
/python/lenlp/sparse/count_vectorizer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.sparse import csr_matrix
3 |
4 | from lenlp._rslenlp import SparseMatrixBuilder
5 |
6 | __all__ = ["CountVectorizer"]
7 |
8 |
9 | class CountVectorizer:
10 | """CountVectorizer is a class that converts a collection of text documents to a sparse
11 | matrix.
12 |
13 | Parameters
14 | ----------
15 | analyzer
16 | {word, char, char_wb}, default=word.
17 | Whether the feature should be made of word n-gram or character n-grams. Option
18 | char_wb creates character n-grams only from text inside word boundaries;
19 | n-grams at the edges of words are padded with space.
20 | ngram_range
21 | tuple (min_n, max_n), default=(1, 1).
22 | The lower and upper boundary of the range of n-values for different n-grams to
23 | be extracted. All values of n such that min_n <= n <= max_n will be used.
24 | normalize
25 | bool, default=True.
26 | Whether to normalize the text before counting. It will lowercase the text and remove
27 | punctuation.
28 | stop_words
29 | list of str, default=None.
30 | A list of stop words that will be removed from the text.
31 |
32 | Examples
33 | --------
34 | >>> from lenlp import sparse
35 |
36 | >>> count_vectorizer = sparse.CountVectorizer(
37 | ... analyzer="word",
38 | ... normalize=True,
39 | ... stop_words=None,
40 | ... )
41 |
42 | >>> x = ["Hello, world!", "How are you?"]
43 |
44 | >>> count_vectorizer = count_vectorizer.fit(x)
45 |
46 | >>> matrix = count_vectorizer.transform(x)
47 | >>> matrix.shape
48 | (2, 5)
49 |
50 | >>> matrix.toarray()
51 | array([[1., 1., 0., 0., 0.],
52 | [0., 0., 1., 1., 1.]], dtype=float32)
53 |
54 | >>> len(count_vectorizer.vocabulary)
55 | 5
56 |
57 | >>> matrix = count_vectorizer.fit_transform(x)
58 | >>> matrix.shape
59 | (2, 5)
60 |
61 | """
62 |
63 | def __init__(
64 | self,
65 | analyzer: str = "word",
66 | ngram_range: tuple[int, int] = (1, 1),
67 | normalize: bool = True,
68 | stop_words: list[str] = None,
69 | ) -> None:
70 | assert analyzer in ("word", "char", "char_wb")
71 |
72 | self.sparse_matrix = SparseMatrixBuilder(
73 | analyzer=analyzer,
74 | n_sizes=list(range(ngram_range[0], ngram_range[1] + 1)),
75 | normalize=normalize,
76 | stop_words=stop_words,
77 | )
78 |
79 | self.fitted = False
80 |
81 | @property
82 | def vocabulary(self) -> dict[str, int]:
83 | """Get the vocabulary of the CountVectorizer object."""
84 | return self.sparse_matrix.get_vocab()
85 |
86 | def fit(self, raw_documents: list[str]) -> None:
87 | """Learn the vocabulary dictionary and return the CountVectorizer object."""
88 | self.fitted = True
89 | self.sparse_matrix.fit(raw_documents)
90 | return self
91 |
92 | def transform(self, raw_documents: list[str]) -> csr_matrix:
93 | """Transform documents to document-term matrix."""
94 | if not self.fitted:
95 | raise ValueError("Call fit method before calling transform method.")
96 |
97 | values, row_indices, column_indices = self.sparse_matrix.transform(
98 | raw_documents
99 | )
100 |
101 | return csr_matrix(
102 | arg1=(values, (row_indices, column_indices)),
103 | shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
104 | dtype=np.float32,
105 | )
106 |
107 | def fit_transform(self, raw_documents: list[str]) -> csr_matrix:
108 | """Learn the vocabulary dictionary and return the CountVectorizer object."""
109 | self.fitted = True
110 |
111 | values, row_indices, column_indices = self.sparse_matrix.fit_transform(
112 | raw_documents
113 | )
114 |
115 | return csr_matrix(
116 | arg1=(values, (row_indices, column_indices)),
117 | shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
118 | dtype=np.float32,
119 | )
120 |
--------------------------------------------------------------------------------
/python/lenlp/sparse/tfidf_vectorizer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.sparse import csr_matrix
3 | from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2
4 |
5 | from .count_vectorizer import CountVectorizer
6 |
7 |
8 | class TfidfVectorizer(CountVectorizer):
9 | """TfidfVectorizer is a class that converts a collection of text documents to a sparse
10 | tfidf matrix.
11 |
12 | Parameters
13 | ----------
14 | analyzer
15 | {word, char, char_wb}, default=word.
16 | Whether the feature should be made of word n-gram or character n-grams. Option
17 | char_wb creates character n-grams only from text inside word boundaries;
18 | n-grams at the edges of words are padded with space.
19 | ngram_range
20 | tuple (min_n, max_n), default=(1, 1).
21 | The lower and upper boundary of the range of n-values for different n-grams to
22 | be extracted. All values of n such that min_n <= n <= max_n will be used.
23 | normalize
24 | bool, default=True.
25 | Whether to normalize the text before counting. It will lowercase the text and remove
26 | punctuation.
27 | stop_words
28 | list of str, default=None.
29 | A list of stop words that will be removed from the text.
30 |
31 | Examples
32 | --------
33 | >>> from lenlp import sparse
34 |
35 | >>> tfidf_vectorizer = sparse.TfidfVectorizer(
36 | ... analyzer="word",
37 | ... normalize=True,
38 | ... stop_words=None,
39 | ... )
40 |
41 | >>> x = ["Hello, world!", "How are you?"]
42 |
43 | >>> tfidf_vectorizer = tfidf_vectorizer.fit(x)
44 | >>> matrix = tfidf_vectorizer.transform(x)
45 | >>> matrix.shape
46 | (2, 5)
47 |
48 | >>> len(tfidf_vectorizer.vocabulary)
49 | 5
50 |
51 | >>> matrix = tfidf_vectorizer.fit_transform(x)
52 | >>> matrix.shape
53 | (2, 5)
54 |
55 | """
56 |
57 | def __init__(
58 | self,
59 | analyzer: str = "word",
60 | ngram_range: tuple[int, int] = (1, 1),
61 | normalize: bool = True,
62 | stop_words: list[str] = None,
63 | ) -> None:
64 | super().__init__(
65 | analyzer=analyzer,
66 | ngram_range=ngram_range,
67 | normalize=normalize,
68 | stop_words=stop_words,
69 | )
70 |
71 | self.idf = None
72 |
73 | def fit(self, raw_documents: list[str]) -> None:
74 | matrix = super().fit_transform(raw_documents=raw_documents)
75 | self.update(matrix=matrix)
76 | return self
77 |
78 | def update(self, matrix: csr_matrix) -> csr_matrix:
79 | """Update the idf values."""
80 | tf = (matrix > 0).sum(axis=0)
81 | self.idf = (
82 | np.squeeze(a=np.asarray(a=np.log((matrix.shape[0] + 1.0) / (tf + 1.0)))) + 1
83 | )
84 |
85 | def _transform(self, matrix: csr_matrix) -> csr_matrix:
86 | """Transform a count matrix to a bm25 matrix."""
87 | matrix.data *= np.take(
88 | a=self.idf,
89 | indices=matrix.indices,
90 | )
91 |
92 | inplace_csr_row_normalize_l2(matrix)
93 | return matrix
94 |
95 | def transform(self, raw_documents: list[str]) -> csr_matrix:
96 | """Transform documents to document-term matrix."""
97 | values, row_indices, column_indices = self.sparse_matrix.transform(
98 | raw_documents
99 | )
100 | return self._transform(
101 | matrix=csr_matrix(
102 | arg1=(values, (row_indices, column_indices)),
103 | shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
104 | dtype=np.float32,
105 | )
106 | )
107 |
108 | def fit_transform(self, raw_documents: list[str]) -> csr_matrix:
109 | """Learn the vocabulary dictionary and return the CountVectorizer object."""
110 | values, row_indices, column_indices = self.sparse_matrix.fit_transform(
111 | raw_documents
112 | )
113 |
114 | matrix = csr_matrix(
115 | arg1=(values, (row_indices, column_indices)),
116 | shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
117 | dtype=np.float32,
118 | )
119 |
120 | self.update(matrix=matrix)
121 |
122 | return self._transform(
123 | matrix=matrix,
124 | )
125 |
--------------------------------------------------------------------------------
/rust/lib.rs:
--------------------------------------------------------------------------------
1 | use pyo3::prelude::*;
2 | use pyo3::types::PyModule; // NEW
3 |
4 | mod rsanalyzer;
5 | mod rscounter;
6 | mod rsflashtext;
7 | mod rsnormalizer;
8 | mod rssparse;
9 | mod rsstop_words;
10 | mod rsvectorizer;
11 |
12 | #[pymodule]
13 | fn _rslenlp(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
14 | rsanalyzer::register_functions(m)?;
15 | rscounter::register_functions(m)?;
16 | rsflashtext::register_functions(m)?;
17 | rsnormalizer::register_functions(m)?;
18 | rssparse::register_functions(m)?;
19 | rsstop_words::register_functions(m)?;
20 | rsvectorizer::register_functions(m)?;
21 | Ok(())
22 | }
23 |
--------------------------------------------------------------------------------
/rust/rsanalyzer.rs:
--------------------------------------------------------------------------------
1 | use pyo3::prelude::*;
2 | use pyo3::types::PyModule; // <- NEW: we now use the smart-pointer Bound<…, PyModule>
3 | use pyo3::wrap_pyfunction;
4 | use rayon::prelude::*;
5 |
6 | /// Splits text into words.
7 | ///
8 | /// # Arguments
9 | ///
10 | /// * `text` - The input text.
11 | /// * `n_sizes` - The size of the n-grams.
12 | ///
13 | /// # Returns
14 | ///
15 | /// A vector of words.
16 | #[pyfunction]
17 | pub fn rssplit_words(text: &str, n_sizes: Vec) -> Vec {
18 | let mut ngrams: Vec = Vec::new();
19 |
20 | for &n in &n_sizes {
21 | let words: Vec<&str> = text.split_whitespace().collect();
22 | for window in words.windows(n) {
23 | ngrams.push(window.join(" "));
24 | }
25 | }
26 |
27 | ngrams
28 | }
29 |
30 | /// Same as `rssplit_words` but for many texts at once.
31 | #[pyfunction]
32 | pub fn rssplit_words_many(texts: Vec, n_sizes: Vec) -> Vec> {
33 | texts
34 | .par_iter()
35 | .map(|text: &String| rssplit_words(text, n_sizes.clone()))
36 | .collect()
37 | }
38 |
39 | /// Computes character n-grams.
40 | ///
41 | /// # Arguments
42 | ///
43 | /// * `texts` - A vector of input texts.
44 | /// * `n_sizes` - The size of the n-grams.
45 | ///
46 | /// # Returns
47 | ///
48 | /// A vector of character n-grams.
49 | #[pyfunction]
50 | pub fn rschar_ngrams(text: &str, n_sizes: Vec) -> Vec {
51 | let mut ngrams: Vec = Vec::new();
52 |
53 | for &n in &n_sizes {
54 | let chars: Vec = text.chars().collect();
55 | for window in chars.windows(n) {
56 | ngrams.push(window.iter().collect::());
57 | }
58 | }
59 |
60 | ngrams
61 | }
62 |
63 | /// Same as `rschar_ngrams` but for many texts at once.
64 | #[pyfunction]
65 | pub fn rschar_ngrams_many(texts: Vec, n_sizes: Vec) -> Vec> {
66 | texts
67 | .par_iter()
68 | .map(|text: &String| rschar_ngrams(text, n_sizes.clone()))
69 | .collect()
70 | }
71 |
72 | /// Character n-grams with word-boundary handling.
73 | #[pyfunction]
74 | pub fn rschar_wb_ngrams(text: &str, n_sizes: Vec) -> Vec {
75 | let mut ngrams: Vec = Vec::new();
76 | let chars: Vec = text.chars().collect();
77 |
78 | for &n in &n_sizes {
79 | if n > chars.len() {
80 | continue;
81 | }
82 | for window in chars.windows(n) {
83 | ngrams.push(window.iter().collect::());
84 | }
85 | }
86 |
87 | ngrams
88 | }
89 |
90 | /// Same as `rschar_wb_ngrams` but for many texts at once.
91 | #[pyfunction]
92 | pub fn rschar_wb_ngrams_many(texts: Vec, n_sizes: Vec) -> Vec> {
93 | texts
94 | .par_iter()
95 | .map(|text: &String| rschar_wb_ngrams(text, n_sizes.clone()))
96 | .collect()
97 | }
98 |
99 | /// Registers all the above functions in a Python sub-module.
100 | ///
101 | /// Called from your `#[pymodule]` entry-point.
102 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
103 | m.add_function(wrap_pyfunction!(rssplit_words, m)?)?;
104 | m.add_function(wrap_pyfunction!(rssplit_words_many, m)?)?;
105 | m.add_function(wrap_pyfunction!(rschar_ngrams, m)?)?;
106 | m.add_function(wrap_pyfunction!(rschar_ngrams_many, m)?)?;
107 | m.add_function(wrap_pyfunction!(rschar_wb_ngrams, m)?)?;
108 | m.add_function(wrap_pyfunction!(rschar_wb_ngrams_many, m)?)?;
109 | Ok(())
110 | }
111 |
--------------------------------------------------------------------------------
/rust/rscounter.rs:
--------------------------------------------------------------------------------
1 | use pyo3::prelude::*;
2 | use pyo3::wrap_pyfunction;
3 | use rayon::prelude::*;
4 | use std::collections::HashMap;
5 |
6 | /// Counts the number of times each word appears in the input text.
7 | ///
8 | /// # Arguments
9 | ///
10 | /// * `text` - The input text as a vector of words.
11 | ///
12 | /// # Returns
13 | ///
14 | /// A hashmap with the words as keys and the number of times they appear as values.
15 | #[pyfunction]
16 | pub fn rscount(text: Vec) -> HashMap {
17 | let mut word_counter = HashMap::new();
18 | for word in text {
19 | *word_counter.entry(word).or_insert(0) += 1;
20 | }
21 | word_counter
22 | }
23 |
24 | /// Counts the number of times each word appears for each input text.
25 | ///
26 | /// # Arguments
27 | ///
28 | /// * `texts` - The input texts as a vector of vectors of words.
29 | ///
30 | /// # Returns
31 | ///
32 | /// A vector of hashmaps with the words as keys and the number of times they appear as values.
33 | #[pyfunction]
34 | pub fn rscount_many(texts: Vec>) -> Vec> {
35 | texts.par_iter().map(|text| rscount(text.clone())).collect()
36 | }
37 |
38 | /// Registers all the above functions in a Python sub-module.
39 | ///
40 | /// Called from your `#[pymodule]` entry-point.
41 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
42 | m.add_function(wrap_pyfunction!(rscount, m)?)?;
43 | m.add_function(wrap_pyfunction!(rscount_many, m)?)?;
44 | Ok(())
45 | }
46 |
47 | #[cfg(test)]
48 | mod tests {
49 | use super::*;
50 |
51 | #[test]
52 | fn test_rscount() {
53 | let text = vec![
54 | "hello".to_string(),
55 | "world".to_string(),
56 | "hello".to_string(),
57 | "hello".to_string(),
58 | ];
59 | let result = rscount(text);
60 | let mut expected = HashMap::new();
61 | expected.insert("hello".to_string(), 3);
62 | expected.insert("world".to_string(), 1);
63 | assert_eq!(result, expected);
64 | }
65 |
66 | #[test]
67 | fn test_rscount_many() {
68 | let texts = vec![
69 | vec!["hello".to_string(), "world".to_string()],
70 | vec![
71 | "hello".to_string(),
72 | "world".to_string(),
73 | "hello".to_string(),
74 | ],
75 | ];
76 | let result = rscount_many(texts);
77 | let mut expected = Vec::new();
78 | let mut map1 = HashMap::new();
79 | map1.insert("hello".to_string(), 1);
80 | map1.insert("world".to_string(), 1);
81 | let mut map2 = HashMap::new();
82 | map2.insert("hello".to_string(), 2);
83 | map2.insert("world".to_string(), 1);
84 | expected.push(map1);
85 | expected.push(map2);
86 | assert_eq!(result, expected);
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/rust/rsflashtext.rs:
--------------------------------------------------------------------------------
1 | use pyo3::prelude::*;
2 |
3 | use std::collections::HashMap;
4 | use std::collections::HashSet;
5 |
6 | use rayon::prelude::*;
7 | use unidecode::unidecode;
8 |
9 | #[pyclass()]
10 | pub struct RSKeywordProcessor {
11 | keyword: String,
12 | non_word_boundaries: HashSet,
13 | keyword_trie_dict: HashMap,
14 | lowercase: bool,
15 | normalize: bool,
16 | terms_in_trie: usize,
17 | }
18 |
19 | #[pyclass()]
20 | pub struct RSTrieNode {
21 | children: HashMap,
22 | is_end: bool,
23 | clean_name: Option,
24 | }
25 |
26 | impl RSTrieNode {
27 | pub fn new() -> Self {
28 | RSTrieNode {
29 | children: HashMap::new(),
30 | is_end: false,
31 | clean_name: None,
32 | }
33 | }
34 | }
35 |
36 | #[pymethods]
37 | impl RSKeywordProcessor {
38 | #[new]
39 | pub fn new(lowercase: bool, normalize: bool) -> Self {
40 | let keyword: String = "_keyword_".to_string();
41 | let non_word_boundaries: HashSet = {
42 | let mut set: HashSet = HashSet::new();
43 | set.extend('0'..='9');
44 | set.extend('a'..='z');
45 | set.extend('A'..='Z');
46 | set.insert('_');
47 | set
48 | };
49 |
50 | RSKeywordProcessor {
51 | keyword,
52 | non_word_boundaries,
53 | keyword_trie_dict: HashMap::new(),
54 | lowercase,
55 | normalize,
56 | terms_in_trie: 0,
57 | }
58 | }
59 |
60 | pub fn add_keywords_many(
61 | &mut self,
62 | keywords: Vec,
63 | clean_name: Option<&str>,
64 | ) -> Vec {
65 | keywords
66 | .iter()
67 | .map(|keyword: &String| self.add_keyword(&keyword, clean_name))
68 | .collect()
69 | }
70 |
71 | pub fn add_keyword(&mut self, keyword: &str, clean_name: Option<&str>) -> bool {
72 | let clean_name: &str = clean_name.unwrap_or(keyword);
73 | let keyword: String = if self.normalize {
74 | unidecode(keyword)
75 | .to_lowercase()
76 | .chars()
77 | .filter(|c| !c.is_ascii_punctuation())
78 | .collect::()
79 | .trim()
80 | .to_string()
81 | } else if self.lowercase {
82 | keyword.to_lowercase()
83 | } else {
84 | keyword.to_string()
85 | };
86 |
87 | let mut current_node: &mut HashMap = &mut self.keyword_trie_dict;
88 | for char in keyword.chars() {
89 | current_node = &mut current_node
90 | .entry(char)
91 | .or_insert_with(RSTrieNode::new)
92 | .children;
93 | }
94 |
95 | if !current_node.contains_key(&self.keyword.chars().next().unwrap()) {
96 | self.terms_in_trie += 1;
97 | current_node.insert(
98 | self.keyword.chars().next().unwrap(),
99 | RSTrieNode {
100 | children: HashMap::new(),
101 | is_end: true,
102 | clean_name: Some(clean_name.to_string()),
103 | },
104 | );
105 | true
106 | } else {
107 | false
108 | }
109 | }
110 |
111 | pub fn extract_keywords_many(
112 | &self,
113 | sentences: Vec,
114 | ) -> Vec> {
115 | sentences
116 | .par_iter()
117 | .map(|sentence: &String| self.extract_keywords(&sentence))
118 | .collect()
119 | }
120 |
121 | pub fn extract_keywords(&self, sentence: &str) -> Vec<(String, usize, usize)> {
122 | // Map from the index in the normalized sentence to the index in the original sentence
123 | let mut index_map: Vec = Vec::with_capacity(sentence.len());
124 | let mut original_idx = 0;
125 |
126 | let normalized_sentence: String = if self.normalize {
127 | let mut normalized = String::new();
128 | for c in sentence.chars() {
129 | if c.is_ascii_punctuation() {
130 | original_idx += c.len_utf8();
131 | continue;
132 | }
133 | let normalized_char = unidecode::unidecode_char(c).to_lowercase();
134 | for nc in normalized_char.chars() {
135 | normalized.push(nc);
136 | index_map.push(original_idx);
137 | }
138 | original_idx += c.len_utf8();
139 | }
140 | normalized.to_string()
141 | } else if self.lowercase {
142 | sentence.to_lowercase()
143 | } else {
144 | sentence.to_string()
145 | };
146 |
147 | let mut extracted_keywords: Vec<(String, usize, usize)> = Vec::new();
148 | let mut current_node: &HashMap = &self.keyword_trie_dict;
149 | let mut start_pos: usize = 0;
150 | let mut end_pos: usize = 0;
151 |
152 | let mut idx: usize = 0;
153 | let sentence_len: usize = normalized_sentence.len();
154 | while idx < sentence_len {
155 | let char: char = normalized_sentence.chars().nth(idx).unwrap();
156 | if !self.non_word_boundaries.contains(&char) {
157 | if let Some(node) = current_node.get(&self.keyword.chars().next().unwrap()) {
158 | if node.is_end {
159 | let clean_name: &String = node.clean_name.as_ref().unwrap();
160 | let original_start_pos = index_map[start_pos];
161 | let original_end_pos = index_map[end_pos - 1] + 1;
162 | extracted_keywords.push((
163 | clean_name.clone(),
164 | original_start_pos,
165 | original_end_pos,
166 | ));
167 | }
168 | }
169 | current_node = &self.keyword_trie_dict;
170 | start_pos = idx + 1;
171 | } else if let Some(node) = current_node.get(&char) {
172 | current_node = &node.children;
173 | end_pos = idx + 1;
174 | } else {
175 | current_node = &self.keyword_trie_dict;
176 | start_pos = idx + 1;
177 | }
178 | idx += 1;
179 | }
180 |
181 | // Check if the last segment is a keyword
182 | if let Some(node) = current_node.get(&self.keyword.chars().next().unwrap()) {
183 | if node.is_end {
184 | let clean_name: &String = node.clean_name.as_ref().unwrap();
185 | let original_start_pos = index_map[start_pos];
186 | let original_end_pos = index_map[end_pos - 1] + 1;
187 | extracted_keywords.push((clean_name.clone(), original_start_pos, original_end_pos));
188 | }
189 | }
190 |
191 | extracted_keywords
192 | }
193 | }
194 |
195 | /// Registers all the above functions in a Python sub-module.
196 | ///
197 | /// Called from your `#[pymodule]` entry-point.
198 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
199 | m.add_class::()?;
200 | m.add_class::()?;
201 | Ok(())
202 | }
203 |
--------------------------------------------------------------------------------
/rust/rsnormalizer.rs:
--------------------------------------------------------------------------------
1 | use pyo3::prelude::*;
2 | use pyo3::wrap_pyfunction;
3 | use rayon::prelude::*;
4 | use unidecode::unidecode;
5 |
6 | /// Normalize text by converting to lowercase, removing punctuation, and trimming whitespace.
7 | ///
8 | /// # Arguments
9 | ///
10 | /// * `text` - A string slice that holds the text to normalize.
11 | ///
12 | /// # Returns
13 | ///
14 | /// A String that holds the normalized text.
15 | #[pyfunction]
16 | pub fn rsnormalize(text: &str) -> String {
17 | unidecode(text)
18 | .to_lowercase()
19 | .chars()
20 | .filter(|c| !c.is_ascii_punctuation())
21 | .collect::()
22 | .trim()
23 | .to_string()
24 | }
25 |
26 | /// Normalize multiple texts.
27 | ///
28 | /// # Arguments
29 | ///
30 | /// * `texts` - A vector of strings that holds the texts to normalize.
31 | ///
32 | /// # Returns
33 | ///
34 | /// A vector of strings that holds the normalized texts.
35 | #[pyfunction]
36 | pub fn rsnormalize_many(texts: Vec) -> Vec {
37 | texts.par_iter().map(|text| rsnormalize(text)).collect()
38 | }
39 |
40 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
41 | m.add_function(wrap_pyfunction!(rsnormalize, m)?)?;
42 | m.add_function(wrap_pyfunction!(rsnormalize_many, m)?)?;
43 | Ok(())
44 | }
45 |
46 | #[cfg(test)]
47 | mod tests {
48 | use super::*;
49 |
50 | #[test]
51 | fn test_rsnormalize() {
52 | assert_eq!(rsnormalize("Hello World! 😀"), "hello world");
53 | assert_eq!(rsnormalize("1,2,3,4"), "1234");
54 | }
55 |
56 | #[test]
57 | fn test_rsnormalize_many() {
58 | let input = vec!["Hello World! 😀".to_string(), "Goodbye, World!".to_string()];
59 | let expected = vec!["hello world".to_string(), "goodbye world".to_string()];
60 | assert_eq!(rsnormalize_many(input), expected);
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/rust/rssparse.rs:
--------------------------------------------------------------------------------
1 | use crate::rsvectorizer::rsvectorize_many;
2 | use bincode::{deserialize, serialize};
3 | use numpy::PyArray1;
4 | use pyo3::prelude::*;
5 | use pyo3::types::{PyBytes, PyModule}; // NEW
6 | use serde::{Deserialize, Serialize};
7 | use std::collections::HashMap;
8 |
9 | // ---------------------------------------------------------------------------
10 | // Sparse-matrix builder
11 | // ---------------------------------------------------------------------------
12 |
13 | #[derive(Clone, Debug, Serialize, Deserialize)]
14 | #[pyclass(module = "lenlp.sparse.count_vectorizer")]
15 | pub struct SparseMatrixBuilder {
16 | analyzer: String,
17 | n_sizes: Vec,
18 | stop_words: Option>,
19 | normalize: Option,
20 | vocab: HashMap,
21 | num_cols: usize,
22 | }
23 |
24 | #[pymethods]
25 | impl SparseMatrixBuilder {
26 | #[new]
27 | pub fn new(
28 | n_sizes: Vec,
29 | analyzer: String,
30 | stop_words: Option>,
31 | normalize: Option,
32 | ) -> Self {
33 | Self {
34 | vocab: HashMap::new(),
35 | n_sizes,
36 | analyzer,
37 | stop_words,
38 | normalize,
39 | num_cols: 0,
40 | }
41 | }
42 |
43 | /// Build the vocabulary and return the CSR triplet arrays.
44 | pub fn fit_transform(
45 | &mut self,
46 | texts: Vec,
47 | py: Python<'_>,
48 | ) -> (
49 | Py>,
50 | Py>,
51 | Py>,
52 | ) {
53 | self.vocab = HashMap::new();
54 | let texts: Vec> = rsvectorize_many(
55 | texts,
56 | self.n_sizes.clone(),
57 | self.analyzer.clone(),
58 | self.stop_words.clone(),
59 | self.normalize,
60 | );
61 |
62 | self._fit(texts.clone());
63 |
64 | // Scipy csr_matrix are faster to build from numpy arrays.
65 | let (vec1, vec2, vec3) = self._transform(texts);
66 | (
67 | PyArray1::from_vec_bound(py, vec1).into(),
68 | PyArray1::from_vec_bound(py, vec2).into(),
69 | PyArray1::from_vec_bound(py, vec3).into(),
70 | )
71 | }
72 |
73 | pub fn fit(&mut self, texts: Vec) {
74 | self.vocab = HashMap::new();
75 | let texts: Vec> = rsvectorize_many(
76 | texts,
77 | self.n_sizes.clone(),
78 | self.analyzer.clone(),
79 | self.stop_words.clone(),
80 | self.normalize,
81 | );
82 |
83 | self._fit(texts);
84 | }
85 |
86 | fn _fit(&mut self, texts: Vec>) {
87 | let mut col_index: usize = 0;
88 | for doc in &texts {
89 | for token in doc.keys() {
90 | if !self.vocab.contains_key(token) {
91 | self.vocab.insert(token.clone(), col_index);
92 | col_index += 1;
93 | }
94 | }
95 | }
96 | self.num_cols = col_index;
97 | }
98 |
99 | pub fn transform(
100 | &self,
101 | texts: Vec,
102 | py: Python<'_>,
103 | ) -> (
104 | Py>,
105 | Py>,
106 | Py>,
107 | ) {
108 | let texts: Vec> = rsvectorize_many(
109 | texts,
110 | self.n_sizes.clone(),
111 | self.analyzer.clone(),
112 | self.stop_words.clone(),
113 | self.normalize,
114 | );
115 |
116 | // Scipy csr_matrix are faster to build from numpy arrays.
117 | let (vec1, vec2, vec3) = self._transform(texts);
118 | (
119 | PyArray1::from_vec_bound(py, vec1).into(),
120 | PyArray1::from_vec_bound(py, vec2).into(),
121 | PyArray1::from_vec_bound(py, vec3).into(),
122 | )
123 | }
124 |
125 | fn _transform(
126 | &self,
127 | texts: Vec>,
128 | ) -> (Vec, Vec, Vec) {
129 | let mut values: Vec = Vec::new();
130 | let mut row_indices: Vec = Vec::new();
131 | let mut column_indices: Vec = Vec::new();
132 |
133 | for (row_idx, doc) in texts.iter().enumerate() {
134 | for (token, &count) in doc.iter() {
135 | if let Some(&col_idx) = self.vocab.get(token) {
136 | values.push(count);
137 | row_indices.push(row_idx);
138 | column_indices.push(col_idx);
139 | }
140 | }
141 | }
142 |
143 | (values, row_indices, column_indices)
144 | }
145 |
146 | // ---------------------------------------------------------------------
147 | // Accessors
148 | // ---------------------------------------------------------------------
149 | pub fn get_vocab(&self) -> HashMap {
150 | self.vocab.clone()
151 | }
152 |
153 | pub fn get_num_cols(&self) -> usize {
154 | self.num_cols
155 | }
156 |
157 | // ---------------------------------------------------------------------
158 | // Pickle support
159 | // ---------------------------------------------------------------------
160 |
161 | pub fn __setstate__(&mut self, state: &Bound<'_, PyBytes>) -> PyResult<()> {
162 | *self = deserialize(state.as_bytes()).unwrap();
163 | Ok(())
164 | }
165 |
166 | pub fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult> {
167 | Ok(PyBytes::new(py, &serialize(&self).unwrap()))
168 | }
169 |
170 | pub fn __getnewargs__(
171 | &self,
172 | ) -> PyResult<(Vec, String, Option>, Option)> {
173 | Ok((
174 | self.n_sizes.clone(),
175 | self.analyzer.clone(),
176 | self.stop_words.clone(),
177 | self.normalize,
178 | ))
179 | }
180 | }
181 |
182 | // ---------------------------------------------------------------------------
183 | // Module registration
184 | // ---------------------------------------------------------------------------
185 |
186 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
187 | m.add_class::()?;
188 | Ok(())
189 | }
190 |
--------------------------------------------------------------------------------
/rust/rsstop_words.rs:
--------------------------------------------------------------------------------
1 | use pyo3::prelude::*;
2 | use pyo3::wrap_pyfunction;
3 | use rayon::prelude::*;
4 | use std::collections::HashSet;
5 |
6 | /// Function to filter stop words from a string
7 | ///
8 | /// # Arguments
9 | ///
10 | /// * `text` - The input text.
11 | /// * `stop_words` - The stop words to filter.
12 | ///
13 | /// # Returns
14 | ///
15 | /// A string with the stop words removed.
16 | #[pyfunction]
17 | pub fn rsfilter_stop_words(text: &str, stop_words: Vec) -> String {
18 | // Use HashSet for better performance in membership checks
19 | let stop_words_set: HashSet<_> = stop_words.into_iter().collect();
20 | text.split_whitespace()
21 | .filter(|word: &&str| !stop_words_set.contains(*word))
22 | .collect::>()
23 | .join(" ")
24 | }
25 |
26 | /// Function to filter stop words from multiple strings
27 | ///
28 | /// # Arguments
29 | ///
30 | /// * `texts` - The input texts.
31 | /// * `stop_words` - The stop words to filter.
32 | ///
33 | /// # Returns
34 | ///
35 | /// A vector of strings with the stop words removed.
36 | #[pyfunction]
37 | pub fn rsfilter_stop_words_many(texts: Vec, stop_words: Vec) -> Vec {
38 | // Use HashSet for better performance in membership checks
39 | let stop_words_set: HashSet<_> = stop_words.into_iter().collect();
40 | texts
41 | .into_par_iter()
42 | .map(|sentence: String| {
43 | sentence
44 | .split_whitespace()
45 | .filter(|word: &&str| !stop_words_set.contains(*word))
46 | .collect::>()
47 | .join(" ")
48 | })
49 | .collect()
50 | }
51 |
52 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
53 | m.add_function(wrap_pyfunction!(rsfilter_stop_words, m)?)?;
54 | m.add_function(wrap_pyfunction!(rsfilter_stop_words_many, m)?)?;
55 | Ok(())
56 | }
57 |
--------------------------------------------------------------------------------
/rust/rsvectorizer.rs:
--------------------------------------------------------------------------------
1 | use pyo3::prelude::*;
2 | use pyo3::wrap_pyfunction;
3 | use rayon::prelude::*;
4 | use std::collections::HashMap;
5 |
6 | use crate::rsanalyzer::rschar_ngrams;
7 | use crate::rsanalyzer::rschar_wb_ngrams;
8 | use crate::rsanalyzer::rssplit_words;
9 | use crate::rscounter::rscount;
10 | use crate::rsnormalizer::rsnormalize_many;
11 | use crate::rsstop_words::rsfilter_stop_words_many;
12 |
13 | pub fn process_texts(
14 | texts: Vec,
15 | normalize: Option,
16 | stop_words: Option>,
17 | ) -> Vec {
18 | let texts: Vec = match normalize {
19 | Some(true) => rsnormalize_many(texts),
20 | _ => texts,
21 | };
22 |
23 | match stop_words {
24 | Some(stop_words) => rsfilter_stop_words_many(texts, stop_words),
25 | None => texts,
26 | }
27 | }
28 |
29 | #[pyfunction]
30 | pub fn rsvectorize_split_words_many(
31 | texts: Vec,
32 | n_sizes: Vec,
33 | stop_words: Option>,
34 | normalize: Option,
35 | ) -> Vec> {
36 | let texts: Vec = process_texts(texts, normalize, stop_words);
37 | texts
38 | .par_iter()
39 | .map(|text: &String| rscount(rssplit_words(text, n_sizes.clone())))
40 | .collect()
41 | }
42 |
43 | #[pyfunction]
44 | pub fn rsvectorize_char_ngrams_many(
45 | texts: Vec,
46 | n_sizes: Vec,
47 | stop_words: Option>,
48 | normalize: Option,
49 | ) -> Vec> {
50 | let texts: Vec = process_texts(texts, normalize, stop_words);
51 | texts
52 | .par_iter()
53 | .map(|text: &String| rscount(rschar_ngrams(text, n_sizes.clone())))
54 | .collect()
55 | }
56 |
57 | #[pyfunction]
58 | pub fn rsvectorize_char_wb_ngrams_many(
59 | texts: Vec,
60 | n_sizes: Vec,
61 | stop_words: Option>,
62 | normalize: Option,
63 | ) -> Vec> {
64 | let texts: Vec = process_texts(texts, normalize, stop_words);
65 | texts
66 | .par_iter()
67 | .map(|text: &String| rscount(rschar_wb_ngrams(text, n_sizes.clone())))
68 | .collect()
69 | }
70 |
71 | // Main vectorization function
72 | #[pyfunction]
73 | pub fn rsvectorize_many(
74 | texts: Vec,
75 | n_sizes: Vec,
76 | analyzer: String,
77 | stop_words: Option>,
78 | normalize: Option,
79 | ) -> Vec> {
80 | match analyzer.as_str() {
81 | "word" => rsvectorize_split_words_many(texts, n_sizes, stop_words, normalize),
82 | "char" => rsvectorize_char_ngrams_many(texts, n_sizes, stop_words, normalize),
83 | "char_wb" => rsvectorize_char_wb_ngrams_many(texts, n_sizes, stop_words, normalize),
84 | _ => panic!("Invalid analyzer type"),
85 | }
86 | }
87 |
88 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
89 | m.add_function(wrap_pyfunction!(rsvectorize_split_words_many, m)?)?;
90 | m.add_function(wrap_pyfunction!(rsvectorize_char_ngrams_many, m)?)?;
91 | m.add_function(wrap_pyfunction!(rsvectorize_char_wb_ngrams_many, m)?)?;
92 | m.add_function(wrap_pyfunction!(rsvectorize_many, m)?)?;
93 |
94 | Ok(())
95 | }
96 |
--------------------------------------------------------------------------------