├── .github
    └── workflows
    │   └── publish.yml
├── .gitignore
├── Cargo.toml
├── README.md
├── docs
    ├── bm25.png
    ├── count_vectorizer.png
    ├── count_vectorizer_char.png
    ├── flashtext.png
    ├── logo.png
    └── tfidf.png
├── pyproject.toml
├── python
    └── lenlp
    │   ├── __init__.py
    │   ├── analyzer
    │       ├── __init__.py
    │       └── analyze.py
    │   ├── counter
    │       ├── __init__.py
    │       └── count.py
    │   ├── flash
    │       ├── __init__.py
    │       └── flash_text.py
    │   ├── normalizer
    │       ├── __init__.py
    │       └── normalize.py
    │   └── sparse
    │       ├── __init__.py
    │       ├── bm25_vectorizer.py
    │       ├── count_vectorizer.py
    │       └── tfidf_vectorizer.py
└── rust
    ├── lib.rs
    ├── rsanalyzer.rs
    ├── rscounter.rs
    ├── rsflashtext.rs
    ├── rsnormalizer.rs
    ├── rssparse.rs
    ├── rsstop_words.rs
    └── rsvectorizer.rs


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
  1 | # This file is autogenerated by maturin v1.5.1
  2 | # To update, run
  3 | #
  4 | #    maturin generate-ci github
  5 | #
  6 | name: CI
  7 | 
  8 | on:
  9 |   push:
 10 |     branches:
 11 |       - main
 12 |   workflow_dispatch:
 13 | 
 14 | permissions:
 15 |   contents: read
 16 | 
 17 | jobs:
 18 |   linux:
 19 |     runs-on: ${{ matrix.runner }}
 20 |     strategy:
 21 |       matrix:
 22 |         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
 23 |         target: [x86_64, x86, aarch64, armv7, s390x, ppc64le]
 24 |         runner: [ubuntu-latest]
 25 |         exclude:
 26 |           - target: x86
 27 |             python-version: ['3.8', '3.9', '3.11', '3.12']
 28 |           - target: aarch64
 29 |             python-version: ['3.8', '3.9', '3.11', '3.12']
 30 |           - target: armv7
 31 |             python-version: ['3.8', '3.9', '3.11', '3.12']
 32 |           - target: s390x
 33 |             python-version: ['3.8', '3.9', '3.11', '3.12']
 34 |           - target: ppc64le
 35 |             python-version: ['3.8', '3.9', '3.11', '3.12']
 36 |     steps:
 37 |       - uses: actions/checkout@v4
 38 |       - uses: actions/setup-python@v5
 39 |         with:
 40 |           python-version: ${{ matrix.python-version }}
 41 |       - name: Build wheels
 42 |         uses: PyO3/maturin-action@v1
 43 |         with:
 44 |           target: ${{ matrix.target }}
 45 |           args: --release --out dist --find-interpreter
 46 |           sccache: 'true'
 47 |           manylinux: auto
 48 |       - name: Upload wheels
 49 |         uses: actions/upload-artifact@v4
 50 |         with:
 51 |           name: wheels-linux-${{ matrix.target }}-py${{ matrix.python-version }}-${{ github.run_id }}
 52 |           path: dist
 53 | 
 54 |   windows:
 55 |     runs-on: ${{ matrix.runner }}
 56 |     strategy:
 57 |       matrix:
 58 |         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
 59 |         target: [x64, x86]
 60 |         runner: [windows-latest]
 61 |         exclude:
 62 |           - target: x86
 63 |             python-version: ['3.8', '3.9', '3.11', '3.12']
 64 |     steps:
 65 |       - uses: actions/checkout@v4
 66 |       - uses: actions/setup-python@v5
 67 |         with:
 68 |           python-version: ${{ matrix.python-version }}
 69 |           architecture: ${{ matrix.target }}
 70 |       - name: Build wheels
 71 |         uses: PyO3/maturin-action@v1
 72 |         with:
 73 |           target: ${{ matrix.target }}
 74 |           args: --release --out dist --find-interpreter
 75 |           sccache: 'true'
 76 |       - name: Upload wheels
 77 |         uses: actions/upload-artifact@v4
 78 |         with:
 79 |           name: wheels-windows-${{ matrix.target }}-py${{ matrix.python-version }}-${{ github.run_id }}
 80 |           path: dist
 81 | 
 82 |   macos:
 83 |     runs-on: ${{ matrix.runner }}
 84 |     strategy:
 85 |       matrix:
 86 |         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
 87 |         target: [x86_64, aarch64]
 88 |         runner: [macos-latest]
 89 |         exclude:
 90 |           - target: aarch64
 91 |             python-version: ['3.8', '3.9', '3.11', '3.12']
 92 |     steps:
 93 |       - uses: actions/checkout@v4
 94 |       - uses: actions/setup-python@v5
 95 |         with:
 96 |           python-version: ${{ matrix.python-version }}
 97 |       - name: Build wheels
 98 |         uses: PyO3/maturin-action@v1
 99 |         with:
100 |           target: ${{ matrix.target }}
101 |           args: --release --out dist --find-interpreter
102 |           sccache: 'true'
103 |       - name: Upload wheels
104 |         uses: actions/upload-artifact@v4
105 |         with:
106 |           name: wheels-macos-${{ matrix.target }}-py${{ matrix.python-version }}-${{ github.run_id }}
107 |           path: dist
108 | 
109 |   sdist:
110 |     runs-on: ubuntu-latest
111 |     steps:
112 |       - uses: actions/checkout@v4
113 |       - name: Build sdist
114 |         uses: PyO3/maturin-action@v1
115 |         with:
116 |           command: sdist
117 |           args: --out dist
118 |       - name: Upload sdist
119 |         uses: actions/upload-artifact@v4
120 |         with:
121 |           name: wheels-sdist-${{ github.run_id }}
122 |           path: dist
123 | 
124 |   release:
125 |     name: Release
126 |     runs-on: ubuntu-latest
127 |     needs: [linux, windows, macos, sdist]
128 |     steps:
129 |       - uses: actions/download-artifact@v4
130 |       - name: Publish to PyPI
131 |         uses: PyO3/maturin-action@v1
132 |         env:
133 |           MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_PASSWORD }}
134 |         with:
135 |           command: upload
136 |           args: --non-interactive --skip-existing wheels-*/*
137 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | /target
  2 | Cargo.lock
  3 | .DS_Store
  4 | 
  5 | *.json
  6 | 
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | *.test
 13 | *.onx
 14 | *.qonx
 15 | *.DS_Store
 16 | *.pyc
 17 | *.ipynb_checkpoints
 18 | *.pickle
 19 | *.pkl
 20 | *.icloud
 21 | cache/
 22 | # C extensions
 23 | *.so
 24 | test/
 25 | 
 26 | # Distribution / packaging
 27 | .Python
 28 | build/
 29 | develop-eggs/
 30 | dist/
 31 | downloads/
 32 | eggs/
 33 | .eggs/
 34 | lib/
 35 | lib64/
 36 | parts/
 37 | sdist/
 38 | var/
 39 | wheels/
 40 | pip-wheel-metadata/
 41 | share/python-wheels/
 42 | *.egg-info/
 43 | .installed.cfg
 44 | *.egg
 45 | MANIFEST
 46 | 
 47 | # PyInstaller
 48 | #  Usually these files are written by a python script from a template
 49 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 50 | *.manifest
 51 | *.spec
 52 | 
 53 | # Installer logs
 54 | pip-log.txt
 55 | pip-delete-this-directory.txt
 56 | 
 57 | # Unit test / coverage reports
 58 | htmlcov/
 59 | .tox/
 60 | .nox/
 61 | .coverage
 62 | .coverage.*
 63 | .cache
 64 | nosetests.xml
 65 | coverage.xml
 66 | *.cover
 67 | *.py,cover
 68 | .hypothesis/
 69 | .pytest_cache/
 70 | 
 71 | # Translations
 72 | *.mo
 73 | *.pot
 74 | 
 75 | # Django stuff:
 76 | *.log
 77 | local_settings.py
 78 | db.sqlite3
 79 | db.sqlite3-journal
 80 | 
 81 | # Flask stuff:
 82 | instance/
 83 | .webassets-cache
 84 | 
 85 | # Scrapy stuff:
 86 | .scrapy
 87 | 
 88 | # Sphinx documentation
 89 | docs/_build/
 90 | 
 91 | # PyBuilder
 92 | target/
 93 | 
 94 | # Jupyter Notebook
 95 | .ipynb_checkpoints
 96 | 
 97 | # IPython
 98 | profile_default/
 99 | ipython_config.py
100 | 
101 | # pyenv
102 | .python-version
103 | 
104 | # pipenv
105 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
106 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
107 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
108 | #   install all needed dependencies.
109 | #Pipfile.lock
110 | 
111 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
112 | __pypackages__/
113 | 
114 | # Celery stuff
115 | celerybeat-schedule
116 | celerybeat.pid
117 | 
118 | # SageMath parsed files
119 | *.sage.py
120 | 
121 | # Environments
122 | .env
123 | .venv
124 | env/
125 | venv/
126 | ENV/
127 | env.bak/
128 | venv.bak/
129 | 
130 | # Spyder project settings
131 | .spyderproject
132 | .spyproject
133 | 
134 | # Rope project settings
135 | .ropeproject
136 | 
137 | # mkdocs documentation
138 | /site
139 | 
140 | # mypy
141 | .mypy_cache/
142 | .dmypy.json
143 | dmypy.json
144 | 
145 | # Pyre type checker
146 | .pyre/
147 | test.ipynb


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "_rslenlp"
 3 | edition = "2021"
 4 | 
 5 | [lib]
 6 | name = "_rslenlp"
 7 | crate-type = ["cdylib"]
 8 | path = "rust/lib.rs"
 9 | 
10 | [dependencies]
11 | unidecode = "0.3.0"
12 | rayon = "1.10.0"
13 | pyo3 = { version = "0.24.2", features = [
14 |     "extension-module",
15 |     "generate-import-lib",
16 | ] }
17 | serde = { version = "1.0.202", features = ["derive"] }
18 | serde_json = { version = "1.0.117" }
19 | bincode = "1.3.3"
20 | ndarray = "0.15"
21 | numpy = "0.24"
22 | 
23 | [profile.dev]
24 | opt-level = 0
25 | 
26 | [profile.release]
27 | opt-level = 3
28 | 
29 | [tool.maturin]
30 | features = ["pyo3/extension-module"]
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |   <h1>LeNLP</h1>
  3 |   <p>Natural Language Processing toolbox for Python with Rust</p>
  4 | </div>
  5 | 
  6 | <p align="center"><img width=500 src="docs/logo.png"/></p>
  7 | 
  8 | <div align="center">
  9 |   <!-- License -->
 10 |   <a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-blue.svg?style=flat-square" alt="license"></a>
 11 | </div>
 12 | 
 13 | 
 14 | LeNLP is a toolkit dedicated to natural language processing (NLP). It provides optimized and parallelized functions in Rust for use in Python, offering high performance and ease of integration.
 15 | 
 16 | ## Installation
 17 | 
 18 | We can install LeNLP using:
 19 | 
 20 | ```
 21 | pip install lenlp
 22 | ```
 23 | 
 24 | ## Sections
 25 | 
 26 | - [Installation](#installation)
 27 | - [Quick Start](#quick-start)
 28 | - [Sparse Module](#sparse-module)
 29 |     - [CountVectorizer](#countvectorizer)
 30 |     - [TfidfVectorizer](#TfidfVectorizer)
 31 |     - [BM25Vectorizer](#bm25vectorizer)
 32 | - [FlashText](#flashtext)
 33 | - [Extras](#extras)
 34 |     - [Counter](#counter)
 35 |     - [Normalizer](#normalizer)
 36 | 
 37 | ## Quick Start
 38 | 
 39 | ### Sparse Module
 40 | 
 41 | The `sparse` module offers a variety of vectorizers and transformers for text data. These sparse matrices are `scipy.sparse.csr_matrix` objects, optimized for memory usage and speed. They can be used as drop-in replacements for `scikit-learn` vectorizers.
 42 | 
 43 | #### CountVectorizer
 44 | 
 45 | The `CountVectorizer` converts a list of texts into a sparse matrix of token counts. This is a Rust implementation of the `CountVectorizer` from `scikit-learn`.
 46 | 
 47 | ```python
 48 | from lenlp import sparse
 49 | 
 50 | vectorizer = sparse.CountVectorizer(
 51 |     ngram_range=(3, 5), # range of n-grams
 52 |     analyzer="char_wb", # word, char, char_wb
 53 |     normalize=True, # lowercase and strip accents
 54 |     stop_words=["based"], # list of stop words
 55 | )
 56 | ```
 57 | 
 58 | You can fit the vectorizer and transform a list of texts into a sparse matrix of token counts:
 59 | 
 60 | ```python
 61 | X = [
 62 |     "Hello World", 
 63 |     "Rust based vectorizer"
 64 | ]
 65 | 
 66 | matrix = vectorizer.fit_transform(X)
 67 | ```
 68 | 
 69 | Or use separate calls:
 70 | 
 71 | ```python
 72 | vectorizer.fit(X)
 73 | matrix = vectorizer.transform(X)
 74 | ```
 75 | 
 76 | Benchmark:
 77 | 
 78 | <p align="center"><img width=500 src="docs/count_vectorizer_char.png"/></p>
 79 | 
 80 | LeNLP CountVectorizer versus Sklearn CountVectorizer `fit_transform` with `char` analyzer.
 81 | 
 82 | #### TfidfVectorizer
 83 | 
 84 | The `TfidfVectorizer` converts a list of texts into a sparse matrix of tf-idf weights, implemented in Rust.
 85 | 
 86 | ```python
 87 | from lenlp import sparse
 88 | 
 89 | vectorizer = sparse.TfidfVectorizer(
 90 |     ngram_range=(3, 5), # Range of n-grams
 91 |     analyzer="char_wb", # Options: word, char, char_wb
 92 |     normalize=True, # Lowercase and strip accents
 93 |     stop_words=["based"] # List of stop words
 94 | )
 95 | ```
 96 | 
 97 | Fit the vectorizer and transform texts:
 98 | 
 99 | ```python
100 | X = [
101 |     "Hello World", 
102 |     "Rust based vectorizer"
103 | ]
104 | 
105 | matrix = vectorizer.fit_transform(X)
106 | ```
107 | 
108 | Or use separate calls:
109 | 
110 | ```python
111 | vectorizer.fit(X)
112 | matrix = vectorizer.transform(X)
113 | ```
114 | 
115 | Benchmark:
116 | 
117 | <p align="center"><img width=500 src="docs/tfidf.png"/></p>
118 | 
119 | LeNLP TfidfVectorizer versus Sklearn TfidfVectorizer `fit_transform` with `char` analyzer. 
120 | 
121 | #### BM25Vectorizer
122 | 
123 | The `BM25Vectorizer` converts texts into a sparse matrix of BM25 weights, which are more accurate than tf-idf and count weights.
124 | 
125 | ```python
126 | from lenlp import sparse
127 | 
128 | vectorizer = sparse.BM25Vectorizer(
129 |     ngram_range=(3, 5), # Range of n-grams
130 |     analyzer="char_wb", # Options: word, char, char_wb
131 |     normalize=True, # Lowercase and strip accents
132 |     stop_words=["based"] # List of stop words
133 | )
134 | ```
135 | 
136 | Fit the vectorizer and transform texts:
137 | 
138 | ```python
139 | X = [
140 |     "Hello World", 
141 |     "Rust based vectorizer"
142 | ]
143 | 
144 | matrix = vectorizer.fit_transform(X)
145 | ```
146 | 
147 | Or use separate calls:
148 | 
149 | ```python
150 | vectorizer.fit(X)
151 | matrix = vectorizer.transform(X)
152 | ```
153 | 
154 | Benchmark:
155 | 
156 | <p align="center"><img width=500 src="docs/bm25.png"/></p>
157 | 
158 | 
159 | LeNLP BM25Vectorizer versus LeNLP TfidfVectorizer `fit_transform` with `char` analyzer. BM25Vectorizer counterpart is not available in Sklearn.
160 | 
161 | ### FlashText
162 | 
163 | The `flashtext` module allows for efficient keyword extraction from texts. It implements the FlashText algorithm as described in the paper *[Replace or Retrieve Keywords In Documents At Scale](https://arxiv.org/pdf/1711.00046)*.
164 | 
165 | ```python
166 | from lenlp import flash
167 | 
168 | flash_text = flash.FlashText(
169 |     normalize=True # remove accents and lowercase
170 | ) 
171 | 
172 | # Add keywords we want to retrieve:
173 | flash_text.add(["paris", "bordeaux", "toulouse"])
174 | ```
175 | 
176 | Extract keywords and their positions from sentences:
177 | 
178 | ```python
179 | sentences = [
180 |     "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux",
181 |     "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse",
182 | ]
183 | 
184 | flash_text.extract(sentences)
185 | ```
186 | 
187 | Output:
188 | 
189 | ```python
190 | [[('toulouse', 0, 8), ('bordeaux', 60, 68), ('bordeaux', 74, 82)],
191 |  [('paris', 0, 5), ('bordeaux', 62, 70), ('toulouse', 76, 84)]]
192 | ```
193 | 
194 | The FlashText algorithm is highly efficient, significantly faster than regular expressions for keyword extraction. LeNLP's implementation normalizes input documents by removing accents and converting to lowercase to enhance keyword extraction.
195 | 
196 | Benchmark:
197 | 
198 | <p align="center"><img width=500 src="docs/flashtext.png"/></p>
199 | 
200 | LeNLP FlashText is benchmarked versus the official implementation of [FlashText](https://github.com/vi3k6i5/flashtext).
201 | 
202 | ### Extras
203 | 
204 | #### Counter
205 | 
206 | The counter module allows to convert a list of texts into a dictionary of token counts.
207 | 
208 | ```python
209 | from lenlp import counter
210 | 
211 | sentences = [
212 |     "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux",
213 |     "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse",
214 | ]
215 | 
216 | counter.count(
217 |     sentences,
218 |     ngram_range=(1, 1), # Range of n-grams
219 |     analyzer="word", # Options: word, char, char_wb
220 |     normalize=True, # Lowercase and strip accents
221 |     stop_words=["its", "in", "is", "of", "the", "and", "to", "a"] # List of stop words
222 | )
223 | ```
224 | 
225 | Output:
226 | 
227 | ```python
228 | [{'compared': 1,
229 |   'south': 1,
230 |   'city': 1,
231 |   'toulouse': 1,
232 |   'bordeaux': 2,
233 |   'france': 1},
234 |  {'toulouse': 1,
235 |   'france': 1,
236 |   'capital': 1,
237 |   'paris': 1,
238 |   'north': 1,
239 |   'compared': 1,
240 |   'bordeaux': 1}]
241 | ```
242 | 
243 | #### Normalizer
244 | 
245 | The normalizer module allows to normalize a list of texts by removing accents and converting to lowercase.
246 | 
247 | ```python
248 | from lenlp import normalizer
249 | 
250 | sentences = [
251 |     "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux",
252 |     "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse",
253 | ]
254 | 
255 | normalizer.normalize(sentences)
256 | ```
257 | 
258 | Output:
259 | 
260 | ```python
261 | [
262 | 	'toulouse is a city in france its in the south compared to bordeaux and bordeaux',
263 |  	'paris is the capital of france its in the north compared to bordeaux and toulouse',
264 | ]
265 | ```
266 | 
267 | ## References
268 | 
269 | - *[FlashText](https://github.com/vi3k6i5/flashtext)*
270 | - *[Scikit Learn](https://github.com/scikit-learn/scikit-learn)*
271 | - *[PyO3](https://github.com/PyO3/pyo3)* 
272 | - *[Maturin](https://github.com/PyO3/maturin)*
273 | 
274 | 


--------------------------------------------------------------------------------
/docs/bm25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/bm25.png


--------------------------------------------------------------------------------
/docs/count_vectorizer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/count_vectorizer.png


--------------------------------------------------------------------------------
/docs/count_vectorizer_char.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/count_vectorizer_char.png


--------------------------------------------------------------------------------
/docs/flashtext.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/flashtext.png


--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/logo.png


--------------------------------------------------------------------------------
/docs/tfidf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/tfidf.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["maturin >= 1.5.1"]
 3 | build-backend = "maturin"
 4 | 
 5 | [project]
 6 | name = "lenlp"
 7 | version = "1.2.0"
 8 | description = "Natural Language Processing library for Python from Rust."
 9 | 
10 | authors = [{ name = "Raphael Sourty", email = "raphael.sourty@gmail.com" }]
11 | 
12 | 
13 | keywords = []
14 | 
15 | classifiers = [
16 |     "Programming Language :: Python :: 3",
17 |     "Programming Language :: Rust",
18 |     "Operating System :: OS Independent",
19 | ]
20 | 
21 | requires-python = ">=3.8"
22 | 
23 | dependencies = ["scikit-learn >= 1.5.0", "scipy >= 1.13.1"]
24 | 
25 | [project.urls]
26 | Homepage = "https://github.com/raphaelsty/lenlp"
27 | Documentation = "https://github.com/raphaelsty/lenlp"
28 | Repository = "https://github.com/raphaelsty/lenlp"
29 | 
30 | [project.optional-dependencies]
31 | dev = [
32 |     "maturin >= 1.5.1",
33 |     "pytest-cov >= 5.0.0",
34 |     "pytest >= 7.4.4",
35 |     "ruff >= 0.1.15",
36 | ]
37 | [tool.maturin]
38 | bindings = "pyo3"
39 | features = ["pyo3/extension-module"]
40 | python-source = "python"
41 | module-name = "lenlp._rslenlp"
42 | 
43 | [tool.include]
44 | include = ["Cargo.toml", "pyproject.toml", "README.md", "rust/*"]
45 | 
46 | [tool.pytest.ini_options]
47 | filterwarnings = [
48 |     "ignore::DeprecationWarning",
49 |     "ignore::RuntimeWarning",
50 |     "ignore::UserWarning",
51 | ]
52 | addopts = [
53 |     "--doctest-modules",
54 |     "--verbose",
55 |     "-ra",
56 |     "--cov-config=.coveragerc",
57 |     "-m not web and not slow",
58 | ]
59 | doctest_optionflags = ["NORMALIZE_WHITESPACE", "NUMBER"]
60 | norecursedirs = ["build", "docs", "node_modules"]
61 | markers = [
62 |     "web: tests that require using the Internet",
63 |     "slow: tests that take a long time to run",
64 | ]
65 | 


--------------------------------------------------------------------------------
/python/lenlp/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 |     "analyzer",
3 |     "counter",
4 |     "flash",
5 |     "normalizer",
6 |     "sparse",
7 | ]
8 | 


--------------------------------------------------------------------------------
/python/lenlp/analyzer/__init__.py:
--------------------------------------------------------------------------------
1 | from .analyze import analyze
2 | 
3 | __all__ = [
4 |     "analyze",
5 | ]
6 | 


--------------------------------------------------------------------------------
/python/lenlp/analyzer/analyze.py:
--------------------------------------------------------------------------------
 1 | from lenlp._rslenlp import rschar_ngrams_many, rschar_wb_ngrams_many, rssplit_words_many
 2 | 
 3 | __all__ = ["analyze"]
 4 | 
 5 | 
 6 | def analyze(
 7 |     x: str | list[str],
 8 |     analyzer: str = "word",
 9 |     ngram_range: tuple[int, int] = (1, 1),
10 | ) -> str | list[str]:
11 |     """Split text or list of texts into words or characters.
12 | 
13 |     Parameters
14 |     ----------
15 |     x
16 |         str or list of str.
17 |     analyzer
18 |         {word, char, char_wb}, default=word.
19 |         Whether the feature should be made of word n-gram or character n-grams. Option
20 |         char_wb creates character n-grams only from text inside word boundaries;
21 |         n-grams at the edges of words are padded with space.
22 |     ngram_range
23 |         tuple (min_n, max_n), default=(1).
24 |         The lower and upper boundary of the range of n-values for different n-grams to
25 |         be extracted. All values of n such that min_n <= n <= max_n will be used.
26 |     Examples
27 |     --------
28 |     >>> from lenlp import analyzer
29 | 
30 |     >>> analyzer.analyze("Hello, world!", analyzer="word")
31 |     ['Hello,', 'world!']
32 | 
33 |     >>> analyzer.analyze("Hello, world!", analyzer="char_wb", ngram_range=(3, 3))
34 |     ['Hel', 'ell', 'llo', 'lo,', 'o, ', ', w', ' wo', 'wor', 'orl', 'rld', 'ld!']
35 | 
36 |     >>> analyzer.analyze(["hello, world", "good"], analyzer="char", ngram_range=(2, 3))
37 |     [['he', 'el', 'll', 'lo', 'o,', ', ', ' w', 'wo', 'or', 'rl', 'ld', 'hel', 'ell', 'llo', 'lo,', 'o, ', ', w', ' wo', 'wor', 'orl', 'rld'], ['go', 'oo', 'od', 'goo', 'ood']]
38 | 
39 |     """
40 |     return_string = True if isinstance(x, str) else False
41 |     x = [x] if isinstance(x, str) else x
42 |     n_sizes = list(range(ngram_range[0], ngram_range[1] + 1))
43 | 
44 |     match analyzer:
45 |         case "word":
46 |             y = rssplit_words_many(x, n_sizes=n_sizes)
47 |         case "char":
48 |             y = rschar_ngrams_many(x, n_sizes=n_sizes)
49 |         case "char_wb":
50 |             y = rschar_wb_ngrams_many(x, n_sizes=n_sizes)
51 | 
52 |     return y[0] if return_string else y
53 | 


--------------------------------------------------------------------------------
/python/lenlp/counter/__init__.py:
--------------------------------------------------------------------------------
1 | from .count import count
2 | 
3 | __all__ = ["count"]
4 | 


--------------------------------------------------------------------------------
/python/lenlp/counter/count.py:
--------------------------------------------------------------------------------
 1 | from lenlp._rslenlp import (
 2 |     rsvectorize_char_ngrams_many,
 3 |     rsvectorize_char_wb_ngrams_many,
 4 |     rsvectorize_split_words_many,
 5 | )
 6 | 
 7 | __all__ = ["count"]
 8 | 
 9 | 
10 | def count(
11 |     x: str | list[str],
12 |     analyzer: str = "word",
13 |     ngram_range: tuple[int, int] = (1, 1),
14 |     normalize: bool = True,
15 |     stop_words: list[str] = None,
16 |     sort: bool = False,
17 | ) -> dict[str, int]:
18 |     """Count the frequency of words in a text or in a list of texts. Tokens are unordered within
19 |     the same text.
20 | 
21 |     Parameters
22 |     ----------
23 |     x
24 |         str or list of str.
25 |     analyzer
26 |         {word, char, char_wb}, default=word.
27 |         Whether the feature should be made of word n-gram or character n-grams. Option
28 |         char_wb creates character n-grams only from text inside word boundaries;
29 |         n-grams at the edges of words are padded with space.
30 |     ngram_range
31 |         tuple (min_n, max_n), default=1.
32 |         The lower and upper boundary of the range of n-values for different n-grams to
33 |         be extracted. All values of n such that min_n <= n <= max_n will be used.
34 |     normalize
35 |         bool, default=True.
36 |         Whether to normalize the text before counting. It will lowercase the text and remove
37 |         punctuation.
38 |     stop_words
39 |         list of str, default=None.
40 |         A list of stop words that will be removed from the text.
41 | 
42 |     Examples
43 |     --------
44 |     >>> from lenlp import counter
45 | 
46 |     >>> counter.count("Hello, world!", sort=True)
47 |     {'hello': 1, 'world': 1}
48 | 
49 |     >>> counter.count("Hello, world!", ngram_range=(2, 2), sort=True, normalize=False)
50 |     {'Hello, world!': 1}
51 | 
52 |     >>> counter.count(["Hello, world!", "How are you?"], stop_words=["are", "you"], sort=True)
53 |     [{'hello': 1, 'world': 1}, {'how': 1}]
54 | 
55 |     >>> counter.count(["Hello, world!", "hello"], analyzer="char_wb", ngram_range=(3, 7), stop_words=["hello"], sort=True)
56 |     [{'orl': 1, 'orld': 1, 'rld': 1, 'wor': 1, 'worl': 1, 'world': 1}, {}]
57 | 
58 |     >>> counter.count("Hello, world!", analyzer="char_wb", ngram_range=(3, 7), sort=True)
59 |     {' wo': 1, ' wor': 1, ' worl': 1, ' world': 1, 'ell': 1, 'ello': 1, 'ello ': 1, 'ello w': 1, 'ello wo': 1, 'hel': 1, 'hell': 1, 'hello': 1, 'hello ': 1, 'hello w': 1, 'llo': 1, 'llo ': 1, 'llo w': 1, 'llo wo': 1, 'llo wor': 1, 'lo ': 1, 'lo w': 1, 'lo wo': 1, 'lo wor': 1, 'lo worl': 1, 'o w': 1, 'o wo': 1, 'o wor': 1, 'o worl': 1, 'o world': 1, 'orl': 1, 'orld': 1, 'rld': 1, 'wor': 1, 'worl': 1, 'world': 1}
60 | 
61 |     >>> counter.count("Hello, world!", analyzer="char", ngram_range=(3, 7), sort=True)
62 |     {' wo': 1, ' wor': 1, ' worl': 1, ' world': 1, 'ell': 1, 'ello': 1, 'ello ': 1, 'ello w': 1, 'ello wo': 1, 'hel': 1, 'hell': 1, 'hello': 1, 'hello ': 1, 'hello w': 1, 'llo': 1, 'llo ': 1, 'llo w': 1, 'llo wo': 1, 'llo wor': 1, 'lo ': 1, 'lo w': 1, 'lo wo': 1, 'lo wor': 1, 'lo worl': 1, 'o w': 1, 'o wo': 1, 'o wor': 1, 'o worl': 1, 'o world': 1, 'orl': 1, 'orld': 1, 'rld': 1, 'wor': 1, 'worl': 1, 'world': 1}
63 | 
64 |     >>> counter.count(["Hello, world!", "hello"], analyzer="char", ngram_range=(3, 7), stop_words=["hello"], sort=True)
65 |     [{'orl': 1, 'orld': 1, 'rld': 1, 'wor': 1, 'worl': 1, 'world': 1}, {}]
66 | 
67 |     """
68 |     return_string = True if isinstance(x, str) else False
69 |     x = [x] if isinstance(x, str) else x
70 |     n_sizes = list(range(ngram_range[0], ngram_range[1] + 1))
71 | 
72 |     match analyzer:
73 |         case "word":
74 |             y = rsvectorize_split_words_many(
75 |                 x, n_sizes=n_sizes, stop_words=stop_words, normalize=normalize
76 |             )
77 |         case "char":
78 |             y = rsvectorize_char_ngrams_many(
79 |                 x, n_sizes=n_sizes, stop_words=stop_words, normalize=normalize
80 |             )
81 | 
82 |         case "char_wb":
83 |             y = rsvectorize_char_wb_ngrams_many(
84 |                 x, n_sizes=n_sizes, stop_words=stop_words, normalize=normalize
85 |             )
86 | 
87 |     if sort:
88 |         y = [dict(sorted(d.items())) for d in y]
89 | 
90 |     return y[0] if return_string else y
91 | 


--------------------------------------------------------------------------------
/python/lenlp/flash/__init__.py:
--------------------------------------------------------------------------------
1 | from .flash_text import FlashText
2 | 
3 | __all__ = ["FlashText"]
4 | 


--------------------------------------------------------------------------------
/python/lenlp/flash/flash_text.py:
--------------------------------------------------------------------------------
 1 | from lenlp._rslenlp import RSKeywordProcessor
 2 | 
 3 | __all__ = ["FlashText"]
 4 | 
 5 | 
 6 | class FlashText:
 7 |     """FlashText retrieve keywords from text.
 8 | 
 9 |     Parameters
10 |     ----------
11 |     lowercase
12 |         bool, default=True.
13 |         Whether to lowercase the text before extracting keywords.
14 |     normalize
15 |         bool, default=True.
16 |         Whether to normalize the text before extracting keywords. It will lowercase the text
17 |         and remove punctuation.
18 | 
19 |     Examples
20 |     --------
21 |     >>> from lenlp import flash
22 | 
23 |     >>> flash_text = flash.FlashText(normalize=True)
24 |     >>> flash_text = flash_text.add(["hello", "world"])
25 | 
26 |     >>> flash_text.extract(["Hello, world!", "world", "hello"])
27 |     [[('hello', 0, 5), ('world', 7, 12)], [('world', 0, 5)], [('hello', 0, 5)]]
28 | 
29 |     """
30 | 
31 |     def __init__(
32 |         self,
33 |         lowercase: bool = True,
34 |         normalize: bool = True,
35 |     ) -> None:
36 |         self.flash = RSKeywordProcessor(lowercase=lowercase, normalize=normalize)
37 | 
38 |     def add(
39 |         self,
40 |         x: str | list[str],
41 |         clean_name: str | None = None,
42 |     ) -> None:
43 |         """Add a keyword to the FlashText object."""
44 |         x = [x] if isinstance(x, str) else x
45 |         self.flash.add_keywords_many(x, clean_name)
46 |         return self
47 | 
48 |     def extract(self, x: str | list[str]) -> list[str]:
49 |         """Extract keywords from a sentence."""
50 |         is_string = isinstance(x, str)
51 |         x = [x] if isinstance(x, str) else x
52 |         y = self.flash.extract_keywords_many(x)
53 |         return y[0] if is_string else y
54 | 


--------------------------------------------------------------------------------
/python/lenlp/normalizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .normalize import normalize
2 | 
3 | __all__ = ["normalize"]
4 | 


--------------------------------------------------------------------------------
/python/lenlp/normalizer/normalize.py:
--------------------------------------------------------------------------------
 1 | from lenlp._rslenlp import rsnormalize, rsnormalize_many
 2 | 
 3 | __all__ = ["normalize"]
 4 | 
 5 | 
 6 | def normalize(x: str | list[str]) -> str:
 7 |     """Lowercase, remove punctation and unidecode single text.
 8 | 
 9 |     Examples
10 |     --------
11 |     >>> from lenlp import normalizer
12 | 
13 |     >>> normalizer.normalize("Hello, world!")
14 |     'hello world'
15 | 
16 |     >>> normalizer.normalize(["Hello, world!", "How are you?"])
17 |     ['hello world', 'how are you']
18 | 
19 |     """
20 |     return rsnormalize(x) if isinstance(x, str) else rsnormalize_many(x)
21 | 


--------------------------------------------------------------------------------
/python/lenlp/sparse/__init__.py:
--------------------------------------------------------------------------------
1 | from .bm25_vectorizer import BM25Vectorizer
2 | from .count_vectorizer import CountVectorizer
3 | from .tfidf_vectorizer import TfidfVectorizer
4 | 
5 | __all__ = ["BM25Vectorizer", "CountVectorizer", "TfidfVectorizer"]
6 | 


--------------------------------------------------------------------------------
/python/lenlp/sparse/bm25_vectorizer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.sparse import csr_matrix
  3 | from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2
  4 | 
  5 | from .tfidf_vectorizer import TfidfVectorizer
  6 | 
  7 | 
  8 | class BM25Vectorizer(TfidfVectorizer):
  9 |     """BM25Vectorizer is a class that converts a collection of text documents to a sparse
 10 |     bm25 matrix.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     analyzer
 15 |         {word, char, char_wb}, default=word.
 16 |         Whether the feature should be made of word n-gram or character n-grams. Option
 17 |         char_wb creates character n-grams only from text inside word boundaries;
 18 |         n-grams at the edges of words are padded with space.
 19 |     ngram_range
 20 |         tuple (min_n, max_n), default=(1, 1).
 21 |         The lower and upper boundary of the range of n-values for different n-grams to
 22 |         be extracted. All values of n such that min_n <= n <= max_n will be used.
 23 |     normalize
 24 |         bool, default=True.
 25 |         Whether to normalize the text before counting. It will lowercase the text and remove
 26 |         punctuation.
 27 |     stop_words
 28 |         list of str, default=None.
 29 |         A list of stop words that will be removed from the text.
 30 |     b
 31 |         The impact of document length normalization.  Default is `0.75`, Higher will
 32 |         penalize longer documents more.
 33 |     k1
 34 |         How quickly the impact of term frequency saturates.  Default is `1.5`, Higher
 35 |         will make term frequency more influential.
 36 |     epsilon
 37 |         Smoothing term. Default is `0`.
 38 | 
 39 |     Examples
 40 |     --------
 41 |     >>> from lenlp import sparse
 42 | 
 43 |     >>> bm25_vectorizer = sparse.BM25Vectorizer(
 44 |     ...     analyzer="word",
 45 |     ...     normalize=True,
 46 |     ...     stop_words=None,
 47 |     ... )
 48 | 
 49 |     >>> x = ["Hello, world!", "How are you?"]
 50 | 
 51 |     >>> bm25_vectorizer = bm25_vectorizer.fit(x)
 52 |     >>> matrix = bm25_vectorizer.transform(x)
 53 |     >>> matrix.shape
 54 |     (2, 5)
 55 | 
 56 |     >>> len(bm25_vectorizer.vocabulary)
 57 |     5
 58 | 
 59 |     >>> matrix = bm25_vectorizer.fit_transform(x)
 60 |     >>> matrix.shape
 61 |     (2, 5)
 62 | 
 63 |     """
 64 | 
 65 |     def __init__(
 66 |         self,
 67 |         analyzer: str = "word",
 68 |         ngram_range: tuple[int, int] = (1, 1),
 69 |         normalize: bool = True,
 70 |         stop_words: list[str] = None,
 71 |         k1: float = 1.5,
 72 |         b: float = 0.75,
 73 |         epsilon: float = 0,
 74 |     ) -> None:
 75 |         super().__init__(
 76 |             analyzer=analyzer,
 77 |             ngram_range=ngram_range,
 78 |             normalize=normalize,
 79 |             stop_words=stop_words,
 80 |         )
 81 | 
 82 |         self.k1 = k1
 83 |         self.b = b
 84 |         self.epsilon = epsilon
 85 |         self.average_len = None
 86 | 
 87 |     def update(self, matrix: csr_matrix) -> csr_matrix:
 88 |         """Update the idf values."""
 89 |         self.tf = (matrix > 0).sum(axis=0)
 90 |         len_documents = (matrix).sum(axis=1)
 91 |         self.average_len = len_documents.mean()
 92 |         self.count = matrix.shape[0]
 93 | 
 94 |         self.idf = np.squeeze(
 95 |             a=np.asarray(
 96 |                 a=np.log((self.count - self.tf + 0.5) / (self.tf + 0.5) + 1),
 97 |                 dtype=np.float32,
 98 |             )
 99 |         )
100 | 
101 |     def _transform(self, matrix: csr_matrix) -> csr_matrix:
102 |         """Transform a count matrix to a bm25 matrix."""
103 |         len_documents = (matrix).sum(axis=1)
104 |         regularization = np.squeeze(
105 |             a=np.asarray(
106 |                 a=(
107 |                     self.k1 * (1 - self.b + self.b * (len_documents / self.average_len))
108 |                 ).flatten()
109 |             )
110 |         )
111 | 
112 |         denominator = matrix.tocsc()
113 |         denominator.data += np.take(a=regularization, indices=denominator.indices)
114 |         matrix.data = (
115 |             (matrix.data * (self.k1 + 1)) / denominator.tocsr().data
116 |         ) + self.epsilon
117 | 
118 |         matrix = matrix.multiply(other=self.idf).tocsr()
119 |         inplace_csr_row_normalize_l2(matrix)
120 |         return matrix
121 | 


--------------------------------------------------------------------------------
/python/lenlp/sparse/count_vectorizer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.sparse import csr_matrix
  3 | 
  4 | from lenlp._rslenlp import SparseMatrixBuilder
  5 | 
  6 | __all__ = ["CountVectorizer"]
  7 | 
  8 | 
  9 | class CountVectorizer:
 10 |     """CountVectorizer is a class that converts a collection of text documents to a sparse
 11 |     matrix.
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     analyzer
 16 |         {word, char, char_wb}, default=word.
 17 |         Whether the feature should be made of word n-gram or character n-grams. Option
 18 |         char_wb creates character n-grams only from text inside word boundaries;
 19 |         n-grams at the edges of words are padded with space.
 20 |     ngram_range
 21 |         tuple (min_n, max_n), default=(1, 1).
 22 |         The lower and upper boundary of the range of n-values for different n-grams to
 23 |         be extracted. All values of n such that min_n <= n <= max_n will be used.
 24 |     normalize
 25 |         bool, default=True.
 26 |         Whether to normalize the text before counting. It will lowercase the text and remove
 27 |         punctuation.
 28 |     stop_words
 29 |         list of str, default=None.
 30 |         A list of stop words that will be removed from the text.
 31 | 
 32 |     Examples
 33 |     --------
 34 |     >>> from lenlp import sparse
 35 | 
 36 |     >>> count_vectorizer = sparse.CountVectorizer(
 37 |     ...     analyzer="word",
 38 |     ...     normalize=True,
 39 |     ...     stop_words=None,
 40 |     ... )
 41 | 
 42 |     >>> x = ["Hello, world!", "How are you?"]
 43 | 
 44 |     >>> count_vectorizer = count_vectorizer.fit(x)
 45 | 
 46 |     >>> matrix = count_vectorizer.transform(x)
 47 |     >>> matrix.shape
 48 |     (2, 5)
 49 | 
 50 |     >>> matrix.toarray()
 51 |     array([[1., 1., 0., 0., 0.],
 52 |            [0., 0., 1., 1., 1.]], dtype=float32)
 53 | 
 54 |     >>> len(count_vectorizer.vocabulary)
 55 |     5
 56 | 
 57 |     >>> matrix = count_vectorizer.fit_transform(x)
 58 |     >>> matrix.shape
 59 |     (2, 5)
 60 | 
 61 |     """
 62 | 
 63 |     def __init__(
 64 |         self,
 65 |         analyzer: str = "word",
 66 |         ngram_range: tuple[int, int] = (1, 1),
 67 |         normalize: bool = True,
 68 |         stop_words: list[str] = None,
 69 |     ) -> None:
 70 |         assert analyzer in ("word", "char", "char_wb")
 71 | 
 72 |         self.sparse_matrix = SparseMatrixBuilder(
 73 |             analyzer=analyzer,
 74 |             n_sizes=list(range(ngram_range[0], ngram_range[1] + 1)),
 75 |             normalize=normalize,
 76 |             stop_words=stop_words,
 77 |         )
 78 | 
 79 |         self.fitted = False
 80 | 
 81 |     @property
 82 |     def vocabulary(self) -> dict[str, int]:
 83 |         """Get the vocabulary of the CountVectorizer object."""
 84 |         return self.sparse_matrix.get_vocab()
 85 | 
 86 |     def fit(self, raw_documents: list[str]) -> None:
 87 |         """Learn the vocabulary dictionary and return the CountVectorizer object."""
 88 |         self.fitted = True
 89 |         self.sparse_matrix.fit(raw_documents)
 90 |         return self
 91 | 
 92 |     def transform(self, raw_documents: list[str]) -> csr_matrix:
 93 |         """Transform documents to document-term matrix."""
 94 |         if not self.fitted:
 95 |             raise ValueError("Call fit method before calling transform method.")
 96 | 
 97 |         values, row_indices, column_indices = self.sparse_matrix.transform(
 98 |             raw_documents
 99 |         )
100 | 
101 |         return csr_matrix(
102 |             arg1=(values, (row_indices, column_indices)),
103 |             shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
104 |             dtype=np.float32,
105 |         )
106 | 
107 |     def fit_transform(self, raw_documents: list[str]) -> csr_matrix:
108 |         """Learn the vocabulary dictionary and return the CountVectorizer object."""
109 |         self.fitted = True
110 | 
111 |         values, row_indices, column_indices = self.sparse_matrix.fit_transform(
112 |             raw_documents
113 |         )
114 | 
115 |         return csr_matrix(
116 |             arg1=(values, (row_indices, column_indices)),
117 |             shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
118 |             dtype=np.float32,
119 |         )
120 | 


--------------------------------------------------------------------------------
/python/lenlp/sparse/tfidf_vectorizer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.sparse import csr_matrix
  3 | from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2
  4 | 
  5 | from .count_vectorizer import CountVectorizer
  6 | 
  7 | 
  8 | class TfidfVectorizer(CountVectorizer):
  9 |     """TfidfVectorizer is a class that converts a collection of text documents to a sparse
 10 |     tfidf matrix.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     analyzer
 15 |         {word, char, char_wb}, default=word.
 16 |         Whether the feature should be made of word n-gram or character n-grams. Option
 17 |         char_wb creates character n-grams only from text inside word boundaries;
 18 |         n-grams at the edges of words are padded with space.
 19 |     ngram_range
 20 |         tuple (min_n, max_n), default=(1, 1).
 21 |         The lower and upper boundary of the range of n-values for different n-grams to
 22 |         be extracted. All values of n such that min_n <= n <= max_n will be used.
 23 |     normalize
 24 |         bool, default=True.
 25 |         Whether to normalize the text before counting. It will lowercase the text and remove
 26 |         punctuation.
 27 |     stop_words
 28 |         list of str, default=None.
 29 |         A list of stop words that will be removed from the text.
 30 | 
 31 |     Examples
 32 |     --------
 33 |     >>> from lenlp import sparse
 34 | 
 35 |     >>> tfidf_vectorizer = sparse.TfidfVectorizer(
 36 |     ...     analyzer="word",
 37 |     ...     normalize=True,
 38 |     ...     stop_words=None,
 39 |     ... )
 40 | 
 41 |     >>> x = ["Hello, world!", "How are you?"]
 42 | 
 43 |     >>> tfidf_vectorizer = tfidf_vectorizer.fit(x)
 44 |     >>> matrix = tfidf_vectorizer.transform(x)
 45 |     >>> matrix.shape
 46 |     (2, 5)
 47 | 
 48 |     >>> len(tfidf_vectorizer.vocabulary)
 49 |     5
 50 | 
 51 |     >>> matrix = tfidf_vectorizer.fit_transform(x)
 52 |     >>> matrix.shape
 53 |     (2, 5)
 54 | 
 55 |     """
 56 | 
 57 |     def __init__(
 58 |         self,
 59 |         analyzer: str = "word",
 60 |         ngram_range: tuple[int, int] = (1, 1),
 61 |         normalize: bool = True,
 62 |         stop_words: list[str] = None,
 63 |     ) -> None:
 64 |         super().__init__(
 65 |             analyzer=analyzer,
 66 |             ngram_range=ngram_range,
 67 |             normalize=normalize,
 68 |             stop_words=stop_words,
 69 |         )
 70 | 
 71 |         self.idf = None
 72 | 
 73 |     def fit(self, raw_documents: list[str]) -> None:
 74 |         matrix = super().fit_transform(raw_documents=raw_documents)
 75 |         self.update(matrix=matrix)
 76 |         return self
 77 | 
 78 |     def update(self, matrix: csr_matrix) -> csr_matrix:
 79 |         """Update the idf values."""
 80 |         tf = (matrix > 0).sum(axis=0)
 81 |         self.idf = (
 82 |             np.squeeze(a=np.asarray(a=np.log((matrix.shape[0] + 1.0) / (tf + 1.0)))) + 1
 83 |         )
 84 | 
 85 |     def _transform(self, matrix: csr_matrix) -> csr_matrix:
 86 |         """Transform a count matrix to a bm25 matrix."""
 87 |         matrix.data *= np.take(
 88 |             a=self.idf,
 89 |             indices=matrix.indices,
 90 |         )
 91 | 
 92 |         inplace_csr_row_normalize_l2(matrix)
 93 |         return matrix
 94 | 
 95 |     def transform(self, raw_documents: list[str]) -> csr_matrix:
 96 |         """Transform documents to document-term matrix."""
 97 |         values, row_indices, column_indices = self.sparse_matrix.transform(
 98 |             raw_documents
 99 |         )
100 |         return self._transform(
101 |             matrix=csr_matrix(
102 |                 arg1=(values, (row_indices, column_indices)),
103 |                 shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
104 |                 dtype=np.float32,
105 |             )
106 |         )
107 | 
108 |     def fit_transform(self, raw_documents: list[str]) -> csr_matrix:
109 |         """Learn the vocabulary dictionary and return the CountVectorizer object."""
110 |         values, row_indices, column_indices = self.sparse_matrix.fit_transform(
111 |             raw_documents
112 |         )
113 | 
114 |         matrix = csr_matrix(
115 |             arg1=(values, (row_indices, column_indices)),
116 |             shape=(len(raw_documents), self.sparse_matrix.get_num_cols()),
117 |             dtype=np.float32,
118 |         )
119 | 
120 |         self.update(matrix=matrix)
121 | 
122 |         return self._transform(
123 |             matrix=matrix,
124 |         )
125 | 


--------------------------------------------------------------------------------
/rust/lib.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::prelude::*;
 2 | use pyo3::types::PyModule; // NEW
 3 | 
 4 | mod rsanalyzer;
 5 | mod rscounter;
 6 | mod rsflashtext;
 7 | mod rsnormalizer;
 8 | mod rssparse;
 9 | mod rsstop_words;
10 | mod rsvectorizer;
11 | 
12 | #[pymodule]
13 | fn _rslenlp(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
14 |     rsanalyzer::register_functions(m)?;
15 |     rscounter::register_functions(m)?;
16 |     rsflashtext::register_functions(m)?;
17 |     rsnormalizer::register_functions(m)?;
18 |     rssparse::register_functions(m)?;
19 |     rsstop_words::register_functions(m)?;
20 |     rsvectorizer::register_functions(m)?;
21 |     Ok(())
22 | }
23 | 


--------------------------------------------------------------------------------
/rust/rsanalyzer.rs:
--------------------------------------------------------------------------------
  1 | use pyo3::prelude::*;
  2 | use pyo3::types::PyModule; // <- NEW: we now use the smart-pointer Bound<…, PyModule>
  3 | use pyo3::wrap_pyfunction;
  4 | use rayon::prelude::*;
  5 | 
  6 | /// Splits text into words.
  7 | ///
  8 | /// # Arguments
  9 | ///
 10 | /// * `text` - The input text.
 11 | /// * `n_sizes` - The size of the n-grams.
 12 | ///
 13 | /// # Returns
 14 | ///
 15 | /// A vector of words.
 16 | #[pyfunction]
 17 | pub fn rssplit_words(text: &str, n_sizes: Vec<usize>) -> Vec<String> {
 18 |     let mut ngrams: Vec<String> = Vec::new();
 19 | 
 20 |     for &n in &n_sizes {
 21 |         let words: Vec<&str> = text.split_whitespace().collect();
 22 |         for window in words.windows(n) {
 23 |             ngrams.push(window.join(" "));
 24 |         }
 25 |     }
 26 | 
 27 |     ngrams
 28 | }
 29 | 
 30 | /// Same as `rssplit_words` but for many texts at once.
 31 | #[pyfunction]
 32 | pub fn rssplit_words_many(texts: Vec<String>, n_sizes: Vec<usize>) -> Vec<Vec<String>> {
 33 |     texts
 34 |         .par_iter()
 35 |         .map(|text: &String| rssplit_words(text, n_sizes.clone()))
 36 |         .collect()
 37 | }
 38 | 
 39 | /// Computes character n-grams.
 40 | ///
 41 | /// # Arguments
 42 | ///
 43 | /// * `texts` - A vector of input texts.
 44 | /// * `n_sizes` - The size of the n-grams.
 45 | ///
 46 | /// # Returns
 47 | ///
 48 | /// A vector of character n-grams.
 49 | #[pyfunction]
 50 | pub fn rschar_ngrams(text: &str, n_sizes: Vec<usize>) -> Vec<String> {
 51 |     let mut ngrams: Vec<String> = Vec::new();
 52 | 
 53 |     for &n in &n_sizes {
 54 |         let chars: Vec<char> = text.chars().collect();
 55 |         for window in chars.windows(n) {
 56 |             ngrams.push(window.iter().collect::<String>());
 57 |         }
 58 |     }
 59 | 
 60 |     ngrams
 61 | }
 62 | 
 63 | /// Same as `rschar_ngrams` but for many texts at once.
 64 | #[pyfunction]
 65 | pub fn rschar_ngrams_many(texts: Vec<String>, n_sizes: Vec<usize>) -> Vec<Vec<String>> {
 66 |     texts
 67 |         .par_iter()
 68 |         .map(|text: &String| rschar_ngrams(text, n_sizes.clone()))
 69 |         .collect()
 70 | }
 71 | 
 72 | /// Character n-grams with word-boundary handling.
 73 | #[pyfunction]
 74 | pub fn rschar_wb_ngrams(text: &str, n_sizes: Vec<usize>) -> Vec<String> {
 75 |     let mut ngrams: Vec<String> = Vec::new();
 76 |     let chars: Vec<char> = text.chars().collect();
 77 | 
 78 |     for &n in &n_sizes {
 79 |         if n > chars.len() {
 80 |             continue;
 81 |         }
 82 |         for window in chars.windows(n) {
 83 |             ngrams.push(window.iter().collect::<String>());
 84 |         }
 85 |     }
 86 | 
 87 |     ngrams
 88 | }
 89 | 
 90 | /// Same as `rschar_wb_ngrams` but for many texts at once.
 91 | #[pyfunction]
 92 | pub fn rschar_wb_ngrams_many(texts: Vec<String>, n_sizes: Vec<usize>) -> Vec<Vec<String>> {
 93 |     texts
 94 |         .par_iter()
 95 |         .map(|text: &String| rschar_wb_ngrams(text, n_sizes.clone()))
 96 |         .collect()
 97 | }
 98 | 
 99 | /// Registers all the above functions in a Python sub-module.
100 | ///
101 | /// Called from your `#[pymodule]` entry-point.
102 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
103 |     m.add_function(wrap_pyfunction!(rssplit_words, m)?)?;
104 |     m.add_function(wrap_pyfunction!(rssplit_words_many, m)?)?;
105 |     m.add_function(wrap_pyfunction!(rschar_ngrams, m)?)?;
106 |     m.add_function(wrap_pyfunction!(rschar_ngrams_many, m)?)?;
107 |     m.add_function(wrap_pyfunction!(rschar_wb_ngrams, m)?)?;
108 |     m.add_function(wrap_pyfunction!(rschar_wb_ngrams_many, m)?)?;
109 |     Ok(())
110 | }
111 | 


--------------------------------------------------------------------------------
/rust/rscounter.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::prelude::*;
 2 | use pyo3::wrap_pyfunction;
 3 | use rayon::prelude::*;
 4 | use std::collections::HashMap;
 5 | 
 6 | /// Counts the number of times each word appears in the input text.
 7 | ///
 8 | /// # Arguments
 9 | ///
10 | /// * `text` - The input text as a vector of words.
11 | ///
12 | /// # Returns
13 | ///
14 | /// A hashmap with the words as keys and the number of times they appear as values.
15 | #[pyfunction]
16 | pub fn rscount(text: Vec<String>) -> HashMap<String, usize> {
17 |     let mut word_counter = HashMap::new();
18 |     for word in text {
19 |         *word_counter.entry(word).or_insert(0) += 1;
20 |     }
21 |     word_counter
22 | }
23 | 
24 | /// Counts the number of times each word appears for each input text.
25 | ///
26 | /// # Arguments
27 | ///
28 | /// * `texts` - The input texts as a vector of vectors of words.
29 | ///
30 | /// # Returns
31 | ///
32 | /// A vector of hashmaps with the words as keys and the number of times they appear as values.
33 | #[pyfunction]
34 | pub fn rscount_many(texts: Vec<Vec<String>>) -> Vec<HashMap<String, usize>> {
35 |     texts.par_iter().map(|text| rscount(text.clone())).collect()
36 | }
37 | 
38 | /// Registers all the above functions in a Python sub-module.
39 | ///
40 | /// Called from your `#[pymodule]` entry-point.
41 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
42 |     m.add_function(wrap_pyfunction!(rscount, m)?)?;
43 |     m.add_function(wrap_pyfunction!(rscount_many, m)?)?;
44 |     Ok(())
45 | }
46 | 
47 | #[cfg(test)]
48 | mod tests {
49 |     use super::*;
50 | 
51 |     #[test]
52 |     fn test_rscount() {
53 |         let text = vec![
54 |             "hello".to_string(),
55 |             "world".to_string(),
56 |             "hello".to_string(),
57 |             "hello".to_string(),
58 |         ];
59 |         let result = rscount(text);
60 |         let mut expected = HashMap::new();
61 |         expected.insert("hello".to_string(), 3);
62 |         expected.insert("world".to_string(), 1);
63 |         assert_eq!(result, expected);
64 |     }
65 | 
66 |     #[test]
67 |     fn test_rscount_many() {
68 |         let texts = vec![
69 |             vec!["hello".to_string(), "world".to_string()],
70 |             vec![
71 |                 "hello".to_string(),
72 |                 "world".to_string(),
73 |                 "hello".to_string(),
74 |             ],
75 |         ];
76 |         let result = rscount_many(texts);
77 |         let mut expected = Vec::new();
78 |         let mut map1 = HashMap::new();
79 |         map1.insert("hello".to_string(), 1);
80 |         map1.insert("world".to_string(), 1);
81 |         let mut map2 = HashMap::new();
82 |         map2.insert("hello".to_string(), 2);
83 |         map2.insert("world".to_string(), 1);
84 |         expected.push(map1);
85 |         expected.push(map2);
86 |         assert_eq!(result, expected);
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/rust/rsflashtext.rs:
--------------------------------------------------------------------------------
  1 | use pyo3::prelude::*;
  2 | 
  3 | use std::collections::HashMap;
  4 | use std::collections::HashSet;
  5 | 
  6 | use rayon::prelude::*;
  7 | use unidecode::unidecode;
  8 | 
  9 | #[pyclass()]
 10 | pub struct RSKeywordProcessor {
 11 |     keyword: String,
 12 |     non_word_boundaries: HashSet<char>,
 13 |     keyword_trie_dict: HashMap<char, RSTrieNode>,
 14 |     lowercase: bool,
 15 |     normalize: bool,
 16 |     terms_in_trie: usize,
 17 | }
 18 | 
 19 | #[pyclass()]
 20 | pub struct RSTrieNode {
 21 |     children: HashMap<char, RSTrieNode>,
 22 |     is_end: bool,
 23 |     clean_name: Option<String>,
 24 | }
 25 | 
 26 | impl RSTrieNode {
 27 |     pub fn new() -> Self {
 28 |         RSTrieNode {
 29 |             children: HashMap::new(),
 30 |             is_end: false,
 31 |             clean_name: None,
 32 |         }
 33 |     }
 34 | }
 35 | 
 36 | #[pymethods]
 37 | impl RSKeywordProcessor {
 38 |     #[new]
 39 |     pub fn new(lowercase: bool, normalize: bool) -> Self {
 40 |         let keyword: String = "_keyword_".to_string();
 41 |         let non_word_boundaries: HashSet<char> = {
 42 |             let mut set: HashSet<char> = HashSet::new();
 43 |             set.extend('0'..='9');
 44 |             set.extend('a'..='z');
 45 |             set.extend('A'..='Z');
 46 |             set.insert('_');
 47 |             set
 48 |         };
 49 | 
 50 |         RSKeywordProcessor {
 51 |             keyword,
 52 |             non_word_boundaries,
 53 |             keyword_trie_dict: HashMap::new(),
 54 |             lowercase,
 55 |             normalize,
 56 |             terms_in_trie: 0,
 57 |         }
 58 |     }
 59 | 
 60 |     pub fn add_keywords_many(
 61 |         &mut self,
 62 |         keywords: Vec<String>,
 63 |         clean_name: Option<&str>,
 64 |     ) -> Vec<bool> {
 65 |         keywords
 66 |             .iter()
 67 |             .map(|keyword: &String| self.add_keyword(&keyword, clean_name))
 68 |             .collect()
 69 |     }
 70 | 
 71 |     pub fn add_keyword(&mut self, keyword: &str, clean_name: Option<&str>) -> bool {
 72 |         let clean_name: &str = clean_name.unwrap_or(keyword);
 73 |         let keyword: String = if self.normalize {
 74 |             unidecode(keyword)
 75 |                 .to_lowercase()
 76 |                 .chars()
 77 |                 .filter(|c| !c.is_ascii_punctuation())
 78 |                 .collect::<String>()
 79 |                 .trim()
 80 |                 .to_string()
 81 |         } else if self.lowercase {
 82 |             keyword.to_lowercase()
 83 |         } else {
 84 |             keyword.to_string()
 85 |         };
 86 | 
 87 |         let mut current_node: &mut HashMap<char, RSTrieNode> = &mut self.keyword_trie_dict;
 88 |         for char in keyword.chars() {
 89 |             current_node = &mut current_node
 90 |                 .entry(char)
 91 |                 .or_insert_with(RSTrieNode::new)
 92 |                 .children;
 93 |         }
 94 | 
 95 |         if !current_node.contains_key(&self.keyword.chars().next().unwrap()) {
 96 |             self.terms_in_trie += 1;
 97 |             current_node.insert(
 98 |                 self.keyword.chars().next().unwrap(),
 99 |                 RSTrieNode {
100 |                     children: HashMap::new(),
101 |                     is_end: true,
102 |                     clean_name: Some(clean_name.to_string()),
103 |                 },
104 |             );
105 |             true
106 |         } else {
107 |             false
108 |         }
109 |     }
110 | 
111 |     pub fn extract_keywords_many(
112 |         &self,
113 |         sentences: Vec<String>,
114 |     ) -> Vec<Vec<(String, usize, usize)>> {
115 |         sentences
116 |             .par_iter()
117 |             .map(|sentence: &String| self.extract_keywords(&sentence))
118 |             .collect()
119 |     }
120 | 
121 |     pub fn extract_keywords(&self, sentence: &str) -> Vec<(String, usize, usize)> {
122 |         // Map from the index in the normalized sentence to the index in the original sentence
123 |         let mut index_map: Vec<usize> = Vec::with_capacity(sentence.len());
124 |         let mut original_idx = 0;
125 | 
126 |         let normalized_sentence: String = if self.normalize {
127 |             let mut normalized = String::new();
128 |             for c in sentence.chars() {
129 |                 if c.is_ascii_punctuation() {
130 |                     original_idx += c.len_utf8();
131 |                     continue;
132 |                 }
133 |                 let normalized_char = unidecode::unidecode_char(c).to_lowercase();
134 |                 for nc in normalized_char.chars() {
135 |                     normalized.push(nc);
136 |                     index_map.push(original_idx);
137 |                 }
138 |                 original_idx += c.len_utf8();
139 |             }
140 |             normalized.to_string()
141 |         } else if self.lowercase {
142 |             sentence.to_lowercase()
143 |         } else {
144 |             sentence.to_string()
145 |         };
146 | 
147 |         let mut extracted_keywords: Vec<(String, usize, usize)> = Vec::new();
148 |         let mut current_node: &HashMap<char, RSTrieNode> = &self.keyword_trie_dict;
149 |         let mut start_pos: usize = 0;
150 |         let mut end_pos: usize = 0;
151 | 
152 |         let mut idx: usize = 0;
153 |         let sentence_len: usize = normalized_sentence.len();
154 |         while idx < sentence_len {
155 |             let char: char = normalized_sentence.chars().nth(idx).unwrap();
156 |             if !self.non_word_boundaries.contains(&char) {
157 |                 if let Some(node) = current_node.get(&self.keyword.chars().next().unwrap()) {
158 |                     if node.is_end {
159 |                         let clean_name: &String = node.clean_name.as_ref().unwrap();
160 |                         let original_start_pos = index_map[start_pos];
161 |                         let original_end_pos = index_map[end_pos - 1] + 1;
162 |                         extracted_keywords.push((
163 |                             clean_name.clone(),
164 |                             original_start_pos,
165 |                             original_end_pos,
166 |                         ));
167 |                     }
168 |                 }
169 |                 current_node = &self.keyword_trie_dict;
170 |                 start_pos = idx + 1;
171 |             } else if let Some(node) = current_node.get(&char) {
172 |                 current_node = &node.children;
173 |                 end_pos = idx + 1;
174 |             } else {
175 |                 current_node = &self.keyword_trie_dict;
176 |                 start_pos = idx + 1;
177 |             }
178 |             idx += 1;
179 |         }
180 | 
181 |         // Check if the last segment is a keyword
182 |         if let Some(node) = current_node.get(&self.keyword.chars().next().unwrap()) {
183 |             if node.is_end {
184 |                 let clean_name: &String = node.clean_name.as_ref().unwrap();
185 |                 let original_start_pos = index_map[start_pos];
186 |                 let original_end_pos = index_map[end_pos - 1] + 1;
187 |                 extracted_keywords.push((clean_name.clone(), original_start_pos, original_end_pos));
188 |             }
189 |         }
190 | 
191 |         extracted_keywords
192 |     }
193 | }
194 | 
195 | /// Registers all the above functions in a Python sub-module.
196 | ///
197 | /// Called from your `#[pymodule]` entry-point.
198 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
199 |     m.add_class::<RSKeywordProcessor>()?;
200 |     m.add_class::<RSTrieNode>()?;
201 |     Ok(())
202 | }
203 | 


--------------------------------------------------------------------------------
/rust/rsnormalizer.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::prelude::*;
 2 | use pyo3::wrap_pyfunction;
 3 | use rayon::prelude::*;
 4 | use unidecode::unidecode;
 5 | 
 6 | /// Normalize text by converting to lowercase, removing punctuation, and trimming whitespace.
 7 | ///
 8 | /// # Arguments
 9 | ///
10 | /// * `text` - A string slice that holds the text to normalize.
11 | ///
12 | /// # Returns
13 | ///
14 | /// A String that holds the normalized text.
15 | #[pyfunction]
16 | pub fn rsnormalize(text: &str) -> String {
17 |     unidecode(text)
18 |         .to_lowercase()
19 |         .chars()
20 |         .filter(|c| !c.is_ascii_punctuation())
21 |         .collect::<String>()
22 |         .trim()
23 |         .to_string()
24 | }
25 | 
26 | /// Normalize multiple texts.
27 | ///
28 | /// # Arguments
29 | ///
30 | /// * `texts` - A vector of strings that holds the texts to normalize.
31 | ///
32 | /// # Returns
33 | ///
34 | /// A vector of strings that holds the normalized texts.
35 | #[pyfunction]
36 | pub fn rsnormalize_many(texts: Vec<String>) -> Vec<String> {
37 |     texts.par_iter().map(|text| rsnormalize(text)).collect()
38 | }
39 | 
40 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
41 |     m.add_function(wrap_pyfunction!(rsnormalize, m)?)?;
42 |     m.add_function(wrap_pyfunction!(rsnormalize_many, m)?)?;
43 |     Ok(())
44 | }
45 | 
46 | #[cfg(test)]
47 | mod tests {
48 |     use super::*;
49 | 
50 |     #[test]
51 |     fn test_rsnormalize() {
52 |         assert_eq!(rsnormalize("Hello World! 😀"), "hello world");
53 |         assert_eq!(rsnormalize("1,2,3,4"), "1234");
54 |     }
55 | 
56 |     #[test]
57 |     fn test_rsnormalize_many() {
58 |         let input = vec!["Hello World! 😀".to_string(), "Goodbye, World!".to_string()];
59 |         let expected = vec!["hello world".to_string(), "goodbye world".to_string()];
60 |         assert_eq!(rsnormalize_many(input), expected);
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/rust/rssparse.rs:
--------------------------------------------------------------------------------
  1 | use crate::rsvectorizer::rsvectorize_many;
  2 | use bincode::{deserialize, serialize};
  3 | use numpy::PyArray1;
  4 | use pyo3::prelude::*;
  5 | use pyo3::types::{PyBytes, PyModule}; // NEW
  6 | use serde::{Deserialize, Serialize};
  7 | use std::collections::HashMap;
  8 | 
  9 | // ---------------------------------------------------------------------------
 10 | // Sparse-matrix builder
 11 | // ---------------------------------------------------------------------------
 12 | 
 13 | #[derive(Clone, Debug, Serialize, Deserialize)]
 14 | #[pyclass(module = "lenlp.sparse.count_vectorizer")]
 15 | pub struct SparseMatrixBuilder {
 16 |     analyzer: String,
 17 |     n_sizes: Vec<usize>,
 18 |     stop_words: Option<Vec<String>>,
 19 |     normalize: Option<bool>,
 20 |     vocab: HashMap<String, usize>,
 21 |     num_cols: usize,
 22 | }
 23 | 
 24 | #[pymethods]
 25 | impl SparseMatrixBuilder {
 26 |     #[new]
 27 |     pub fn new(
 28 |         n_sizes: Vec<usize>,
 29 |         analyzer: String,
 30 |         stop_words: Option<Vec<String>>,
 31 |         normalize: Option<bool>,
 32 |     ) -> Self {
 33 |         Self {
 34 |             vocab: HashMap::new(),
 35 |             n_sizes,
 36 |             analyzer,
 37 |             stop_words,
 38 |             normalize,
 39 |             num_cols: 0,
 40 |         }
 41 |     }
 42 | 
 43 |     /// Build the vocabulary and return the CSR triplet arrays.
 44 |     pub fn fit_transform(
 45 |         &mut self,
 46 |         texts: Vec<String>,
 47 |         py: Python<'_>,
 48 |     ) -> (
 49 |         Py<PyArray1<usize>>,
 50 |         Py<PyArray1<usize>>,
 51 |         Py<PyArray1<usize>>,
 52 |     ) {
 53 |         self.vocab = HashMap::new();
 54 |         let texts: Vec<HashMap<String, usize>> = rsvectorize_many(
 55 |             texts,
 56 |             self.n_sizes.clone(),
 57 |             self.analyzer.clone(),
 58 |             self.stop_words.clone(),
 59 |             self.normalize,
 60 |         );
 61 | 
 62 |         self._fit(texts.clone());
 63 | 
 64 |         // Scipy csr_matrix are faster to build from numpy arrays.
 65 |         let (vec1, vec2, vec3) = self._transform(texts);
 66 |         (
 67 |             PyArray1::from_vec_bound(py, vec1).into(),
 68 |             PyArray1::from_vec_bound(py, vec2).into(),
 69 |             PyArray1::from_vec_bound(py, vec3).into(),
 70 |         )
 71 |     }
 72 | 
 73 |     pub fn fit(&mut self, texts: Vec<String>) {
 74 |         self.vocab = HashMap::new();
 75 |         let texts: Vec<HashMap<String, usize>> = rsvectorize_many(
 76 |             texts,
 77 |             self.n_sizes.clone(),
 78 |             self.analyzer.clone(),
 79 |             self.stop_words.clone(),
 80 |             self.normalize,
 81 |         );
 82 | 
 83 |         self._fit(texts);
 84 |     }
 85 | 
 86 |     fn _fit(&mut self, texts: Vec<HashMap<String, usize>>) {
 87 |         let mut col_index: usize = 0;
 88 |         for doc in &texts {
 89 |             for token in doc.keys() {
 90 |                 if !self.vocab.contains_key(token) {
 91 |                     self.vocab.insert(token.clone(), col_index);
 92 |                     col_index += 1;
 93 |                 }
 94 |             }
 95 |         }
 96 |         self.num_cols = col_index;
 97 |     }
 98 | 
 99 |     pub fn transform(
100 |         &self,
101 |         texts: Vec<String>,
102 |         py: Python<'_>,
103 |     ) -> (
104 |         Py<PyArray1<usize>>,
105 |         Py<PyArray1<usize>>,
106 |         Py<PyArray1<usize>>,
107 |     ) {
108 |         let texts: Vec<HashMap<String, usize>> = rsvectorize_many(
109 |             texts,
110 |             self.n_sizes.clone(),
111 |             self.analyzer.clone(),
112 |             self.stop_words.clone(),
113 |             self.normalize,
114 |         );
115 | 
116 |         // Scipy csr_matrix are faster to build from numpy arrays.
117 |         let (vec1, vec2, vec3) = self._transform(texts);
118 |         (
119 |             PyArray1::from_vec_bound(py, vec1).into(),
120 |             PyArray1::from_vec_bound(py, vec2).into(),
121 |             PyArray1::from_vec_bound(py, vec3).into(),
122 |         )
123 |     }
124 | 
125 |     fn _transform(
126 |         &self,
127 |         texts: Vec<HashMap<String, usize>>,
128 |     ) -> (Vec<usize>, Vec<usize>, Vec<usize>) {
129 |         let mut values: Vec<usize> = Vec::new();
130 |         let mut row_indices: Vec<usize> = Vec::new();
131 |         let mut column_indices: Vec<usize> = Vec::new();
132 | 
133 |         for (row_idx, doc) in texts.iter().enumerate() {
134 |             for (token, &count) in doc.iter() {
135 |                 if let Some(&col_idx) = self.vocab.get(token) {
136 |                     values.push(count);
137 |                     row_indices.push(row_idx);
138 |                     column_indices.push(col_idx);
139 |                 }
140 |             }
141 |         }
142 | 
143 |         (values, row_indices, column_indices)
144 |     }
145 | 
146 |     // ---------------------------------------------------------------------
147 |     // Accessors
148 |     // ---------------------------------------------------------------------
149 |     pub fn get_vocab(&self) -> HashMap<String, usize> {
150 |         self.vocab.clone()
151 |     }
152 | 
153 |     pub fn get_num_cols(&self) -> usize {
154 |         self.num_cols
155 |     }
156 | 
157 |     // ---------------------------------------------------------------------
158 |     // Pickle support
159 |     // ---------------------------------------------------------------------
160 | 
161 |     pub fn __setstate__(&mut self, state: &Bound<'_, PyBytes>) -> PyResult<()> {
162 |         *self = deserialize(state.as_bytes()).unwrap();
163 |         Ok(())
164 |     }
165 | 
166 |     pub fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyBytes>> {
167 |         Ok(PyBytes::new(py, &serialize(&self).unwrap()))
168 |     }
169 | 
170 |     pub fn __getnewargs__(
171 |         &self,
172 |     ) -> PyResult<(Vec<usize>, String, Option<Vec<String>>, Option<bool>)> {
173 |         Ok((
174 |             self.n_sizes.clone(),
175 |             self.analyzer.clone(),
176 |             self.stop_words.clone(),
177 |             self.normalize,
178 |         ))
179 |     }
180 | }
181 | 
182 | // ---------------------------------------------------------------------------
183 | // Module registration
184 | // ---------------------------------------------------------------------------
185 | 
186 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
187 |     m.add_class::<SparseMatrixBuilder>()?;
188 |     Ok(())
189 | }
190 | 


--------------------------------------------------------------------------------
/rust/rsstop_words.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::prelude::*;
 2 | use pyo3::wrap_pyfunction;
 3 | use rayon::prelude::*;
 4 | use std::collections::HashSet;
 5 | 
 6 | /// Function to filter stop words from a string
 7 | ///
 8 | /// # Arguments
 9 | ///
10 | /// * `text` - The input text.
11 | /// * `stop_words` - The stop words to filter.
12 | ///
13 | /// # Returns
14 | ///
15 | /// A string with the stop words removed.
16 | #[pyfunction]
17 | pub fn rsfilter_stop_words(text: &str, stop_words: Vec<String>) -> String {
18 |     // Use HashSet for better performance in membership checks
19 |     let stop_words_set: HashSet<_> = stop_words.into_iter().collect();
20 |     text.split_whitespace()
21 |         .filter(|word: &&str| !stop_words_set.contains(*word))
22 |         .collect::<Vec<&str>>()
23 |         .join(" ")
24 | }
25 | 
26 | /// Function to filter stop words from multiple strings
27 | ///
28 | /// # Arguments
29 | ///
30 | /// * `texts` - The input texts.
31 | /// * `stop_words` - The stop words to filter.
32 | ///   
33 | /// # Returns
34 | ///
35 | /// A vector of strings with the stop words removed.
36 | #[pyfunction]
37 | pub fn rsfilter_stop_words_many(texts: Vec<String>, stop_words: Vec<String>) -> Vec<String> {
38 |     // Use HashSet for better performance in membership checks
39 |     let stop_words_set: HashSet<_> = stop_words.into_iter().collect();
40 |     texts
41 |         .into_par_iter()
42 |         .map(|sentence: String| {
43 |             sentence
44 |                 .split_whitespace()
45 |                 .filter(|word: &&str| !stop_words_set.contains(*word))
46 |                 .collect::<Vec<&str>>()
47 |                 .join(" ")
48 |         })
49 |         .collect()
50 | }
51 | 
52 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
53 |     m.add_function(wrap_pyfunction!(rsfilter_stop_words, m)?)?;
54 |     m.add_function(wrap_pyfunction!(rsfilter_stop_words_many, m)?)?;
55 |     Ok(())
56 | }
57 | 


--------------------------------------------------------------------------------
/rust/rsvectorizer.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::prelude::*;
 2 | use pyo3::wrap_pyfunction;
 3 | use rayon::prelude::*;
 4 | use std::collections::HashMap;
 5 | 
 6 | use crate::rsanalyzer::rschar_ngrams;
 7 | use crate::rsanalyzer::rschar_wb_ngrams;
 8 | use crate::rsanalyzer::rssplit_words;
 9 | use crate::rscounter::rscount;
10 | use crate::rsnormalizer::rsnormalize_many;
11 | use crate::rsstop_words::rsfilter_stop_words_many;
12 | 
13 | pub fn process_texts(
14 |     texts: Vec<String>,
15 |     normalize: Option<bool>,
16 |     stop_words: Option<Vec<String>>,
17 | ) -> Vec<String> {
18 |     let texts: Vec<String> = match normalize {
19 |         Some(true) => rsnormalize_many(texts),
20 |         _ => texts,
21 |     };
22 | 
23 |     match stop_words {
24 |         Some(stop_words) => rsfilter_stop_words_many(texts, stop_words),
25 |         None => texts,
26 |     }
27 | }
28 | 
29 | #[pyfunction]
30 | pub fn rsvectorize_split_words_many(
31 |     texts: Vec<String>,
32 |     n_sizes: Vec<usize>,
33 |     stop_words: Option<Vec<String>>,
34 |     normalize: Option<bool>,
35 | ) -> Vec<HashMap<String, usize>> {
36 |     let texts: Vec<String> = process_texts(texts, normalize, stop_words);
37 |     texts
38 |         .par_iter()
39 |         .map(|text: &String| rscount(rssplit_words(text, n_sizes.clone())))
40 |         .collect()
41 | }
42 | 
43 | #[pyfunction]
44 | pub fn rsvectorize_char_ngrams_many(
45 |     texts: Vec<String>,
46 |     n_sizes: Vec<usize>,
47 |     stop_words: Option<Vec<String>>,
48 |     normalize: Option<bool>,
49 | ) -> Vec<HashMap<String, usize>> {
50 |     let texts: Vec<String> = process_texts(texts, normalize, stop_words);
51 |     texts
52 |         .par_iter()
53 |         .map(|text: &String| rscount(rschar_ngrams(text, n_sizes.clone())))
54 |         .collect()
55 | }
56 | 
57 | #[pyfunction]
58 | pub fn rsvectorize_char_wb_ngrams_many(
59 |     texts: Vec<String>,
60 |     n_sizes: Vec<usize>,
61 |     stop_words: Option<Vec<String>>,
62 |     normalize: Option<bool>,
63 | ) -> Vec<HashMap<String, usize>> {
64 |     let texts: Vec<String> = process_texts(texts, normalize, stop_words);
65 |     texts
66 |         .par_iter()
67 |         .map(|text: &String| rscount(rschar_wb_ngrams(text, n_sizes.clone())))
68 |         .collect()
69 | }
70 | 
71 | // Main vectorization function
72 | #[pyfunction]
73 | pub fn rsvectorize_many(
74 |     texts: Vec<String>,
75 |     n_sizes: Vec<usize>,
76 |     analyzer: String,
77 |     stop_words: Option<Vec<String>>,
78 |     normalize: Option<bool>,
79 | ) -> Vec<HashMap<String, usize>> {
80 |     match analyzer.as_str() {
81 |         "word" => rsvectorize_split_words_many(texts, n_sizes, stop_words, normalize),
82 |         "char" => rsvectorize_char_ngrams_many(texts, n_sizes, stop_words, normalize),
83 |         "char_wb" => rsvectorize_char_wb_ngrams_many(texts, n_sizes, stop_words, normalize),
84 |         _ => panic!("Invalid analyzer type"),
85 |     }
86 | }
87 | 
88 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> {
89 |     m.add_function(wrap_pyfunction!(rsvectorize_split_words_many, m)?)?;
90 |     m.add_function(wrap_pyfunction!(rsvectorize_char_ngrams_many, m)?)?;
91 |     m.add_function(wrap_pyfunction!(rsvectorize_char_wb_ngrams_many, m)?)?;
92 |     m.add_function(wrap_pyfunction!(rsvectorize_many, m)?)?;
93 | 
94 |     Ok(())
95 | }
96 | 


--------------------------------------------------------------------------------