├── .github └── workflows │ └── publish.yml ├── .gitignore ├── Cargo.toml ├── README.md ├── docs ├── bm25.png ├── count_vectorizer.png ├── count_vectorizer_char.png ├── flashtext.png ├── logo.png └── tfidf.png ├── pyproject.toml ├── python └── lenlp │ ├── __init__.py │ ├── analyzer │ ├── __init__.py │ └── analyze.py │ ├── counter │ ├── __init__.py │ └── count.py │ ├── flash │ ├── __init__.py │ └── flash_text.py │ ├── normalizer │ ├── __init__.py │ └── normalize.py │ └── sparse │ ├── __init__.py │ ├── bm25_vectorizer.py │ ├── count_vectorizer.py │ └── tfidf_vectorizer.py └── rust ├── lib.rs ├── rsanalyzer.rs ├── rscounter.rs ├── rsflashtext.rs ├── rsnormalizer.rs ├── rssparse.rs ├── rsstop_words.rs └── rsvectorizer.rs /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by maturin v1.5.1 2 | # To update, run 3 | # 4 | # maturin generate-ci github 5 | # 6 | name: CI 7 | 8 | on: 9 | push: 10 | branches: 11 | - main 12 | workflow_dispatch: 13 | 14 | permissions: 15 | contents: read 16 | 17 | jobs: 18 | linux: 19 | runs-on: ${{ matrix.runner }} 20 | strategy: 21 | matrix: 22 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] 23 | target: [x86_64, x86, aarch64, armv7, s390x, ppc64le] 24 | runner: [ubuntu-latest] 25 | exclude: 26 | - target: x86 27 | python-version: ['3.8', '3.9', '3.11', '3.12'] 28 | - target: aarch64 29 | python-version: ['3.8', '3.9', '3.11', '3.12'] 30 | - target: armv7 31 | python-version: ['3.8', '3.9', '3.11', '3.12'] 32 | - target: s390x 33 | python-version: ['3.8', '3.9', '3.11', '3.12'] 34 | - target: ppc64le 35 | python-version: ['3.8', '3.9', '3.11', '3.12'] 36 | steps: 37 | - uses: actions/checkout@v4 38 | - uses: actions/setup-python@v5 39 | with: 40 | python-version: ${{ matrix.python-version }} 41 | - name: Build wheels 42 | uses: PyO3/maturin-action@v1 43 | with: 44 | target: ${{ matrix.target }} 45 | args: --release --out dist --find-interpreter 46 | sccache: 'true' 47 | manylinux: auto 48 | - name: Upload wheels 49 | uses: actions/upload-artifact@v4 50 | with: 51 | name: wheels-linux-${{ matrix.target }}-py${{ matrix.python-version }}-${{ github.run_id }} 52 | path: dist 53 | 54 | windows: 55 | runs-on: ${{ matrix.runner }} 56 | strategy: 57 | matrix: 58 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] 59 | target: [x64, x86] 60 | runner: [windows-latest] 61 | exclude: 62 | - target: x86 63 | python-version: ['3.8', '3.9', '3.11', '3.12'] 64 | steps: 65 | - uses: actions/checkout@v4 66 | - uses: actions/setup-python@v5 67 | with: 68 | python-version: ${{ matrix.python-version }} 69 | architecture: ${{ matrix.target }} 70 | - name: Build wheels 71 | uses: PyO3/maturin-action@v1 72 | with: 73 | target: ${{ matrix.target }} 74 | args: --release --out dist --find-interpreter 75 | sccache: 'true' 76 | - name: Upload wheels 77 | uses: actions/upload-artifact@v4 78 | with: 79 | name: wheels-windows-${{ matrix.target }}-py${{ matrix.python-version }}-${{ github.run_id }} 80 | path: dist 81 | 82 | macos: 83 | runs-on: ${{ matrix.runner }} 84 | strategy: 85 | matrix: 86 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] 87 | target: [x86_64, aarch64] 88 | runner: [macos-latest] 89 | exclude: 90 | - target: aarch64 91 | python-version: ['3.8', '3.9', '3.11', '3.12'] 92 | steps: 93 | - uses: actions/checkout@v4 94 | - uses: actions/setup-python@v5 95 | with: 96 | python-version: ${{ matrix.python-version }} 97 | - name: Build wheels 98 | uses: PyO3/maturin-action@v1 99 | with: 100 | target: ${{ matrix.target }} 101 | args: --release --out dist --find-interpreter 102 | sccache: 'true' 103 | - name: Upload wheels 104 | uses: actions/upload-artifact@v4 105 | with: 106 | name: wheels-macos-${{ matrix.target }}-py${{ matrix.python-version }}-${{ github.run_id }} 107 | path: dist 108 | 109 | sdist: 110 | runs-on: ubuntu-latest 111 | steps: 112 | - uses: actions/checkout@v4 113 | - name: Build sdist 114 | uses: PyO3/maturin-action@v1 115 | with: 116 | command: sdist 117 | args: --out dist 118 | - name: Upload sdist 119 | uses: actions/upload-artifact@v4 120 | with: 121 | name: wheels-sdist-${{ github.run_id }} 122 | path: dist 123 | 124 | release: 125 | name: Release 126 | runs-on: ubuntu-latest 127 | needs: [linux, windows, macos, sdist] 128 | steps: 129 | - uses: actions/download-artifact@v4 130 | - name: Publish to PyPI 131 | uses: PyO3/maturin-action@v1 132 | env: 133 | MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_PASSWORD }} 134 | with: 135 | command: upload 136 | args: --non-interactive --skip-existing wheels-*/* 137 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | .DS_Store 4 | 5 | *.json 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | *.test 13 | *.onx 14 | *.qonx 15 | *.DS_Store 16 | *.pyc 17 | *.ipynb_checkpoints 18 | *.pickle 19 | *.pkl 20 | *.icloud 21 | cache/ 22 | # C extensions 23 | *.so 24 | test/ 25 | 26 | # Distribution / packaging 27 | .Python 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheels/ 40 | pip-wheel-metadata/ 41 | share/python-wheels/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | MANIFEST 46 | 47 | # PyInstaller 48 | # Usually these files are written by a python script from a template 49 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 50 | *.manifest 51 | *.spec 52 | 53 | # Installer logs 54 | pip-log.txt 55 | pip-delete-this-directory.txt 56 | 57 | # Unit test / coverage reports 58 | htmlcov/ 59 | .tox/ 60 | .nox/ 61 | .coverage 62 | .coverage.* 63 | .cache 64 | nosetests.xml 65 | coverage.xml 66 | *.cover 67 | *.py,cover 68 | .hypothesis/ 69 | .pytest_cache/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | db.sqlite3 79 | db.sqlite3-journal 80 | 81 | # Flask stuff: 82 | instance/ 83 | .webassets-cache 84 | 85 | # Scrapy stuff: 86 | .scrapy 87 | 88 | # Sphinx documentation 89 | docs/_build/ 90 | 91 | # PyBuilder 92 | target/ 93 | 94 | # Jupyter Notebook 95 | .ipynb_checkpoints 96 | 97 | # IPython 98 | profile_default/ 99 | ipython_config.py 100 | 101 | # pyenv 102 | .python-version 103 | 104 | # pipenv 105 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 106 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 107 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 108 | # install all needed dependencies. 109 | #Pipfile.lock 110 | 111 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 112 | __pypackages__/ 113 | 114 | # Celery stuff 115 | celerybeat-schedule 116 | celerybeat.pid 117 | 118 | # SageMath parsed files 119 | *.sage.py 120 | 121 | # Environments 122 | .env 123 | .venv 124 | env/ 125 | venv/ 126 | ENV/ 127 | env.bak/ 128 | venv.bak/ 129 | 130 | # Spyder project settings 131 | .spyderproject 132 | .spyproject 133 | 134 | # Rope project settings 135 | .ropeproject 136 | 137 | # mkdocs documentation 138 | /site 139 | 140 | # mypy 141 | .mypy_cache/ 142 | .dmypy.json 143 | dmypy.json 144 | 145 | # Pyre type checker 146 | .pyre/ 147 | test.ipynb -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "_rslenlp" 3 | edition = "2021" 4 | 5 | [lib] 6 | name = "_rslenlp" 7 | crate-type = ["cdylib"] 8 | path = "rust/lib.rs" 9 | 10 | [dependencies] 11 | unidecode = "0.3.0" 12 | rayon = "1.10.0" 13 | pyo3 = { version = "0.24.2", features = [ 14 | "extension-module", 15 | "generate-import-lib", 16 | ] } 17 | serde = { version = "1.0.202", features = ["derive"] } 18 | serde_json = { version = "1.0.117" } 19 | bincode = "1.3.3" 20 | ndarray = "0.15" 21 | numpy = "0.24" 22 | 23 | [profile.dev] 24 | opt-level = 0 25 | 26 | [profile.release] 27 | opt-level = 3 28 | 29 | [tool.maturin] 30 | features = ["pyo3/extension-module"] 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

LeNLP

3 |

Natural Language Processing toolbox for Python with Rust

4 |
5 | 6 |

7 | 8 |
9 | 10 | license 11 |
12 | 13 | 14 | LeNLP is a toolkit dedicated to natural language processing (NLP). It provides optimized and parallelized functions in Rust for use in Python, offering high performance and ease of integration. 15 | 16 | ## Installation 17 | 18 | We can install LeNLP using: 19 | 20 | ``` 21 | pip install lenlp 22 | ``` 23 | 24 | ## Sections 25 | 26 | - [Installation](#installation) 27 | - [Quick Start](#quick-start) 28 | - [Sparse Module](#sparse-module) 29 | - [CountVectorizer](#countvectorizer) 30 | - [TfidfVectorizer](#TfidfVectorizer) 31 | - [BM25Vectorizer](#bm25vectorizer) 32 | - [FlashText](#flashtext) 33 | - [Extras](#extras) 34 | - [Counter](#counter) 35 | - [Normalizer](#normalizer) 36 | 37 | ## Quick Start 38 | 39 | ### Sparse Module 40 | 41 | The `sparse` module offers a variety of vectorizers and transformers for text data. These sparse matrices are `scipy.sparse.csr_matrix` objects, optimized for memory usage and speed. They can be used as drop-in replacements for `scikit-learn` vectorizers. 42 | 43 | #### CountVectorizer 44 | 45 | The `CountVectorizer` converts a list of texts into a sparse matrix of token counts. This is a Rust implementation of the `CountVectorizer` from `scikit-learn`. 46 | 47 | ```python 48 | from lenlp import sparse 49 | 50 | vectorizer = sparse.CountVectorizer( 51 | ngram_range=(3, 5), # range of n-grams 52 | analyzer="char_wb", # word, char, char_wb 53 | normalize=True, # lowercase and strip accents 54 | stop_words=["based"], # list of stop words 55 | ) 56 | ``` 57 | 58 | You can fit the vectorizer and transform a list of texts into a sparse matrix of token counts: 59 | 60 | ```python 61 | X = [ 62 | "Hello World", 63 | "Rust based vectorizer" 64 | ] 65 | 66 | matrix = vectorizer.fit_transform(X) 67 | ``` 68 | 69 | Or use separate calls: 70 | 71 | ```python 72 | vectorizer.fit(X) 73 | matrix = vectorizer.transform(X) 74 | ``` 75 | 76 | Benchmark: 77 | 78 |

79 | 80 | LeNLP CountVectorizer versus Sklearn CountVectorizer `fit_transform` with `char` analyzer. 81 | 82 | #### TfidfVectorizer 83 | 84 | The `TfidfVectorizer` converts a list of texts into a sparse matrix of tf-idf weights, implemented in Rust. 85 | 86 | ```python 87 | from lenlp import sparse 88 | 89 | vectorizer = sparse.TfidfVectorizer( 90 | ngram_range=(3, 5), # Range of n-grams 91 | analyzer="char_wb", # Options: word, char, char_wb 92 | normalize=True, # Lowercase and strip accents 93 | stop_words=["based"] # List of stop words 94 | ) 95 | ``` 96 | 97 | Fit the vectorizer and transform texts: 98 | 99 | ```python 100 | X = [ 101 | "Hello World", 102 | "Rust based vectorizer" 103 | ] 104 | 105 | matrix = vectorizer.fit_transform(X) 106 | ``` 107 | 108 | Or use separate calls: 109 | 110 | ```python 111 | vectorizer.fit(X) 112 | matrix = vectorizer.transform(X) 113 | ``` 114 | 115 | Benchmark: 116 | 117 |

118 | 119 | LeNLP TfidfVectorizer versus Sklearn TfidfVectorizer `fit_transform` with `char` analyzer. 120 | 121 | #### BM25Vectorizer 122 | 123 | The `BM25Vectorizer` converts texts into a sparse matrix of BM25 weights, which are more accurate than tf-idf and count weights. 124 | 125 | ```python 126 | from lenlp import sparse 127 | 128 | vectorizer = sparse.BM25Vectorizer( 129 | ngram_range=(3, 5), # Range of n-grams 130 | analyzer="char_wb", # Options: word, char, char_wb 131 | normalize=True, # Lowercase and strip accents 132 | stop_words=["based"] # List of stop words 133 | ) 134 | ``` 135 | 136 | Fit the vectorizer and transform texts: 137 | 138 | ```python 139 | X = [ 140 | "Hello World", 141 | "Rust based vectorizer" 142 | ] 143 | 144 | matrix = vectorizer.fit_transform(X) 145 | ``` 146 | 147 | Or use separate calls: 148 | 149 | ```python 150 | vectorizer.fit(X) 151 | matrix = vectorizer.transform(X) 152 | ``` 153 | 154 | Benchmark: 155 | 156 |

157 | 158 | 159 | LeNLP BM25Vectorizer versus LeNLP TfidfVectorizer `fit_transform` with `char` analyzer. BM25Vectorizer counterpart is not available in Sklearn. 160 | 161 | ### FlashText 162 | 163 | The `flashtext` module allows for efficient keyword extraction from texts. It implements the FlashText algorithm as described in the paper *[Replace or Retrieve Keywords In Documents At Scale](https://arxiv.org/pdf/1711.00046)*. 164 | 165 | ```python 166 | from lenlp import flash 167 | 168 | flash_text = flash.FlashText( 169 | normalize=True # remove accents and lowercase 170 | ) 171 | 172 | # Add keywords we want to retrieve: 173 | flash_text.add(["paris", "bordeaux", "toulouse"]) 174 | ``` 175 | 176 | Extract keywords and their positions from sentences: 177 | 178 | ```python 179 | sentences = [ 180 | "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux", 181 | "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse", 182 | ] 183 | 184 | flash_text.extract(sentences) 185 | ``` 186 | 187 | Output: 188 | 189 | ```python 190 | [[('toulouse', 0, 8), ('bordeaux', 60, 68), ('bordeaux', 74, 82)], 191 | [('paris', 0, 5), ('bordeaux', 62, 70), ('toulouse', 76, 84)]] 192 | ``` 193 | 194 | The FlashText algorithm is highly efficient, significantly faster than regular expressions for keyword extraction. LeNLP's implementation normalizes input documents by removing accents and converting to lowercase to enhance keyword extraction. 195 | 196 | Benchmark: 197 | 198 |

199 | 200 | LeNLP FlashText is benchmarked versus the official implementation of [FlashText](https://github.com/vi3k6i5/flashtext). 201 | 202 | ### Extras 203 | 204 | #### Counter 205 | 206 | The counter module allows to convert a list of texts into a dictionary of token counts. 207 | 208 | ```python 209 | from lenlp import counter 210 | 211 | sentences = [ 212 | "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux", 213 | "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse", 214 | ] 215 | 216 | counter.count( 217 | sentences, 218 | ngram_range=(1, 1), # Range of n-grams 219 | analyzer="word", # Options: word, char, char_wb 220 | normalize=True, # Lowercase and strip accents 221 | stop_words=["its", "in", "is", "of", "the", "and", "to", "a"] # List of stop words 222 | ) 223 | ``` 224 | 225 | Output: 226 | 227 | ```python 228 | [{'compared': 1, 229 | 'south': 1, 230 | 'city': 1, 231 | 'toulouse': 1, 232 | 'bordeaux': 2, 233 | 'france': 1}, 234 | {'toulouse': 1, 235 | 'france': 1, 236 | 'capital': 1, 237 | 'paris': 1, 238 | 'north': 1, 239 | 'compared': 1, 240 | 'bordeaux': 1}] 241 | ``` 242 | 243 | #### Normalizer 244 | 245 | The normalizer module allows to normalize a list of texts by removing accents and converting to lowercase. 246 | 247 | ```python 248 | from lenlp import normalizer 249 | 250 | sentences = [ 251 | "Toulouse is a city in France, it's in the south compared to bordeaux, and bordeaux", 252 | "Paris is the capital of France, it's in the north compared to bordeaux, and toulouse", 253 | ] 254 | 255 | normalizer.normalize(sentences) 256 | ``` 257 | 258 | Output: 259 | 260 | ```python 261 | [ 262 | 'toulouse is a city in france its in the south compared to bordeaux and bordeaux', 263 | 'paris is the capital of france its in the north compared to bordeaux and toulouse', 264 | ] 265 | ``` 266 | 267 | ## References 268 | 269 | - *[FlashText](https://github.com/vi3k6i5/flashtext)* 270 | - *[Scikit Learn](https://github.com/scikit-learn/scikit-learn)* 271 | - *[PyO3](https://github.com/PyO3/pyo3)* 272 | - *[Maturin](https://github.com/PyO3/maturin)* 273 | 274 | -------------------------------------------------------------------------------- /docs/bm25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/bm25.png -------------------------------------------------------------------------------- /docs/count_vectorizer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/count_vectorizer.png -------------------------------------------------------------------------------- /docs/count_vectorizer_char.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/count_vectorizer_char.png -------------------------------------------------------------------------------- /docs/flashtext.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/flashtext.png -------------------------------------------------------------------------------- /docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/logo.png -------------------------------------------------------------------------------- /docs/tfidf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphaelsty/LeNLP/9f079a8454717d96fb0c3a8d8d7fd3a1ee809efa/docs/tfidf.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin >= 1.5.1"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "lenlp" 7 | version = "1.2.0" 8 | description = "Natural Language Processing library for Python from Rust." 9 | 10 | authors = [{ name = "Raphael Sourty", email = "raphael.sourty@gmail.com" }] 11 | 12 | 13 | keywords = [] 14 | 15 | classifiers = [ 16 | "Programming Language :: Python :: 3", 17 | "Programming Language :: Rust", 18 | "Operating System :: OS Independent", 19 | ] 20 | 21 | requires-python = ">=3.8" 22 | 23 | dependencies = ["scikit-learn >= 1.5.0", "scipy >= 1.13.1"] 24 | 25 | [project.urls] 26 | Homepage = "https://github.com/raphaelsty/lenlp" 27 | Documentation = "https://github.com/raphaelsty/lenlp" 28 | Repository = "https://github.com/raphaelsty/lenlp" 29 | 30 | [project.optional-dependencies] 31 | dev = [ 32 | "maturin >= 1.5.1", 33 | "pytest-cov >= 5.0.0", 34 | "pytest >= 7.4.4", 35 | "ruff >= 0.1.15", 36 | ] 37 | [tool.maturin] 38 | bindings = "pyo3" 39 | features = ["pyo3/extension-module"] 40 | python-source = "python" 41 | module-name = "lenlp._rslenlp" 42 | 43 | [tool.include] 44 | include = ["Cargo.toml", "pyproject.toml", "README.md", "rust/*"] 45 | 46 | [tool.pytest.ini_options] 47 | filterwarnings = [ 48 | "ignore::DeprecationWarning", 49 | "ignore::RuntimeWarning", 50 | "ignore::UserWarning", 51 | ] 52 | addopts = [ 53 | "--doctest-modules", 54 | "--verbose", 55 | "-ra", 56 | "--cov-config=.coveragerc", 57 | "-m not web and not slow", 58 | ] 59 | doctest_optionflags = ["NORMALIZE_WHITESPACE", "NUMBER"] 60 | norecursedirs = ["build", "docs", "node_modules"] 61 | markers = [ 62 | "web: tests that require using the Internet", 63 | "slow: tests that take a long time to run", 64 | ] 65 | -------------------------------------------------------------------------------- /python/lenlp/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "analyzer", 3 | "counter", 4 | "flash", 5 | "normalizer", 6 | "sparse", 7 | ] 8 | -------------------------------------------------------------------------------- /python/lenlp/analyzer/__init__.py: -------------------------------------------------------------------------------- 1 | from .analyze import analyze 2 | 3 | __all__ = [ 4 | "analyze", 5 | ] 6 | -------------------------------------------------------------------------------- /python/lenlp/analyzer/analyze.py: -------------------------------------------------------------------------------- 1 | from lenlp._rslenlp import rschar_ngrams_many, rschar_wb_ngrams_many, rssplit_words_many 2 | 3 | __all__ = ["analyze"] 4 | 5 | 6 | def analyze( 7 | x: str | list[str], 8 | analyzer: str = "word", 9 | ngram_range: tuple[int, int] = (1, 1), 10 | ) -> str | list[str]: 11 | """Split text or list of texts into words or characters. 12 | 13 | Parameters 14 | ---------- 15 | x 16 | str or list of str. 17 | analyzer 18 | {word, char, char_wb}, default=word. 19 | Whether the feature should be made of word n-gram or character n-grams. Option 20 | char_wb creates character n-grams only from text inside word boundaries; 21 | n-grams at the edges of words are padded with space. 22 | ngram_range 23 | tuple (min_n, max_n), default=(1). 24 | The lower and upper boundary of the range of n-values for different n-grams to 25 | be extracted. All values of n such that min_n <= n <= max_n will be used. 26 | Examples 27 | -------- 28 | >>> from lenlp import analyzer 29 | 30 | >>> analyzer.analyze("Hello, world!", analyzer="word") 31 | ['Hello,', 'world!'] 32 | 33 | >>> analyzer.analyze("Hello, world!", analyzer="char_wb", ngram_range=(3, 3)) 34 | ['Hel', 'ell', 'llo', 'lo,', 'o, ', ', w', ' wo', 'wor', 'orl', 'rld', 'ld!'] 35 | 36 | >>> analyzer.analyze(["hello, world", "good"], analyzer="char", ngram_range=(2, 3)) 37 | [['he', 'el', 'll', 'lo', 'o,', ', ', ' w', 'wo', 'or', 'rl', 'ld', 'hel', 'ell', 'llo', 'lo,', 'o, ', ', w', ' wo', 'wor', 'orl', 'rld'], ['go', 'oo', 'od', 'goo', 'ood']] 38 | 39 | """ 40 | return_string = True if isinstance(x, str) else False 41 | x = [x] if isinstance(x, str) else x 42 | n_sizes = list(range(ngram_range[0], ngram_range[1] + 1)) 43 | 44 | match analyzer: 45 | case "word": 46 | y = rssplit_words_many(x, n_sizes=n_sizes) 47 | case "char": 48 | y = rschar_ngrams_many(x, n_sizes=n_sizes) 49 | case "char_wb": 50 | y = rschar_wb_ngrams_many(x, n_sizes=n_sizes) 51 | 52 | return y[0] if return_string else y 53 | -------------------------------------------------------------------------------- /python/lenlp/counter/__init__.py: -------------------------------------------------------------------------------- 1 | from .count import count 2 | 3 | __all__ = ["count"] 4 | -------------------------------------------------------------------------------- /python/lenlp/counter/count.py: -------------------------------------------------------------------------------- 1 | from lenlp._rslenlp import ( 2 | rsvectorize_char_ngrams_many, 3 | rsvectorize_char_wb_ngrams_many, 4 | rsvectorize_split_words_many, 5 | ) 6 | 7 | __all__ = ["count"] 8 | 9 | 10 | def count( 11 | x: str | list[str], 12 | analyzer: str = "word", 13 | ngram_range: tuple[int, int] = (1, 1), 14 | normalize: bool = True, 15 | stop_words: list[str] = None, 16 | sort: bool = False, 17 | ) -> dict[str, int]: 18 | """Count the frequency of words in a text or in a list of texts. Tokens are unordered within 19 | the same text. 20 | 21 | Parameters 22 | ---------- 23 | x 24 | str or list of str. 25 | analyzer 26 | {word, char, char_wb}, default=word. 27 | Whether the feature should be made of word n-gram or character n-grams. Option 28 | char_wb creates character n-grams only from text inside word boundaries; 29 | n-grams at the edges of words are padded with space. 30 | ngram_range 31 | tuple (min_n, max_n), default=1. 32 | The lower and upper boundary of the range of n-values for different n-grams to 33 | be extracted. All values of n such that min_n <= n <= max_n will be used. 34 | normalize 35 | bool, default=True. 36 | Whether to normalize the text before counting. It will lowercase the text and remove 37 | punctuation. 38 | stop_words 39 | list of str, default=None. 40 | A list of stop words that will be removed from the text. 41 | 42 | Examples 43 | -------- 44 | >>> from lenlp import counter 45 | 46 | >>> counter.count("Hello, world!", sort=True) 47 | {'hello': 1, 'world': 1} 48 | 49 | >>> counter.count("Hello, world!", ngram_range=(2, 2), sort=True, normalize=False) 50 | {'Hello, world!': 1} 51 | 52 | >>> counter.count(["Hello, world!", "How are you?"], stop_words=["are", "you"], sort=True) 53 | [{'hello': 1, 'world': 1}, {'how': 1}] 54 | 55 | >>> counter.count(["Hello, world!", "hello"], analyzer="char_wb", ngram_range=(3, 7), stop_words=["hello"], sort=True) 56 | [{'orl': 1, 'orld': 1, 'rld': 1, 'wor': 1, 'worl': 1, 'world': 1}, {}] 57 | 58 | >>> counter.count("Hello, world!", analyzer="char_wb", ngram_range=(3, 7), sort=True) 59 | {' wo': 1, ' wor': 1, ' worl': 1, ' world': 1, 'ell': 1, 'ello': 1, 'ello ': 1, 'ello w': 1, 'ello wo': 1, 'hel': 1, 'hell': 1, 'hello': 1, 'hello ': 1, 'hello w': 1, 'llo': 1, 'llo ': 1, 'llo w': 1, 'llo wo': 1, 'llo wor': 1, 'lo ': 1, 'lo w': 1, 'lo wo': 1, 'lo wor': 1, 'lo worl': 1, 'o w': 1, 'o wo': 1, 'o wor': 1, 'o worl': 1, 'o world': 1, 'orl': 1, 'orld': 1, 'rld': 1, 'wor': 1, 'worl': 1, 'world': 1} 60 | 61 | >>> counter.count("Hello, world!", analyzer="char", ngram_range=(3, 7), sort=True) 62 | {' wo': 1, ' wor': 1, ' worl': 1, ' world': 1, 'ell': 1, 'ello': 1, 'ello ': 1, 'ello w': 1, 'ello wo': 1, 'hel': 1, 'hell': 1, 'hello': 1, 'hello ': 1, 'hello w': 1, 'llo': 1, 'llo ': 1, 'llo w': 1, 'llo wo': 1, 'llo wor': 1, 'lo ': 1, 'lo w': 1, 'lo wo': 1, 'lo wor': 1, 'lo worl': 1, 'o w': 1, 'o wo': 1, 'o wor': 1, 'o worl': 1, 'o world': 1, 'orl': 1, 'orld': 1, 'rld': 1, 'wor': 1, 'worl': 1, 'world': 1} 63 | 64 | >>> counter.count(["Hello, world!", "hello"], analyzer="char", ngram_range=(3, 7), stop_words=["hello"], sort=True) 65 | [{'orl': 1, 'orld': 1, 'rld': 1, 'wor': 1, 'worl': 1, 'world': 1}, {}] 66 | 67 | """ 68 | return_string = True if isinstance(x, str) else False 69 | x = [x] if isinstance(x, str) else x 70 | n_sizes = list(range(ngram_range[0], ngram_range[1] + 1)) 71 | 72 | match analyzer: 73 | case "word": 74 | y = rsvectorize_split_words_many( 75 | x, n_sizes=n_sizes, stop_words=stop_words, normalize=normalize 76 | ) 77 | case "char": 78 | y = rsvectorize_char_ngrams_many( 79 | x, n_sizes=n_sizes, stop_words=stop_words, normalize=normalize 80 | ) 81 | 82 | case "char_wb": 83 | y = rsvectorize_char_wb_ngrams_many( 84 | x, n_sizes=n_sizes, stop_words=stop_words, normalize=normalize 85 | ) 86 | 87 | if sort: 88 | y = [dict(sorted(d.items())) for d in y] 89 | 90 | return y[0] if return_string else y 91 | -------------------------------------------------------------------------------- /python/lenlp/flash/__init__.py: -------------------------------------------------------------------------------- 1 | from .flash_text import FlashText 2 | 3 | __all__ = ["FlashText"] 4 | -------------------------------------------------------------------------------- /python/lenlp/flash/flash_text.py: -------------------------------------------------------------------------------- 1 | from lenlp._rslenlp import RSKeywordProcessor 2 | 3 | __all__ = ["FlashText"] 4 | 5 | 6 | class FlashText: 7 | """FlashText retrieve keywords from text. 8 | 9 | Parameters 10 | ---------- 11 | lowercase 12 | bool, default=True. 13 | Whether to lowercase the text before extracting keywords. 14 | normalize 15 | bool, default=True. 16 | Whether to normalize the text before extracting keywords. It will lowercase the text 17 | and remove punctuation. 18 | 19 | Examples 20 | -------- 21 | >>> from lenlp import flash 22 | 23 | >>> flash_text = flash.FlashText(normalize=True) 24 | >>> flash_text = flash_text.add(["hello", "world"]) 25 | 26 | >>> flash_text.extract(["Hello, world!", "world", "hello"]) 27 | [[('hello', 0, 5), ('world', 7, 12)], [('world', 0, 5)], [('hello', 0, 5)]] 28 | 29 | """ 30 | 31 | def __init__( 32 | self, 33 | lowercase: bool = True, 34 | normalize: bool = True, 35 | ) -> None: 36 | self.flash = RSKeywordProcessor(lowercase=lowercase, normalize=normalize) 37 | 38 | def add( 39 | self, 40 | x: str | list[str], 41 | clean_name: str | None = None, 42 | ) -> None: 43 | """Add a keyword to the FlashText object.""" 44 | x = [x] if isinstance(x, str) else x 45 | self.flash.add_keywords_many(x, clean_name) 46 | return self 47 | 48 | def extract(self, x: str | list[str]) -> list[str]: 49 | """Extract keywords from a sentence.""" 50 | is_string = isinstance(x, str) 51 | x = [x] if isinstance(x, str) else x 52 | y = self.flash.extract_keywords_many(x) 53 | return y[0] if is_string else y 54 | -------------------------------------------------------------------------------- /python/lenlp/normalizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalize import normalize 2 | 3 | __all__ = ["normalize"] 4 | -------------------------------------------------------------------------------- /python/lenlp/normalizer/normalize.py: -------------------------------------------------------------------------------- 1 | from lenlp._rslenlp import rsnormalize, rsnormalize_many 2 | 3 | __all__ = ["normalize"] 4 | 5 | 6 | def normalize(x: str | list[str]) -> str: 7 | """Lowercase, remove punctation and unidecode single text. 8 | 9 | Examples 10 | -------- 11 | >>> from lenlp import normalizer 12 | 13 | >>> normalizer.normalize("Hello, world!") 14 | 'hello world' 15 | 16 | >>> normalizer.normalize(["Hello, world!", "How are you?"]) 17 | ['hello world', 'how are you'] 18 | 19 | """ 20 | return rsnormalize(x) if isinstance(x, str) else rsnormalize_many(x) 21 | -------------------------------------------------------------------------------- /python/lenlp/sparse/__init__.py: -------------------------------------------------------------------------------- 1 | from .bm25_vectorizer import BM25Vectorizer 2 | from .count_vectorizer import CountVectorizer 3 | from .tfidf_vectorizer import TfidfVectorizer 4 | 5 | __all__ = ["BM25Vectorizer", "CountVectorizer", "TfidfVectorizer"] 6 | -------------------------------------------------------------------------------- /python/lenlp/sparse/bm25_vectorizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import csr_matrix 3 | from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2 4 | 5 | from .tfidf_vectorizer import TfidfVectorizer 6 | 7 | 8 | class BM25Vectorizer(TfidfVectorizer): 9 | """BM25Vectorizer is a class that converts a collection of text documents to a sparse 10 | bm25 matrix. 11 | 12 | Parameters 13 | ---------- 14 | analyzer 15 | {word, char, char_wb}, default=word. 16 | Whether the feature should be made of word n-gram or character n-grams. Option 17 | char_wb creates character n-grams only from text inside word boundaries; 18 | n-grams at the edges of words are padded with space. 19 | ngram_range 20 | tuple (min_n, max_n), default=(1, 1). 21 | The lower and upper boundary of the range of n-values for different n-grams to 22 | be extracted. All values of n such that min_n <= n <= max_n will be used. 23 | normalize 24 | bool, default=True. 25 | Whether to normalize the text before counting. It will lowercase the text and remove 26 | punctuation. 27 | stop_words 28 | list of str, default=None. 29 | A list of stop words that will be removed from the text. 30 | b 31 | The impact of document length normalization. Default is `0.75`, Higher will 32 | penalize longer documents more. 33 | k1 34 | How quickly the impact of term frequency saturates. Default is `1.5`, Higher 35 | will make term frequency more influential. 36 | epsilon 37 | Smoothing term. Default is `0`. 38 | 39 | Examples 40 | -------- 41 | >>> from lenlp import sparse 42 | 43 | >>> bm25_vectorizer = sparse.BM25Vectorizer( 44 | ... analyzer="word", 45 | ... normalize=True, 46 | ... stop_words=None, 47 | ... ) 48 | 49 | >>> x = ["Hello, world!", "How are you?"] 50 | 51 | >>> bm25_vectorizer = bm25_vectorizer.fit(x) 52 | >>> matrix = bm25_vectorizer.transform(x) 53 | >>> matrix.shape 54 | (2, 5) 55 | 56 | >>> len(bm25_vectorizer.vocabulary) 57 | 5 58 | 59 | >>> matrix = bm25_vectorizer.fit_transform(x) 60 | >>> matrix.shape 61 | (2, 5) 62 | 63 | """ 64 | 65 | def __init__( 66 | self, 67 | analyzer: str = "word", 68 | ngram_range: tuple[int, int] = (1, 1), 69 | normalize: bool = True, 70 | stop_words: list[str] = None, 71 | k1: float = 1.5, 72 | b: float = 0.75, 73 | epsilon: float = 0, 74 | ) -> None: 75 | super().__init__( 76 | analyzer=analyzer, 77 | ngram_range=ngram_range, 78 | normalize=normalize, 79 | stop_words=stop_words, 80 | ) 81 | 82 | self.k1 = k1 83 | self.b = b 84 | self.epsilon = epsilon 85 | self.average_len = None 86 | 87 | def update(self, matrix: csr_matrix) -> csr_matrix: 88 | """Update the idf values.""" 89 | self.tf = (matrix > 0).sum(axis=0) 90 | len_documents = (matrix).sum(axis=1) 91 | self.average_len = len_documents.mean() 92 | self.count = matrix.shape[0] 93 | 94 | self.idf = np.squeeze( 95 | a=np.asarray( 96 | a=np.log((self.count - self.tf + 0.5) / (self.tf + 0.5) + 1), 97 | dtype=np.float32, 98 | ) 99 | ) 100 | 101 | def _transform(self, matrix: csr_matrix) -> csr_matrix: 102 | """Transform a count matrix to a bm25 matrix.""" 103 | len_documents = (matrix).sum(axis=1) 104 | regularization = np.squeeze( 105 | a=np.asarray( 106 | a=( 107 | self.k1 * (1 - self.b + self.b * (len_documents / self.average_len)) 108 | ).flatten() 109 | ) 110 | ) 111 | 112 | denominator = matrix.tocsc() 113 | denominator.data += np.take(a=regularization, indices=denominator.indices) 114 | matrix.data = ( 115 | (matrix.data * (self.k1 + 1)) / denominator.tocsr().data 116 | ) + self.epsilon 117 | 118 | matrix = matrix.multiply(other=self.idf).tocsr() 119 | inplace_csr_row_normalize_l2(matrix) 120 | return matrix 121 | -------------------------------------------------------------------------------- /python/lenlp/sparse/count_vectorizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import csr_matrix 3 | 4 | from lenlp._rslenlp import SparseMatrixBuilder 5 | 6 | __all__ = ["CountVectorizer"] 7 | 8 | 9 | class CountVectorizer: 10 | """CountVectorizer is a class that converts a collection of text documents to a sparse 11 | matrix. 12 | 13 | Parameters 14 | ---------- 15 | analyzer 16 | {word, char, char_wb}, default=word. 17 | Whether the feature should be made of word n-gram or character n-grams. Option 18 | char_wb creates character n-grams only from text inside word boundaries; 19 | n-grams at the edges of words are padded with space. 20 | ngram_range 21 | tuple (min_n, max_n), default=(1, 1). 22 | The lower and upper boundary of the range of n-values for different n-grams to 23 | be extracted. All values of n such that min_n <= n <= max_n will be used. 24 | normalize 25 | bool, default=True. 26 | Whether to normalize the text before counting. It will lowercase the text and remove 27 | punctuation. 28 | stop_words 29 | list of str, default=None. 30 | A list of stop words that will be removed from the text. 31 | 32 | Examples 33 | -------- 34 | >>> from lenlp import sparse 35 | 36 | >>> count_vectorizer = sparse.CountVectorizer( 37 | ... analyzer="word", 38 | ... normalize=True, 39 | ... stop_words=None, 40 | ... ) 41 | 42 | >>> x = ["Hello, world!", "How are you?"] 43 | 44 | >>> count_vectorizer = count_vectorizer.fit(x) 45 | 46 | >>> matrix = count_vectorizer.transform(x) 47 | >>> matrix.shape 48 | (2, 5) 49 | 50 | >>> matrix.toarray() 51 | array([[1., 1., 0., 0., 0.], 52 | [0., 0., 1., 1., 1.]], dtype=float32) 53 | 54 | >>> len(count_vectorizer.vocabulary) 55 | 5 56 | 57 | >>> matrix = count_vectorizer.fit_transform(x) 58 | >>> matrix.shape 59 | (2, 5) 60 | 61 | """ 62 | 63 | def __init__( 64 | self, 65 | analyzer: str = "word", 66 | ngram_range: tuple[int, int] = (1, 1), 67 | normalize: bool = True, 68 | stop_words: list[str] = None, 69 | ) -> None: 70 | assert analyzer in ("word", "char", "char_wb") 71 | 72 | self.sparse_matrix = SparseMatrixBuilder( 73 | analyzer=analyzer, 74 | n_sizes=list(range(ngram_range[0], ngram_range[1] + 1)), 75 | normalize=normalize, 76 | stop_words=stop_words, 77 | ) 78 | 79 | self.fitted = False 80 | 81 | @property 82 | def vocabulary(self) -> dict[str, int]: 83 | """Get the vocabulary of the CountVectorizer object.""" 84 | return self.sparse_matrix.get_vocab() 85 | 86 | def fit(self, raw_documents: list[str]) -> None: 87 | """Learn the vocabulary dictionary and return the CountVectorizer object.""" 88 | self.fitted = True 89 | self.sparse_matrix.fit(raw_documents) 90 | return self 91 | 92 | def transform(self, raw_documents: list[str]) -> csr_matrix: 93 | """Transform documents to document-term matrix.""" 94 | if not self.fitted: 95 | raise ValueError("Call fit method before calling transform method.") 96 | 97 | values, row_indices, column_indices = self.sparse_matrix.transform( 98 | raw_documents 99 | ) 100 | 101 | return csr_matrix( 102 | arg1=(values, (row_indices, column_indices)), 103 | shape=(len(raw_documents), self.sparse_matrix.get_num_cols()), 104 | dtype=np.float32, 105 | ) 106 | 107 | def fit_transform(self, raw_documents: list[str]) -> csr_matrix: 108 | """Learn the vocabulary dictionary and return the CountVectorizer object.""" 109 | self.fitted = True 110 | 111 | values, row_indices, column_indices = self.sparse_matrix.fit_transform( 112 | raw_documents 113 | ) 114 | 115 | return csr_matrix( 116 | arg1=(values, (row_indices, column_indices)), 117 | shape=(len(raw_documents), self.sparse_matrix.get_num_cols()), 118 | dtype=np.float32, 119 | ) 120 | -------------------------------------------------------------------------------- /python/lenlp/sparse/tfidf_vectorizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import csr_matrix 3 | from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2 4 | 5 | from .count_vectorizer import CountVectorizer 6 | 7 | 8 | class TfidfVectorizer(CountVectorizer): 9 | """TfidfVectorizer is a class that converts a collection of text documents to a sparse 10 | tfidf matrix. 11 | 12 | Parameters 13 | ---------- 14 | analyzer 15 | {word, char, char_wb}, default=word. 16 | Whether the feature should be made of word n-gram or character n-grams. Option 17 | char_wb creates character n-grams only from text inside word boundaries; 18 | n-grams at the edges of words are padded with space. 19 | ngram_range 20 | tuple (min_n, max_n), default=(1, 1). 21 | The lower and upper boundary of the range of n-values for different n-grams to 22 | be extracted. All values of n such that min_n <= n <= max_n will be used. 23 | normalize 24 | bool, default=True. 25 | Whether to normalize the text before counting. It will lowercase the text and remove 26 | punctuation. 27 | stop_words 28 | list of str, default=None. 29 | A list of stop words that will be removed from the text. 30 | 31 | Examples 32 | -------- 33 | >>> from lenlp import sparse 34 | 35 | >>> tfidf_vectorizer = sparse.TfidfVectorizer( 36 | ... analyzer="word", 37 | ... normalize=True, 38 | ... stop_words=None, 39 | ... ) 40 | 41 | >>> x = ["Hello, world!", "How are you?"] 42 | 43 | >>> tfidf_vectorizer = tfidf_vectorizer.fit(x) 44 | >>> matrix = tfidf_vectorizer.transform(x) 45 | >>> matrix.shape 46 | (2, 5) 47 | 48 | >>> len(tfidf_vectorizer.vocabulary) 49 | 5 50 | 51 | >>> matrix = tfidf_vectorizer.fit_transform(x) 52 | >>> matrix.shape 53 | (2, 5) 54 | 55 | """ 56 | 57 | def __init__( 58 | self, 59 | analyzer: str = "word", 60 | ngram_range: tuple[int, int] = (1, 1), 61 | normalize: bool = True, 62 | stop_words: list[str] = None, 63 | ) -> None: 64 | super().__init__( 65 | analyzer=analyzer, 66 | ngram_range=ngram_range, 67 | normalize=normalize, 68 | stop_words=stop_words, 69 | ) 70 | 71 | self.idf = None 72 | 73 | def fit(self, raw_documents: list[str]) -> None: 74 | matrix = super().fit_transform(raw_documents=raw_documents) 75 | self.update(matrix=matrix) 76 | return self 77 | 78 | def update(self, matrix: csr_matrix) -> csr_matrix: 79 | """Update the idf values.""" 80 | tf = (matrix > 0).sum(axis=0) 81 | self.idf = ( 82 | np.squeeze(a=np.asarray(a=np.log((matrix.shape[0] + 1.0) / (tf + 1.0)))) + 1 83 | ) 84 | 85 | def _transform(self, matrix: csr_matrix) -> csr_matrix: 86 | """Transform a count matrix to a bm25 matrix.""" 87 | matrix.data *= np.take( 88 | a=self.idf, 89 | indices=matrix.indices, 90 | ) 91 | 92 | inplace_csr_row_normalize_l2(matrix) 93 | return matrix 94 | 95 | def transform(self, raw_documents: list[str]) -> csr_matrix: 96 | """Transform documents to document-term matrix.""" 97 | values, row_indices, column_indices = self.sparse_matrix.transform( 98 | raw_documents 99 | ) 100 | return self._transform( 101 | matrix=csr_matrix( 102 | arg1=(values, (row_indices, column_indices)), 103 | shape=(len(raw_documents), self.sparse_matrix.get_num_cols()), 104 | dtype=np.float32, 105 | ) 106 | ) 107 | 108 | def fit_transform(self, raw_documents: list[str]) -> csr_matrix: 109 | """Learn the vocabulary dictionary and return the CountVectorizer object.""" 110 | values, row_indices, column_indices = self.sparse_matrix.fit_transform( 111 | raw_documents 112 | ) 113 | 114 | matrix = csr_matrix( 115 | arg1=(values, (row_indices, column_indices)), 116 | shape=(len(raw_documents), self.sparse_matrix.get_num_cols()), 117 | dtype=np.float32, 118 | ) 119 | 120 | self.update(matrix=matrix) 121 | 122 | return self._transform( 123 | matrix=matrix, 124 | ) 125 | -------------------------------------------------------------------------------- /rust/lib.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3::types::PyModule; // NEW 3 | 4 | mod rsanalyzer; 5 | mod rscounter; 6 | mod rsflashtext; 7 | mod rsnormalizer; 8 | mod rssparse; 9 | mod rsstop_words; 10 | mod rsvectorizer; 11 | 12 | #[pymodule] 13 | fn _rslenlp(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { 14 | rsanalyzer::register_functions(m)?; 15 | rscounter::register_functions(m)?; 16 | rsflashtext::register_functions(m)?; 17 | rsnormalizer::register_functions(m)?; 18 | rssparse::register_functions(m)?; 19 | rsstop_words::register_functions(m)?; 20 | rsvectorizer::register_functions(m)?; 21 | Ok(()) 22 | } 23 | -------------------------------------------------------------------------------- /rust/rsanalyzer.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3::types::PyModule; // <- NEW: we now use the smart-pointer Bound<…, PyModule> 3 | use pyo3::wrap_pyfunction; 4 | use rayon::prelude::*; 5 | 6 | /// Splits text into words. 7 | /// 8 | /// # Arguments 9 | /// 10 | /// * `text` - The input text. 11 | /// * `n_sizes` - The size of the n-grams. 12 | /// 13 | /// # Returns 14 | /// 15 | /// A vector of words. 16 | #[pyfunction] 17 | pub fn rssplit_words(text: &str, n_sizes: Vec) -> Vec { 18 | let mut ngrams: Vec = Vec::new(); 19 | 20 | for &n in &n_sizes { 21 | let words: Vec<&str> = text.split_whitespace().collect(); 22 | for window in words.windows(n) { 23 | ngrams.push(window.join(" ")); 24 | } 25 | } 26 | 27 | ngrams 28 | } 29 | 30 | /// Same as `rssplit_words` but for many texts at once. 31 | #[pyfunction] 32 | pub fn rssplit_words_many(texts: Vec, n_sizes: Vec) -> Vec> { 33 | texts 34 | .par_iter() 35 | .map(|text: &String| rssplit_words(text, n_sizes.clone())) 36 | .collect() 37 | } 38 | 39 | /// Computes character n-grams. 40 | /// 41 | /// # Arguments 42 | /// 43 | /// * `texts` - A vector of input texts. 44 | /// * `n_sizes` - The size of the n-grams. 45 | /// 46 | /// # Returns 47 | /// 48 | /// A vector of character n-grams. 49 | #[pyfunction] 50 | pub fn rschar_ngrams(text: &str, n_sizes: Vec) -> Vec { 51 | let mut ngrams: Vec = Vec::new(); 52 | 53 | for &n in &n_sizes { 54 | let chars: Vec = text.chars().collect(); 55 | for window in chars.windows(n) { 56 | ngrams.push(window.iter().collect::()); 57 | } 58 | } 59 | 60 | ngrams 61 | } 62 | 63 | /// Same as `rschar_ngrams` but for many texts at once. 64 | #[pyfunction] 65 | pub fn rschar_ngrams_many(texts: Vec, n_sizes: Vec) -> Vec> { 66 | texts 67 | .par_iter() 68 | .map(|text: &String| rschar_ngrams(text, n_sizes.clone())) 69 | .collect() 70 | } 71 | 72 | /// Character n-grams with word-boundary handling. 73 | #[pyfunction] 74 | pub fn rschar_wb_ngrams(text: &str, n_sizes: Vec) -> Vec { 75 | let mut ngrams: Vec = Vec::new(); 76 | let chars: Vec = text.chars().collect(); 77 | 78 | for &n in &n_sizes { 79 | if n > chars.len() { 80 | continue; 81 | } 82 | for window in chars.windows(n) { 83 | ngrams.push(window.iter().collect::()); 84 | } 85 | } 86 | 87 | ngrams 88 | } 89 | 90 | /// Same as `rschar_wb_ngrams` but for many texts at once. 91 | #[pyfunction] 92 | pub fn rschar_wb_ngrams_many(texts: Vec, n_sizes: Vec) -> Vec> { 93 | texts 94 | .par_iter() 95 | .map(|text: &String| rschar_wb_ngrams(text, n_sizes.clone())) 96 | .collect() 97 | } 98 | 99 | /// Registers all the above functions in a Python sub-module. 100 | /// 101 | /// Called from your `#[pymodule]` entry-point. 102 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> { 103 | m.add_function(wrap_pyfunction!(rssplit_words, m)?)?; 104 | m.add_function(wrap_pyfunction!(rssplit_words_many, m)?)?; 105 | m.add_function(wrap_pyfunction!(rschar_ngrams, m)?)?; 106 | m.add_function(wrap_pyfunction!(rschar_ngrams_many, m)?)?; 107 | m.add_function(wrap_pyfunction!(rschar_wb_ngrams, m)?)?; 108 | m.add_function(wrap_pyfunction!(rschar_wb_ngrams_many, m)?)?; 109 | Ok(()) 110 | } 111 | -------------------------------------------------------------------------------- /rust/rscounter.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3::wrap_pyfunction; 3 | use rayon::prelude::*; 4 | use std::collections::HashMap; 5 | 6 | /// Counts the number of times each word appears in the input text. 7 | /// 8 | /// # Arguments 9 | /// 10 | /// * `text` - The input text as a vector of words. 11 | /// 12 | /// # Returns 13 | /// 14 | /// A hashmap with the words as keys and the number of times they appear as values. 15 | #[pyfunction] 16 | pub fn rscount(text: Vec) -> HashMap { 17 | let mut word_counter = HashMap::new(); 18 | for word in text { 19 | *word_counter.entry(word).or_insert(0) += 1; 20 | } 21 | word_counter 22 | } 23 | 24 | /// Counts the number of times each word appears for each input text. 25 | /// 26 | /// # Arguments 27 | /// 28 | /// * `texts` - The input texts as a vector of vectors of words. 29 | /// 30 | /// # Returns 31 | /// 32 | /// A vector of hashmaps with the words as keys and the number of times they appear as values. 33 | #[pyfunction] 34 | pub fn rscount_many(texts: Vec>) -> Vec> { 35 | texts.par_iter().map(|text| rscount(text.clone())).collect() 36 | } 37 | 38 | /// Registers all the above functions in a Python sub-module. 39 | /// 40 | /// Called from your `#[pymodule]` entry-point. 41 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> { 42 | m.add_function(wrap_pyfunction!(rscount, m)?)?; 43 | m.add_function(wrap_pyfunction!(rscount_many, m)?)?; 44 | Ok(()) 45 | } 46 | 47 | #[cfg(test)] 48 | mod tests { 49 | use super::*; 50 | 51 | #[test] 52 | fn test_rscount() { 53 | let text = vec![ 54 | "hello".to_string(), 55 | "world".to_string(), 56 | "hello".to_string(), 57 | "hello".to_string(), 58 | ]; 59 | let result = rscount(text); 60 | let mut expected = HashMap::new(); 61 | expected.insert("hello".to_string(), 3); 62 | expected.insert("world".to_string(), 1); 63 | assert_eq!(result, expected); 64 | } 65 | 66 | #[test] 67 | fn test_rscount_many() { 68 | let texts = vec![ 69 | vec!["hello".to_string(), "world".to_string()], 70 | vec![ 71 | "hello".to_string(), 72 | "world".to_string(), 73 | "hello".to_string(), 74 | ], 75 | ]; 76 | let result = rscount_many(texts); 77 | let mut expected = Vec::new(); 78 | let mut map1 = HashMap::new(); 79 | map1.insert("hello".to_string(), 1); 80 | map1.insert("world".to_string(), 1); 81 | let mut map2 = HashMap::new(); 82 | map2.insert("hello".to_string(), 2); 83 | map2.insert("world".to_string(), 1); 84 | expected.push(map1); 85 | expected.push(map2); 86 | assert_eq!(result, expected); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /rust/rsflashtext.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | 3 | use std::collections::HashMap; 4 | use std::collections::HashSet; 5 | 6 | use rayon::prelude::*; 7 | use unidecode::unidecode; 8 | 9 | #[pyclass()] 10 | pub struct RSKeywordProcessor { 11 | keyword: String, 12 | non_word_boundaries: HashSet, 13 | keyword_trie_dict: HashMap, 14 | lowercase: bool, 15 | normalize: bool, 16 | terms_in_trie: usize, 17 | } 18 | 19 | #[pyclass()] 20 | pub struct RSTrieNode { 21 | children: HashMap, 22 | is_end: bool, 23 | clean_name: Option, 24 | } 25 | 26 | impl RSTrieNode { 27 | pub fn new() -> Self { 28 | RSTrieNode { 29 | children: HashMap::new(), 30 | is_end: false, 31 | clean_name: None, 32 | } 33 | } 34 | } 35 | 36 | #[pymethods] 37 | impl RSKeywordProcessor { 38 | #[new] 39 | pub fn new(lowercase: bool, normalize: bool) -> Self { 40 | let keyword: String = "_keyword_".to_string(); 41 | let non_word_boundaries: HashSet = { 42 | let mut set: HashSet = HashSet::new(); 43 | set.extend('0'..='9'); 44 | set.extend('a'..='z'); 45 | set.extend('A'..='Z'); 46 | set.insert('_'); 47 | set 48 | }; 49 | 50 | RSKeywordProcessor { 51 | keyword, 52 | non_word_boundaries, 53 | keyword_trie_dict: HashMap::new(), 54 | lowercase, 55 | normalize, 56 | terms_in_trie: 0, 57 | } 58 | } 59 | 60 | pub fn add_keywords_many( 61 | &mut self, 62 | keywords: Vec, 63 | clean_name: Option<&str>, 64 | ) -> Vec { 65 | keywords 66 | .iter() 67 | .map(|keyword: &String| self.add_keyword(&keyword, clean_name)) 68 | .collect() 69 | } 70 | 71 | pub fn add_keyword(&mut self, keyword: &str, clean_name: Option<&str>) -> bool { 72 | let clean_name: &str = clean_name.unwrap_or(keyword); 73 | let keyword: String = if self.normalize { 74 | unidecode(keyword) 75 | .to_lowercase() 76 | .chars() 77 | .filter(|c| !c.is_ascii_punctuation()) 78 | .collect::() 79 | .trim() 80 | .to_string() 81 | } else if self.lowercase { 82 | keyword.to_lowercase() 83 | } else { 84 | keyword.to_string() 85 | }; 86 | 87 | let mut current_node: &mut HashMap = &mut self.keyword_trie_dict; 88 | for char in keyword.chars() { 89 | current_node = &mut current_node 90 | .entry(char) 91 | .or_insert_with(RSTrieNode::new) 92 | .children; 93 | } 94 | 95 | if !current_node.contains_key(&self.keyword.chars().next().unwrap()) { 96 | self.terms_in_trie += 1; 97 | current_node.insert( 98 | self.keyword.chars().next().unwrap(), 99 | RSTrieNode { 100 | children: HashMap::new(), 101 | is_end: true, 102 | clean_name: Some(clean_name.to_string()), 103 | }, 104 | ); 105 | true 106 | } else { 107 | false 108 | } 109 | } 110 | 111 | pub fn extract_keywords_many( 112 | &self, 113 | sentences: Vec, 114 | ) -> Vec> { 115 | sentences 116 | .par_iter() 117 | .map(|sentence: &String| self.extract_keywords(&sentence)) 118 | .collect() 119 | } 120 | 121 | pub fn extract_keywords(&self, sentence: &str) -> Vec<(String, usize, usize)> { 122 | // Map from the index in the normalized sentence to the index in the original sentence 123 | let mut index_map: Vec = Vec::with_capacity(sentence.len()); 124 | let mut original_idx = 0; 125 | 126 | let normalized_sentence: String = if self.normalize { 127 | let mut normalized = String::new(); 128 | for c in sentence.chars() { 129 | if c.is_ascii_punctuation() { 130 | original_idx += c.len_utf8(); 131 | continue; 132 | } 133 | let normalized_char = unidecode::unidecode_char(c).to_lowercase(); 134 | for nc in normalized_char.chars() { 135 | normalized.push(nc); 136 | index_map.push(original_idx); 137 | } 138 | original_idx += c.len_utf8(); 139 | } 140 | normalized.to_string() 141 | } else if self.lowercase { 142 | sentence.to_lowercase() 143 | } else { 144 | sentence.to_string() 145 | }; 146 | 147 | let mut extracted_keywords: Vec<(String, usize, usize)> = Vec::new(); 148 | let mut current_node: &HashMap = &self.keyword_trie_dict; 149 | let mut start_pos: usize = 0; 150 | let mut end_pos: usize = 0; 151 | 152 | let mut idx: usize = 0; 153 | let sentence_len: usize = normalized_sentence.len(); 154 | while idx < sentence_len { 155 | let char: char = normalized_sentence.chars().nth(idx).unwrap(); 156 | if !self.non_word_boundaries.contains(&char) { 157 | if let Some(node) = current_node.get(&self.keyword.chars().next().unwrap()) { 158 | if node.is_end { 159 | let clean_name: &String = node.clean_name.as_ref().unwrap(); 160 | let original_start_pos = index_map[start_pos]; 161 | let original_end_pos = index_map[end_pos - 1] + 1; 162 | extracted_keywords.push(( 163 | clean_name.clone(), 164 | original_start_pos, 165 | original_end_pos, 166 | )); 167 | } 168 | } 169 | current_node = &self.keyword_trie_dict; 170 | start_pos = idx + 1; 171 | } else if let Some(node) = current_node.get(&char) { 172 | current_node = &node.children; 173 | end_pos = idx + 1; 174 | } else { 175 | current_node = &self.keyword_trie_dict; 176 | start_pos = idx + 1; 177 | } 178 | idx += 1; 179 | } 180 | 181 | // Check if the last segment is a keyword 182 | if let Some(node) = current_node.get(&self.keyword.chars().next().unwrap()) { 183 | if node.is_end { 184 | let clean_name: &String = node.clean_name.as_ref().unwrap(); 185 | let original_start_pos = index_map[start_pos]; 186 | let original_end_pos = index_map[end_pos - 1] + 1; 187 | extracted_keywords.push((clean_name.clone(), original_start_pos, original_end_pos)); 188 | } 189 | } 190 | 191 | extracted_keywords 192 | } 193 | } 194 | 195 | /// Registers all the above functions in a Python sub-module. 196 | /// 197 | /// Called from your `#[pymodule]` entry-point. 198 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> { 199 | m.add_class::()?; 200 | m.add_class::()?; 201 | Ok(()) 202 | } 203 | -------------------------------------------------------------------------------- /rust/rsnormalizer.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3::wrap_pyfunction; 3 | use rayon::prelude::*; 4 | use unidecode::unidecode; 5 | 6 | /// Normalize text by converting to lowercase, removing punctuation, and trimming whitespace. 7 | /// 8 | /// # Arguments 9 | /// 10 | /// * `text` - A string slice that holds the text to normalize. 11 | /// 12 | /// # Returns 13 | /// 14 | /// A String that holds the normalized text. 15 | #[pyfunction] 16 | pub fn rsnormalize(text: &str) -> String { 17 | unidecode(text) 18 | .to_lowercase() 19 | .chars() 20 | .filter(|c| !c.is_ascii_punctuation()) 21 | .collect::() 22 | .trim() 23 | .to_string() 24 | } 25 | 26 | /// Normalize multiple texts. 27 | /// 28 | /// # Arguments 29 | /// 30 | /// * `texts` - A vector of strings that holds the texts to normalize. 31 | /// 32 | /// # Returns 33 | /// 34 | /// A vector of strings that holds the normalized texts. 35 | #[pyfunction] 36 | pub fn rsnormalize_many(texts: Vec) -> Vec { 37 | texts.par_iter().map(|text| rsnormalize(text)).collect() 38 | } 39 | 40 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> { 41 | m.add_function(wrap_pyfunction!(rsnormalize, m)?)?; 42 | m.add_function(wrap_pyfunction!(rsnormalize_many, m)?)?; 43 | Ok(()) 44 | } 45 | 46 | #[cfg(test)] 47 | mod tests { 48 | use super::*; 49 | 50 | #[test] 51 | fn test_rsnormalize() { 52 | assert_eq!(rsnormalize("Hello World! 😀"), "hello world"); 53 | assert_eq!(rsnormalize("1,2,3,4"), "1234"); 54 | } 55 | 56 | #[test] 57 | fn test_rsnormalize_many() { 58 | let input = vec!["Hello World! 😀".to_string(), "Goodbye, World!".to_string()]; 59 | let expected = vec!["hello world".to_string(), "goodbye world".to_string()]; 60 | assert_eq!(rsnormalize_many(input), expected); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /rust/rssparse.rs: -------------------------------------------------------------------------------- 1 | use crate::rsvectorizer::rsvectorize_many; 2 | use bincode::{deserialize, serialize}; 3 | use numpy::PyArray1; 4 | use pyo3::prelude::*; 5 | use pyo3::types::{PyBytes, PyModule}; // NEW 6 | use serde::{Deserialize, Serialize}; 7 | use std::collections::HashMap; 8 | 9 | // --------------------------------------------------------------------------- 10 | // Sparse-matrix builder 11 | // --------------------------------------------------------------------------- 12 | 13 | #[derive(Clone, Debug, Serialize, Deserialize)] 14 | #[pyclass(module = "lenlp.sparse.count_vectorizer")] 15 | pub struct SparseMatrixBuilder { 16 | analyzer: String, 17 | n_sizes: Vec, 18 | stop_words: Option>, 19 | normalize: Option, 20 | vocab: HashMap, 21 | num_cols: usize, 22 | } 23 | 24 | #[pymethods] 25 | impl SparseMatrixBuilder { 26 | #[new] 27 | pub fn new( 28 | n_sizes: Vec, 29 | analyzer: String, 30 | stop_words: Option>, 31 | normalize: Option, 32 | ) -> Self { 33 | Self { 34 | vocab: HashMap::new(), 35 | n_sizes, 36 | analyzer, 37 | stop_words, 38 | normalize, 39 | num_cols: 0, 40 | } 41 | } 42 | 43 | /// Build the vocabulary and return the CSR triplet arrays. 44 | pub fn fit_transform( 45 | &mut self, 46 | texts: Vec, 47 | py: Python<'_>, 48 | ) -> ( 49 | Py>, 50 | Py>, 51 | Py>, 52 | ) { 53 | self.vocab = HashMap::new(); 54 | let texts: Vec> = rsvectorize_many( 55 | texts, 56 | self.n_sizes.clone(), 57 | self.analyzer.clone(), 58 | self.stop_words.clone(), 59 | self.normalize, 60 | ); 61 | 62 | self._fit(texts.clone()); 63 | 64 | // Scipy csr_matrix are faster to build from numpy arrays. 65 | let (vec1, vec2, vec3) = self._transform(texts); 66 | ( 67 | PyArray1::from_vec_bound(py, vec1).into(), 68 | PyArray1::from_vec_bound(py, vec2).into(), 69 | PyArray1::from_vec_bound(py, vec3).into(), 70 | ) 71 | } 72 | 73 | pub fn fit(&mut self, texts: Vec) { 74 | self.vocab = HashMap::new(); 75 | let texts: Vec> = rsvectorize_many( 76 | texts, 77 | self.n_sizes.clone(), 78 | self.analyzer.clone(), 79 | self.stop_words.clone(), 80 | self.normalize, 81 | ); 82 | 83 | self._fit(texts); 84 | } 85 | 86 | fn _fit(&mut self, texts: Vec>) { 87 | let mut col_index: usize = 0; 88 | for doc in &texts { 89 | for token in doc.keys() { 90 | if !self.vocab.contains_key(token) { 91 | self.vocab.insert(token.clone(), col_index); 92 | col_index += 1; 93 | } 94 | } 95 | } 96 | self.num_cols = col_index; 97 | } 98 | 99 | pub fn transform( 100 | &self, 101 | texts: Vec, 102 | py: Python<'_>, 103 | ) -> ( 104 | Py>, 105 | Py>, 106 | Py>, 107 | ) { 108 | let texts: Vec> = rsvectorize_many( 109 | texts, 110 | self.n_sizes.clone(), 111 | self.analyzer.clone(), 112 | self.stop_words.clone(), 113 | self.normalize, 114 | ); 115 | 116 | // Scipy csr_matrix are faster to build from numpy arrays. 117 | let (vec1, vec2, vec3) = self._transform(texts); 118 | ( 119 | PyArray1::from_vec_bound(py, vec1).into(), 120 | PyArray1::from_vec_bound(py, vec2).into(), 121 | PyArray1::from_vec_bound(py, vec3).into(), 122 | ) 123 | } 124 | 125 | fn _transform( 126 | &self, 127 | texts: Vec>, 128 | ) -> (Vec, Vec, Vec) { 129 | let mut values: Vec = Vec::new(); 130 | let mut row_indices: Vec = Vec::new(); 131 | let mut column_indices: Vec = Vec::new(); 132 | 133 | for (row_idx, doc) in texts.iter().enumerate() { 134 | for (token, &count) in doc.iter() { 135 | if let Some(&col_idx) = self.vocab.get(token) { 136 | values.push(count); 137 | row_indices.push(row_idx); 138 | column_indices.push(col_idx); 139 | } 140 | } 141 | } 142 | 143 | (values, row_indices, column_indices) 144 | } 145 | 146 | // --------------------------------------------------------------------- 147 | // Accessors 148 | // --------------------------------------------------------------------- 149 | pub fn get_vocab(&self) -> HashMap { 150 | self.vocab.clone() 151 | } 152 | 153 | pub fn get_num_cols(&self) -> usize { 154 | self.num_cols 155 | } 156 | 157 | // --------------------------------------------------------------------- 158 | // Pickle support 159 | // --------------------------------------------------------------------- 160 | 161 | pub fn __setstate__(&mut self, state: &Bound<'_, PyBytes>) -> PyResult<()> { 162 | *self = deserialize(state.as_bytes()).unwrap(); 163 | Ok(()) 164 | } 165 | 166 | pub fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult> { 167 | Ok(PyBytes::new(py, &serialize(&self).unwrap())) 168 | } 169 | 170 | pub fn __getnewargs__( 171 | &self, 172 | ) -> PyResult<(Vec, String, Option>, Option)> { 173 | Ok(( 174 | self.n_sizes.clone(), 175 | self.analyzer.clone(), 176 | self.stop_words.clone(), 177 | self.normalize, 178 | )) 179 | } 180 | } 181 | 182 | // --------------------------------------------------------------------------- 183 | // Module registration 184 | // --------------------------------------------------------------------------- 185 | 186 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> { 187 | m.add_class::()?; 188 | Ok(()) 189 | } 190 | -------------------------------------------------------------------------------- /rust/rsstop_words.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3::wrap_pyfunction; 3 | use rayon::prelude::*; 4 | use std::collections::HashSet; 5 | 6 | /// Function to filter stop words from a string 7 | /// 8 | /// # Arguments 9 | /// 10 | /// * `text` - The input text. 11 | /// * `stop_words` - The stop words to filter. 12 | /// 13 | /// # Returns 14 | /// 15 | /// A string with the stop words removed. 16 | #[pyfunction] 17 | pub fn rsfilter_stop_words(text: &str, stop_words: Vec) -> String { 18 | // Use HashSet for better performance in membership checks 19 | let stop_words_set: HashSet<_> = stop_words.into_iter().collect(); 20 | text.split_whitespace() 21 | .filter(|word: &&str| !stop_words_set.contains(*word)) 22 | .collect::>() 23 | .join(" ") 24 | } 25 | 26 | /// Function to filter stop words from multiple strings 27 | /// 28 | /// # Arguments 29 | /// 30 | /// * `texts` - The input texts. 31 | /// * `stop_words` - The stop words to filter. 32 | /// 33 | /// # Returns 34 | /// 35 | /// A vector of strings with the stop words removed. 36 | #[pyfunction] 37 | pub fn rsfilter_stop_words_many(texts: Vec, stop_words: Vec) -> Vec { 38 | // Use HashSet for better performance in membership checks 39 | let stop_words_set: HashSet<_> = stop_words.into_iter().collect(); 40 | texts 41 | .into_par_iter() 42 | .map(|sentence: String| { 43 | sentence 44 | .split_whitespace() 45 | .filter(|word: &&str| !stop_words_set.contains(*word)) 46 | .collect::>() 47 | .join(" ") 48 | }) 49 | .collect() 50 | } 51 | 52 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> { 53 | m.add_function(wrap_pyfunction!(rsfilter_stop_words, m)?)?; 54 | m.add_function(wrap_pyfunction!(rsfilter_stop_words_many, m)?)?; 55 | Ok(()) 56 | } 57 | -------------------------------------------------------------------------------- /rust/rsvectorizer.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3::wrap_pyfunction; 3 | use rayon::prelude::*; 4 | use std::collections::HashMap; 5 | 6 | use crate::rsanalyzer::rschar_ngrams; 7 | use crate::rsanalyzer::rschar_wb_ngrams; 8 | use crate::rsanalyzer::rssplit_words; 9 | use crate::rscounter::rscount; 10 | use crate::rsnormalizer::rsnormalize_many; 11 | use crate::rsstop_words::rsfilter_stop_words_many; 12 | 13 | pub fn process_texts( 14 | texts: Vec, 15 | normalize: Option, 16 | stop_words: Option>, 17 | ) -> Vec { 18 | let texts: Vec = match normalize { 19 | Some(true) => rsnormalize_many(texts), 20 | _ => texts, 21 | }; 22 | 23 | match stop_words { 24 | Some(stop_words) => rsfilter_stop_words_many(texts, stop_words), 25 | None => texts, 26 | } 27 | } 28 | 29 | #[pyfunction] 30 | pub fn rsvectorize_split_words_many( 31 | texts: Vec, 32 | n_sizes: Vec, 33 | stop_words: Option>, 34 | normalize: Option, 35 | ) -> Vec> { 36 | let texts: Vec = process_texts(texts, normalize, stop_words); 37 | texts 38 | .par_iter() 39 | .map(|text: &String| rscount(rssplit_words(text, n_sizes.clone()))) 40 | .collect() 41 | } 42 | 43 | #[pyfunction] 44 | pub fn rsvectorize_char_ngrams_many( 45 | texts: Vec, 46 | n_sizes: Vec, 47 | stop_words: Option>, 48 | normalize: Option, 49 | ) -> Vec> { 50 | let texts: Vec = process_texts(texts, normalize, stop_words); 51 | texts 52 | .par_iter() 53 | .map(|text: &String| rscount(rschar_ngrams(text, n_sizes.clone()))) 54 | .collect() 55 | } 56 | 57 | #[pyfunction] 58 | pub fn rsvectorize_char_wb_ngrams_many( 59 | texts: Vec, 60 | n_sizes: Vec, 61 | stop_words: Option>, 62 | normalize: Option, 63 | ) -> Vec> { 64 | let texts: Vec = process_texts(texts, normalize, stop_words); 65 | texts 66 | .par_iter() 67 | .map(|text: &String| rscount(rschar_wb_ngrams(text, n_sizes.clone()))) 68 | .collect() 69 | } 70 | 71 | // Main vectorization function 72 | #[pyfunction] 73 | pub fn rsvectorize_many( 74 | texts: Vec, 75 | n_sizes: Vec, 76 | analyzer: String, 77 | stop_words: Option>, 78 | normalize: Option, 79 | ) -> Vec> { 80 | match analyzer.as_str() { 81 | "word" => rsvectorize_split_words_many(texts, n_sizes, stop_words, normalize), 82 | "char" => rsvectorize_char_ngrams_many(texts, n_sizes, stop_words, normalize), 83 | "char_wb" => rsvectorize_char_wb_ngrams_many(texts, n_sizes, stop_words, normalize), 84 | _ => panic!("Invalid analyzer type"), 85 | } 86 | } 87 | 88 | pub fn register_functions(m: &Bound<'_, PyModule>) -> PyResult<()> { 89 | m.add_function(wrap_pyfunction!(rsvectorize_split_words_many, m)?)?; 90 | m.add_function(wrap_pyfunction!(rsvectorize_char_ngrams_many, m)?)?; 91 | m.add_function(wrap_pyfunction!(rsvectorize_char_wb_ngrams_many, m)?)?; 92 | m.add_function(wrap_pyfunction!(rsvectorize_many, m)?)?; 93 | 94 | Ok(()) 95 | } 96 | --------------------------------------------------------------------------------