├── .gitmodules ├── MANIFEST.in ├── flake8.ini ├── .gitignore ├── .editorconfig ├── AUTHORS.rst ├── .github └── workflows │ ├── pypi.yml │ └── main.yml ├── Makefile ├── LICENSE ├── ChangeLog.rst ├── pyproject.toml ├── src ├── stop_words │ └── __init__.py └── tests.py └── README.rst /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/stop_words/stop-words"] 2 | path = src/stop_words/stop-words 3 | url = https://github.com/Alir3z4/stop-words.git 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.rst 3 | include ChangeLog.rst 4 | include AUTHORS.rst 5 | recursive-include stop_words/stop-words *.txt 6 | -------------------------------------------------------------------------------- /flake8.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | exclude = 4 | venv, 5 | .venv, 6 | cache, 7 | build, 8 | src/stop_words/stop-words/**, 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE specific 2 | .idea/ 3 | .spyderproject 4 | # file 5 | *.pyc 6 | manage.py 7 | dump.json 8 | MANIFEST 9 | # dir 10 | build/ 11 | dist/ 12 | *.egg-info/ 13 | logs/ 14 | bin/ 15 | develop-eggs/ 16 | eggs/ 17 | coverage.xml 18 | .coverage 19 | build 20 | src/stop_words/_version.py 21 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | root = true 3 | 4 | [*] 5 | indent_size = 2 6 | indent_style = space 7 | end_of_line = lf 8 | charset = utf-8 9 | max_line_length = 120 10 | insert_final_newline = true 11 | 12 | [*.py] 13 | indent_size = 4 14 | trim_trailing_whitespace = true 15 | 16 | 17 | [*.rst] 18 | trim_trailing_whitespace = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ``python-stop-words`` was originally created in middle 2014 at home, the bedroom 2 | division of the Alireza's place somewhere on planet earth maybe. 3 | 4 | The PRIMARY AUTHORS are (and/or have been): 5 | 6 | * Alireza Savand 7 | * François‎ 8 | 9 | And here is an inevitably incomplete list of MUCH-APPRECIATED CONTRIBUTORS -- 10 | people who have submitted patches, reported bugs, added translations, helped 11 | answer newbie questions, and generally made ``python-stop-words`` that much better: 12 | 13 | * Alireza Savand 14 | * Julien Fache 15 | * David Miró 16 | * Taras Labiak 17 | 18 | 19 | A big THANK YOU goes to: 20 | 21 | * François‎ for convincing Alireza to start the project. 22 | * Guido van Rossum for creating Python. 23 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | release: 5 | types: [released] 6 | 7 | jobs: 8 | release: 9 | name: Release 10 | environment: 11 | name: pypi 12 | url: https://pypi.org/project/stop-words 13 | permissions: 14 | id-token: write 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@v5 19 | with: 20 | submodules: true 21 | 22 | - uses: actions/setup-python@v6 23 | with: 24 | python-version: '3.13' 25 | 26 | - name: Build 27 | run: | 28 | python -m pip install build 29 | make update-submodules build 30 | 31 | - name: Publish package distributions to PyPI 32 | uses: pypa/gh-action-pypi-publish@release/v1 33 | with: 34 | verbose: true 35 | print-hash: true 36 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help install test coverage build clean format check-format lint precommit update-submodules 2 | 3 | .DEFAULT_GOAL := help 4 | 5 | help: ## Display this help message 6 | @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) 7 | 8 | install: update-submodules ## Install development dependencies 9 | pip install -e '.[dev]' 10 | 11 | update-submodules: ## Update all git submodules 12 | git submodule sync --recursive 13 | git submodule update --init --remote --recursive 14 | 15 | test: ## Run test suite 16 | python -m unittest discover -s src/ -v 17 | 18 | coverage: ## Generate coverage report 19 | coverage run -m unittest discover -s src/ 20 | coverage report 21 | coverage xml 22 | 23 | build: ## Build source and wheel distributions 24 | python -m build 25 | 26 | clean: ## Remove build artifacts and temporary files 27 | rm -rf build/ dist/ *.egg-info/ **/*.egg-info/ .coverage coverage.xml .mypy_cache/ 88 28 | 29 | format: ## Auto-format code with isort and black 30 | isort . 31 | black . 32 | 33 | check-format: ## Check code formatting with isort and black 34 | isort --check-only --diff . 35 | black --check --diff . 36 | 37 | lint: ## Run all code quality checks 38 | flake8 --config=flake8.ini . 39 | mypy src/ --install-types --non-interactive 40 | 41 | precommit: format lint ## Full pre-commit checks (format + lint) 42 | 43 | ##@ Development Targets 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Alireza Savand, Contributors 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of the {organization} nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /ChangeLog.rst: -------------------------------------------------------------------------------- 1 | 2025.11.4 2 | ========= 3 | 4 | * Sync with latest of https://github.com/Alir3z4/stop-words. 5 | * Add much more tests and cleaned up the code. 6 | * Modernized Python packaging and publishing. 7 | 8 | 9 | 2018.7.23 10 | ========= 11 | 12 | * Fixed #14: `languages.json` is missing, if you don't git clone with `--recursive`. 13 | * Feature: Support latest version of Python (3.7+). 14 | * Feature #22: Enforces packaging of eggs into folders. 15 | * Update the `stop-words` repository to get the latest languages. 16 | * Fixed Travis failing and tests due to bootstrap. 17 | 18 | 19 | 2015.2.23.1 20 | =========== 21 | 22 | * Fixed #9: Missing ``languages.json`` file that breaks the installation. 23 | 24 | 25 | 2015.2.23 26 | ========= 27 | 28 | * Feature: Using the cache is optional 29 | * Feature: Filtering stopwords 30 | 31 | 2015.2.21 32 | ========= 33 | 34 | * Feature: ``LANGUAGE_MAPPING`` is loads from stop-words/languages.json 35 | * Fixed: Made paths OS-independent 36 | 37 | 38 | 2015.1.31 39 | ========= 40 | 41 | * Feature #5: Decode error AND Add ``catalan`` language to ``LANGUAGE_MAPPING`. 42 | * Feature: Update `stop-words` dictionary. 43 | 44 | 45 | 2015.1.22 46 | ========= 47 | 48 | * Feature: Tests 49 | * Feature: Python 3 support 50 | * Feature: Dev installation via zc.buildout 51 | * Feature: Continuous integration via Travis 52 | 53 | 54 | 2015.1.19 55 | ========= 56 | 57 | * Feature #3: Handle language code, cache and custom errors 58 | 59 | 60 | 2014.5.26 61 | ========= 62 | 63 | * Initial release. 64 | * Package on pypi. 65 | * github.com/Alir3z4/stop-words as submodule. 66 | 67 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | # Controls when the action will run. 4 | on: 5 | # Triggers the workflow on push or pull request events but only for the master branch 6 | push: 7 | branches: [ master ] 8 | pull_request: 9 | branches: [ master ] 10 | 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | jobs: 15 | code-quality: 16 | runs-on: ubuntu-latest 17 | 18 | name: "Linting" 19 | steps: 20 | - uses: actions/checkout@v5 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: setup python 25 | uses: actions/setup-python@v6 26 | with: 27 | python-version: '3.13' 28 | 29 | - name: Install dependencies 30 | run: make install 31 | 32 | - name: Linting 33 | run: make lint 34 | 35 | test: 36 | # The type of runner that the job will run on 37 | runs-on: ubuntu-latest 38 | strategy: 39 | matrix: 40 | python-version: ["3.11", "3.12", "3.13", "3.14"] 41 | env: 42 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 43 | name: "Python ${{ matrix.python-version }}" 44 | 45 | steps: 46 | - name: Check out code 47 | uses: actions/checkout@v5 48 | with: 49 | submodules: true 50 | 51 | - name: Set up Python ${{ matrix.python-version }} 52 | uses: actions/setup-python@v6 53 | with: 54 | python-version: ${{ matrix.python-version }} 55 | 56 | - name: Install dependencies 57 | run: make install 58 | 59 | - name: Run tests 60 | run: make coverage 61 | 62 | - name: Upload coverage to Codecov 63 | uses: codecov/codecov-action@v5 64 | with: 65 | flags: unittests-${{ matrix.python-version }} 66 | fail_ci_if_error: true # default = false 67 | verbose: true # default = false 68 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.2", "setuptools_scm[toml]>=3.4.3"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "stop-words" 7 | description = "Get list of common stop words in various languages in Python" 8 | readme = "README.rst" 9 | authors = [{name = "Alireza Savand", email = "alireza.savand@gmail.com"}] 10 | license = "BSD-3-Clause" 11 | classifiers = [ 12 | "Programming Language :: Python", 13 | "Intended Audience :: Developers", 14 | "Operating System :: OS Independent", 15 | "Topic :: Software Development", 16 | "Development Status :: 6 - Mature", 17 | "Programming Language :: Python :: 3", 18 | "Topic :: Text Processing", 19 | "Topic :: Text Processing :: Filters", 20 | ] 21 | requires-python = ">=3.11" 22 | dynamic = ["version"] 23 | 24 | [project.urls] 25 | Homepage = "https://github.com/Alir3z4/python-stop-words" 26 | Repository = "https://github.com/Alir3z4/python-stop-words.git" 27 | Issues = "https://github.com/Alir3z4/python-stop-words/issues" 28 | Changelog = "https://github.com/Alir3z4/python-stop-words/blob/main/ChangeLog.rst" 29 | 30 | [project.optional-dependencies] 31 | dev = [ 32 | "black==25.9.0", 33 | "mypy==1.18.2", 34 | "flake8==7.3.0", 35 | "coverage==7.11.0", 36 | ] 37 | 38 | [tool.setuptools_scm] 39 | write_to = "src/stop_words/_version.py" 40 | 41 | [tool.setuptools] 42 | packages = ["stop_words"] 43 | package-dir = {"" = "src"} 44 | package-data = {stop_words = [ 45 | "stop-words/*.txt", 46 | "stop-words/languages.json", 47 | ]} 48 | 49 | 50 | [tool.mypy] 51 | python_version = "3.13" 52 | exclude_gitignore = true 53 | 54 | [tool.coverage.run] 55 | cover_pylib = false 56 | omit = [ 57 | "*site-packages*", 58 | "*distutils*", 59 | "venv/*", 60 | ".venv/*", 61 | "_version.py", 62 | ] 63 | 64 | [tool.coverage.report] 65 | precision = 3 66 | show_missing = true 67 | ignore_errors = true 68 | # Regexes for lines to exclude from consideration 69 | exclude_lines = [ 70 | # Have to re-enable the standard pragma 71 | "pragma: no cover", 72 | 73 | # Don't complain about missing debug-only code: 74 | "def __repr__", 75 | "def __str__", 76 | "if self\\.debug", 77 | 78 | # Don't complain if tests don't hit defensive assertion code: 79 | "raise AssertionError", 80 | "raise NotImplementedError", 81 | 82 | # Don't complain if non-runnable code isn't run: 83 | "if 0:", 84 | "if __name__ == .__main__.:", 85 | ] 86 | skip_covered = true 87 | 88 | 89 | [tool.black] 90 | line-length = 120 91 | target-version = ['py313'] 92 | extend-exclude = ''' 93 | /( 94 | build 95 | | \.venv 96 | | src/stop_words/stop-words 97 | )/ 98 | ''' 99 | 100 | 101 | [tool.isort] 102 | line_length = 120 103 | extend_skip = ["src/stop_words/stop-words", "src/stop_words/_version.py"] 104 | sections = "FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER" 105 | indent = 4 106 | multi_line_output = 3 107 | include_trailing_comma = true 108 | order_by_type = true 109 | combine_as_imports = true 110 | lines_after_imports = 2 111 | float_to_top = true 112 | atomic = true 113 | -------------------------------------------------------------------------------- /src/stop_words/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stop Words Library 3 | 4 | A module for loading and managing stop words across multiple languages. 5 | Stop words are common words that are typically filtered out in text processing. 6 | 7 | This module provides: 8 | - Loading stop words from language-specific files 9 | - Caching for performance optimization 10 | - Custom filtering system for post-processing stop words 11 | - Language code mapping (e.g., 'en' -> 'english') 12 | """ 13 | 14 | import json 15 | from pathlib import Path 16 | from typing import Callable 17 | 18 | 19 | # Directory configuration 20 | CURRENT_DIR = Path(__file__).resolve().parent 21 | STOP_WORDS_DIR = CURRENT_DIR / "stop-words" 22 | 23 | # Global caches 24 | STOP_WORDS_CACHE: dict[str, list[str]] = {} 25 | _filters: dict[str | None, list[Callable[[list[str], str | None], list[str]]]] = {None: []} 26 | 27 | # Load language mapping configuration 28 | _languages_file = STOP_WORDS_DIR / "languages.json" 29 | with _languages_file.open("r", encoding="utf-8") as f: 30 | LANGUAGE_MAPPING: dict[str, str] = json.load(f) 31 | 32 | AVAILABLE_LANGUAGES: list[str] = list(LANGUAGE_MAPPING.values()) 33 | 34 | 35 | class StopWordError(Exception): 36 | """Raised when a requested language is unavailable or files are unreadable.""" 37 | 38 | pass 39 | 40 | 41 | def get_version() -> str: 42 | """ 43 | Get the version of the stop words library. 44 | 45 | :returns: The version string from _version module. 46 | """ 47 | from ._version import __version__ # type: ignore 48 | 49 | return __version__ 50 | 51 | 52 | def get_stop_words(language: str, *, cache: bool = True) -> list[str]: 53 | """ 54 | Load stop words for a specified language. 55 | 56 | :param language: Language code (e.g., 'en', 'es') or full name (e.g., 'english', 'spanish'). 57 | Supports both ISO codes and full language names via LANGUAGE_MAPPING. 58 | :param cache: If True, cache the results for faster subsequent access. Defaults to True. 59 | 60 | :returns: A list of stop words for the specified language. Returns a copy to prevent external modification. 61 | :raises StopWordError: If the language is not available or the file cannot be read. 62 | 63 | Example: 64 | >>> words = get_stop_words('en') 65 | >>> 'the' in words 66 | True 67 | """ 68 | # Normalize language code to full name 69 | try: 70 | language = LANGUAGE_MAPPING[language] 71 | except KeyError: 72 | if language not in AVAILABLE_LANGUAGES: 73 | raise StopWordError( 74 | f'Language "{language}" is unavailable. ' 75 | f'Available languages: {", ".join(sorted(AVAILABLE_LANGUAGES))}' 76 | ) 77 | 78 | # Return cached version if available 79 | if cache and language in STOP_WORDS_CACHE: 80 | return STOP_WORDS_CACHE[language].copy() 81 | 82 | # Load stop words from file 83 | language_file = STOP_WORDS_DIR / f"{language}.txt" 84 | 85 | try: 86 | with language_file.open("r", encoding="utf-8") as f: 87 | stop_words = [line.strip() for line in f if line.strip()] 88 | stop_words = apply_filters(stop_words, language) 89 | except (IOError, OSError) as e: 90 | raise StopWordError(f'File "{language_file}" is unreadable. Check your installation. Error: {e}') from e 91 | 92 | # Cache if requested 93 | if cache: 94 | STOP_WORDS_CACHE[language] = stop_words 95 | 96 | return stop_words.copy() 97 | 98 | 99 | def apply_filters(stopwords: list[str], language: str | None) -> list[str]: 100 | """ 101 | Apply registered filters to stop words. 102 | 103 | Filters can modify, remove, or add stop words. Language-specific filters 104 | are applied first, followed by global filters (registered with language=None). 105 | 106 | :param stopwords: List of stop words to filter. 107 | :param language: Language code for language-specific filters. 108 | 109 | :returns: Filtered list of stop words. 110 | """ 111 | # Apply language-specific filters 112 | if language in _filters: 113 | for func in _filters[language]: 114 | stopwords = func(stopwords, language) 115 | 116 | # Apply global filters 117 | for func in _filters[None]: 118 | stopwords = func(stopwords, language) 119 | 120 | return stopwords 121 | 122 | 123 | def add_filter(func: Callable[[list[str], str | None], list[str]], *, language: str | None = None) -> None: 124 | """ 125 | Register a filter function for stop word post-processing. 126 | 127 | Language-specific filters receive: func(stopwords: list[str]) -> list[str] 128 | Global filters receive: func(stopwords: list[str], language: str) -> list[str] 129 | 130 | Note: Filters only apply to newly loaded stop words, not cached ones. 131 | Clear the cache with STOP_WORDS_CACHE.clear() to reapply filters. 132 | 133 | :param func: Callable that takes a list of stop words and returns a modified list. 134 | :param language: Language code for language-specific filter, or None for global filter. 135 | 136 | Example: 137 | >>> # Add a filter to uppercase all stop words for English 138 | >>> add_filter(lambda words: [w.upper() for w in words], 'english') 139 | >>> # Add a global filter to remove single-character words 140 | >>> add_filter(lambda words, lang: [w for w in words if len(w) > 1]) 141 | """ 142 | if language is None: 143 | _filters[None].append(func) 144 | return 145 | 146 | if language not in _filters: 147 | _filters[language] = [] 148 | 149 | _filters[language].append(func) 150 | 151 | 152 | def remove_filter(func: Callable[[list[str], str | None], list[str]], *, language: str | None = None) -> bool: 153 | """ 154 | Unregister a previously registered filter function. 155 | 156 | :param func: The filter function to remove. 157 | :param language: Language code or None for global filters. 158 | 159 | :returns: True if the filter was found and removed, False otherwise. 160 | """ 161 | if language not in _filters or func not in _filters[language]: 162 | return False 163 | 164 | _filters[language].remove(func) 165 | return True 166 | 167 | 168 | def safe_get_stop_words(language: str) -> list[str]: 169 | """ 170 | Safely load stop words, returning an empty list on error. 171 | 172 | This is a convenience wrapper around get_stop_words() that catches 173 | StopWordError exceptions and returns an empty list instead. 174 | 175 | :param language: Language code or full name. 176 | 177 | :returns: Stop words for the language, or empty list if unavailable. 178 | 179 | Example: 180 | >>> words = safe_get_stop_words('unknown_language') 181 | >>> words 182 | [] 183 | """ 184 | try: 185 | return get_stop_words(language) 186 | except StopWordError: 187 | return [] 188 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Python Stop Words 3 | ================= 4 | 5 | .. image:: https://img.shields.io/pypi/v/stop-words.svg 6 | :target: https://pypi.org/project/stop-words/ 7 | :alt: PyPI version 8 | 9 | .. image:: https://img.shields.io/pypi/pyversions/stop-words.svg 10 | :target: https://pypi.org/project/stop-words/ 11 | :alt: Python versions 12 | 13 | .. image:: https://img.shields.io/pypi/l/stop-words.svg 14 | :target: https://github.com/Alir3z4/python-stop-words/blob/master/LICENSE 15 | :alt: License 16 | 17 | .. contents:: Table of Contents 18 | :depth: 2 19 | :local: 20 | 21 | Overview 22 | -------- 23 | 24 | A Python library providing curated lists of stop words across 34+ languages. Stop words are common words (like "the", "is", "at") that are typically filtered out in natural language processing and text analysis tasks. 25 | 26 | **Key Features:** 27 | 28 | * **34+ Languages** - Extensive language support. 29 | * **Performance** - Built-in caching for fast repeated access. 30 | * **Flexible** - Custom filtering system for advanced use cases. 31 | * **Zero Dependencies** - Lightweight with no external requirements. 32 | 33 | 34 | Available Languages 35 | ------------------- 36 | 37 | All the available languages supported by https://github.com/Alir3z4/stop-words 38 | 39 | Each language is identified by both its ISO 639-1 language code (e.g., ``en``) and full name (e.g., ``english``). 40 | 41 | 42 | Installation 43 | ------------ 44 | 45 | **Via pip (Recommended):** 46 | 47 | .. code-block:: bash 48 | 49 | $ pip install stop-words 50 | 51 | **Via Git:** 52 | 53 | .. code-block:: bash 54 | 55 | $ git clone --recursive https://github.com/Alir3z4/python-stop-words.git 56 | $ cd python-stop-words 57 | $ pip install -e . 58 | 59 | **Requirements:** 60 | 61 | * Usually any version of Python that supports type hints and probably has not been marked as EOL. 62 | 63 | 64 | Quick Start 65 | ----------- 66 | 67 | Basic Usage 68 | ~~~~~~~~~~~ 69 | 70 | .. code-block:: python 71 | 72 | from stop_words import get_stop_words 73 | 74 | # Get English stop words using language code 75 | stop_words = get_stop_words('en') 76 | 77 | # Or use the full language name 78 | stop_words = get_stop_words('english') 79 | 80 | # Use in text processing 81 | text = "The quick brown fox jumps over the lazy dog" 82 | words = text.lower().split() 83 | filtered_words = [word for word in words if word not in stop_words] 84 | print(filtered_words) # ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog'] 85 | 86 | 87 | Safe Loading 88 | ~~~~~~~~~~~~ 89 | 90 | Use ``safe_get_stop_words()`` when you're not sure if a language is supported: 91 | 92 | .. code-block:: python 93 | 94 | from stop_words import safe_get_stop_words 95 | 96 | # Returns empty list instead of raising an exception 97 | stop_words = safe_get_stop_words('klingon') # Returns [] 98 | 99 | # Works normally with supported languages 100 | stop_words = safe_get_stop_words('fr') # Returns French stop words 101 | 102 | 103 | Advanced Usage 104 | -------------- 105 | 106 | Checking Available Languages 107 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 108 | 109 | .. code-block:: python 110 | 111 | from stop_words import AVAILABLE_LANGUAGES, LANGUAGE_MAPPING 112 | 113 | # List all available languages 114 | print(AVAILABLE_LANGUAGES) 115 | # ['arabic', 'bulgarian', 'catalan', ...] 116 | 117 | # View language code mappings 118 | print(LANGUAGE_MAPPING) 119 | # {'en': 'english', 'fr': 'french', ...} 120 | 121 | 122 | Caching Control 123 | ~~~~~~~~~~~~~~~ 124 | 125 | By default, stop words are cached for performance. You can control this behavior: 126 | 127 | .. code-block:: python 128 | 129 | from stop_words import get_stop_words, STOP_WORDS_CACHE 130 | 131 | # Disable caching for this call 132 | stop_words = get_stop_words('en', cache=False) 133 | 134 | # Clear the cache manually 135 | STOP_WORDS_CACHE.clear() 136 | 137 | # Check what's cached 138 | print(STOP_WORDS_CACHE.keys()) # ['english', 'french', ...] 139 | 140 | 141 | Custom Filters 142 | ~~~~~~~~~~~~~~ 143 | 144 | Apply custom transformations to stop words using the filter system: 145 | 146 | .. code-block:: python 147 | 148 | from stop_words import get_stop_words, add_filter, remove_filter 149 | 150 | # Add a global filter (applies to all languages) 151 | def remove_short_words(words, language): 152 | """Remove words shorter than 3 characters.""" 153 | return [w for w in words if len(w) >= 3] 154 | 155 | add_filter(remove_short_words) 156 | stop_words = get_stop_words('en', cache=False) 157 | 158 | # Add a language-specific filter 159 | def uppercase_words(words): 160 | """Convert all words to uppercase.""" 161 | return [w.upper() for w in words] 162 | 163 | add_filter(uppercase_words, language='english') 164 | stop_words = get_stop_words('en', cache=False) 165 | 166 | # Remove a filter when done 167 | remove_filter(uppercase_words, language='english') 168 | 169 | **Note:** Filters only apply to newly loaded stop words, not cached ones. Use ``cache=False`` or clear the cache to apply new filters. 170 | 171 | 172 | Practical Examples 173 | ------------------ 174 | 175 | Text Preprocessing 176 | ~~~~~~~~~~~~~~~~~~ 177 | 178 | .. code-block:: python 179 | 180 | from stop_words import get_stop_words 181 | import re 182 | 183 | def preprocess_text(text, language='en'): 184 | """Clean and filter text for NLP tasks.""" 185 | stop_words = set(get_stop_words(language)) 186 | 187 | # Convert to lowercase and extract words 188 | words = re.findall(r'\b\w+\b', text.lower()) 189 | 190 | # Remove stop words 191 | filtered_words = [w for w in words if w not in stop_words] 192 | 193 | return filtered_words 194 | 195 | text = "The quick brown fox jumps over the lazy dog" 196 | print(preprocess_text(text)) 197 | # ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog'] 198 | 199 | 200 | Multilingual Processing 201 | ~~~~~~~~~~~~~~~~~~~~~~~ 202 | 203 | .. code-block:: python 204 | 205 | from stop_words import get_stop_words 206 | 207 | def filter_multilingual_text(texts_dict): 208 | """Process texts in multiple languages. 209 | 210 | Args: 211 | texts_dict: Dictionary mapping language codes to text strings 212 | 213 | Returns: 214 | Dictionary with filtered words for each language 215 | """ 216 | results = {} 217 | 218 | for lang_code, text in texts_dict.items(): 219 | stop_words = set(get_stop_words(lang_code)) 220 | words = text.lower().split() 221 | filtered = [w for w in words if w not in stop_words] 222 | results[lang_code] = filtered 223 | 224 | return results 225 | 226 | texts = { 227 | 'en': 'The cat is on the table', 228 | 'fr': 'Le chat est sur la table', 229 | 'es': 'El gato está en la mesa' 230 | } 231 | 232 | print(filter_multilingual_text(texts)) 233 | 234 | 235 | Keyword Extraction 236 | ~~~~~~~~~~~~~~~~~~ 237 | 238 | .. code-block:: python 239 | 240 | from stop_words import get_stop_words 241 | from collections import Counter 242 | import re 243 | 244 | def extract_keywords(text, language='en', top_n=10): 245 | """Extract the most common meaningful words from text.""" 246 | stop_words = set(get_stop_words(language)) 247 | 248 | # Extract words and filter 249 | words = re.findall(r'\b\w+\b', text.lower()) 250 | meaningful_words = [w for w in words if w not in stop_words and len(w) > 2] 251 | 252 | # Count and return top keywords 253 | word_counts = Counter(meaningful_words) 254 | return word_counts.most_common(top_n) 255 | 256 | article = """ 257 | Python is a high-level programming language. Python is known for its 258 | simplicity and readability. Many developers choose Python for data science. 259 | """ 260 | 261 | keywords = extract_keywords(article) 262 | print(keywords) 263 | # [('python', 3), ('language', 1), ('high-level', 1), ...] 264 | 265 | 266 | API Reference 267 | ------------- 268 | 269 | Functions 270 | ~~~~~~~~~ 271 | 272 | ``get_stop_words(language, *, cache=True)`` 273 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 274 | 275 | Load stop words for a specified language. 276 | 277 | **Parameters:** 278 | 279 | * ``language`` (str): Language code (e.g., 'en') or full name (e.g., 'english') 280 | * ``cache`` (bool, optional): Enable caching. Defaults to True. 281 | 282 | **Returns:** 283 | 284 | * ``list[str]``: List of stop words 285 | 286 | **Raises:** 287 | 288 | * ``StopWordError``: If language is unavailable or files are unreadable 289 | 290 | **Example:** 291 | 292 | .. code-block:: python 293 | 294 | stop_words = get_stop_words('en') 295 | stop_words = get_stop_words('french', cache=False) 296 | 297 | 298 | ``safe_get_stop_words(language)`` 299 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 300 | 301 | Safely load stop words, returning empty list on error. 302 | 303 | **Parameters:** 304 | 305 | * ``language`` (str): Language code or full name 306 | 307 | **Returns:** 308 | 309 | * ``list[str]``: Stop words, or empty list if unavailable 310 | 311 | **Example:** 312 | 313 | .. code-block:: python 314 | 315 | stop_words = safe_get_stop_words('unknown') # Returns [] 316 | 317 | 318 | ``add_filter(func, language=None)`` 319 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 320 | 321 | Register a filter function for stop word post-processing. 322 | 323 | **Parameters:** 324 | 325 | * ``func`` (Callable): Filter function 326 | * ``language`` (str | None, optional): Language code or None for global filter 327 | 328 | **Filter Signatures:** 329 | 330 | * Language-specific: ``func(stopwords: list[str]) -> list[str]`` 331 | * Global: ``func(stopwords: list[str], language: str) -> list[str]`` 332 | 333 | **Example:** 334 | 335 | .. code-block:: python 336 | 337 | def remove_short(words, lang): 338 | return [w for w in words if len(w) > 3] 339 | 340 | add_filter(remove_short) # Global filter 341 | 342 | 343 | ``remove_filter(func, language=None)`` 344 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 345 | 346 | Remove a previously registered filter. 347 | 348 | **Parameters:** 349 | 350 | * ``func`` (Callable): The filter function to remove 351 | * ``language`` (str | None, optional): Language code or None 352 | 353 | **Returns:** 354 | 355 | * ``bool``: True if removed, False if not found 356 | 357 | **Example:** 358 | 359 | .. code-block:: python 360 | 361 | success = remove_filter(my_filter, language='english') 362 | 363 | 364 | Constants 365 | ~~~~~~~~~ 366 | 367 | ``AVAILABLE_LANGUAGES`` 368 | ^^^^^^^^^^^^^^^^^^^^^^^^ 369 | 370 | List of all supported language names. 371 | 372 | .. code-block:: python 373 | 374 | ['arabic', 'bulgarian', 'catalan', ...] 375 | 376 | 377 | ``LANGUAGE_MAPPING`` 378 | ^^^^^^^^^^^^^^^^^^^^ 379 | 380 | Dictionary mapping language codes to full names. 381 | 382 | .. code-block:: python 383 | 384 | {'en': 'english', 'fr': 'french', 'de': 'german', ...} 385 | 386 | 387 | ``STOP_WORDS_CACHE`` 388 | ^^^^^^^^^^^^^^^^^^^^^ 389 | 390 | Dictionary storing cached stop words. Can be manually cleared. 391 | 392 | .. code-block:: python 393 | 394 | STOP_WORDS_CACHE.clear() # Clear all cached data 395 | 396 | 397 | Exceptions 398 | ~~~~~~~~~~ 399 | 400 | ``StopWordError`` 401 | ^^^^^^^^^^^^^^^^^ 402 | 403 | Raised when a language is unavailable or files cannot be read. 404 | 405 | .. code-block:: python 406 | 407 | try: 408 | stop_words = get_stop_words('invalid') 409 | except StopWordError as e: 410 | print(f"Error: {e}") 411 | 412 | 413 | Performance Tips 414 | ---------------- 415 | 416 | 1. **Use caching** - Keep ``cache=True`` (default) for repeated access to the same language 417 | 2. **Reuse stop word sets** - Convert to ``set()`` once for O(1) lookup performance: 418 | 419 | .. code-block:: python 420 | 421 | stop_words_set = set(get_stop_words('en')) 422 | # Fast membership testing 423 | is_stop_word = 'the' in stop_words_set 424 | 425 | 3. **Preload languages** - Load stop words during initialization, not in tight loops 426 | 4. **Use safe_get_stop_words** - Avoid try/except overhead when language availability is uncertain 427 | 428 | 429 | Troubleshooting 430 | --------------- 431 | 432 | **"Language unavailable" error** 433 | 434 | * Check spelling and use either the language code or full name 435 | * Verify the language is in ``AVAILABLE_LANGUAGES`` 436 | * See the `Available Languages`_ table above 437 | 438 | **"File is unreadable" error** 439 | 440 | * Ensure the package installed correctly: ``pip install --force-reinstall stop-words`` 441 | * Check file permissions in the installation directory 442 | * Verify the ``stop-words`` subdirectory exists in the package 443 | 444 | **Filters not applying** 445 | 446 | * Filters only affect newly loaded stop words 447 | * Clear the cache: ``STOP_WORDS_CACHE.clear()`` 448 | * Use ``cache=False`` when testing filters 449 | 450 | **Performance issues** 451 | 452 | * Ensure caching is enabled (default behavior) 453 | * Convert stop word lists to sets for faster lookups 454 | * Preload stop words outside of loops 455 | 456 | 457 | Contributing 458 | ------------ 459 | 460 | Contributions are welcome! Here's how you can help: 461 | 462 | 1. **Add new languages** - Submit stop word lists for unsupported languages via https://github.com/Alir3z4/stop-words 463 | 2. **Improve existing lists** - Suggest additions or removals for existing languages via https://github.com/Alir3z4/stop-words 464 | 3. **Report bugs** - Open issues on GitHub 465 | 4. **Submit PRs** - Fix bugs or add features 466 | 467 | **Repository:** https://github.com/Alir3z4/python-stop-words 468 | 469 | 470 | License 471 | ------- 472 | 473 | This project is licensed under the BSD 3-Clause License. See ``LICENSE`` file for details. 474 | 475 | 476 | Changelog 477 | --------- 478 | 479 | See `ChangeLog.rst `_ for version history. 480 | 481 | 482 | Support 483 | ------- 484 | 485 | * **Issues:** https://github.com/Alir3z4/python-stop-words/issues 486 | * **PyPI:** https://pypi.org/project/stop-words/ 487 | 488 | 489 | Credits 490 | ------- 491 | 492 | * Maintained by `Alireza Savand `_ 493 | * Stop word lists compiled from various open sources 494 | * Contributors: See `GitHub contributors `_ 495 | 496 | 497 | Related Projects 498 | ---------------- 499 | * `Stop Words `_ - List of common stop words in various languages. 500 | * `NLTK `_ - Natural Language Toolkit with extensive NLP features 501 | * `spaCy `_ - Industrial-strength NLP library 502 | * `TextBlob `_ - Simplified text processing 503 | 504 | 505 | Indices and Tables 506 | ------------------ 507 | 508 | * `Available Languages`_ 509 | * `Quick Start`_ 510 | * `Advanced Usage`_ 511 | * `API Reference`_ 512 | -------------------------------------------------------------------------------- /src/tests.py: -------------------------------------------------------------------------------- 1 | import random 2 | from pathlib import Path 3 | from unittest import TestCase 4 | 5 | import stop_words 6 | from stop_words import ( 7 | AVAILABLE_LANGUAGES, 8 | LANGUAGE_MAPPING, 9 | STOP_WORDS_CACHE, 10 | StopWordError, 11 | add_filter, 12 | get_stop_words, 13 | get_version, 14 | remove_filter, 15 | safe_get_stop_words, 16 | ) 17 | 18 | 19 | class TestStopWordsBasic(TestCase): 20 | """Test basic stop word loading functionality.""" 21 | 22 | NUMBER_OF_ENGLISH_STOP_WORDS = 1333 23 | 24 | def test_get_stop_words_returns_list(self) -> None: 25 | """Stop words should be returned as a list.""" 26 | sw = get_stop_words("english") 27 | self.assertIsInstance(sw, list) 28 | self.assertEqual(len(sw), self.NUMBER_OF_ENGLISH_STOP_WORDS) 29 | 30 | def test_get_stop_words_contains_strings(self) -> None: 31 | """All stop words should be strings.""" 32 | sw = get_stop_words("english") 33 | self.assertTrue(all(isinstance(word, str) for word in sw)) 34 | 35 | def test_get_stop_words_no_empty_strings(self) -> None: 36 | """Stop words should not contain empty strings.""" 37 | sw = get_stop_words("english") 38 | self.assertTrue(all(word.strip() for word in sw)) 39 | 40 | def test_get_stop_words_language_mapping(self) -> None: 41 | """Language codes should map to full language names.""" 42 | sw_code = get_stop_words("en") 43 | sw_full = get_stop_words("english") 44 | self.assertEqual(len(sw_code), self.NUMBER_OF_ENGLISH_STOP_WORDS) 45 | self.assertEqual(sw_code, sw_full) 46 | 47 | def test_common_english_stop_words(self) -> None: 48 | """Common English stop words should be present.""" 49 | sw = get_stop_words("en") 50 | common_words = ["the", "a", "an", "and", "or", "but", "is", "are"] 51 | for word in common_words: 52 | self.assertIn(word, sw, f"Expected '{word}' in English stop words") 53 | 54 | def test_get_version(self) -> None: 55 | self.assertIsNotNone(get_version()) 56 | 57 | 58 | class TestStopWordsCache(TestCase): 59 | """Test caching behavior.""" 60 | 61 | def setUp(self) -> None: 62 | """Clear cache before each test.""" 63 | STOP_WORDS_CACHE.clear() 64 | 65 | def test_cache_enabled_by_default(self) -> None: 66 | """Cache should be enabled by default.""" 67 | self.assertNotIn("french", STOP_WORDS_CACHE) 68 | get_stop_words("fr") 69 | self.assertIn("french", STOP_WORDS_CACHE) 70 | 71 | def test_cache_disabled(self) -> None: 72 | """Cache should not be used when cache=False.""" 73 | self.assertNotIn("german", STOP_WORDS_CACHE) 74 | get_stop_words("de", cache=False) 75 | self.assertNotIn("german", STOP_WORDS_CACHE) 76 | 77 | def test_cache_persists_across_calls(self) -> None: 78 | """Cached stop words should persist across calls.""" 79 | original_dir = stop_words.STOP_WORDS_DIR 80 | 81 | # Load and cache 82 | sw1 = get_stop_words("fr") 83 | self.assertIn("french", STOP_WORDS_CACHE) 84 | 85 | # Break the file system path 86 | stop_words.STOP_WORDS_DIR = Path("non-existent-directory") 87 | 88 | # Should still work from cache 89 | sw2 = get_stop_words("french") 90 | self.assertEqual(sw1, sw2) 91 | 92 | # Restore 93 | stop_words.STOP_WORDS_DIR = original_dir 94 | 95 | def test_cache_miss_raises_error(self) -> None: 96 | """Cache miss with invalid path should raise error.""" 97 | original_dir = stop_words.STOP_WORDS_DIR 98 | stop_words.STOP_WORDS_DIR = Path("non-existent-directory") 99 | 100 | with self.assertRaises(StopWordError): 101 | get_stop_words("spanish") 102 | 103 | self.assertNotIn("spanish", STOP_WORDS_CACHE) 104 | stop_words.STOP_WORDS_DIR = original_dir 105 | 106 | def test_returns_copy_not_reference(self) -> None: 107 | """get_stop_words should return a copy, not the cached reference.""" 108 | sw1 = get_stop_words("en") 109 | sw2 = get_stop_words("en") 110 | 111 | # Modify one list 112 | sw1.append("custom_word") 113 | 114 | # The other should be unchanged 115 | self.assertNotIn("custom_word", sw2) 116 | 117 | # Cache should also be unchanged 118 | sw3 = get_stop_words("en") 119 | self.assertNotIn("custom_word", sw3) 120 | 121 | 122 | class TestStopWordsErrors(TestCase): 123 | """Test error handling.""" 124 | 125 | def test_unavailable_language_raises_error(self) -> None: 126 | """Unknown languages should raise StopWordError.""" 127 | with self.assertRaises(StopWordError) as ctx: 128 | get_stop_words("sindarin") 129 | self.assertIn("sindarin", str(ctx.exception).lower()) 130 | 131 | def test_missing_file_raises_error(self) -> None: 132 | """Missing language files should raise StopWordError.""" 133 | original_dir = stop_words.STOP_WORDS_DIR 134 | stop_words.STOP_WORDS_DIR = Path("non-existent-directory") 135 | 136 | with self.assertRaises(StopWordError) as ctx: 137 | get_stop_words("german", cache=False) 138 | 139 | self.assertIn("unreadable", str(ctx.exception).lower()) 140 | stop_words.STOP_WORDS_DIR = original_dir 141 | 142 | def test_safe_get_stop_words_no_exception(self) -> None: 143 | """safe_get_stop_words should never raise exceptions.""" 144 | result = safe_get_stop_words("klingon") 145 | self.assertEqual(result, []) 146 | self.assertIsInstance(result, list) 147 | 148 | def test_safe_get_stop_words_with_valid_language(self) -> None: 149 | """safe_get_stop_words should work with valid languages.""" 150 | result = safe_get_stop_words("en") 151 | self.assertGreater(len(result), 0) 152 | 153 | def test_error_message_includes_available_languages(self) -> None: 154 | """Error message should hint at available languages.""" 155 | with self.assertRaises(StopWordError) as ctx: 156 | get_stop_words("notreal") 157 | error_msg = str(ctx.exception).lower() 158 | self.assertIn("available", error_msg) 159 | 160 | 161 | class TestStopWordsFilters(TestCase): 162 | """Test the filter system.""" 163 | 164 | def setUp(self) -> None: 165 | """Clear cache and filters before each test.""" 166 | STOP_WORDS_CACHE.clear() 167 | stop_words._filters.clear() 168 | stop_words._filters[None] = [] 169 | 170 | def tearDown(self) -> None: 171 | """Clean up filters after each test.""" 172 | stop_words._filters.clear() 173 | stop_words._filters[None] = [] 174 | 175 | def test_global_filter_removes_words(self) -> None: 176 | """Global filters should modify all languages.""" 177 | 178 | def remove_short_words(words: list[str], _lang: str | None = None) -> list[str]: 179 | return [w for w in words if len(w) > 3] 180 | 181 | add_filter(remove_short_words) 182 | sw = get_stop_words("en", cache=False) 183 | 184 | self.assertTrue(all(len(word) > 3 for word in sw)) 185 | 186 | def test_language_specific_filter(self) -> None: 187 | """Language-specific filters should only affect that language.""" 188 | 189 | def uppercase_filter(words: list[str], _language: str | None = None) -> list[str]: 190 | return [w.upper() for w in words] 191 | 192 | add_filter(uppercase_filter, language="english") 193 | 194 | # English should be uppercase 195 | en_words = get_stop_words("en", cache=False) 196 | self.assertTrue(all(w.isupper() for w in en_words if not w.isnumeric())) 197 | 198 | # Other languages should be unaffected 199 | fr_words = get_stop_words("fr", cache=False) 200 | self.assertFalse(all(w.isupper() for w in fr_words)) 201 | 202 | def test_multiple_filters_chain(self) -> None: 203 | """Multiple filters should be applied in sequence.""" 204 | 205 | def add_prefix(words: list[str], _lang: str | None = None) -> list[str]: 206 | return [f"prefix_{w}" for w in words] 207 | 208 | def add_suffix(words: list[str], _lang: str | None = None) -> list[str]: 209 | return [f"{w}_suffix" for w in words] 210 | 211 | add_filter(add_prefix) 212 | add_filter(add_suffix) 213 | 214 | sw = get_stop_words("en", cache=False) 215 | sample_word = sw[0] 216 | 217 | self.assertTrue(sample_word.startswith("prefix_")) 218 | self.assertTrue(sample_word.endswith("_suffix")) 219 | 220 | def test_remove_filter_returns_true(self) -> None: 221 | """Removing an existing filter should return True.""" 222 | 223 | def dummy_filter(words: list[str], _lang: str | None = None) -> list[str]: 224 | return words 225 | 226 | add_filter(dummy_filter) 227 | 228 | # Calling it to get the `dummy_filter` actually execute. 229 | get_stop_words("en") 230 | 231 | result = remove_filter(dummy_filter) 232 | self.assertTrue(result) 233 | 234 | def test_remove_nonexistent_filter_returns_false(self) -> None: 235 | """Removing a non-existent filter should return False.""" 236 | 237 | def dummy_filter(words: list[str], _lang: str | None = None) -> list[str]: 238 | return words # pragma: no cover 239 | 240 | result = remove_filter(dummy_filter) 241 | self.assertFalse(result) 242 | 243 | def test_remove_filter_with_language(self) -> None: 244 | """Language-specific filter removal should work.""" 245 | 246 | def lang_filter(words: list[str], _language: str | None = None) -> list[str]: 247 | return words 248 | 249 | add_filter(lang_filter, language="english") 250 | 251 | # Calling it to get the `lang_filter` actually execute. 252 | get_stop_words("en") 253 | 254 | result = remove_filter(lang_filter, language="english") 255 | self.assertTrue(result) 256 | 257 | # Should return False when trying to remove again 258 | result = remove_filter(lang_filter, language="english") 259 | self.assertFalse(result) 260 | 261 | def test_filter_with_random_letter_removal(self) -> None: 262 | """Original test: remove words containing a random letter.""" 263 | language = "en" 264 | before = get_stop_words(language, cache=False) 265 | letter = random.choice(random.choice(before)) 266 | 267 | def remove_letter(words: list[str], _lang: str | None = None) -> list[str]: 268 | return [w for w in words if letter not in w] 269 | 270 | add_filter(remove_letter) 271 | after = get_stop_words(language, cache=False) 272 | 273 | for word in after: 274 | self.assertNotIn(letter, word) 275 | 276 | self.assertTrue(remove_filter(remove_letter)) 277 | 278 | 279 | class TestStopWordsAllLanguages(TestCase): 280 | """Test all available languages.""" 281 | 282 | def test_all_mapped_languages_loadable(self) -> None: 283 | """All languages in LANGUAGE_MAPPING should be loadable.""" 284 | for code, full_name in LANGUAGE_MAPPING.items(): 285 | with self.subTest(code=code, language=full_name): 286 | sw = safe_get_stop_words(code) 287 | self.assertGreater(len(sw), 0, f"No stop words loaded for {full_name} ({code})") 288 | 289 | def test_random_language_loading(self) -> None: 290 | """Random sample of languages should all load successfully.""" 291 | all_languages = list(LANGUAGE_MAPPING.keys()) + AVAILABLE_LANGUAGES 292 | sample = random.sample(all_languages, min(10, len(all_languages))) 293 | 294 | for language in sample: 295 | with self.subTest(language=language): 296 | sw = safe_get_stop_words(language) 297 | self.assertGreater(len(sw), 0, f"Cannot load stopwords for {language}") 298 | 299 | def test_all_languages_have_unique_words(self) -> None: 300 | """Each language should have at least some unique characteristics.""" 301 | # Compare English and French as they should be different 302 | en = set(get_stop_words("en")) 303 | fr = set(get_stop_words("fr")) 304 | 305 | # Should have different words 306 | self.assertNotEqual(en, fr) 307 | # Should have some overlap (common borrowed words) 308 | self.assertGreater(len(en & fr), 0) 309 | 310 | 311 | class TestStopWordsEdgeCases(TestCase): 312 | """Test edge cases and boundary conditions.""" 313 | 314 | def test_empty_language_string(self) -> None: 315 | """Empty language string should raise error.""" 316 | with self.assertRaises(StopWordError): 317 | get_stop_words("") 318 | 319 | def test_none_language(self) -> None: 320 | """None as language should raise appropriate error.""" 321 | with self.assertRaises((StopWordError, KeyError, TypeError)): 322 | get_stop_words(None) # type: ignore 323 | 324 | def test_case_sensitive_language_codes(self) -> None: 325 | """Language codes should be case-sensitive.""" 326 | # Lowercase should work 327 | sw_lower = get_stop_words("en") 328 | self.assertGreater(len(sw_lower), 0) 329 | 330 | # Uppercase might not be in mapping 331 | with self.assertRaises(StopWordError): 332 | get_stop_words("EN") 333 | 334 | def test_whitespace_in_stop_words(self) -> None: 335 | """Stop words should be properly stripped of whitespace.""" 336 | sw = get_stop_words("en") 337 | for word in sw: 338 | self.assertEqual(word, word.strip(), f"Word '{word}' has extra whitespace") 339 | 340 | def test_duplicate_stop_words(self) -> None: 341 | """Stop words list should not contain duplicates.""" 342 | sw = get_stop_words("en") 343 | unique_words = set(sw) 344 | self.assertEqual(len(sw), len(unique_words), "Stop words list contains duplicates") 345 | 346 | def test_filter_returns_empty_list(self) -> None: 347 | """Filter that returns empty list should work.""" 348 | 349 | def remove_all(words: list[str], _lang: str | None = None) -> list[str]: 350 | return [] 351 | 352 | STOP_WORDS_CACHE.clear() 353 | stop_words._filters.clear() 354 | stop_words._filters[None] = [] 355 | 356 | add_filter(remove_all) 357 | sw = get_stop_words("en", cache=False) 358 | self.assertEqual(sw, []) 359 | 360 | # Cleanup 361 | remove_filter(remove_all) 362 | 363 | def test_filter_adds_words(self) -> None: 364 | """Filter that adds words should work.""" 365 | 366 | def add_custom(words: list[str], _lang: str | None = None) -> list[str]: 367 | return words + ["custom1", "custom2"] 368 | 369 | STOP_WORDS_CACHE.clear() 370 | stop_words._filters.clear() 371 | stop_words._filters[None] = [] 372 | 373 | add_filter(add_custom) 374 | sw = get_stop_words("en", cache=False) 375 | 376 | self.assertIn("custom1", sw) 377 | self.assertIn("custom2", sw) 378 | 379 | # Cleanup 380 | remove_filter(add_custom) 381 | 382 | def test_concurrent_filter_modifications(self) -> None: 383 | """Adding and removing filters should be safe.""" 384 | filters = [ 385 | lambda w, language: w, 386 | lambda w, language: [word.upper() for word in w], 387 | lambda w, language: [word.lower() for word in w], 388 | ] 389 | 390 | STOP_WORDS_CACHE.clear() 391 | stop_words._filters.clear() 392 | stop_words._filters[None] = [] 393 | 394 | # Add all filters 395 | for f in filters: 396 | add_filter(f) 397 | 398 | # Remove them in different order 399 | for f in reversed(filters): 400 | remove_filter(f) 401 | 402 | # Should be back to empty 403 | self.assertEqual(len(stop_words._filters[None]), 0) 404 | 405 | 406 | class TestStopWordsConfiguration(TestCase): 407 | """Test module configuration and constants.""" 408 | 409 | def test_available_languages_is_list(self) -> None: 410 | """AVAILABLE_LANGUAGES should be a list.""" 411 | self.assertIsInstance(AVAILABLE_LANGUAGES, list) 412 | self.assertGreater(len(AVAILABLE_LANGUAGES), 0) 413 | 414 | def test_language_mapping_is_dict(self) -> None: 415 | """LANGUAGE_MAPPING should be a dictionary.""" 416 | self.assertIsInstance(LANGUAGE_MAPPING, dict) 417 | self.assertGreater(len(LANGUAGE_MAPPING), 0) 418 | 419 | def test_cache_is_dict(self) -> None: 420 | """STOP_WORDS_CACHE should be a dictionary.""" 421 | self.assertIsInstance(STOP_WORDS_CACHE, dict) 422 | 423 | def test_stop_words_dir_exists(self) -> None: 424 | """STOP_WORDS_DIR should point to an existing directory.""" 425 | self.assertTrue( 426 | stop_words.STOP_WORDS_DIR.exists(), 427 | f"Stop words directory not found: {stop_words.STOP_WORDS_DIR}", 428 | ) 429 | self.assertTrue(stop_words.STOP_WORDS_DIR.is_dir()) 430 | 431 | def test_language_files_exist(self) -> None: 432 | """Language files referenced in mapping should exist.""" 433 | for lang_name in AVAILABLE_LANGUAGES: 434 | lang_file = stop_words.STOP_WORDS_DIR / f"{lang_name}.txt" 435 | self.assertTrue(lang_file.exists(), f"Language file missing: {lang_file}") 436 | --------------------------------------------------------------------------------